Support UTF-32 position encoding

Looks like this is a native encoding for Emacs at least!
author: Aleksey Kladov <aleksey.kladov@gmail.com> 2023-02-14 00:56:28 +0000
committer: Aleksey Kladov <aleksey.kladov@gmail.com> 2023-02-14 01:09:50 +0000
commit: 0da27376cf3768d92bcd0c094b52da50146dc70f (patch)
tree: 0249938c5a632d60c548a1ca8767c30892804b7f
parent: c97aae38f20f64daede9877212aff83c259a4faa (diff)
download: rust-0da27376cf3768d92bcd0c094b52da50146dc70f.tar.gz
rust-0da27376cf3768d92bcd0c094b52da50146dc70f.zip
18 files changed, 211 insertions, 159 deletions
diff --git a/Cargo.lock b/Cargo.lock
index ef0316f30fb..6d8a3eeb739 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -711,6 +711,7 @@ dependencies = [
  "limit",
  "memchr",
  "once_cell",
+ "oorandom",
  "parser",
  "profile",
  "rayon",
diff --git a/crates/ide-db/Cargo.toml b/crates/ide-db/Cargo.toml
index 9672bb9b7b5..57daaf623df 100644
--- a/crates/ide-db/Cargo.toml
+++ b/crates/ide-db/Cargo.toml
@@ -37,8 +37,9 @@ text-edit.workspace = true
 hir.workspace = true
 
 [dev-dependencies]
-xshell = "0.2.2"
 expect-test = "1.4.0"
+oorandom = "11.1.3"
+xshell = "0.2.2"
 
 # local deps
 test-utils.workspace = true
diff --git a/crates/ide-db/src/line_index.rs b/crates/ide-db/src/line_index.rs
index 8f12ab33409..c17ca95f5d0 100644
--- a/crates/ide-db/src/line_index.rs
+++ b/crates/ide-db/src/line_index.rs
@@ -7,56 +7,72 @@ use syntax::{TextRange, TextSize};
 
 #[derive(Clone, Debug, PartialEq, Eq)]
 pub struct LineIndex {
-    /// Offset the the beginning of each line, zero-based
+    /// Offset the the beginning of each line, zero-based.
     pub(crate) newlines: Vec<TextSize>,
-    /// List of non-ASCII characters on each line
-    pub(crate) utf16_lines: NoHashHashMap<u32, Vec<Utf16Char>>,
+    /// List of non-ASCII characters on each line.
+    pub(crate) line_wide_chars: NoHashHashMap<u32, Vec<WideChar>>,
 }
 
+/// Line/Column information in native, utf8 format.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
-pub struct LineColUtf16 {
+pub struct LineCol {
     /// Zero-based
     pub line: u32,
-    /// Zero-based
+    /// Zero-based utf8 offset
     pub col: u32,
 }
 
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
-pub struct LineCol {
+pub enum WideEncoding {
+    Utf16,
+    Utf32,
+}
+
+/// Line/Column information in legacy encodings.
+///
+/// Deliberately not a generic type and different from `LineCol`.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+pub struct WideLineCol {
     /// Zero-based
     pub line: u32,
-    /// Zero-based utf8 offset
+    /// Zero-based
     pub col: u32,
 }
 
 #[derive(Clone, Debug, Hash, PartialEq, Eq)]
-pub(crate) struct Utf16Char {
+pub(crate) struct WideChar {
     /// Start offset of a character inside a line, zero-based
     pub(crate) start: TextSize,
     /// End offset of a character inside a line, zero-based
     pub(crate) end: TextSize,
 }
 
-impl Utf16Char {
+impl WideChar {
     /// Returns the length in 8-bit UTF-8 code units.
     fn len(&self) -> TextSize {
         self.end - self.start
     }
 
-    /// Returns the length in 16-bit UTF-16 code units.
-    fn len_utf16(&self) -> usize {
-        if self.len() == TextSize::from(4) {
-            2
-        } else {
-            1
+    /// Returns the length in UTF-16 or UTF-32 code units.
+    fn wide_len(&self, enc: WideEncoding) -> usize {
+        match enc {
+            WideEncoding::Utf16 => {
+                if self.len() == TextSize::from(4) {
+                    2
+                } else {
+                    1
+                }
+            }
+
+            WideEncoding::Utf32 => 1,
         }
     }
 }
 
 impl LineIndex {
     pub fn new(text: &str) -> LineIndex {
-        let mut utf16_lines = NoHashHashMap::default();
-        let mut utf16_chars = Vec::new();
+        let mut line_wide_chars = NoHashHashMap::default();
+        let mut wide_chars = Vec::new();
 
         let mut newlines = Vec::with_capacity(16);
         newlines.push(TextSize::from(0));
@@ -71,8 +87,8 @@ impl LineIndex {
                 newlines.push(curr_row);
 
                 // Save any utf-16 characters seen in the previous line
-                if !utf16_chars.is_empty() {
-                    utf16_lines.insert(line, mem::take(&mut utf16_chars));
+                if !wide_chars.is_empty() {
+                    line_wide_chars.insert(line, mem::take(&mut wide_chars));
                 }
 
                 // Prepare for processing the next line
@@ -82,18 +98,18 @@ impl LineIndex {
             }
 
             if !c.is_ascii() {
-                utf16_chars.push(Utf16Char { start: curr_col, end: curr_col + c_len });
+                wide_chars.push(WideChar { start: curr_col, end: curr_col + c_len });
             }
 
             curr_col += c_len;
         }
 
         // Save any utf-16 characters seen in the last line
-        if !utf16_chars.is_empty() {
-            utf16_lines.insert(line, utf16_chars);
+        if !wide_chars.is_empty() {
+            line_wide_chars.insert(line, wide_chars);
         }
 
-        LineIndex { newlines, utf16_lines }
+        LineIndex { newlines, line_wide_chars }
     }
 
     pub fn line_col(&self, offset: TextSize) -> LineCol {
@@ -109,13 +125,13 @@ impl LineIndex {
             .map(|offset| offset + TextSize::from(line_col.col))
     }
 
-    pub fn to_utf16(&self, line_col: LineCol) -> LineColUtf16 {
-        let col = self.utf8_to_utf16_col(line_col.line, line_col.col.into());
-        LineColUtf16 { line: line_col.line, col: col as u32 }
+    pub fn to_wide(&self, enc: WideEncoding, line_col: LineCol) -> WideLineCol {
+        let col = self.utf8_to_wide_col(enc, line_col.line, line_col.col.into());
+        WideLineCol { line: line_col.line, col: col as u32 }
     }
 
-    pub fn to_utf8(&self, line_col: LineColUtf16) -> LineCol {
-        let col = self.utf16_to_utf8_col(line_col.line, line_col.col);
+    pub fn to_utf8(&self, enc: WideEncoding, line_col: WideLineCol) -> LineCol {
+        let col = self.wide_to_utf8_col(enc, line_col.line, line_col.col);
         LineCol { line: line_col.line, col: col.into() }
     }
 
@@ -132,12 +148,12 @@ impl LineIndex {
             .filter(|it| !it.is_empty())
     }
 
-    fn utf8_to_utf16_col(&self, line: u32, col: TextSize) -> usize {
+    fn utf8_to_wide_col(&self, enc: WideEncoding, line: u32, col: TextSize) -> usize {
         let mut res: usize = col.into();
-        if let Some(utf16_chars) = self.utf16_lines.get(&line) {
-            for c in utf16_chars {
+        if let Some(wide_chars) = self.line_wide_chars.get(&line) {
+            for c in wide_chars {
                 if c.end <= col {
-                    res -= usize::from(c.len()) - c.len_utf16();
+                    res -= usize::from(c.len()) - c.wide_len(enc);
                 } else {
                     // From here on, all utf16 characters come *after* the character we are mapping,
                     // so we don't need to take them into account
@@ -148,11 +164,11 @@ impl LineIndex {
         res
     }
 
-    fn utf16_to_utf8_col(&self, line: u32, mut col: u32) -> TextSize {
-        if let Some(utf16_chars) = self.utf16_lines.get(&line) {
-            for c in utf16_chars {
+    fn wide_to_utf8_col(&self, enc: WideEncoding, line: u32, mut col: u32) -> TextSize {
+        if let Some(wide_chars) = self.line_wide_chars.get(&line) {
+            for c in wide_chars {
                 if col > u32::from(c.start) {
-                    col += u32::from(c.len()) - c.len_utf16() as u32;
+                    col += u32::from(c.len()) - c.wide_len(enc) as u32;
                 } else {
                     // From here on, all utf16 characters come *after* the character we are mapping,
                     // so we don't need to take them into account
@@ -167,6 +183,9 @@ impl LineIndex {
 
 #[cfg(test)]
 mod tests {
+    use test_utils::skip_slow_tests;
+
+    use super::WideEncoding::{Utf16, Utf32};
     use super::*;
 
     #[test]
@@ -210,67 +229,59 @@ mod tests {
 const C: char = 'x';
 ",
         );
-        assert_eq!(col_index.utf16_lines.len(), 0);
+        assert_eq!(col_index.line_wide_chars.len(), 0);
     }
 
     #[test]
-    fn test_single_char() {
-        let col_index = LineIndex::new(
-            "
-const C: char = 'メ';
-",
-        );
-
-        assert_eq!(col_index.utf16_lines.len(), 1);
-        assert_eq!(col_index.utf16_lines[&1].len(), 1);
-        assert_eq!(col_index.utf16_lines[&1][0], Utf16Char { start: 17.into(), end: 20.into() });
-
-        // UTF-8 to UTF-16, no changes
-        assert_eq!(col_index.utf8_to_utf16_col(1, 15.into()), 15);
-
-        // UTF-8 to UTF-16
-        assert_eq!(col_index.utf8_to_utf16_col(1, 22.into()), 20);
-
-        // UTF-16 to UTF-8, no changes
-        assert_eq!(col_index.utf16_to_utf8_col(1, 15), TextSize::from(15));
-
-        // UTF-16 to UTF-8
-        assert_eq!(col_index.utf16_to_utf8_col(1, 19), TextSize::from(21));
-
-        let col_index = LineIndex::new("a𐐏b");
-        assert_eq!(col_index.utf16_to_utf8_col(0, 3), TextSize::from(5));
-    }
-
-    #[test]
-    fn test_string() {
-        let col_index = LineIndex::new(
-            "
-const C: char = \"メ メ\";
-",
-        );
-
-        assert_eq!(col_index.utf16_lines.len(), 1);
-        assert_eq!(col_index.utf16_lines[&1].len(), 2);
-        assert_eq!(col_index.utf16_lines[&1][0], Utf16Char { start: 17.into(), end: 20.into() });
-        assert_eq!(col_index.utf16_lines[&1][1], Utf16Char { start: 21.into(), end: 24.into() });
-
-        // UTF-8 to UTF-16
-        assert_eq!(col_index.utf8_to_utf16_col(1, 15.into()), 15);
-
-        assert_eq!(col_index.utf8_to_utf16_col(1, 21.into()), 19);
-        assert_eq!(col_index.utf8_to_utf16_col(1, 25.into()), 21);
-
-        assert!(col_index.utf8_to_utf16_col(2, 15.into()) == 15);
-
-        // UTF-16 to UTF-8
-        assert_eq!(col_index.utf16_to_utf8_col(1, 15), TextSize::from(15));
+    fn test_every_chars() {
+        if skip_slow_tests() {
+            return;
+        }
 
-        // メ UTF-8: 0xE3 0x83 0xA1, UTF-16: 0x30E1
-        assert_eq!(col_index.utf16_to_utf8_col(1, 17), TextSize::from(17)); // first メ at 17..20
-        assert_eq!(col_index.utf16_to_utf8_col(1, 18), TextSize::from(20)); // space
-        assert_eq!(col_index.utf16_to_utf8_col(1, 19), TextSize::from(21)); // second メ at 21..24
+        let text: String = {
+            let mut chars: Vec<char> = ((0 as char)..char::MAX).collect(); // Neat!
+            chars.extend("\n".repeat(chars.len() / 16).chars());
+            let mut rng = oorandom::Rand32::new(stdx::rand::seed());
+            stdx::rand::shuffle(&mut chars, |i| rng.rand_range(0..i as u32) as usize);
+            chars.into_iter().collect()
+        };
+        assert!(text.contains('💩')); // Sanity check.
+
+        let line_index = LineIndex::new(&text);
+
+        let mut lin_col = LineCol { line: 0, col: 0 };
+        let mut col_utf16 = 0;
+        let mut col_utf32 = 0;
+        for (offset, c) in text.char_indices() {
+            let got_offset = line_index.offset(lin_col).unwrap();
+            assert_eq!(usize::from(got_offset), offset);
+
+            let got_lin_col = line_index.line_col(got_offset);
+            assert_eq!(got_lin_col, lin_col);
+
+            for enc in [Utf16, Utf32] {
+                let wide_lin_col = line_index.to_wide(enc, lin_col);
+                let got_lin_col = line_index.to_utf8(enc, wide_lin_col);
+                assert_eq!(got_lin_col, lin_col);
+
+                let want_col = match enc {
+                    Utf16 => col_utf16,
+                    Utf32 => col_utf32,
+                };
+                assert_eq!(wide_lin_col.col, want_col)
+            }
 
-        assert_eq!(col_index.utf16_to_utf8_col(2, 15), TextSize::from(15));
+            if c == '\n' {
+                lin_col.line += 1;
+                lin_col.col = 0;
+                col_utf16 = 0;
+                col_utf32 = 0;
+            } else {
+                lin_col.col += c.len_utf8() as u32;
+                col_utf16 += c.len_utf16() as u32;
+                col_utf32 += 1;
+            }
+        }
     }
 
     #[test]
diff --git a/crates/ide/src/lib.rs b/crates/ide/src/lib.rs
index 4ead9d4d0a8..f2b535bdc7e 100644
--- a/crates/ide/src/lib.rs
+++ b/crates/ide/src/lib.rs
@@ -115,7 +115,7 @@ pub use ide_db::{
         SourceRoot, SourceRootId,
     },
     label::Label,
-    line_index::{LineCol, LineColUtf16, LineIndex},
+    line_index::{LineCol, LineIndex},
     search::{ReferenceCategory, SearchScope},
     source_change::{FileSystemEdit, SourceChange},
     symbol_index::Query,
diff --git a/crates/ide/src/shuffle_crate_graph.rs b/crates/ide/src/shuffle_crate_graph.rs
index ae539a5d397..e606072a823 100644
--- a/crates/ide/src/shuffle_crate_graph.rs
+++ b/crates/ide/src/shuffle_crate_graph.rs
@@ -18,7 +18,9 @@ pub(crate) fn shuffle_crate_graph(db: &mut RootDatabase) {
     let crate_graph = db.crate_graph();
 
     let mut shuffled_ids = crate_graph.iter().collect::<Vec<_>>();
-    shuffle(&mut shuffled_ids);
+
+    let mut rng = oorandom::Rand32::new(stdx::rand::seed());
+    stdx::rand::shuffle(&mut shuffled_ids, |i| rng.rand_range(0..i as u32) as usize);
 
     let mut new_graph = CrateGraph::default();
 
@@ -52,21 +54,3 @@ pub(crate) fn shuffle_crate_graph(db: &mut RootDatabase) {
 
     db.set_crate_graph_with_durability(Arc::new(new_graph), Durability::HIGH);
 }
-
-fn shuffle<T>(slice: &mut [T]) {
-    let mut rng = oorandom::Rand32::new(seed());
-
-    let mut remaining = slice.len() - 1;
-    while remaining > 0 {
-        let index = rng.rand_range(0..remaining as u32);
-        slice.swap(remaining, index as usize);
-        remaining -= 1;
-    }
-}
-
-fn seed() -> u64 {
-    use std::collections::hash_map::RandomState;
-    use std::hash::{BuildHasher, Hasher};
-
-    RandomState::new().build_hasher().finish()
-}
diff --git a/crates/rust-analyzer/src/caps.rs b/crates/rust-analyzer/src/caps.rs
index 841861635c6..a9ed05021da 100644
--- a/crates/rust-analyzer/src/caps.rs
+++ b/crates/rust-analyzer/src/caps.rs
@@ -1,4 +1,5 @@
 //! Advertises the capabilities of the LSP Server.
+use ide_db::line_index::WideEncoding;
 use lsp_types::{
     CallHierarchyServerCapability, ClientCapabilities, CodeActionKind, CodeActionOptions,
     CodeActionProviderCapability, CodeLensOptions, CompletionOptions,
@@ -16,16 +17,19 @@ use lsp_types::{
 use serde_json::json;
 
 use crate::config::{Config, RustfmtConfig};
-use crate::lsp_ext::supports_utf8;
+use crate::line_index::PositionEncoding;
+use crate::lsp_ext::negotiated_encoding;
 use crate::semantic_tokens;
 
 pub fn server_capabilities(config: &Config) -> ServerCapabilities {
     ServerCapabilities {
-        position_encoding: if supports_utf8(config.caps()) {
-            Some(PositionEncodingKind::UTF8)
-        } else {
-            None
-        },
+        position_encoding: Some(match negotiated_encoding(config.caps()) {
+            PositionEncoding::Utf8 => PositionEncodingKind::UTF8,
+            PositionEncoding::Wide(wide) => match wide {
+                WideEncoding::Utf16 => PositionEncodingKind::UTF16,
+                WideEncoding::Utf32 => PositionEncodingKind::UTF32,
+            },
+        }),
         text_document_sync: Some(TextDocumentSyncCapability::Options(TextDocumentSyncOptions {
             open_close: Some(true),
             change: Some(TextDocumentSyncKind::INCREMENTAL),
diff --git a/crates/rust-analyzer/src/cli/lsif.rs b/crates/rust-analyzer/src/cli/lsif.rs
index 60a7f99ccdb..3fc1aa4eaeb 100644
--- a/crates/rust-analyzer/src/cli/lsif.rs
+++ b/crates/rust-analyzer/src/cli/lsif.rs
@@ -11,6 +11,7 @@ use ide::{
 use ide_db::LineIndexDatabase;
 
 use ide_db::base_db::salsa::{self, ParallelDatabase};
+use ide_db::line_index::WideEncoding;
 use lsp_types::{self, lsif};
 use project_model::{CargoConfig, ProjectManifest, ProjectWorkspace};
 use vfs::{AbsPathBuf, Vfs};
@@ -127,7 +128,7 @@ impl LsifManager<'_> {
         let line_index = self.db.line_index(file_id);
         let line_index = LineIndex {
             index: line_index,
-            encoding: PositionEncoding::Utf16,
+            encoding: PositionEncoding::Wide(WideEncoding::Utf16),
             endings: LineEndings::Unix,
         };
         let range_id = self.add_vertex(lsif::Vertex::Range {
@@ -249,7 +250,7 @@ impl LsifManager<'_> {
         let line_index = self.db.line_index(file_id);
         let line_index = LineIndex {
             index: line_index,
-            encoding: PositionEncoding::Utf16,
+            encoding: PositionEncoding::Wide(WideEncoding::Utf16),
             endings: LineEndings::Unix,
         };
         let result = folds
diff --git a/crates/rust-analyzer/src/config.rs b/crates/rust-analyzer/src/config.rs
index be09938c2c4..f609a50a05f 100644
--- a/crates/rust-analyzer/src/config.rs
+++ b/crates/rust-analyzer/src/config.rs
@@ -33,7 +33,7 @@ use crate::{
     caps::completion_item_edit_resolve,
     diagnostics::DiagnosticsMapConfig,
     line_index::PositionEncoding,
-    lsp_ext::{self, supports_utf8, WorkspaceSymbolSearchKind, WorkspaceSymbolSearchScope},
+    lsp_ext::{self, negotiated_encoding, WorkspaceSymbolSearchKind, WorkspaceSymbolSearchScope},
 };
 
 mod patch_old_style;
@@ -999,11 +999,7 @@ impl Config {
     }
 
     pub fn position_encoding(&self) -> PositionEncoding {
-        if supports_utf8(&self.caps) {
-            PositionEncoding::Utf8
-        } else {
-            PositionEncoding::Utf16
-        }
+        negotiated_encoding(&self.caps)
     }
 
     fn experimental(&self, index: &'static str) -> bool {
diff --git a/crates/rust-analyzer/src/diagnostics/to_proto.rs b/crates/rust-analyzer/src/diagnostics/to_proto.rs
index 55b89019b47..415fa4e02f2 100644
--- a/crates/rust-analyzer/src/diagnostics/to_proto.rs
+++ b/crates/rust-analyzer/src/diagnostics/to_proto.rs
@@ -3,6 +3,7 @@
 use std::collections::HashMap;
 
 use flycheck::{Applicability, DiagnosticLevel, DiagnosticSpan};
+use ide_db::line_index::WideEncoding;
 use itertools::Itertools;
 use stdx::format_to;
 use vfs::{AbsPath, AbsPathBuf};
@@ -95,7 +96,8 @@ fn position(
         let mut char_offset = 0;
         let len_func = match position_encoding {
             PositionEncoding::Utf8 => char::len_utf8,
-            PositionEncoding::Utf16 => char::len_utf16,
+            PositionEncoding::Wide(WideEncoding::Utf16) => char::len_utf16,
+            PositionEncoding::Wide(WideEncoding::Utf32) => |_| 1,
         };
         for c in line.text.chars() {
             char_offset += 1;
diff --git a/crates/rust-analyzer/src/from_proto.rs b/crates/rust-analyzer/src/from_proto.rs
index 2dbb14fcd9a..50af38cd6fe 100644
--- a/crates/rust-analyzer/src/from_proto.rs
+++ b/crates/rust-analyzer/src/from_proto.rs
@@ -1,7 +1,10 @@
 //! Conversion lsp_types types to rust-analyzer specific ones.
 use anyhow::format_err;
-use ide::{Annotation, AnnotationKind, AssistKind, LineCol, LineColUtf16};
-use ide_db::base_db::{FileId, FilePosition, FileRange};
+use ide::{Annotation, AnnotationKind, AssistKind, LineCol};
+use ide_db::{
+    base_db::{FileId, FilePosition, FileRange},
+    line_index::WideLineCol,
+};
 use syntax::{TextRange, TextSize};
 use vfs::AbsPathBuf;
 
@@ -26,9 +29,9 @@ pub(crate) fn vfs_path(url: &lsp_types::Url) -> Result<vfs::VfsPath> {
 pub(crate) fn offset(line_index: &LineIndex, position: lsp_types::Position) -> Result<TextSize> {
     let line_col = match line_index.encoding {
         PositionEncoding::Utf8 => LineCol { line: position.line, col: position.character },
-        PositionEncoding::Utf16 => {
-            let line_col = LineColUtf16 { line: position.line, col: position.character };
-            line_index.index.to_utf8(line_col)
+        PositionEncoding::Wide(enc) => {
+            let line_col = WideLineCol { line: position.line, col: position.character };
+            line_index.index.to_utf8(enc, line_col)
         }
     };
     let text_size =
diff --git a/crates/rust-analyzer/src/line_index.rs b/crates/rust-analyzer/src/line_index.rs
index 2945dba12f2..791cd931d42 100644
--- a/crates/rust-analyzer/src/line_index.rs
+++ b/crates/rust-analyzer/src/line_index.rs
@@ -7,9 +7,12 @@
 
 use std::sync::Arc;
 
+use ide_db::line_index::WideEncoding;
+
+#[derive(Clone, Copy)]
 pub enum PositionEncoding {
     Utf8,
-    Utf16,
+    Wide(WideEncoding),
 }
 
 pub(crate) struct LineIndex {
diff --git a/crates/rust-analyzer/src/lsp_ext.rs b/crates/rust-analyzer/src/lsp_ext.rs
index 08b2c837de3..e33589cc536 100644
--- a/crates/rust-analyzer/src/lsp_ext.rs
+++ b/crates/rust-analyzer/src/lsp_ext.rs
@@ -2,6 +2,7 @@
 
 use std::{collections::HashMap, path::PathBuf};
 
+use ide_db::line_index::WideEncoding;
 use lsp_types::request::Request;
 use lsp_types::PositionEncodingKind;
 use lsp_types::{
@@ -10,6 +11,8 @@ use lsp_types::{
 };
 use serde::{Deserialize, Serialize};
 
+use crate::line_index::PositionEncoding;
+
 pub enum AnalyzerStatus {}
 
 impl Request for AnalyzerStatus {
@@ -481,16 +484,22 @@ pub(crate) enum CodeLensResolveData {
     References(lsp_types::TextDocumentPositionParams),
 }
 
-pub fn supports_utf8(caps: &lsp_types::ClientCapabilities) -> bool {
-    match &caps.general {
-        Some(general) => general
-            .position_encodings
-            .as_deref()
-            .unwrap_or_default()
-            .iter()
-            .any(|it| it == &PositionEncodingKind::UTF8),
-        _ => false,
+pub fn negotiated_encoding(caps: &lsp_types::ClientCapabilities) -> PositionEncoding {
+    let client_encodings = match &caps.general {
+        Some(general) => general.position_encodings.as_deref().unwrap_or_default(),
+        None => &[],
+    };
+
+    for enc in client_encodings {
+        if enc == &PositionEncodingKind::UTF8 {
+            return PositionEncoding::Utf8;
+        } else if enc == &PositionEncodingKind::UTF32 {
+            return PositionEncoding::Wide(WideEncoding::Utf32);
+        }
+        // NB: intentionally prefer just about anything else to utf-16.
     }
+
+    PositionEncoding::Wide(WideEncoding::Utf16)
 }
 
 pub enum MoveItem {}
diff --git a/crates/rust-analyzer/src/lsp_utils.rs b/crates/rust-analyzer/src/lsp_utils.rs
index baa77a005e2..30f1c53c198 100644
--- a/crates/rust-analyzer/src/lsp_utils.rs
+++ b/crates/rust-analyzer/src/lsp_utils.rs
@@ -161,6 +161,7 @@ impl GlobalState {
 }
 
 pub(crate) fn apply_document_changes(
+    encoding: PositionEncoding,
     file_contents: impl FnOnce() -> String,
     mut content_changes: Vec<lsp_types::TextDocumentContentChangeEvent>,
 ) -> String {
@@ -192,9 +193,9 @@ pub(crate) fn apply_document_changes(
     let mut line_index = LineIndex {
         // the index will be overwritten in the bottom loop's first iteration
         index: Arc::new(ide::LineIndex::new(&text)),
-        // We don't care about line endings or offset encoding here.
+        // We don't care about line endings here.
         endings: LineEndings::Unix,
-        encoding: PositionEncoding::Utf16,
+        encoding,
     };
 
     // The changes we got must be applied sequentially, but can cross lines so we
@@ -256,6 +257,7 @@ pub(crate) fn all_edits_are_disjoint(
 
 #[cfg(test)]
 mod tests {
+    use ide_db::line_index::WideEncoding;
     use lsp_types::{
         CompletionItem, CompletionTextEdit, InsertReplaceEdit, Position, Range,
         TextDocumentContentChangeEvent,
@@ -278,9 +280,11 @@ mod tests {
             };
         }
 
-        let text = apply_document_changes(|| String::new(), vec![]);
+        let encoding = PositionEncoding::Wide(WideEncoding::Utf16);
+        let text = apply_document_changes(encoding, || String::new(), vec![]);
         assert_eq!(text, "");
         let text = apply_document_changes(
+            encoding,
             || text,
             vec![TextDocumentContentChangeEvent {
                 range: None,
@@ -289,39 +293,49 @@ mod tests {
             }],
         );
         assert_eq!(text, "the");
-        let text = apply_document_changes(|| text, c![0, 3; 0, 3 => " quick"]);
+        let text = apply_document_changes(encoding, || text, c![0, 3; 0, 3 => " quick"]);
         assert_eq!(text, "the quick");
-        let text = apply_document_changes(|| text, c![0, 0; 0, 4 => "", 0, 5; 0, 5 => " foxes"]);
+        let text =
+            apply_document_changes(encoding, || text, c![0, 0; 0, 4 => "", 0, 5; 0, 5 => " foxes"]);
         assert_eq!(text, "quick foxes");
-        let text = apply_document_changes(|| text, c![0, 11; 0, 11 => "\ndream"]);
+        let text = apply_document_changes(encoding, || text, c![0, 11; 0, 11 => "\ndream"]);
         assert_eq!(text, "quick foxes\ndream");
-        let text = apply_document_changes(|| text, c![1, 0; 1, 0 => "have "]);
+        let text = apply_document_changes(encoding, || text, c![1, 0; 1, 0 => "have "]);
         assert_eq!(text, "quick foxes\nhave dream");
         let text = apply_document_changes(
+            encoding,
             || text,
             c![0, 0; 0, 0 => "the ", 1, 4; 1, 4 => " quiet", 1, 16; 1, 16 => "s\n"],
         );
         assert_eq!(text, "the quick foxes\nhave quiet dreams\n");
-        let text = apply_document_changes(|| text, c![0, 15; 0, 15 => "\n", 2, 17; 2, 17 => "\n"]);
+        let text = apply_document_changes(
+            encoding,
+            || text,
+            c![0, 15; 0, 15 => "\n", 2, 17; 2, 17 => "\n"],
+        );
         assert_eq!(text, "the quick foxes\n\nhave quiet dreams\n\n");
         let text = apply_document_changes(
+            encoding,
             || text,
             c![1, 0; 1, 0 => "DREAM", 2, 0; 2, 0 => "they ", 3, 0; 3, 0 => "DON'T THEY?"],
         );
         assert_eq!(text, "the quick foxes\nDREAM\nthey have quiet dreams\nDON'T THEY?\n");
-        let text = apply_document_changes(|| text, c![0, 10; 1, 5 => "", 2, 0; 2, 12 => ""]);
+        let text =
+            apply_document_changes(encoding, || text, c![0, 10; 1, 5 => "", 2, 0; 2, 12 => ""]);
         assert_eq!(text, "the quick \nthey have quiet dreams\n");
 
         let text = String::from("❤️");
-        let text = apply_document_changes(|| text, c![0, 0; 0, 0 => "a"]);
+        let text = apply_document_changes(encoding, || text, c![0, 0; 0, 0 => "a"]);
         assert_eq!(text, "a❤️");
 
         let text = String::from("a\nb");
-        let text = apply_document_changes(|| text, c![0, 1; 1, 0 => "\nțc", 0, 1; 1, 1 => "d"]);
+        let text =
+            apply_document_changes(encoding, || text, c![0, 1; 1, 0 => "\nțc", 0, 1; 1, 1 => "d"]);
         assert_eq!(text, "adcb");
 
         let text = String::from("a\nb");
-        let text = apply_document_changes(|| text, c![0, 1; 1, 0 => "ț\nc", 0, 2; 0, 2 => "c"]);
+        let text =
+            apply_document_changes(encoding, || text, c![0, 1; 1, 0 => "ț\nc", 0, 2; 0, 2 => "c"]);
         assert_eq!(text, "ațc\ncb");
     }
 
diff --git a/crates/rust-analyzer/src/main_loop.rs b/crates/rust-analyzer/src/main_loop.rs
index 346a74e270f..d1e38b33c7d 100644
--- a/crates/rust-analyzer/src/main_loop.rs
+++ b/crates/rust-analyzer/src/main_loop.rs
@@ -831,6 +831,7 @@ impl GlobalState {
                     let vfs = &mut this.vfs.write().0;
                     let file_id = vfs.file_id(&path).unwrap();
                     let text = apply_document_changes(
+                        this.config.position_encoding(),
                         || std::str::from_utf8(vfs.file_contents(file_id)).unwrap().into(),
                         params.content_changes,
                     );
diff --git a/crates/rust-analyzer/src/to_proto.rs b/crates/rust-analyzer/src/to_proto.rs
index 5bdc1bf8d9b..78cd4b8e2b1 100644
--- a/crates/rust-analyzer/src/to_proto.rs
+++ b/crates/rust-analyzer/src/to_proto.rs
@@ -31,8 +31,8 @@ pub(crate) fn position(line_index: &LineIndex, offset: TextSize) -> lsp_types::P
     let line_col = line_index.index.line_col(offset);
     match line_index.encoding {
         PositionEncoding::Utf8 => lsp_types::Position::new(line_col.line, line_col.col),
-        PositionEncoding::Utf16 => {
-            let line_col = line_index.index.to_utf16(line_col);
+        PositionEncoding::Wide(enc) => {
+            let line_col = line_index.index.to_wide(enc, line_col);
             lsp_types::Position::new(line_col.line, line_col.col)
         }
     }
@@ -1429,7 +1429,7 @@ fn main() {
         let line_index = LineIndex {
             index: Arc::new(ide::LineIndex::new(text)),
             endings: LineEndings::Unix,
-            encoding: PositionEncoding::Utf16,
+            encoding: PositionEncoding::Utf8,
         };
         let converted: Vec<lsp_types::FoldingRange> =
             folds.into_iter().map(|it| folding_range(text, &line_index, true, it)).collect();
diff --git a/crates/stdx/src/lib.rs b/crates/stdx/src/lib.rs
index bd24d7d28ba..5639aaf57cd 100644
--- a/crates/stdx/src/lib.rs
+++ b/crates/stdx/src/lib.rs
@@ -11,6 +11,7 @@ pub mod hash;
 pub mod process;
 pub mod panic_context;
 pub mod non_empty_vec;
+pub mod rand;
 
 pub use always_assert::{always, never};
 
diff --git a/crates/stdx/src/rand.rs b/crates/stdx/src/rand.rs
new file mode 100644
index 00000000000..b38506caef5
--- /dev/null
+++ b/crates/stdx/src/rand.rs
@@ -0,0 +1,21 @@
+//! We don't use `rand`, as that's too many things for us.
+//!
+//! Currently, we use oorandom instead, but it misses these two utilities.
+//! Perhaps we should switch to `fastrand`, or our own small prng, it's not like
+//! we need anything move complicatied that xor-shift.
+
+pub fn shuffle<T>(slice: &mut [T], mut rand_index: impl FnMut(usize) -> usize) {
+    let mut remaining = slice.len() - 1;
+    while remaining > 0 {
+        let index = rand_index(remaining);
+        slice.swap(remaining, index);
+        remaining -= 1;
+    }
+}
+
+pub fn seed() -> u64 {
+    use std::collections::hash_map::RandomState;
+    use std::hash::{BuildHasher, Hasher};
+
+    RandomState::new().build_hasher().finish()
+}
diff --git a/docs/dev/lsp-extensions.md b/docs/dev/lsp-extensions.md
index a794e866181..c3623a5cc46 100644
--- a/docs/dev/lsp-extensions.md
+++ b/docs/dev/lsp-extensions.md
@@ -1,5 +1,5 @@
 <!---
-lsp_ext.rs hash: ec29403e67dfd15b
+lsp_ext.rs hash: d87477896dfe41d4
 
 If you need to change the above hash to make the test pass, please check if you
 need to adjust this doc as well and ping this issue:
author	Aleksey Kladov <aleksey.kladov@gmail.com>	2023-02-14 00:56:28 +0000
committer	Aleksey Kladov <aleksey.kladov@gmail.com>	2023-02-14 01:09:50 +0000
commit	0da27376cf3768d92bcd0c094b52da50146dc70f (patch)
tree	0249938c5a632d60c548a1ca8767c30892804b7f
parent	c97aae38f20f64daede9877212aff83c259a4faa (diff)
download	rust-0da27376cf3768d92bcd0c094b52da50146dc70f.tar.gz rust-0da27376cf3768d92bcd0c094b52da50146dc70f.zip