Rollup merge of #65074 - Rantanen:json-byte-pos, r=matklad

Fix the start/end byte positions in the compiler JSON output Track the changes made during normalization in the `SourceFile` and use this information to correct the `start_byte` and `end_byte` fields in the JSON output. This should ensure the start/end byte fields can be used to index the original file, even if Rust normalized the source code for parsing purposes. Both CRLF to LF and BOM removal are handled with this one. The rough plan was discussed with @matklad in rust-lang-nursery/rustfix#176 - although I ended up going with `u32` offset tracking so I wouldn't need to deal with `u32 + i32` arithmetics when applying the offset to the span byte positions. Fixes #65029
author: Mazdak Farrokhzad <twingoow@gmail.com> 2019-10-25 13:12:45 +0200
committer: GitHub <noreply@github.com> 2019-10-25 13:12:45 +0200
commit: 1f93be1bb3f89d6b30a3ddc39e8a462924ccd503 (patch)
tree: ddc93b333040abfda9dd252818e026760f8e0fa3 /src/libsyntax_pos
parent: 959b6e324ce2786a4adade6cef222ffbd20f3791 (diff)
parent: ff1860ad763baac652d3a43a93985e29ade805cb (diff)
download: rust-1f93be1bb3f89d6b30a3ddc39e8a462924ccd503.tar.gz
rust-1f93be1bb3f89d6b30a3ddc39e8a462924ccd503.zip
2 files changed, 78 insertions, 17 deletions
diff --git a/src/libsyntax_pos/lib.rs b/src/libsyntax_pos/lib.rs
index a307a5c0b37..9034f8c1afd 100644
--- a/src/libsyntax_pos/lib.rs
+++ b/src/libsyntax_pos/lib.rs
@@ -855,6 +855,15 @@ impl Sub<BytePos> for NonNarrowChar {
     }
 }
 
+/// Identifies an offset of a character that was normalized away from `SourceFile`.
+#[derive(Copy, Clone, RustcEncodable, RustcDecodable, Eq, PartialEq, Debug)]
+pub struct NormalizedPos {
+    /// The absolute offset of the character in the `SourceMap`.
+    pub pos: BytePos,
+    /// The difference between original and normalized string at position.
+    pub diff: u32,
+}
+
 /// The state of the lazy external source loading mechanism of a `SourceFile`.
 #[derive(PartialEq, Eq, Clone)]
 pub enum ExternalSource {
@@ -918,6 +927,8 @@ pub struct SourceFile {
     pub multibyte_chars: Vec<MultiByteChar>,
     /// Width of characters that are not narrow in the source code.
     pub non_narrow_chars: Vec<NonNarrowChar>,
+    /// Locations of characters removed during normalization.
+    pub normalized_pos: Vec<NormalizedPos>,
     /// A hash of the filename, used for speeding up hashing in incremental compilation.
     pub name_hash: u128,
 }
@@ -984,6 +995,9 @@ impl Encodable for SourceFile {
             })?;
             s.emit_struct_field("name_hash", 8, |s| {
                 self.name_hash.encode(s)
+            })?;
+            s.emit_struct_field("normalized_pos", 9, |s| {
+                self.normalized_pos.encode(s)
             })
         })
     }
@@ -1034,6 +1048,8 @@ impl Decodable for SourceFile {
                 d.read_struct_field("non_narrow_chars", 7, |d| Decodable::decode(d))?;
             let name_hash: u128 =
                 d.read_struct_field("name_hash", 8, |d| Decodable::decode(d))?;
+            let normalized_pos: Vec<NormalizedPos> =
+                d.read_struct_field("normalized_pos", 9, |d| Decodable::decode(d))?;
             Ok(SourceFile {
                 name,
                 name_was_remapped,
@@ -1050,6 +1066,7 @@ impl Decodable for SourceFile {
                 lines,
                 multibyte_chars,
                 non_narrow_chars,
+                normalized_pos,
                 name_hash,
             })
         })
@@ -1068,8 +1085,7 @@ impl SourceFile {
                unmapped_path: FileName,
                mut src: String,
                start_pos: BytePos) -> Result<SourceFile, OffsetOverflowError> {
-        remove_bom(&mut src);
-        normalize_newlines(&mut src);
+        let normalized_pos = normalize_src(&mut src, start_pos);
 
         let src_hash = {
             let mut hasher: StableHasher = StableHasher::new();
@@ -1102,6 +1118,7 @@ impl SourceFile {
             lines,
             multibyte_chars,
             non_narrow_chars,
+            normalized_pos,
             name_hash,
         })
     }
@@ -1228,12 +1245,44 @@ impl SourceFile {
     pub fn contains(&self, byte_pos: BytePos) -> bool {
         byte_pos >= self.start_pos && byte_pos <= self.end_pos
     }
+
+    /// Calculates the original byte position relative to the start of the file
+    /// based on the given byte position.
+    pub fn original_relative_byte_pos(&self, pos: BytePos) -> BytePos {
+
+        // Diff before any records is 0. Otherwise use the previously recorded
+        // diff as that applies to the following characters until a new diff
+        // is recorded.
+        let diff = match self.normalized_pos.binary_search_by(
+                            |np| np.pos.cmp(&pos)) {
+            Ok(i) => self.normalized_pos[i].diff,
+            Err(i) if i == 0 => 0,
+            Err(i) => self.normalized_pos[i-1].diff,
+        };
+
+        BytePos::from_u32(pos.0 - self.start_pos.0 + diff)
+    }
+}
+
+/// Normalizes the source code and records the normalizations.
+fn normalize_src(src: &mut String, start_pos: BytePos) -> Vec<NormalizedPos> {
+    let mut normalized_pos = vec![];
+    remove_bom(src, &mut normalized_pos);
+    normalize_newlines(src, &mut normalized_pos);
+
+    // Offset all the positions by start_pos to match the final file positions.
+    for np in &mut normalized_pos {
+        np.pos.0 += start_pos.0;
+    }
+
+    normalized_pos
 }
 
 /// Removes UTF-8 BOM, if any.
-fn remove_bom(src: &mut String) {
+fn remove_bom(src: &mut String, normalized_pos: &mut Vec<NormalizedPos>) {
     if src.starts_with("\u{feff}") {
         src.drain(..3);
+        normalized_pos.push(NormalizedPos { pos: BytePos(0), diff: 3 });
     }
 }
 
@@ -1241,7 +1290,7 @@ fn remove_bom(src: &mut String) {
 /// Replaces `\r\n` with `\n` in-place in `src`.
 ///
 /// Returns error if there's a lone `\r` in the string
-fn normalize_newlines(src: &mut String) {
+fn normalize_newlines(src: &mut String, normalized_pos: &mut Vec<NormalizedPos>) {
     if !src.as_bytes().contains(&b'\r') {
         return;
     }
@@ -1254,6 +1303,8 @@ fn normalize_newlines(src: &mut String) {
     let mut buf = std::mem::replace(src, String::new()).into_bytes();
     let mut gap_len = 0;
     let mut tail = buf.as_mut_slice();
+    let mut cursor = 0;
+    let original_gap = normalized_pos.last().map_or(0, |l| l.diff);
     loop {
         let idx = match find_crlf(&tail[gap_len..]) {
             None => tail.len(),
@@ -1264,7 +1315,12 @@ fn normalize_newlines(src: &mut String) {
         if tail.len() == gap_len {
             break;
         }
+        cursor += idx - gap_len;
         gap_len += 1;
+        normalized_pos.push(NormalizedPos {
+            pos: BytePos::from_usize(cursor + 1),
+            diff: original_gap + gap_len as u32,
+        });
     }
 
     // Account for removed `\r`.
diff --git a/src/libsyntax_pos/tests.rs b/src/libsyntax_pos/tests.rs
index 6bd6016020a..87cc3505e38 100644
--- a/src/libsyntax_pos/tests.rs
+++ b/src/libsyntax_pos/tests.rs
@@ -19,20 +19,25 @@ fn test_lookup_line() {
 
 #[test]
 fn test_normalize_newlines() {
-    fn check(before: &str, after: &str) {
+    fn check(before: &str, after: &str, expected_positions: &[u32]) {
         let mut actual = before.to_string();
-        normalize_newlines(&mut actual);
+        let mut actual_positions = vec![];
+        normalize_newlines(&mut actual, &mut actual_positions);
+        let actual_positions : Vec<_> = actual_positions
+            .into_iter()
+            .map(|nc| nc.pos.0).collect();
         assert_eq!(actual.as_str(), after);
+        assert_eq!(actual_positions, expected_positions);
     }
-    check("", "");
-    check("\n", "\n");
-    check("\r", "\r");
-    check("\r\r", "\r\r");
-    check("\r\n", "\n");
-    check("hello world", "hello world");
-    check("hello\nworld", "hello\nworld");
-    check("hello\r\nworld", "hello\nworld");
-    check("\r\nhello\r\nworld\r\n", "\nhello\nworld\n");
-    check("\r\r\n", "\r\n");
-    check("hello\rworld", "hello\rworld");
+    check("", "", &[]);
+    check("\n", "\n", &[]);
+    check("\r", "\r", &[]);
+    check("\r\r", "\r\r", &[]);
+    check("\r\n", "\n", &[1]);
+    check("hello world", "hello world", &[]);
+    check("hello\nworld", "hello\nworld", &[]);
+    check("hello\r\nworld", "hello\nworld", &[6]);
+    check("\r\nhello\r\nworld\r\n", "\nhello\nworld\n", &[1, 7, 13]);
+    check("\r\r\n", "\r\n", &[2]);
+    check("hello\rworld", "hello\rworld", &[]);
 }
author	Mazdak Farrokhzad <twingoow@gmail.com>	2019-10-25 13:12:45 +0200
committer	GitHub <noreply@github.com>	2019-10-25 13:12:45 +0200
commit	1f93be1bb3f89d6b30a3ddc39e8a462924ccd503 (patch)
tree	ddc93b333040abfda9dd252818e026760f8e0fa3 /src/libsyntax_pos
parent	959b6e324ce2786a4adade6cef222ffbd20f3791 (diff)
parent	ff1860ad763baac652d3a43a93985e29ade805cb (diff)
download	rust-1f93be1bb3f89d6b30a3ddc39e8a462924ccd503.tar.gz rust-1f93be1bb3f89d6b30a3ddc39e8a462924ccd503.zip