convert \r\n to \n when loading files

author: Aleksey Kladov <aleksey.kladov@gmail.com> 2019-08-14 15:35:12 +0300
committer: Aleksey Kladov <aleksey.kladov@gmail.com> 2019-08-14 15:35:12 +0300
commit: 004f3acec11fa599d1afe71d85121f1c345d8769 (patch)
tree: f1f89111d2a5aa04612d9a5eb3256144807aa018 /src
parent: 60960a260f7b5c695fd0717311d72ce62dd4eb43 (diff)
download: rust-004f3acec11fa599d1afe71d85121f1c345d8769.tar.gz
rust-004f3acec11fa599d1afe71d85121f1c345d8769.zip
2 files changed, 76 insertions, 0 deletions
diff --git a/src/libsyntax_pos/lib.rs b/src/libsyntax_pos/lib.rs
index 02a7433d946..57a16bef86e 100644
--- a/src/libsyntax_pos/lib.rs
+++ b/src/libsyntax_pos/lib.rs
@@ -1045,6 +1045,7 @@ impl SourceFile {
                mut src: String,
                start_pos: BytePos) -> Result<SourceFile, OffsetOverflowError> {
         remove_bom(&mut src);
+        normalize_newlines(&mut src);
 
         let src_hash = {
             let mut hasher: StableHasher<u128> = StableHasher::new();
@@ -1212,6 +1213,61 @@ fn remove_bom(src: &mut String) {
     }
 }
 
+
+/// Replaces `\r\n` with `\n` in-place in `src`.
+///
+/// Returns error if there's a lone `\r` in the string
+fn normalize_newlines(src: &mut String) {
+    if !src.as_bytes().contains(&b'\r') {
+        return;
+    }
+
+    // We replace `\r\n` with `\n` in-place, which doesn't break utf-8 encoding.
+    // While we *can* call `as_mut_vec` and do surgery on the live string
+    // directly, let's rather steal the contents of `src`. This makes the code
+    // safe even if a panic occurs.
+
+    let mut buf = std::mem::replace(src, String::new()).into_bytes();
+    let mut gap_len = 0;
+    let mut tail = buf.as_mut_slice();
+    loop {
+        let idx = match find_crlf(&tail[gap_len..]) {
+            None => tail.len(),
+            Some(idx) => idx + gap_len,
+        };
+        tail.copy_within(gap_len..idx, 0);
+        tail = &mut tail[idx - gap_len..];
+        if tail.len() == gap_len {
+            break;
+        }
+        gap_len += 1;
+    }
+
+    // Account for removed `\r`.
+    // After `set_len`, `buf` is guaranteed to contain utf-8 again.
+    let new_len = buf.len() - gap_len;
+    unsafe {
+        buf.set_len(new_len);
+        *src = String::from_utf8_unchecked(buf);
+    }
+
+    fn find_crlf(src: &[u8]) -> Option<usize> {
+        let mut search_idx = 0;
+        while let Some(idx) = find_cr(&src[search_idx..]) {
+            if src[search_idx..].get(idx + 1) != Some(&b'\n') {
+                search_idx += idx + 1;
+                continue;
+            }
+            return Some(search_idx + idx);
+        }
+        None
+    }
+
+    fn find_cr(src: &[u8]) -> Option<usize> {
+        src.iter().position(|&b| b == b'\r')
+    }
+}
+
 // _____________________________________________________________________________
 // Pos, BytePos, CharPos
 //
diff --git a/src/libsyntax_pos/tests.rs b/src/libsyntax_pos/tests.rs
index 78c4e18e6ae..6bd6016020a 100644
--- a/src/libsyntax_pos/tests.rs
+++ b/src/libsyntax_pos/tests.rs
@@ -16,3 +16,23 @@ fn test_lookup_line() {
     assert_eq!(lookup_line(lines, BytePos(28)), 2);
     assert_eq!(lookup_line(lines, BytePos(29)), 2);
 }
+
+#[test]
+fn test_normalize_newlines() {
+    fn check(before: &str, after: &str) {
+        let mut actual = before.to_string();
+        normalize_newlines(&mut actual);
+        assert_eq!(actual.as_str(), after);
+    }
+    check("", "");
+    check("\n", "\n");
+    check("\r", "\r");
+    check("\r\r", "\r\r");
+    check("\r\n", "\n");
+    check("hello world", "hello world");
+    check("hello\nworld", "hello\nworld");
+    check("hello\r\nworld", "hello\nworld");
+    check("\r\nhello\r\nworld\r\n", "\nhello\nworld\n");
+    check("\r\r\n", "\r\n");
+    check("hello\rworld", "hello\rworld");
+}
author	Aleksey Kladov <aleksey.kladov@gmail.com>	2019-08-14 15:35:12 +0300
committer	Aleksey Kladov <aleksey.kladov@gmail.com>	2019-08-14 15:35:12 +0300
commit	004f3acec11fa599d1afe71d85121f1c345d8769 (patch)
tree	f1f89111d2a5aa04612d9a5eb3256144807aa018 /src
parent	60960a260f7b5c695fd0717311d72ce62dd4eb43 (diff)
download	rust-004f3acec11fa599d1afe71d85121f1c345d8769.tar.gz rust-004f3acec11fa599d1afe71d85121f1c345d8769.zip