about summary refs log tree commit diff
path: root/src
diff options
context:
space:
mode:
authorHuon Wilson <dbau.pp+github@gmail.com>2014-02-23 16:40:04 +1100
committerAlex Crichton <alex@alexcrichton.com>2014-02-24 21:22:26 -0800
commitff79a4471cbf5fa4e78fcf56be129a3d56690127 (patch)
tree6835775de94115c4f499981198c8b618330dd2fd /src
parentdad52cfcb58cb30170c6247f2053bc0f0d57466a (diff)
downloadrust-ff79a4471cbf5fa4e78fcf56be129a3d56690127.tar.gz
rust-ff79a4471cbf5fa4e78fcf56be129a3d56690127.zip
syntax: record multibyte chars' positions absolutely, not relative to
file.

Previously multibyte UTF-8 chars were being recorded as byte offsets
from the start of the file, and then later compared against global byte
positions, resulting in the compiler possibly thinking it had a byte
position pointing inside a multibyte character, if there were multibyte
characters in any non-crate files. (Although, sometimes the byte offsets
line up just right to not ICE, but that was a coincidence.)

Fixes #11136.
Fixes #11178.
Diffstat (limited to 'src')
-rw-r--r--src/libsyntax/parse/lexer.rs3
-rw-r--r--src/test/run-make/unicode-input/Makefile6
-rw-r--r--src/test/run-make/unicode-input/multiple_files.rs54
3 files changed, 61 insertions, 2 deletions
diff --git a/src/libsyntax/parse/lexer.rs b/src/libsyntax/parse/lexer.rs
index b711e95bc94..5bace75a5ea 100644
--- a/src/libsyntax/parse/lexer.rs
+++ b/src/libsyntax/parse/lexer.rs
@@ -264,8 +264,7 @@ pub fn bump(rdr: &StringReader) {
         }
 
         if byte_offset_diff > 1 {
-            rdr.filemap.record_multibyte_char(
-                Pos::from_uint(current_byte_offset), byte_offset_diff);
+            rdr.filemap.record_multibyte_char(rdr.last_pos.get(), byte_offset_diff);
         }
     } else {
         rdr.curr.set(None);
diff --git a/src/test/run-make/unicode-input/Makefile b/src/test/run-make/unicode-input/Makefile
new file mode 100644
index 00000000000..1e420bddb77
--- /dev/null
+++ b/src/test/run-make/unicode-input/Makefile
@@ -0,0 +1,6 @@
+-include ../tools.mk
+
+all:
+	# check that we don't ICE on unicode input, issue #11178
+	$(RUSTC) multiple_files.rs
+	$(call RUN,multiple_files)  "$(RUSTC)" "$(TMPDIR)"
diff --git a/src/test/run-make/unicode-input/multiple_files.rs b/src/test/run-make/unicode-input/multiple_files.rs
new file mode 100644
index 00000000000..2758ac12bab
--- /dev/null
+++ b/src/test/run-make/unicode-input/multiple_files.rs
@@ -0,0 +1,54 @@
+use std::{char, os, run, str};
+use std::rand::{task_rng, Rng};
+use std::io::File;
+
+// creates unicode_input_multiple_files_{main,chars}.rs, where the
+// former imports the latter. `_chars` just contains an indentifier
+// made up of random characters, because will emit an error message
+// about the ident being in the wrong place, with a span (and creating
+// this span used to upset the compiler).
+
+fn random_char() -> char {
+    let mut rng = task_rng();
+    // a subset of the XID_start unicode table (ensuring that the
+    // compiler doesn't fail with an "unrecognised token" error)
+    let (lo, hi): (u32, u32) = match rng.gen_range(1, 4 + 1) {
+        1 => (0x41, 0x5a),
+        2 => (0xf8, 0x1ba),
+        3 => (0x1401, 0x166c),
+        _ => (0x10400, 0x1044f)
+    };
+
+    char::from_u32(rng.gen_range(lo, hi + 1)).unwrap()
+}
+
+fn main() {
+    let args = os::args();
+    let rustc = args[1].as_slice();
+    let tmpdir = Path::new(args[2].as_slice());
+
+    let main_file = tmpdir.join("unicode_input_multiple_files_main.rs");
+    let main_file_str = main_file.as_str().unwrap();
+    {
+        let _ = File::create(&main_file).unwrap()
+            .write_str("mod unicode_input_multiple_files_chars;");
+    }
+
+    for _ in range(0, 100) {
+        {
+            let mut w = File::create(&tmpdir.join("unicode_input_multiple_files_chars.rs")).unwrap();
+            for _ in range(0, 30) {
+                let _ = w.write_char(random_char());
+            }
+        }
+
+        // rustc is passed to us with --out-dir and -L etc., so we
+        // can't exec it directly
+        let result = run::process_output("sh", [~"-c", rustc + " " + main_file_str]).unwrap();
+        let err = str::from_utf8_lossy(result.error);
+
+        // positive test so that this test will be updated when the
+        // compiler changes.
+        assert!(err.as_slice().contains("expected item but found"))
+    }
+}