about summary refs log tree commit diff
diff options
context:
space:
mode:
authorDylan DPC <dylan.dpc@gmail.com>2021-02-27 21:56:15 +0100
committerGitHub <noreply@github.com>2021-02-27 21:56:15 +0100
commitbe3d1eb3010b48f5b0512fc83cc029bb321fb3ab (patch)
tree6260c72ef61516da8803cb6ca2d1cc56aa34856d
parent94736c434ee154b30e2ec22ec112b79e3f6c5884 (diff)
parented8c68644c9a352f61c3b4591b6fc18653e2ffc2 (diff)
downloadrust-be3d1eb3010b48f5b0512fc83cc029bb321fb3ab.tar.gz
rust-be3d1eb3010b48f5b0512fc83cc029bb321fb3ab.zip
Rollup merge of #81856 - Smittyvb:utf16-warn, r=matthewjasper
Suggest character encoding is incorrect when encountering random null bytes

This adds a note whenever null bytes are seen at the start of a token unexpectedly, since those tend to come from UTF-16 encoded files without a [BOM](https://en.wikipedia.org/wiki/Byte_order_mark) (if a UTF-16 BOM appears it won't be valid UTF-8, but if there is no BOM it be both valid UTF-16 and valid but garbled UTF-8). This approach was suggested in https://github.com/rust-lang/rust/issues/73979#issuecomment-653976451.

Closes #73979.
-rw-r--r--compiler/rustc_parse/src/lexer/mod.rs3
-rw-r--r--src/test/ui/parser/issue-66473.stderrbin2660 -> 5260 bytes
-rw-r--r--src/test/ui/parser/issue-68629.stderrbin1441 -> 1831 bytes
-rw-r--r--src/test/ui/parser/issue-68730.stderrbin966 -> 1226 bytes
-rw-r--r--src/test/ui/parser/utf16-be-without-bom.rsbin0 -> 125 bytes
-rw-r--r--src/test/ui/parser/utf16-be-without-bom.stderrbin0 -> 3537 bytes
-rw-r--r--src/test/ui/parser/utf16-le-without-bom.rsbin0 -> 126 bytes
-rw-r--r--src/test/ui/parser/utf16-le-without-bom.stderrbin0 -> 3500 bytes
8 files changed, 3 insertions, 0 deletions
diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs
index 4a638ec3f80..4bf870eb7ce 100644
--- a/compiler/rustc_parse/src/lexer/mod.rs
+++ b/compiler/rustc_parse/src/lexer/mod.rs
@@ -268,6 +268,9 @@ impl<'a> StringReader<'a> {
                 // tokens like `<<` from `rustc_lexer`, and then add fancier error recovery to it,
                 // as there will be less overall work to do this way.
                 let token = unicode_chars::check_for_substitution(self, start, c, &mut err);
+                if c == '\x00' {
+                    err.help("source files must contain UTF-8 encoded text, unexpected null bytes might occur when a different encoding is used");
+                }
                 err.emit();
                 token?
             }
diff --git a/src/test/ui/parser/issue-66473.stderr b/src/test/ui/parser/issue-66473.stderr
index b370b125cfe..8a16d7f9551 100644
--- a/src/test/ui/parser/issue-66473.stderr
+++ b/src/test/ui/parser/issue-66473.stderr
Binary files differdiff --git a/src/test/ui/parser/issue-68629.stderr b/src/test/ui/parser/issue-68629.stderr
index a7885ecec56..19c9ef30f90 100644
--- a/src/test/ui/parser/issue-68629.stderr
+++ b/src/test/ui/parser/issue-68629.stderr
Binary files differdiff --git a/src/test/ui/parser/issue-68730.stderr b/src/test/ui/parser/issue-68730.stderr
index 9f8833e17fe..8602abacabd 100644
--- a/src/test/ui/parser/issue-68730.stderr
+++ b/src/test/ui/parser/issue-68730.stderr
Binary files differdiff --git a/src/test/ui/parser/utf16-be-without-bom.rs b/src/test/ui/parser/utf16-be-without-bom.rs
new file mode 100644
index 00000000000..22aa1971787
--- /dev/null
+++ b/src/test/ui/parser/utf16-be-without-bom.rs
Binary files differdiff --git a/src/test/ui/parser/utf16-be-without-bom.stderr b/src/test/ui/parser/utf16-be-without-bom.stderr
new file mode 100644
index 00000000000..768d2c53164
--- /dev/null
+++ b/src/test/ui/parser/utf16-be-without-bom.stderr
Binary files differdiff --git a/src/test/ui/parser/utf16-le-without-bom.rs b/src/test/ui/parser/utf16-le-without-bom.rs
new file mode 100644
index 00000000000..3c1049929e1
--- /dev/null
+++ b/src/test/ui/parser/utf16-le-without-bom.rs
Binary files differdiff --git a/src/test/ui/parser/utf16-le-without-bom.stderr b/src/test/ui/parser/utf16-le-without-bom.stderr
new file mode 100644
index 00000000000..4f4b91e39ed
--- /dev/null
+++ b/src/test/ui/parser/utf16-le-without-bom.stderr
Binary files differ