Rollup merge of #106566 - clubby789:contiguous-weird-unicode, r=cjgillot

Emit a single error for contiguous sequences of unknown tokens Closes #106101 On encountering a sequence of identical source characters which are unknown tokens, note the amount of subsequent characters and advance past them silently. The old behavior was to emit an error and 'help' note for every single one. `@rustbot` label +A-diagnostics +A-parser
author: Matthias Krüger <matthias.krueger@famsik.de> 2023-01-14 13:04:24 +0100
committer: GitHub <noreply@github.com> 2023-01-14 13:04:24 +0100
commit: 8e0eecdba684b2d01fe4261f8fd367a7b4e913a6 (patch)
tree: 8bc1afa1c48a50b9cb41f1dc0274ba42e3aa5467 /compiler/rustc_parse/src
parent: 27db39b1b30fe808ba725d36f4293a3b61a4b029 (diff)
parent: a3d6bc34684f67b9b5b2a419d57dc0afe5af4a67 (diff)
download: rust-8e0eecdba684b2d01fe4261f8fd367a7b4e913a6.tar.gz
rust-8e0eecdba684b2d01fe4261f8fd367a7b4e913a6.zip
2 files changed, 32 insertions, 6 deletions
diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs
index f027843e6b4..8761c23625b 100644
--- a/compiler/rustc_parse/src/lexer/mod.rs
+++ b/compiler/rustc_parse/src/lexer/mod.rs
@@ -79,7 +79,7 @@ impl<'a> StringReader<'a> {
     /// preceded by whitespace.
     fn next_token(&mut self) -> (Token, bool) {
         let mut preceded_by_whitespace = false;
-
+        let mut swallow_next_invalid = 0;
         // Skip trivial (whitespace & comments) tokens
         loop {
             let token = self.cursor.advance_token();
@@ -232,19 +232,34 @@ impl<'a> StringReader<'a> {
                 rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),
 
                 rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
-                    let c = self.str_from(start).chars().next().unwrap();
+                    // Don't emit diagnostics for sequences of the same invalid token
+                    if swallow_next_invalid > 0 {
+                        swallow_next_invalid -= 1;
+                        continue;
+                    }
+                    let mut it = self.str_from_to_end(start).chars();
+                    let c = it.next().unwrap();
+                    let repeats = it.take_while(|c1| *c1 == c).count();
                     let mut err =
-                        self.struct_err_span_char(start, self.pos, "unknown start of token", c);
+                        self.struct_err_span_char(start, self.pos + Pos::from_usize(repeats * c.len_utf8()), "unknown start of token", c);
                     // FIXME: the lexer could be used to turn the ASCII version of unicode
                     // homoglyphs, instead of keeping a table in `check_for_substitution`into the
                     // token. Ideally, this should be inside `rustc_lexer`. However, we should
                     // first remove compound tokens like `<<` from `rustc_lexer`, and then add
                     // fancier error recovery to it, as there will be less overall work to do this
                     // way.
-                    let token = unicode_chars::check_for_substitution(self, start, c, &mut err);
+                    let token = unicode_chars::check_for_substitution(self, start, c, &mut err, repeats+1);
                     if c == '\x00' {
                         err.help("source files must contain UTF-8 encoded text, unexpected null bytes might occur when a different encoding is used");
                     }
+                    if repeats > 0 {
+                        if repeats == 1 {
+                            err.note(format!("character appears once more"));
+                        } else {
+                            err.note(format!("character appears {repeats} more times"));
+                        }
+                        swallow_next_invalid = repeats;
+                    }
                     err.emit();
                     if let Some(token) = token {
                         token
@@ -486,6 +501,11 @@ impl<'a> StringReader<'a> {
         &self.src[self.src_index(start)..self.src_index(end)]
     }
 
+    /// Slice of the source text spanning from `start` until the end
+    fn str_from_to_end(&self, start: BytePos) -> &str {
+        &self.src[self.src_index(start)..]
+    }
+
     fn report_raw_str_error(&self, start: BytePos, prefix_len: u32) -> ! {
         match rustc_lexer::validate_raw_str(self.str_from(start), prefix_len) {
             Err(RawStrError::InvalidStarter { bad_char }) => {
diff --git a/compiler/rustc_parse/src/lexer/unicode_chars.rs b/compiler/rustc_parse/src/lexer/unicode_chars.rs
index f1b50296e25..65479b341d7 100644
--- a/compiler/rustc_parse/src/lexer/unicode_chars.rs
+++ b/compiler/rustc_parse/src/lexer/unicode_chars.rs
@@ -337,10 +337,11 @@ pub(super) fn check_for_substitution<'a>(
     pos: BytePos,
     ch: char,
     err: &mut Diagnostic,
+    count: usize,
 ) -> Option<token::TokenKind> {
     let &(_u_char, u_name, ascii_char) = UNICODE_ARRAY.iter().find(|&&(c, _, _)| c == ch)?;
 
-    let span = Span::with_root_ctxt(pos, pos + Pos::from_usize(ch.len_utf8()));
+    let span = Span::with_root_ctxt(pos, pos + Pos::from_usize(ch.len_utf8() * count));
 
     let Some((_ascii_char, ascii_name, token)) = ASCII_ARRAY.iter().find(|&&(c, _, _)| c == ascii_char) else {
         let msg = format!("substitution character not found for '{}'", ch);
@@ -369,7 +370,12 @@ pub(super) fn check_for_substitution<'a>(
             "Unicode character '{}' ({}) looks like '{}' ({}), but it is not",
             ch, u_name, ascii_char, ascii_name
         );
-        err.span_suggestion(span, &msg, ascii_char, Applicability::MaybeIncorrect);
+        err.span_suggestion(
+            span,
+            &msg,
+            ascii_char.to_string().repeat(count),
+            Applicability::MaybeIncorrect,
+        );
     }
     token.clone()
 }
author	Matthias Krüger <matthias.krueger@famsik.de>	2023-01-14 13:04:24 +0100
committer	GitHub <noreply@github.com>	2023-01-14 13:04:24 +0100
commit	8e0eecdba684b2d01fe4261f8fd367a7b4e913a6 (patch)
tree	8bc1afa1c48a50b9cb41f1dc0274ba42e3aa5467 /compiler/rustc_parse/src
parent	27db39b1b30fe808ba725d36f4293a3b61a4b029 (diff)
parent	a3d6bc34684f67b9b5b2a419d57dc0afe5af4a67 (diff)
download	rust-8e0eecdba684b2d01fe4261f8fd367a7b4e913a6.tar.gz rust-8e0eecdba684b2d01fe4261f8fd367a7b4e913a6.zip