Allow lexer to recover from some homoglyphs

author: Esteban Küber <esteban@kuber.com.ar> 2019-07-24 16:10:42 -0700
committer: Esteban Küber <esteban@kuber.com.ar> 2019-07-24 16:10:42 -0700
commit: 70c817aee3aa204122b64cdfc2db05fa182da1c5 (patch)
tree: 60a0178103feb7ad6c09aa86eb115dfb8eeabc2e /src/libsyntax
parent: 27a6a304e2baaabca88059753f020377f2476978 (diff)
download: rust-70c817aee3aa204122b64cdfc2db05fa182da1c5.tar.gz
rust-70c817aee3aa204122b64cdfc2db05fa182da1c5.zip
2 files changed, 37 insertions, 33 deletions
diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs
index b97801a50d4..412ed8f04b3 100644
--- a/src/libsyntax/parse/lexer/mod.rs
+++ b/src/libsyntax/parse/lexer/mod.rs
@@ -389,7 +389,10 @@ impl<'a> StringReader<'a> {
                                                           self.pos,
                                                           "unknown start of token",
                                                           c);
-                unicode_chars::check_for_substitution(self, start, c, &mut err);
+                if let Some(t) = unicode_chars::check_for_substitution(self, start, c, &mut err) {
+                    err.emit();
+                    return Ok(t);
+                }
                 return Err(err)
             }
         };
diff --git a/src/libsyntax/parse/lexer/unicode_chars.rs b/src/libsyntax/parse/lexer/unicode_chars.rs
index b728a9e1988..bfa1606a0d2 100644
--- a/src/libsyntax/parse/lexer/unicode_chars.rs
+++ b/src/libsyntax/parse/lexer/unicode_chars.rs
@@ -4,6 +4,7 @@
 use super::StringReader;
 use errors::{Applicability, DiagnosticBuilder};
 use syntax_pos::{BytePos, Pos, Span, NO_EXPANSION};
+use crate::parse::token;
 
 #[rustfmt::skip] // for line breaks
 const UNICODE_ARRAY: &[(char, &str, char)] = &[
@@ -297,32 +298,32 @@ const UNICODE_ARRAY: &[(char, &str, char)] = &[
     ('＞', "Fullwidth Greater-Than Sign", '>'),
 ];
 
-const ASCII_ARRAY: &[(char, &str)] = &[
-    (' ', "Space"),
-    ('_', "Underscore"),
-    ('-', "Minus/Hyphen"),
-    (',', "Comma"),
-    (';', "Semicolon"),
-    (':', "Colon"),
-    ('!', "Exclamation Mark"),
-    ('?', "Question Mark"),
-    ('.', "Period"),
-    ('\'', "Single Quote"),
-    ('"', "Quotation Mark"),
-    ('(', "Left Parenthesis"),
-    (')', "Right Parenthesis"),
-    ('[', "Left Square Bracket"),
-    (']', "Right Square Bracket"),
-    ('{', "Left Curly Brace"),
-    ('}', "Right Curly Brace"),
-    ('*', "Asterisk"),
-    ('/', "Slash"),
-    ('\\', "Backslash"),
-    ('&', "Ampersand"),
-    ('+', "Plus Sign"),
-    ('<', "Less-Than Sign"),
-    ('=', "Equals Sign"),
-    ('>', "Greater-Than Sign"),
+const ASCII_ARRAY: &[(char, &str, Option<token::TokenKind>)] = &[
+    (' ', "Space", Some(token::Whitespace)),
+    ('_', "Underscore", None),
+    ('-', "Minus/Hyphen", Some(token::BinOp(token::Minus))),
+    (',', "Comma", Some(token::Comma)),
+    (';', "Semicolon", Some(token::Semi)),
+    (':', "Colon", Some(token::Colon)),
+    ('!', "Exclamation Mark", Some(token::Not)),
+    ('?', "Question Mark", Some(token::Question)),
+    ('.', "Period", Some(token::Dot)),
+    ('\'', "Single Quote", None),  // Literals are already lexed by this point, so we can't recover
+    ('"', "Quotation Mark", None), // gracefully just by spitting the correct token out.
+    ('(', "Left Parenthesis", Some(token::OpenDelim(token::Paren))),
+    (')', "Right Parenthesis", Some(token::CloseDelim(token::Paren))),
+    ('[', "Left Square Bracket", Some(token::OpenDelim(token::Bracket))),
+    (']', "Right Square Bracket", Some(token::CloseDelim(token::Bracket))),
+    ('{', "Left Curly Brace", Some(token::OpenDelim(token::Brace))),
+    ('}', "Right Curly Brace", Some(token::CloseDelim(token::Brace))),
+    ('*', "Asterisk", Some(token::BinOp(token::Star))),
+    ('/', "Slash", Some(token::BinOp(token::Slash))),
+    ('\\', "Backslash", None),
+    ('&', "Ampersand", Some(token::BinOp(token::And))),
+    ('+', "Plus Sign", Some(token::BinOp(token::Plus))),
+    ('<', "Less-Than Sign", Some(token::Lt)),
+    ('=', "Equals Sign", Some(token::Eq)),
+    ('>', "Greater-Than Sign", Some(token::Gt)),
 ];
 
 crate fn check_for_substitution<'a>(
@@ -330,20 +331,20 @@ crate fn check_for_substitution<'a>(
     pos: BytePos,
     ch: char,
     err: &mut DiagnosticBuilder<'a>,
-) -> bool {
+) -> Option<token::TokenKind> {
     let (u_name, ascii_char) = match UNICODE_ARRAY.iter().find(|&&(c, _, _)| c == ch) {
         Some(&(_u_char, u_name, ascii_char)) => (u_name, ascii_char),
-        None => return false,
+        None => return None,
     };
 
     let span = Span::new(pos, pos + Pos::from_usize(ch.len_utf8()), NO_EXPANSION);
 
-    let ascii_name = match ASCII_ARRAY.iter().find(|&&(c, _)| c == ascii_char) {
-        Some((_ascii_char, ascii_name)) => ascii_name,
+    let (ascii_name, token) = match ASCII_ARRAY.iter().find(|&&(c, _, _)| c == ascii_char) {
+        Some((_ascii_char, ascii_name, token)) => (ascii_name, token),
         None => {
             let msg = format!("substitution character not found for '{}'", ch);
             reader.sess.span_diagnostic.span_bug_no_panic(span, &msg);
-            return false;
+            return None;
         }
     };
 
@@ -371,7 +372,7 @@ crate fn check_for_substitution<'a>(
         );
         err.span_suggestion(span, &msg, ascii_char.to_string(), Applicability::MaybeIncorrect);
     }
-    true
+    token.clone()
 }
 
 /// Extract string if found at current position with given delimiters
author	Esteban Küber <esteban@kuber.com.ar>	2019-07-24 16:10:42 -0700
committer	Esteban Küber <esteban@kuber.com.ar>	2019-07-24 16:10:42 -0700
commit	70c817aee3aa204122b64cdfc2db05fa182da1c5 (patch)
tree	60a0178103feb7ad6c09aa86eb115dfb8eeabc2e /src/libsyntax
parent	27a6a304e2baaabca88059753f020377f2476978 (diff)
download	rust-70c817aee3aa204122b64cdfc2db05fa182da1c5.tar.gz rust-70c817aee3aa204122b64cdfc2db05fa182da1c5.zip