about summary refs log tree commit diff
path: root/compiler/rustc_lexer/src/lib.rs
diff options
context:
space:
mode:
Diffstat (limited to 'compiler/rustc_lexer/src/lib.rs')
-rw-r--r--compiler/rustc_lexer/src/lib.rs44
1 files changed, 31 insertions, 13 deletions
diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs
index b64a891cb25..5b8300ab530 100644
--- a/compiler/rustc_lexer/src/lib.rs
+++ b/compiler/rustc_lexer/src/lib.rs
@@ -64,6 +64,8 @@ pub enum TokenKind {
     /// "ident" or "continue"
     /// At this step keywords are also considered identifiers.
     Ident,
+    /// Like the above, but containing invalid unicode codepoints.
+    InvalidIdent,
     /// "r#ident"
     RawIdent,
     /// An unknown prefix like `foo#`, `foo'`, `foo"`. Note that only the
@@ -225,14 +227,15 @@ pub fn first_token(input: &str) -> Token {
 }
 
 /// Creates an iterator that produces tokens from the input string.
-pub fn tokenize(mut input: &str) -> impl Iterator<Item = Token> + '_ {
+pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ {
+    let mut cursor = Cursor::new(input);
     std::iter::from_fn(move || {
-        if input.is_empty() {
-            return None;
+        if cursor.is_eof() {
+            None
+        } else {
+            cursor.reset_len_consumed();
+            Some(cursor.advance_token())
         }
-        let token = first_token(input);
-        input = &input[token.len..];
-        Some(token)
     })
 }
 
@@ -411,6 +414,10 @@ impl Cursor<'_> {
                 let kind = Str { terminated };
                 Literal { kind, suffix_start }
             }
+            // Identifier starting with an emoji. Only lexed for graceful error recovery.
+            c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => {
+                self.fake_ident_or_unknown_prefix()
+            }
             _ => Unknown,
         };
         Token::new(token_kind, self.len_consumed())
@@ -492,10 +499,28 @@ impl Cursor<'_> {
         // we see a prefix here, it is definitely an unknown prefix.
         match self.first() {
             '#' | '"' | '\'' => UnknownPrefix,
+            c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => {
+                self.fake_ident_or_unknown_prefix()
+            }
             _ => Ident,
         }
     }
 
+    fn fake_ident_or_unknown_prefix(&mut self) -> TokenKind {
+        // Start is already eaten, eat the rest of identifier.
+        self.eat_while(|c| {
+            unicode_xid::UnicodeXID::is_xid_continue(c)
+                || (!c.is_ascii() && unic_emoji_char::is_emoji(c))
+                || c == '\u{200d}'
+        });
+        // Known prefixes must have been handled earlier. So if
+        // we see a prefix here, it is definitely an unknown prefix.
+        match self.first() {
+            '#' | '"' | '\'' => UnknownPrefix,
+            _ => InvalidIdent,
+        }
+    }
+
     fn number(&mut self, first_digit: char) -> LiteralKind {
         debug_assert!('0' <= self.prev() && self.prev() <= '9');
         let mut base = Base::Decimal;
@@ -808,11 +833,4 @@ impl Cursor<'_> {
 
         self.eat_while(is_id_continue);
     }
-
-    /// Eats symbols while predicate returns true or until the end of file is reached.
-    fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
-        while predicate(self.first()) && !self.is_eof() {
-            self.bump();
-        }
-    }
 }