diff options
Diffstat (limited to 'compiler/rustc_lexer/src/lib.rs')
| -rw-r--r-- | compiler/rustc_lexer/src/lib.rs | 44 |
1 files changed, 31 insertions, 13 deletions
diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index b64a891cb25..5b8300ab530 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -64,6 +64,8 @@ pub enum TokenKind { /// "ident" or "continue" /// At this step keywords are also considered identifiers. Ident, + /// Like the above, but containing invalid unicode codepoints. + InvalidIdent, /// "r#ident" RawIdent, /// An unknown prefix like `foo#`, `foo'`, `foo"`. Note that only the @@ -225,14 +227,15 @@ pub fn first_token(input: &str) -> Token { } /// Creates an iterator that produces tokens from the input string. -pub fn tokenize(mut input: &str) -> impl Iterator<Item = Token> + '_ { +pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ { + let mut cursor = Cursor::new(input); std::iter::from_fn(move || { - if input.is_empty() { - return None; + if cursor.is_eof() { + None + } else { + cursor.reset_len_consumed(); + Some(cursor.advance_token()) } - let token = first_token(input); - input = &input[token.len..]; - Some(token) }) } @@ -411,6 +414,10 @@ impl Cursor<'_> { let kind = Str { terminated }; Literal { kind, suffix_start } } + // Identifier starting with an emoji. Only lexed for graceful error recovery. + c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => { + self.fake_ident_or_unknown_prefix() + } _ => Unknown, }; Token::new(token_kind, self.len_consumed()) @@ -492,10 +499,28 @@ impl Cursor<'_> { // we see a prefix here, it is definitely an unknown prefix. match self.first() { '#' | '"' | '\'' => UnknownPrefix, + c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => { + self.fake_ident_or_unknown_prefix() + } _ => Ident, } } + fn fake_ident_or_unknown_prefix(&mut self) -> TokenKind { + // Start is already eaten, eat the rest of identifier. + self.eat_while(|c| { + unicode_xid::UnicodeXID::is_xid_continue(c) + || (!c.is_ascii() && unic_emoji_char::is_emoji(c)) + || c == '\u{200d}' + }); + // Known prefixes must have been handled earlier. So if + // we see a prefix here, it is definitely an unknown prefix. + match self.first() { + '#' | '"' | '\'' => UnknownPrefix, + _ => InvalidIdent, + } + } + fn number(&mut self, first_digit: char) -> LiteralKind { debug_assert!('0' <= self.prev() && self.prev() <= '9'); let mut base = Base::Decimal; @@ -808,11 +833,4 @@ impl Cursor<'_> { self.eat_while(is_id_continue); } - - /// Eats symbols while predicate returns true or until the end of file is reached. - fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) { - while predicate(self.first()) && !self.is_eof() { - self.bump(); - } - } } |
