diff options
| author | bors <bors@rust-lang.org> | 2021-11-25 08:16:08 +0000 |
|---|---|---|
| committer | bors <bors@rust-lang.org> | 2021-11-25 08:16:08 +0000 |
| commit | 23a436606b118bd2fbb12f64fce21e7f9d355349 (patch) | |
| tree | f7cdcfcb705dc416fd967deb4e89ad3184f282c4 /compiler/rustc_parse/src | |
| parent | c6eda7d8a7af3ef51311d3106874a7d8de994edc (diff) | |
| parent | d92916439c372967e4c12b7ece3c8d7e860a8777 (diff) | |
| download | rust-23a436606b118bd2fbb12f64fce21e7f9d355349.tar.gz rust-23a436606b118bd2fbb12f64fce21e7f9d355349.zip | |
Auto merge of #88781 - estebank:emoji-idents, r=oli-obk
Tokenize emoji as if they were valid identifiers In the lexer, consider emojis to be valid identifiers and reject them later to avoid knock down parse errors. Partially address #86102.
Diffstat (limited to 'compiler/rustc_parse/src')
| -rw-r--r-- | compiler/rustc_parse/src/lexer/mod.rs | 19 | ||||
| -rw-r--r-- | compiler/rustc_parse/src/lexer/unicode_chars.rs | 2 |
2 files changed, 19 insertions, 2 deletions
diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index cf35c3cd53b..1a620968d56 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -1,3 +1,4 @@ +use crate::lexer::unicode_chars::UNICODE_ARRAY; use rustc_ast::ast::{self, AttrStyle}; use rustc_ast::token::{self, CommentKind, Token, TokenKind}; use rustc_ast::tokenstream::{Spacing, TokenStream}; @@ -222,6 +223,22 @@ impl<'a> StringReader<'a> { } token::Ident(sym, is_raw_ident) } + rustc_lexer::TokenKind::InvalidIdent + // Do not recover an identifier with emoji if the codepoint is a confusable + // with a recoverable substitution token, like `➖`. + if UNICODE_ARRAY + .iter() + .find(|&&(c, _, _)| { + let sym = self.str_from(start); + sym.chars().count() == 1 && c == sym.chars().next().unwrap() + }) + .is_none() => + { + let sym = nfc_normalize(self.str_from(start)); + let span = self.mk_sp(start, self.pos); + self.sess.bad_unicode_identifiers.borrow_mut().entry(sym).or_default().push(span); + token::Ident(sym, false) + } rustc_lexer::TokenKind::Literal { kind, suffix_start } => { let suffix_start = start + BytePos(suffix_start as u32); let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind); @@ -293,7 +310,7 @@ impl<'a> StringReader<'a> { rustc_lexer::TokenKind::Caret => token::BinOp(token::Caret), rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent), - rustc_lexer::TokenKind::Unknown => { + rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => { let c = self.str_from(start).chars().next().unwrap(); let mut err = self.struct_fatal_span_char(start, self.pos, "unknown start of token", c); diff --git a/compiler/rustc_parse/src/lexer/unicode_chars.rs b/compiler/rustc_parse/src/lexer/unicode_chars.rs index 3eebc088f3f..ccd11f06bc5 100644 --- a/compiler/rustc_parse/src/lexer/unicode_chars.rs +++ b/compiler/rustc_parse/src/lexer/unicode_chars.rs @@ -7,7 +7,7 @@ use rustc_errors::{Applicability, DiagnosticBuilder}; use rustc_span::{symbol::kw, BytePos, Pos, Span}; #[rustfmt::skip] // for line breaks -const UNICODE_ARRAY: &[(char, &str, char)] = &[ +pub(crate) const UNICODE_ARRAY: &[(char, &str, char)] = &[ (' ', "Line Separator", ' '), (' ', "Paragraph Separator", ' '), (' ', "Ogham Space mark", ' '), |
