Auto merge of #88781 - estebank:emoji-idents, r=oli-obk

Tokenize emoji as if they were valid identifiers In the lexer, consider emojis to be valid identifiers and reject them later to avoid knock down parse errors. Partially address #86102.
author: bors <bors@rust-lang.org> 2021-11-25 08:16:08 +0000
committer: bors <bors@rust-lang.org> 2021-11-25 08:16:08 +0000
commit: 23a436606b118bd2fbb12f64fce21e7f9d355349 (patch)
tree: f7cdcfcb705dc416fd967deb4e89ad3184f282c4 /compiler/rustc_parse/src
parent: c6eda7d8a7af3ef51311d3106874a7d8de994edc (diff)
parent: d92916439c372967e4c12b7ece3c8d7e860a8777 (diff)
download: rust-23a436606b118bd2fbb12f64fce21e7f9d355349.tar.gz
rust-23a436606b118bd2fbb12f64fce21e7f9d355349.zip
2 files changed, 19 insertions, 2 deletions
diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs
index cf35c3cd53b..1a620968d56 100644
--- a/compiler/rustc_parse/src/lexer/mod.rs
+++ b/compiler/rustc_parse/src/lexer/mod.rs
@@ -1,3 +1,4 @@
+use crate::lexer::unicode_chars::UNICODE_ARRAY;
 use rustc_ast::ast::{self, AttrStyle};
 use rustc_ast::token::{self, CommentKind, Token, TokenKind};
 use rustc_ast::tokenstream::{Spacing, TokenStream};
@@ -222,6 +223,22 @@ impl<'a> StringReader<'a> {
                 }
                 token::Ident(sym, is_raw_ident)
             }
+            rustc_lexer::TokenKind::InvalidIdent
+                // Do not recover an identifier with emoji if the codepoint is a confusable
+                // with a recoverable substitution token, like `➖`.
+                if UNICODE_ARRAY
+                    .iter()
+                    .find(|&&(c, _, _)| {
+                        let sym = self.str_from(start);
+                        sym.chars().count() == 1 && c == sym.chars().next().unwrap()
+                    })
+                    .is_none() =>
+            {
+                let sym = nfc_normalize(self.str_from(start));
+                let span = self.mk_sp(start, self.pos);
+                self.sess.bad_unicode_identifiers.borrow_mut().entry(sym).or_default().push(span);
+                token::Ident(sym, false)
+            }
             rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
                 let suffix_start = start + BytePos(suffix_start as u32);
                 let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
@@ -293,7 +310,7 @@ impl<'a> StringReader<'a> {
             rustc_lexer::TokenKind::Caret => token::BinOp(token::Caret),
             rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),
 
-            rustc_lexer::TokenKind::Unknown => {
+            rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
                 let c = self.str_from(start).chars().next().unwrap();
                 let mut err =
                     self.struct_fatal_span_char(start, self.pos, "unknown start of token", c);
diff --git a/compiler/rustc_parse/src/lexer/unicode_chars.rs b/compiler/rustc_parse/src/lexer/unicode_chars.rs
index 3eebc088f3f..ccd11f06bc5 100644
--- a/compiler/rustc_parse/src/lexer/unicode_chars.rs
+++ b/compiler/rustc_parse/src/lexer/unicode_chars.rs
@@ -7,7 +7,7 @@ use rustc_errors::{Applicability, DiagnosticBuilder};
 use rustc_span::{symbol::kw, BytePos, Pos, Span};
 
 #[rustfmt::skip] // for line breaks
-const UNICODE_ARRAY: &[(char, &str, char)] = &[
+pub(crate) const UNICODE_ARRAY: &[(char, &str, char)] = &[
     (' ', "Line Separator", ' '),
     (' ', "Paragraph Separator", ' '),
     (' ', "Ogham Space mark", ' '),
author	bors <bors@rust-lang.org>	2021-11-25 08:16:08 +0000
committer	bors <bors@rust-lang.org>	2021-11-25 08:16:08 +0000
commit	23a436606b118bd2fbb12f64fce21e7f9d355349 (patch)
tree	f7cdcfcb705dc416fd967deb4e89ad3184f282c4 /compiler/rustc_parse/src
parent	c6eda7d8a7af3ef51311d3106874a7d8de994edc (diff)
parent	d92916439c372967e4c12b7ece3c8d7e860a8777 (diff)
download	rust-23a436606b118bd2fbb12f64fce21e7f9d355349.tar.gz rust-23a436606b118bd2fbb12f64fce21e7f9d355349.zip