about summary refs log tree commit diff
diff options
context:
space:
mode:
authorEsteban Kuber <esteban@kuber.com.ar>2021-08-29 08:34:23 +0000
committerEsteban Kuber <esteban@kuber.com.ar>2021-11-23 20:35:07 +0000
commit5a68abb0943e687cc6c8165376b4ed6deda17db3 (patch)
treed3fb75d0d5d4be7be5e4e1cf665c94da12ed647b
parent311fa1f14dd8ffbbe83b229a94b17f7f1ecaf33b (diff)
downloadrust-5a68abb0943e687cc6c8165376b4ed6deda17db3.tar.gz
rust-5a68abb0943e687cc6c8165376b4ed6deda17db3.zip
Tokenize emoji as if they were valid indentifiers
In the lexer, consider emojis to be valid identifiers and reject
them later to avoid knock down parse errors.
-rw-r--r--Cargo.lock42
-rw-r--r--compiler/rustc_interface/src/passes.rs12
-rw-r--r--compiler/rustc_lexer/Cargo.toml1
-rw-r--r--compiler/rustc_lexer/src/lib.rs25
-rw-r--r--compiler/rustc_parse/src/lexer/mod.rs6
-rw-r--r--compiler/rustc_session/src/parse.rs8
-rw-r--r--src/test/ui/parser/emoji-identifiers.rs16
-rw-r--r--src/test/ui/parser/emoji-identifiers.stderr72
8 files changed, 180 insertions, 2 deletions
diff --git a/Cargo.lock b/Cargo.lock
index 51ed441d0db..edc227d9db7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4040,6 +4040,7 @@ name = "rustc_lexer"
 version = "0.1.0"
 dependencies = [
  "expect-test",
+ "unic-emoji-char",
  "unicode-xid",
 ]
 
@@ -5511,6 +5512,47 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c"
 
 [[package]]
+name = "unic-char-property"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a8c57a407d9b6fa02b4795eb81c5b6652060a15a7903ea981f3d723e6c0be221"
+dependencies = [
+ "unic-char-range",
+]
+
+[[package]]
+name = "unic-char-range"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0398022d5f700414f6b899e10b8348231abf9173fa93144cbc1a43b9793c1fbc"
+
+[[package]]
+name = "unic-common"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "80d7ff825a6a654ee85a63e80f92f054f904f21e7d12da4e22f9834a4aaa35bc"
+
+[[package]]
+name = "unic-emoji-char"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b07221e68897210270a38bde4babb655869637af0f69407f96053a34f76494d"
+dependencies = [
+ "unic-char-property",
+ "unic-char-range",
+ "unic-ucd-version",
+]
+
+[[package]]
+name = "unic-ucd-version"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "96bd2f2237fe450fcd0a1d2f5f4e91711124f7857ba2e964247776ebeeb7b0c4"
+dependencies = [
+ "unic-common",
+]
+
+[[package]]
 name = "unicase"
 version = "2.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
diff --git a/compiler/rustc_interface/src/passes.rs b/compiler/rustc_interface/src/passes.rs
index b073ee9682f..7286209040c 100644
--- a/compiler/rustc_interface/src/passes.rs
+++ b/compiler/rustc_interface/src/passes.rs
@@ -35,7 +35,7 @@ use rustc_session::output::{filename_for_input, filename_for_metadata};
 use rustc_session::search_paths::PathKind;
 use rustc_session::{Limit, Session};
 use rustc_span::symbol::{sym, Ident, Symbol};
-use rustc_span::FileName;
+use rustc_span::{FileName, MultiSpan};
 use rustc_trait_selection::traits;
 use rustc_typeck as typeck;
 use tempfile::Builder as TempFileBuilder;
@@ -450,6 +450,16 @@ pub fn configure_and_expand(
         });
     }
 
+    // Gate identifiers containing invalid Unicode codepoints that were recovered during lexing.
+    sess.parse_sess.bad_unicode_identifiers.with_lock(|identifiers| {
+        for (ident, spans) in identifiers.drain() {
+            sess.diagnostic().span_err(
+                MultiSpan::from(spans),
+                &format!("identifiers cannot contain emojis: `{}`", ident),
+            );
+        }
+    });
+
     Ok(krate)
 }
 
diff --git a/compiler/rustc_lexer/Cargo.toml b/compiler/rustc_lexer/Cargo.toml
index 60c146f457b..35af110537d 100644
--- a/compiler/rustc_lexer/Cargo.toml
+++ b/compiler/rustc_lexer/Cargo.toml
@@ -17,6 +17,7 @@ doctest = false
 # Note that this crate purposefully does not depend on other rustc crates
 [dependencies]
 unicode-xid = "0.2.0"
+unic-emoji-char = "0.9.0"
 
 [dev-dependencies]
 expect-test = "1.0"
diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs
index b64a891cb25..a729c0c0bbe 100644
--- a/compiler/rustc_lexer/src/lib.rs
+++ b/compiler/rustc_lexer/src/lib.rs
@@ -64,6 +64,8 @@ pub enum TokenKind {
     /// "ident" or "continue"
     /// At this step keywords are also considered identifiers.
     Ident,
+    /// Like the above, but containing invalid unicode codepoints.
+    InvalidIdent,
     /// "r#ident"
     RawIdent,
     /// An unknown prefix like `foo#`, `foo'`, `foo"`. Note that only the
@@ -411,6 +413,11 @@ impl Cursor<'_> {
                 let kind = Str { terminated };
                 Literal { kind, suffix_start }
             }
+            // Identifier (this should be checked after other variant that can
+            // start as identifier).
+            c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => {
+                self.fake_ident_or_unknown_prefix()
+            }
             _ => Unknown,
         };
         Token::new(token_kind, self.len_consumed())
@@ -492,10 +499,28 @@ impl Cursor<'_> {
         // we see a prefix here, it is definitely an unknown prefix.
         match self.first() {
             '#' | '"' | '\'' => UnknownPrefix,
+            c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => {
+                self.fake_ident_or_unknown_prefix()
+            }
             _ => Ident,
         }
     }
 
+    fn fake_ident_or_unknown_prefix(&mut self) -> TokenKind {
+        // Start is already eaten, eat the rest of identifier.
+        self.eat_while(|c| {
+            unicode_xid::UnicodeXID::is_xid_continue(c)
+                || (!c.is_ascii() && unic_emoji_char::is_emoji(c))
+                || c == '\u{200d}'
+        });
+        // Known prefixes must have been handled earlier. So if
+        // we see a prefix here, it is definitely an unknown prefix.
+        match self.first() {
+            '#' | '"' | '\'' => UnknownPrefix,
+            _ => InvalidIdent,
+        }
+    }
+
     fn number(&mut self, first_digit: char) -> LiteralKind {
         debug_assert!('0' <= self.prev() && self.prev() <= '9');
         let mut base = Base::Decimal;
diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs
index cf35c3cd53b..c4a3dd9bfda 100644
--- a/compiler/rustc_parse/src/lexer/mod.rs
+++ b/compiler/rustc_parse/src/lexer/mod.rs
@@ -222,6 +222,12 @@ impl<'a> StringReader<'a> {
                 }
                 token::Ident(sym, is_raw_ident)
             }
+            rustc_lexer::TokenKind::InvalidIdent => {
+                let sym = nfc_normalize(self.str_from(start));
+                let span = self.mk_sp(start, self.pos);
+                self.sess.bad_unicode_identifiers.borrow_mut().entry(sym).or_default().push(span);
+                token::Ident(sym, false)
+            }
             rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
                 let suffix_start = start + BytePos(suffix_start as u32);
                 let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
diff --git a/compiler/rustc_session/src/parse.rs b/compiler/rustc_session/src/parse.rs
index f7246641dca..4e0f6c32e57 100644
--- a/compiler/rustc_session/src/parse.rs
+++ b/compiler/rustc_session/src/parse.rs
@@ -119,8 +119,13 @@ pub struct ParseSess {
     pub config: CrateConfig,
     pub edition: Edition,
     pub missing_fragment_specifiers: Lock<FxHashMap<Span, NodeId>>,
-    /// Places where raw identifiers were used. This is used for feature-gating raw identifiers.
+    /// Places where raw identifiers were used. This is used to avoid complaining about idents
+    /// clashing with keywords in new editions.
     pub raw_identifier_spans: Lock<Vec<Span>>,
+    /// Places where identifiers that contain invalid Unicode codepoints but that look like they
+    /// should be. Useful to avoid bad tokenization when encountering emojis. We group them to
+    /// provide a single error per unique incorrect identifier.
+    pub bad_unicode_identifiers: Lock<FxHashMap<Symbol, Vec<Span>>>,
     source_map: Lrc<SourceMap>,
     pub buffered_lints: Lock<Vec<BufferedEarlyLint>>,
     /// Contains the spans of block expressions that could have been incomplete based on the
@@ -160,6 +165,7 @@ impl ParseSess {
             edition: ExpnId::root().expn_data().edition,
             missing_fragment_specifiers: Default::default(),
             raw_identifier_spans: Lock::new(Vec::new()),
+            bad_unicode_identifiers: Lock::new(Default::default()),
             source_map,
             buffered_lints: Lock::new(vec![]),
             ambiguous_block_expr_parse: Lock::new(FxHashMap::default()),
diff --git a/src/test/ui/parser/emoji-identifiers.rs b/src/test/ui/parser/emoji-identifiers.rs
new file mode 100644
index 00000000000..2f4df4cf216
--- /dev/null
+++ b/src/test/ui/parser/emoji-identifiers.rs
@@ -0,0 +1,16 @@
+struct ABig๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘งFamily; //~ ERROR identifiers cannot contain emojis
+struct ๐Ÿ‘€; //~ ERROR identifiers cannot contain emojis
+impl ๐Ÿ‘€ {
+    fn full_of_โœจ() -> ๐Ÿ‘€ { //~ ERROR identifiers cannot contain emojis
+        ๐Ÿ‘€
+    }
+}
+fn i_like_to_๐Ÿ˜…_a_lot() -> ๐Ÿ‘€ { //~ ERROR identifiers cannot contain emojis
+    ๐Ÿ‘€::full_ofโœจ() //~ ERROR no function or associated item named `full_ofโœจ` found for struct `๐Ÿ‘€`
+    //~^ ERROR identifiers cannot contain emojis
+}
+fn main() {
+    let _ = i_like_to_๐Ÿ˜„_a_lot(); //~ ERROR cannot find function `i_like_to_๐Ÿ˜„_a_lot` in this scope
+    //~^ ERROR identifiers cannot contain emojis
+}
+
diff --git a/src/test/ui/parser/emoji-identifiers.stderr b/src/test/ui/parser/emoji-identifiers.stderr
new file mode 100644
index 00000000000..3b17bb01de9
--- /dev/null
+++ b/src/test/ui/parser/emoji-identifiers.stderr
@@ -0,0 +1,72 @@
+error[E0425]: cannot find function `i_like_to_๐Ÿ˜„_a_lot` in this scope
+  --> $DIR/emoji-identifiers.rs:13:13
+   |
+LL | fn i_like_to_๐Ÿ˜…_a_lot() -> ๐Ÿ‘€ {
+   | ----------------------------- similarly named function `i_like_to_๐Ÿ˜…_a_lot` defined here
+...
+LL |     let _ = i_like_to_๐Ÿ˜„_a_lot();
+   |             ^^^^^^^^^^^^^^^^^^ help: a function with a similar name exists: `i_like_to_๐Ÿ˜…_a_lot`
+
+error: identifiers cannot contain emojis: `i_like_to_๐Ÿ˜„_a_lot`
+  --> $DIR/emoji-identifiers.rs:13:13
+   |
+LL |     let _ = i_like_to_๐Ÿ˜„_a_lot();
+   |             ^^^^^^^^^^^^^^^^^^
+
+error: identifiers cannot contain emojis: `full_of_โœจ`
+  --> $DIR/emoji-identifiers.rs:4:8
+   |
+LL |     fn full_of_โœจ() -> ๐Ÿ‘€ {
+   |        ^^^^^^^^^^
+
+error: identifiers cannot contain emojis: `full_ofโœจ`
+  --> $DIR/emoji-identifiers.rs:9:8
+   |
+LL |     ๐Ÿ‘€::full_ofโœจ()
+   |         ^^^^^^^^^
+
+error: identifiers cannot contain emojis: `๐Ÿ‘€`
+  --> $DIR/emoji-identifiers.rs:2:8
+   |
+LL | struct ๐Ÿ‘€;
+   |        ^^
+LL | impl ๐Ÿ‘€ {
+   |      ^^
+LL |     fn full_of_โœจ() -> ๐Ÿ‘€ {
+   |                        ^^
+LL |         ๐Ÿ‘€
+   |         ^^
+...
+LL | fn i_like_to_๐Ÿ˜…_a_lot() -> ๐Ÿ‘€ {
+   |                            ^^
+LL |     ๐Ÿ‘€::full_ofโœจ()
+   |     ^^
+
+error: identifiers cannot contain emojis: `i_like_to_๐Ÿ˜…_a_lot`
+  --> $DIR/emoji-identifiers.rs:8:4
+   |
+LL | fn i_like_to_๐Ÿ˜…_a_lot() -> ๐Ÿ‘€ {
+   |    ^^^^^^^^^^^^^^^^^^
+
+error: identifiers cannot contain emojis: `ABig๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘งFamily`
+  --> $DIR/emoji-identifiers.rs:1:8
+   |
+LL | struct ABig๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘งFamily;
+   |        ^^^^^^^^^^^^^^^^^^
+
+error[E0599]: no function or associated item named `full_ofโœจ` found for struct `๐Ÿ‘€` in the current scope
+  --> $DIR/emoji-identifiers.rs:9:8
+   |
+LL | struct ๐Ÿ‘€;
+   | ---------- function or associated item `full_ofโœจ` not found for this
+...
+LL |     ๐Ÿ‘€::full_ofโœจ()
+   |         ^^^^^^^^^
+   |         |
+   |         function or associated item not found in `๐Ÿ‘€`
+   |         help: there is an associated function with a similar name: `full_of_โœจ`
+
+error: aborting due to 8 previous errors
+
+Some errors have detailed explanations: E0425, E0599.
+For more information about an error, try `rustc --explain E0425`.