about summary refs log tree commit diff
diff options
context:
space:
mode:
author许杰友 Jieyou Xu (Joe) <jieyouxu@outlook.com>2023-02-14 17:31:58 +0800
committer许杰友 Jieyou Xu (Joe) <jieyouxu@outlook.com>2023-02-14 17:31:58 +0800
commit380fa264132ad481e73cbbf0f3a0feefd99a1d78 (patch)
tree337f10e0613cf08057868680ef2798d27089deef
parentc3c6d73b04a718aceabc314bf231a20c90ccd601 (diff)
downloadrust-380fa264132ad481e73cbbf0f3a0feefd99a1d78.tar.gz
rust-380fa264132ad481e73cbbf0f3a0feefd99a1d78.zip
Don't recover lifetimes/labels containing emojis as character literals
Note that at the time of this commit, `unic-emoji-char` seems to have
data tables only up to Unicode 5.0, but Unicode is already newer than
this.

A newer emoji such as `🥺` will not be recognized as an emoji
but older emojis such as `🐱` will.
-rw-r--r--compiler/rustc_errors/src/lib.rs2
-rw-r--r--compiler/rustc_lexer/src/lib.rs43
-rw-r--r--compiler/rustc_parse/src/lexer/mod.rs9
-rw-r--r--tests/ui/lexer/issue-108019-bad-emoji-recovery.rs45
-rw-r--r--tests/ui/lexer/issue-108019-bad-emoji-recovery.stderr86
5 files changed, 173 insertions, 12 deletions
diff --git a/compiler/rustc_errors/src/lib.rs b/compiler/rustc_errors/src/lib.rs
index 83b733d4c06..b310b191d52 100644
--- a/compiler/rustc_errors/src/lib.rs
+++ b/compiler/rustc_errors/src/lib.rs
@@ -471,6 +471,8 @@ pub enum StashKey {
     /// When an invalid lifetime e.g. `'2` should be reinterpreted
     /// as a char literal in the parser
     LifetimeIsChar,
+    /// When an invalid lifetime e.g. `'🐱` contains emoji.
+    LifetimeContainsEmoji,
     /// Maybe there was a typo where a comma was forgotten before
     /// FRU syntax
     MaybeFruTypo,
diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs
index 6e815863d06..e6f04fe0aaa 100644
--- a/compiler/rustc_lexer/src/lib.rs
+++ b/compiler/rustc_lexer/src/lib.rs
@@ -95,7 +95,7 @@ pub enum TokenKind {
     Literal { kind: LiteralKind, suffix_start: u32 },
 
     /// "'a"
-    Lifetime { starts_with_number: bool },
+    Lifetime { starts_with_number: bool, contains_emoji: bool },
 
     // One-char tokens:
     /// ";"
@@ -630,7 +630,13 @@ impl Cursor<'_> {
             // If the first symbol is valid for identifier, it can be a lifetime.
             // Also check if it's a number for a better error reporting (so '0 will
             // be reported as invalid lifetime and not as unterminated char literal).
-            is_id_start(self.first()) || self.first().is_digit(10)
+            // We also have to account for potential `'🐱` emojis to avoid reporting
+            // it as an unterminated char literal.
+            is_id_start(self.first())
+                || self.first().is_digit(10)
+                // FIXME(#108019): `unic-emoji-char` seems to have data tables only up to Unicode
+                // 5.0, but Unicode is already newer than this.
+                || unic_emoji_char::is_emoji(self.first())
         };
 
         if !can_be_a_lifetime {
@@ -643,16 +649,33 @@ impl Cursor<'_> {
             return Literal { kind, suffix_start };
         }
 
-        // Either a lifetime or a character literal with
-        // length greater than 1.
+        // Either a lifetime or a character literal.
 
         let starts_with_number = self.first().is_digit(10);
+        let mut contains_emoji = false;
 
-        // Skip the literal contents.
-        // First symbol can be a number (which isn't a valid identifier start),
-        // so skip it without any checks.
-        self.bump();
-        self.eat_while(is_id_continue);
+        // FIXME(#108019): `unic-emoji-char` seems to have data tables only up to Unicode
+        // 5.0, but Unicode is already newer than this.
+        if unic_emoji_char::is_emoji(self.first()) {
+            contains_emoji = true;
+        } else {
+            // Skip the literal contents.
+            // First symbol can be a number (which isn't a valid identifier start),
+            // so skip it without any checks.
+            self.bump();
+        }
+        self.eat_while(|c| {
+            if is_id_continue(c) {
+                true
+            // FIXME(#108019): `unic-emoji-char` seems to have data tables only up to Unicode
+            // 5.0, but Unicode is already newer than this.
+            } else if unic_emoji_char::is_emoji(c) {
+                contains_emoji = true;
+                true
+            } else {
+                false
+            }
+        });
 
         // Check if after skipping literal contents we've met a closing
         // single quote (which means that user attempted to create a
@@ -662,7 +685,7 @@ impl Cursor<'_> {
             let kind = Char { terminated: true };
             Literal { kind, suffix_start: self.pos_within_token() }
         } else {
-            Lifetime { starts_with_number }
+            Lifetime { starts_with_number, contains_emoji }
         }
     }
 
diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs
index bd998ed91d9..37449aaabed 100644
--- a/compiler/rustc_parse/src/lexer/mod.rs
+++ b/compiler/rustc_parse/src/lexer/mod.rs
@@ -200,16 +200,21 @@ impl<'a> StringReader<'a> {
                     };
                     token::Literal(token::Lit { kind, symbol, suffix })
                 }
-                rustc_lexer::TokenKind::Lifetime { starts_with_number } => {
+                rustc_lexer::TokenKind::Lifetime { starts_with_number, contains_emoji } => {
                     // Include the leading `'` in the real identifier, for macro
                     // expansion purposes. See #12512 for the gory details of why
                     // this is necessary.
                     let lifetime_name = self.str_from(start);
                     if starts_with_number {
                         let span = self.mk_sp(start, self.pos);
-                        let mut diag = self.sess.struct_err("lifetimes cannot start with a number");
+                        let mut diag = self.sess.struct_err("lifetimes or labels cannot start with a number");
                         diag.set_span(span);
                         diag.stash(span, StashKey::LifetimeIsChar);
+                    } else if contains_emoji {
+                        let span = self.mk_sp(start, self.pos);
+                        let mut diag = self.sess.struct_err("lifetimes or labels cannot contain emojis");
+                        diag.set_span(span);
+                        diag.stash(span, StashKey::LifetimeContainsEmoji);
                     }
                     let ident = Symbol::intern(lifetime_name);
                     token::Lifetime(ident)
diff --git a/tests/ui/lexer/issue-108019-bad-emoji-recovery.rs b/tests/ui/lexer/issue-108019-bad-emoji-recovery.rs
new file mode 100644
index 00000000000..f0f86224560
--- /dev/null
+++ b/tests/ui/lexer/issue-108019-bad-emoji-recovery.rs
@@ -0,0 +1,45 @@
+#![allow(unused_labels)]
+
+// FIXME(#108019): outdated Unicode table
+// fn foo() {
+//     '🥺 loop {
+//         break
+//     }
+// }
+
+fn bar() {
+    '🐱 loop {
+    //~^ ERROR labeled expression must be followed by `:`
+    //~| ERROR lifetimes or labels cannot contain emojis
+        break
+    }
+}
+
+fn qux() {
+    'a🐱 loop {
+    //~^ ERROR labeled expression must be followed by `:`
+    //~| ERROR lifetimes or labels cannot contain emojis
+        break
+    }
+}
+
+fn quux() {
+    '1🐱 loop {
+    //~^ ERROR labeled expression must be followed by `:`
+    //~| ERROR lifetimes or labels cannot start with a number
+        break
+    }
+}
+
+fn x<'🐱>() -> &'🐱 () {
+    //~^ ERROR lifetimes or labels cannot contain emojis
+    //~| ERROR lifetimes or labels cannot contain emojis
+    &()
+}
+
+fn y() {
+    'a🐱: loop {}
+    //~^ ERROR lifetimes or labels cannot contain emojis
+}
+
+fn main() {}
diff --git a/tests/ui/lexer/issue-108019-bad-emoji-recovery.stderr b/tests/ui/lexer/issue-108019-bad-emoji-recovery.stderr
new file mode 100644
index 00000000000..be77ffdea34
--- /dev/null
+++ b/tests/ui/lexer/issue-108019-bad-emoji-recovery.stderr
@@ -0,0 +1,86 @@
+error: labeled expression must be followed by `:`
+  --> $DIR/issue-108019-bad-emoji-recovery.rs:11:5
+   |
+LL |       '🐱 loop {
+   |       ^--- help: add `:` after the label
+   |       |
+   |  _____the label
+   | |
+LL | |
+LL | |
+LL | |         break
+LL | |     }
+   | |_____^
+   |
+   = note: labels are used before loops and blocks, allowing e.g., `break 'label` to them
+
+error: labeled expression must be followed by `:`
+  --> $DIR/issue-108019-bad-emoji-recovery.rs:19:5
+   |
+LL |       'a🐱 loop {
+   |       ^---- help: add `:` after the label
+   |       |
+   |  _____the label
+   | |
+LL | |
+LL | |
+LL | |         break
+LL | |     }
+   | |_____^
+   |
+   = note: labels are used before loops and blocks, allowing e.g., `break 'label` to them
+
+error: labeled expression must be followed by `:`
+  --> $DIR/issue-108019-bad-emoji-recovery.rs:27:5
+   |
+LL |       '1🐱 loop {
+   |       ^---- help: add `:` after the label
+   |       |
+   |  _____the label
+   | |
+LL | |
+LL | |
+LL | |         break
+LL | |     }
+   | |_____^
+   |
+   = note: labels are used before loops and blocks, allowing e.g., `break 'label` to them
+
+error: lifetimes or labels cannot contain emojis
+  --> $DIR/issue-108019-bad-emoji-recovery.rs:11:5
+   |
+LL |     '🐱 loop {
+   |     ^^^
+
+error: lifetimes or labels cannot contain emojis
+  --> $DIR/issue-108019-bad-emoji-recovery.rs:19:5
+   |
+LL |     'a🐱 loop {
+   |     ^^^^
+
+error: lifetimes or labels cannot start with a number
+  --> $DIR/issue-108019-bad-emoji-recovery.rs:27:5
+   |
+LL |     '1🐱 loop {
+   |     ^^^^
+
+error: lifetimes or labels cannot contain emojis
+  --> $DIR/issue-108019-bad-emoji-recovery.rs:34:6
+   |
+LL | fn x<'🐱>() -> &'🐱 () {
+   |      ^^^
+
+error: lifetimes or labels cannot contain emojis
+  --> $DIR/issue-108019-bad-emoji-recovery.rs:34:16
+   |
+LL | fn x<'🐱>() -> &'🐱 () {
+   |                 ^^^
+
+error: lifetimes or labels cannot contain emojis
+  --> $DIR/issue-108019-bad-emoji-recovery.rs:41:5
+   |
+LL |     'a🐱: loop {}
+   |     ^^^^
+
+error: aborting due to 9 previous errors
+