Rollup merge of #130551 - nnethercote:fix-break-last-token, r=petrochenkov

Fix `break_last_token`. It currently doesn't handle the three-char tokens `>>=` and `<<=` correctly. These can be broken twice, resulting in three individual tokens. This is a latent bug that currently doesn't cause any problems, but does cause problems for #124141, because that PR increases the usage of lazy token streams. r? `@petrochenkov`
author: Jubilee <workingjubilee@gmail.com> 2024-09-23 07:54:44 -0700
committer: GitHub <noreply@github.com> 2024-09-23 07:54:44 -0700
commit: 515bdcda01fb56609600d2f6214a382511046341 (patch)
tree: 41f42d29a541b649c309c979566c2af04f3ef361
parent: c58e3cb1e2b61be57149c84ca8cce82a9eb37fd4 (diff)
parent: 73cc5751773d4c49cc9d938548762520037926ba (diff)
download: rust-515bdcda01fb56609600d2f6214a382511046341.tar.gz
rust-515bdcda01fb56609600d2f6214a382511046341.zip
4 files changed, 91 insertions, 64 deletions
diff --git a/compiler/rustc_ast/src/token.rs b/compiler/rustc_ast/src/token.rs
index de58df38141..1d6abbef06c 100644
--- a/compiler/rustc_ast/src/token.rs
+++ b/compiler/rustc_ast/src/token.rs
@@ -385,35 +385,41 @@ impl TokenKind {
         Literal(Lit::new(kind, symbol, suffix))
     }
 
-    /// An approximation to proc-macro-style single-character operators used by rustc parser.
-    /// If the operator token can be broken into two tokens, the first of which is single-character,
-    /// then this function performs that operation, otherwise it returns `None`.
-    pub fn break_two_token_op(&self) -> Option<(TokenKind, TokenKind)> {
-        Some(match *self {
-            Le => (Lt, Eq),
-            EqEq => (Eq, Eq),
-            Ne => (Not, Eq),
-            Ge => (Gt, Eq),
-            AndAnd => (BinOp(And), BinOp(And)),
-            OrOr => (BinOp(Or), BinOp(Or)),
-            BinOp(Shl) => (Lt, Lt),
-            BinOp(Shr) => (Gt, Gt),
-            BinOpEq(Plus) => (BinOp(Plus), Eq),
-            BinOpEq(Minus) => (BinOp(Minus), Eq),
-            BinOpEq(Star) => (BinOp(Star), Eq),
-            BinOpEq(Slash) => (BinOp(Slash), Eq),
-            BinOpEq(Percent) => (BinOp(Percent), Eq),
-            BinOpEq(Caret) => (BinOp(Caret), Eq),
-            BinOpEq(And) => (BinOp(And), Eq),
-            BinOpEq(Or) => (BinOp(Or), Eq),
-            BinOpEq(Shl) => (Lt, Le),
-            BinOpEq(Shr) => (Gt, Ge),
-            DotDot => (Dot, Dot),
-            DotDotDot => (Dot, DotDot),
-            PathSep => (Colon, Colon),
-            RArrow => (BinOp(Minus), Gt),
-            LArrow => (Lt, BinOp(Minus)),
-            FatArrow => (Eq, Gt),
+    /// An approximation to proc-macro-style single-character operators used by
+    /// rustc parser. If the operator token can be broken into two tokens, the
+    /// first of which has `n` (1 or 2) chars, then this function performs that
+    /// operation, otherwise it returns `None`.
+    pub fn break_two_token_op(&self, n: u32) -> Option<(TokenKind, TokenKind)> {
+        assert!(n == 1 || n == 2);
+        Some(match (self, n) {
+            (Le, 1) => (Lt, Eq),
+            (EqEq, 1) => (Eq, Eq),
+            (Ne, 1) => (Not, Eq),
+            (Ge, 1) => (Gt, Eq),
+            (AndAnd, 1) => (BinOp(And), BinOp(And)),
+            (OrOr, 1) => (BinOp(Or), BinOp(Or)),
+            (BinOp(Shl), 1) => (Lt, Lt),
+            (BinOp(Shr), 1) => (Gt, Gt),
+            (BinOpEq(Plus), 1) => (BinOp(Plus), Eq),
+            (BinOpEq(Minus), 1) => (BinOp(Minus), Eq),
+            (BinOpEq(Star), 1) => (BinOp(Star), Eq),
+            (BinOpEq(Slash), 1) => (BinOp(Slash), Eq),
+            (BinOpEq(Percent), 1) => (BinOp(Percent), Eq),
+            (BinOpEq(Caret), 1) => (BinOp(Caret), Eq),
+            (BinOpEq(And), 1) => (BinOp(And), Eq),
+            (BinOpEq(Or), 1) => (BinOp(Or), Eq),
+            (BinOpEq(Shl), 1) => (Lt, Le),         // `<` + `<=`
+            (BinOpEq(Shl), 2) => (BinOp(Shl), Eq), // `<<` + `=`
+            (BinOpEq(Shr), 1) => (Gt, Ge),         // `>` + `>=`
+            (BinOpEq(Shr), 2) => (BinOp(Shr), Eq), // `>>` + `=`
+            (DotDot, 1) => (Dot, Dot),
+            (DotDotDot, 1) => (Dot, DotDot), // `.` + `..`
+            (DotDotDot, 2) => (DotDot, Dot), // `..` + `.`
+            (DotDotEq, 2) => (DotDot, Eq),
+            (PathSep, 1) => (Colon, Colon),
+            (RArrow, 1) => (BinOp(Minus), Gt),
+            (LArrow, 1) => (Lt, BinOp(Minus)),
+            (FatArrow, 1) => (Eq, Gt),
             _ => return None,
         })
     }
diff --git a/compiler/rustc_parse/src/parser/attr_wrapper.rs b/compiler/rustc_parse/src/parser/attr_wrapper.rs
index eb1c1a64578..8bd615e6d79 100644
--- a/compiler/rustc_parse/src/parser/attr_wrapper.rs
+++ b/compiler/rustc_parse/src/parser/attr_wrapper.rs
@@ -108,7 +108,7 @@ struct LazyAttrTokenStreamImpl {
     start_token: (Token, Spacing),
     cursor_snapshot: TokenCursor,
     num_calls: u32,
-    break_last_token: bool,
+    break_last_token: u32,
     node_replacements: Box<[NodeReplacement]>,
 }
 
@@ -339,17 +339,20 @@ impl<'a> Parser<'a> {
         let parser_replacements_end = self.capture_state.parser_replacements.len();
 
         assert!(
-            !(self.break_last_token && matches!(capture_trailing, Trailing::Yes)),
-            "Cannot set break_last_token and have trailing token"
+            !(self.break_last_token > 0 && matches!(capture_trailing, Trailing::Yes)),
+            "Cannot have break_last_token > 0 and have trailing token"
         );
+        assert!(self.break_last_token <= 2, "cannot break token more than twice");
 
         let end_pos = self.num_bump_calls
             + capture_trailing as u32
-            // If we 'broke' the last token (e.g. breaking a '>>' token to two '>' tokens), then
-            // extend the range of captured tokens to include it, since the parser was not actually
-            // bumped past it. When the `LazyAttrTokenStream` gets converted into an
-            // `AttrTokenStream`, we will create the proper token.
-            + self.break_last_token as u32;
+            // If we "broke" the last token (e.g. breaking a `>>` token once into `>` + `>`, or
+            // breaking a `>>=` token twice into `>` + `>` + `=`), then extend the range of
+            // captured tokens to include it, because the parser was not actually bumped past it.
+            // (Even if we broke twice, it was still just one token originally, hence the `1`.)
+            // When the `LazyAttrTokenStream` gets converted into an `AttrTokenStream`, we will
+            // rebreak that final token once or twice.
+            + if self.break_last_token == 0 { 0 } else { 1 };
 
         let num_calls = end_pos - collect_pos.start_pos;
 
@@ -425,7 +428,7 @@ impl<'a> Parser<'a> {
         // for the `#[cfg]` and/or `#[cfg_attr]` attrs. This allows us to run
         // eager cfg-expansion on the captured token stream.
         if definite_capture_mode {
-            assert!(!self.break_last_token, "Should not have unglued last token with cfg attr");
+            assert!(self.break_last_token == 0, "Should not have unglued last token with cfg attr");
 
             // What is the status here when parsing the example code at the top of this method?
             //
@@ -471,7 +474,7 @@ impl<'a> Parser<'a> {
 /// close delims.
 fn make_attr_token_stream(
     iter: impl Iterator<Item = FlatToken>,
-    break_last_token: bool,
+    break_last_token: u32,
 ) -> AttrTokenStream {
     #[derive(Debug)]
     struct FrameData {
@@ -513,18 +516,17 @@ fn make_attr_token_stream(
         }
     }
 
-    if break_last_token {
+    if break_last_token > 0 {
         let last_token = stack_top.inner.pop().unwrap();
         if let AttrTokenTree::Token(last_token, spacing) = last_token {
-            let unglued_first = last_token.kind.break_two_token_op().unwrap().0;
+            let (unglued, _) = last_token.kind.break_two_token_op(break_last_token).unwrap();
 
-            // An 'unglued' token is always two ASCII characters
+            // Tokens are always ASCII chars, so we can use byte arithmetic here.
             let mut first_span = last_token.span.shrink_to_lo();
-            first_span = first_span.with_hi(first_span.lo() + rustc_span::BytePos(1));
+            first_span =
+                first_span.with_hi(first_span.lo() + rustc_span::BytePos(break_last_token));
 
-            stack_top
-                .inner
-                .push(AttrTokenTree::Token(Token::new(unglued_first, first_span), spacing));
+            stack_top.inner.push(AttrTokenTree::Token(Token::new(unglued, first_span), spacing));
         } else {
             panic!("Unexpected last token {last_token:?}")
         }
diff --git a/compiler/rustc_parse/src/parser/mod.rs b/compiler/rustc_parse/src/parser/mod.rs
index 186828baf01..77ad4fdeeb1 100644
--- a/compiler/rustc_parse/src/parser/mod.rs
+++ b/compiler/rustc_parse/src/parser/mod.rs
@@ -146,21 +146,25 @@ pub struct Parser<'a> {
     token_cursor: TokenCursor,
     // The number of calls to `bump`, i.e. the position in the token stream.
     num_bump_calls: u32,
-    // During parsing we may sometimes need to 'unglue' a glued token into two
-    // component tokens (e.g. '>>' into '>' and '>), so the parser can consume
-    // them one at a time. This process bypasses the normal capturing mechanism
-    // (e.g. `num_bump_calls` will not be incremented), since the 'unglued'
-    // tokens due not exist in the original `TokenStream`.
+    // During parsing we may sometimes need to "unglue" a glued token into two
+    // or three component tokens (e.g. `>>` into `>` and `>`, or `>>=` into `>`
+    // and `>` and `=`), so the parser can consume them one at a time. This
+    // process bypasses the normal capturing mechanism (e.g. `num_bump_calls`
+    // will not be incremented), since the "unglued" tokens due not exist in
+    // the original `TokenStream`.
     //
-    // If we end up consuming both unglued tokens, this is not an issue. We'll
-    // end up capturing the single 'glued' token.
+    // If we end up consuming all the component tokens, this is not an issue,
+    // because we'll end up capturing the single "glued" token.
     //
-    // However, sometimes we may want to capture just the first 'unglued'
+    // However, sometimes we may want to capture not all of the original
     // token. For example, capturing the `Vec<u8>` in `Option<Vec<u8>>`
     // requires us to unglue the trailing `>>` token. The `break_last_token`
-    // field is used to track this token. It gets appended to the captured
+    // field is used to track these tokens. They get appended to the captured
     // stream when we evaluate a `LazyAttrTokenStream`.
-    break_last_token: bool,
+    //
+    // This value is always 0, 1, or 2. It can only reach 2 when splitting
+    // `>>=` or `<<=`.
+    break_last_token: u32,
     /// This field is used to keep track of how many left angle brackets we have seen. This is
     /// required in order to detect extra leading left angle brackets (`<` characters) and error
     /// appropriately.
@@ -453,7 +457,7 @@ impl<'a> Parser<'a> {
             expected_tokens: Vec::new(),
             token_cursor: TokenCursor { tree_cursor: stream.into_trees(), stack: Vec::new() },
             num_bump_calls: 0,
-            break_last_token: false,
+            break_last_token: 0,
             unmatched_angle_bracket_count: 0,
             angle_bracket_nesting: 0,
             last_unexpected_token_span: None,
@@ -773,7 +777,7 @@ impl<'a> Parser<'a> {
             self.bump();
             return true;
         }
-        match self.token.kind.break_two_token_op() {
+        match self.token.kind.break_two_token_op(1) {
             Some((first, second)) if first == expected => {
                 let first_span = self.psess.source_map().start_point(self.token.span);
                 let second_span = self.token.span.with_lo(first_span.hi());
@@ -783,8 +787,8 @@ impl<'a> Parser<'a> {
                 //
                 // If we consume any additional tokens, then this token
                 // is not needed (we'll capture the entire 'glued' token),
-                // and `bump` will set this field to `None`
-                self.break_last_token = true;
+                // and `bump` will set this field to 0.
+                self.break_last_token += 1;
                 // Use the spacing of the glued token as the spacing of the
                 // unglued second token.
                 self.bump_with((Token::new(second, second_span), self.token_spacing));
@@ -1148,10 +1152,9 @@ impl<'a> Parser<'a> {
         // than `.0`/`.1` access.
         let mut next = self.token_cursor.inlined_next();
         self.num_bump_calls += 1;
-        // We've retrieved an token from the underlying
-        // cursor, so we no longer need to worry about
-        // an unglued token. See `break_and_eat` for more details
-        self.break_last_token = false;
+        // We got a token from the underlying cursor and no longer need to
+        // worry about an unglued token. See `break_and_eat` for more details.
+        self.break_last_token = 0;
         if next.0.span.is_dummy() {
             // Tweak the location for better diagnostics, but keep syntactic context intact.
             let fallback_span = self.token.span;
diff --git a/tests/ui/macros/break-last-token-twice.rs b/tests/ui/macros/break-last-token-twice.rs
new file mode 100644
index 00000000000..791f349ab38
--- /dev/null
+++ b/tests/ui/macros/break-last-token-twice.rs
@@ -0,0 +1,16 @@
+//@ check-pass
+
+macro_rules! m {
+    (static $name:ident: $t:ty = $e:expr) => {
+        let $name: $t = $e;
+    }
+}
+
+fn main() {
+    m! {
+        // Tricky: the trailing `>>=` token here is broken twice:
+        // - into `>` and `>=`
+        // - then the `>=` is broken into `>` and `=`
+        static _x: Vec<Vec<u32>>= vec![]
+    }
+}
author	Jubilee <workingjubilee@gmail.com>	2024-09-23 07:54:44 -0700
committer	GitHub <noreply@github.com>	2024-09-23 07:54:44 -0700
commit	515bdcda01fb56609600d2f6214a382511046341 (patch)
tree	41f42d29a541b649c309c979566c2af04f3ef361
parent	c58e3cb1e2b61be57149c84ca8cce82a9eb37fd4 (diff)
parent	73cc5751773d4c49cc9d938548762520037926ba (diff)
download	rust-515bdcda01fb56609600d2f6214a382511046341.tar.gz rust-515bdcda01fb56609600d2f6214a382511046341.zip