Implement RFC 3503: frontmatters

Supercedes #137193
author: Deadbeef <ent3rm4n@gmail.com> 2025-04-12 15:53:46 +0000
committer: Deadbeef <ent3rm4n@gmail.com> 2025-05-05 23:10:08 +0800
commit: 662182637e100642d1e5de7cb193da837a14ae48 (patch)
tree: 28e183216387d8d8cd939404a0c2fc01efaf450f /compiler/rustc_lexer/src/lib.rs
parent: 0c33fe2c3d3eecadd17a84b110bb067288a64f1c (diff)
download: rust-662182637e100642d1e5de7cb193da837a14ae48.tar.gz
rust-662182637e100642d1e5de7cb193da837a14ae48.zip
1 files changed, 148 insertions, 7 deletions
diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs
index f9c71b2fa65..2374f388250 100644
--- a/compiler/rustc_lexer/src/lib.rs
+++ b/compiler/rustc_lexer/src/lib.rs
@@ -35,8 +35,8 @@ pub use unicode_xid::UNICODE_VERSION as UNICODE_XID_VERSION;
 
 use self::LiteralKind::*;
 use self::TokenKind::*;
-pub use crate::cursor::Cursor;
 use crate::cursor::EOF_CHAR;
+pub use crate::cursor::{Cursor, FrontmatterAllowed};
 
 /// Parsed token.
 /// It doesn't contain information about data that has been parsed,
@@ -57,17 +57,27 @@ impl Token {
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub enum TokenKind {
     /// A line comment, e.g. `// comment`.
-    LineComment { doc_style: Option<DocStyle> },
+    LineComment {
+        doc_style: Option<DocStyle>,
+    },
 
     /// A block comment, e.g. `/* block comment */`.
     ///
     /// Block comments can be recursive, so a sequence like `/* /* */`
     /// will not be considered terminated and will result in a parsing error.
-    BlockComment { doc_style: Option<DocStyle>, terminated: bool },
+    BlockComment {
+        doc_style: Option<DocStyle>,
+        terminated: bool,
+    },
 
     /// Any whitespace character sequence.
     Whitespace,
 
+    Frontmatter {
+        has_invalid_preceding_whitespace: bool,
+        invalid_infostring: bool,
+    },
+
     /// An identifier or keyword, e.g. `ident` or `continue`.
     Ident,
 
@@ -109,10 +119,15 @@ pub enum TokenKind {
     /// this type will need to check for and reject that case.
     ///
     /// See [LiteralKind] for more details.
-    Literal { kind: LiteralKind, suffix_start: u32 },
+    Literal {
+        kind: LiteralKind,
+        suffix_start: u32,
+    },
 
     /// A lifetime, e.g. `'a`.
-    Lifetime { starts_with_number: bool },
+    Lifetime {
+        starts_with_number: bool,
+    },
 
     /// `;`
     Semi,
@@ -280,7 +295,7 @@ pub fn strip_shebang(input: &str) -> Option<usize> {
 #[inline]
 pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError> {
     debug_assert!(!input.is_empty());
-    let mut cursor = Cursor::new(input);
+    let mut cursor = Cursor::new(input, FrontmatterAllowed::No);
     // Move past the leading `r` or `br`.
     for _ in 0..prefix_len {
         cursor.bump().unwrap();
@@ -290,7 +305,7 @@ pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError>
 
 /// Creates an iterator that produces tokens from the input string.
 pub fn tokenize(input: &str) -> impl Iterator<Item = Token> {
-    let mut cursor = Cursor::new(input);
+    let mut cursor = Cursor::new(input, FrontmatterAllowed::No);
     std::iter::from_fn(move || {
         let token = cursor.advance_token();
         if token.kind != TokenKind::Eof { Some(token) } else { None }
@@ -361,7 +376,34 @@ impl Cursor<'_> {
             Some(c) => c,
             None => return Token::new(TokenKind::Eof, 0),
         };
+
         let token_kind = match first_char {
+            c if matches!(self.frontmatter_allowed, FrontmatterAllowed::Yes)
+                && is_whitespace(c) =>
+            {
+                let mut last = first_char;
+                while is_whitespace(self.first()) {
+                    let Some(c) = self.bump() else {
+                        break;
+                    };
+                    last = c;
+                }
+                // invalid frontmatter opening as whitespace preceding it isn't newline.
+                // combine the whitespace and the frontmatter to a single token as we shall
+                // error later.
+                if last != '\n' && self.as_str().starts_with("---") {
+                    self.bump();
+                    self.frontmatter(true)
+                } else {
+                    Whitespace
+                }
+            }
+            '-' if matches!(self.frontmatter_allowed, FrontmatterAllowed::Yes)
+                && self.as_str().starts_with("--") =>
+            {
+                // happy path
+                self.frontmatter(false)
+            }
             // Slash, comment or block comment.
             '/' => match self.first() {
                 '/' => self.line_comment(),
@@ -464,11 +506,110 @@ impl Cursor<'_> {
             c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident(),
             _ => Unknown,
         };
+        if matches!(self.frontmatter_allowed, FrontmatterAllowed::Yes)
+            && !matches!(token_kind, Whitespace)
+        {
+            // stop allowing frontmatters after first non-whitespace token
+            self.frontmatter_allowed = FrontmatterAllowed::No;
+        }
         let res = Token::new(token_kind, self.pos_within_token());
         self.reset_pos_within_token();
         res
     }
 
+    /// Given that one `-` was eaten, eat the rest of the frontmatter.
+    fn frontmatter(&mut self, has_invalid_preceding_whitespace: bool) -> TokenKind {
+        debug_assert_eq!('-', self.prev());
+
+        let pos = self.pos_within_token();
+        self.eat_while(|c| c == '-');
+
+        // one `-` is eaten by the caller.
+        let length_opening = self.pos_within_token() - pos + 1;
+
+        // must be ensured by the caller
+        debug_assert!(length_opening >= 3);
+
+        // whitespace between the opening and the infostring.
+        self.eat_while(|ch| ch != '\n' && is_whitespace(ch));
+
+        // copied from `eat_identifier`, but allows `.` in infostring to allow something like
+        // `---Cargo.toml` as a valid opener
+        if is_id_start(self.first()) {
+            self.bump();
+            self.eat_while(|c| is_id_continue(c) || c == '.');
+        }
+
+        self.eat_while(|ch| ch != '\n' && is_whitespace(ch));
+        let invalid_infostring = self.first() != '\n';
+
+        let mut s = self.as_str();
+        let mut found = false;
+        while let Some(closing) = s.find(&"-".repeat(length_opening as usize)) {
+            let preceding_chars_start = s[..closing].rfind("\n").map_or(0, |i| i + 1);
+            if s[preceding_chars_start..closing].chars().all(is_whitespace) {
+                // candidate found
+                self.bump_bytes(closing);
+                // in case like
+                // ---cargo
+                // --- blahblah
+                // or
+                // ---cargo
+                // ----
+                // combine those stuff into this frontmatter token such that it gets detected later.
+                self.eat_until(b'\n');
+                found = true;
+                break;
+            } else {
+                s = &s[closing + length_opening as usize..];
+            }
+        }
+
+        if !found {
+            // recovery strategy: a closing statement might have precending whitespace/newline
+            // but not have enough dashes to properly close. In this case, we eat until there,
+            // and report a mismatch in the parser.
+            let mut rest = self.as_str();
+            // We can look for a shorter closing (starting with four dashes but closing with three)
+            // and other indications that Rust has started and the infostring has ended.
+            let mut potential_closing = rest
+                .find("\n---")
+                // n.b. only in the case where there are dashes, we move the index to the line where
+                // the dashes start as we eat to include that line. For other cases those are Rust code
+                // and not included in the frontmatter.
+                .map(|x| x + 1)
+                .or_else(|| rest.find("\nuse "))
+                .or_else(|| rest.find("\n//!"))
+                .or_else(|| rest.find("\n#!["));
+
+            if potential_closing.is_none() {
+                // a less fortunate recovery if all else fails which finds any dashes preceded by whitespace
+                // on a standalone line. Might be wrong.
+                while let Some(closing) = rest.find("---") {
+                    let preceding_chars_start = rest[..closing].rfind("\n").map_or(0, |i| i + 1);
+                    if rest[preceding_chars_start..closing].chars().all(is_whitespace) {
+                        // candidate found
+                        potential_closing = Some(closing);
+                        break;
+                    } else {
+                        rest = &rest[closing + 3..];
+                    }
+                }
+            }
+
+            if let Some(potential_closing) = potential_closing {
+                // bump to the potential closing, and eat everything on that line.
+                self.bump_bytes(potential_closing);
+                self.eat_until(b'\n');
+            } else {
+                // eat everything. this will get reported as an unclosed frontmatter.
+                self.eat_while(|_| true);
+            }
+        }
+
+        Frontmatter { has_invalid_preceding_whitespace, invalid_infostring }
+    }
+
     fn line_comment(&mut self) -> TokenKind {
         debug_assert!(self.prev() == '/' && self.first() == '/');
         self.bump();
author	Deadbeef <ent3rm4n@gmail.com>	2025-04-12 15:53:46 +0000
committer	Deadbeef <ent3rm4n@gmail.com>	2025-05-05 23:10:08 +0800
commit	662182637e100642d1e5de7cb193da837a14ae48 (patch)
tree	28e183216387d8d8cd939404a0c2fc01efaf450f /compiler/rustc_lexer/src/lib.rs
parent	0c33fe2c3d3eecadd17a84b110bb067288a64f1c (diff)
download	rust-662182637e100642d1e5de7cb193da837a14ae48.tar.gz rust-662182637e100642d1e5de7cb193da837a14ae48.zip