feat(lexer): Allow including frontmatter with 'tokenize'

author: Ed Page <eopage@gmail.com> 2025-07-09 16:40:22 -0500
committer: Ed Page <eopage@gmail.com> 2025-07-09 16:42:27 -0500
commit: 45a1e492b1adfc6f7664d1da736dff147e5c3168 (patch)
tree: 8e7a7a4cd1084e16fae200a80d5bafae5a8c6547 /compiler/rustc_lexer
parent: 425cd0f571b3888be81ebad295af8e3c903fb244 (diff)
download: rust-45a1e492b1adfc6f7664d1da736dff147e5c3168.tar.gz
rust-45a1e492b1adfc6f7664d1da736dff147e5c3168.zip
2 files changed, 21 insertions, 11 deletions
diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs
index e30dbe80248..e80196ed567 100644
--- a/compiler/rustc_lexer/src/lib.rs
+++ b/compiler/rustc_lexer/src/lib.rs
@@ -273,14 +273,15 @@ pub fn strip_shebang(input: &str) -> Option<usize> {
     if let Some(input_tail) = input.strip_prefix("#!") {
         // Ok, this is a shebang but if the next non-whitespace token is `[`,
         // then it may be valid Rust code, so consider it Rust code.
-        let next_non_whitespace_token = tokenize(input_tail).map(|tok| tok.kind).find(|tok| {
-            !matches!(
-                tok,
-                TokenKind::Whitespace
-                    | TokenKind::LineComment { doc_style: None }
-                    | TokenKind::BlockComment { doc_style: None, .. }
-            )
-        });
+        let next_non_whitespace_token =
+            tokenize(input_tail, FrontmatterAllowed::No).map(|tok| tok.kind).find(|tok| {
+                !matches!(
+                    tok,
+                    TokenKind::Whitespace
+                        | TokenKind::LineComment { doc_style: None }
+                        | TokenKind::BlockComment { doc_style: None, .. }
+                )
+            });
         if next_non_whitespace_token != Some(TokenKind::OpenBracket) {
             // No other choice than to consider this a shebang.
             return Some(2 + input_tail.lines().next().unwrap_or_default().len());
@@ -303,8 +304,16 @@ pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError>
 }
 
 /// Creates an iterator that produces tokens from the input string.
-pub fn tokenize(input: &str) -> impl Iterator<Item = Token> {
-    let mut cursor = Cursor::new(input, FrontmatterAllowed::No);
+///
+/// When parsing a full Rust document,
+/// first [`strip_shebang`] and then allow frontmatters with [`FrontmatterAllowed::Yes`].
+///
+/// When tokenizing a slice of a document, be sure to disallow frontmatters with [`FrontmatterAllowed::No`]
+pub fn tokenize(
+    input: &str,
+    frontmatter_allowed: FrontmatterAllowed,
+) -> impl Iterator<Item = Token> {
+    let mut cursor = Cursor::new(input, frontmatter_allowed);
     std::iter::from_fn(move || {
         let token = cursor.advance_token();
         if token.kind != TokenKind::Eof { Some(token) } else { None }
diff --git a/compiler/rustc_lexer/src/tests.rs b/compiler/rustc_lexer/src/tests.rs
index fc8d9b9d57b..a9fcb481759 100644
--- a/compiler/rustc_lexer/src/tests.rs
+++ b/compiler/rustc_lexer/src/tests.rs
@@ -125,7 +125,8 @@ fn test_valid_shebang() {
 }
 
 fn check_lexing(src: &str, expect: Expect) {
-    let actual: String = tokenize(src).map(|token| format!("{:?}\n", token)).collect();
+    let actual: String =
+        tokenize(src, FrontmatterAllowed::No).map(|token| format!("{:?}\n", token)).collect();
     expect.assert_eq(&actual)
 }
author	Ed Page <eopage@gmail.com>	2025-07-09 16:40:22 -0500
committer	Ed Page <eopage@gmail.com>	2025-07-09 16:42:27 -0500
commit	45a1e492b1adfc6f7664d1da736dff147e5c3168 (patch)
tree	8e7a7a4cd1084e16fae200a80d5bafae5a8c6547 /compiler/rustc_lexer
parent	425cd0f571b3888be81ebad295af8e3c903fb244 (diff)
download	rust-45a1e492b1adfc6f7664d1da736dff147e5c3168.tar.gz rust-45a1e492b1adfc6f7664d1da736dff147e5c3168.zip