diff options
| author | Deadbeef <ent3rm4n@gmail.com> | 2025-04-12 15:53:46 +0000 |
|---|---|---|
| committer | Deadbeef <ent3rm4n@gmail.com> | 2025-05-05 23:10:08 +0800 |
| commit | 662182637e100642d1e5de7cb193da837a14ae48 (patch) | |
| tree | 28e183216387d8d8cd939404a0c2fc01efaf450f /compiler/rustc_lexer/src/lib.rs | |
| parent | 0c33fe2c3d3eecadd17a84b110bb067288a64f1c (diff) | |
| download | rust-662182637e100642d1e5de7cb193da837a14ae48.tar.gz rust-662182637e100642d1e5de7cb193da837a14ae48.zip | |
Implement RFC 3503: frontmatters
Supercedes #137193
Diffstat (limited to 'compiler/rustc_lexer/src/lib.rs')
| -rw-r--r-- | compiler/rustc_lexer/src/lib.rs | 155 |
1 files changed, 148 insertions, 7 deletions
diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index f9c71b2fa65..2374f388250 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -35,8 +35,8 @@ pub use unicode_xid::UNICODE_VERSION as UNICODE_XID_VERSION; use self::LiteralKind::*; use self::TokenKind::*; -pub use crate::cursor::Cursor; use crate::cursor::EOF_CHAR; +pub use crate::cursor::{Cursor, FrontmatterAllowed}; /// Parsed token. /// It doesn't contain information about data that has been parsed, @@ -57,17 +57,27 @@ impl Token { #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum TokenKind { /// A line comment, e.g. `// comment`. - LineComment { doc_style: Option<DocStyle> }, + LineComment { + doc_style: Option<DocStyle>, + }, /// A block comment, e.g. `/* block comment */`. /// /// Block comments can be recursive, so a sequence like `/* /* */` /// will not be considered terminated and will result in a parsing error. - BlockComment { doc_style: Option<DocStyle>, terminated: bool }, + BlockComment { + doc_style: Option<DocStyle>, + terminated: bool, + }, /// Any whitespace character sequence. Whitespace, + Frontmatter { + has_invalid_preceding_whitespace: bool, + invalid_infostring: bool, + }, + /// An identifier or keyword, e.g. `ident` or `continue`. Ident, @@ -109,10 +119,15 @@ pub enum TokenKind { /// this type will need to check for and reject that case. /// /// See [LiteralKind] for more details. - Literal { kind: LiteralKind, suffix_start: u32 }, + Literal { + kind: LiteralKind, + suffix_start: u32, + }, /// A lifetime, e.g. `'a`. - Lifetime { starts_with_number: bool }, + Lifetime { + starts_with_number: bool, + }, /// `;` Semi, @@ -280,7 +295,7 @@ pub fn strip_shebang(input: &str) -> Option<usize> { #[inline] pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError> { debug_assert!(!input.is_empty()); - let mut cursor = Cursor::new(input); + let mut cursor = Cursor::new(input, FrontmatterAllowed::No); // Move past the leading `r` or `br`. for _ in 0..prefix_len { cursor.bump().unwrap(); @@ -290,7 +305,7 @@ pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError> /// Creates an iterator that produces tokens from the input string. pub fn tokenize(input: &str) -> impl Iterator<Item = Token> { - let mut cursor = Cursor::new(input); + let mut cursor = Cursor::new(input, FrontmatterAllowed::No); std::iter::from_fn(move || { let token = cursor.advance_token(); if token.kind != TokenKind::Eof { Some(token) } else { None } @@ -361,7 +376,34 @@ impl Cursor<'_> { Some(c) => c, None => return Token::new(TokenKind::Eof, 0), }; + let token_kind = match first_char { + c if matches!(self.frontmatter_allowed, FrontmatterAllowed::Yes) + && is_whitespace(c) => + { + let mut last = first_char; + while is_whitespace(self.first()) { + let Some(c) = self.bump() else { + break; + }; + last = c; + } + // invalid frontmatter opening as whitespace preceding it isn't newline. + // combine the whitespace and the frontmatter to a single token as we shall + // error later. + if last != '\n' && self.as_str().starts_with("---") { + self.bump(); + self.frontmatter(true) + } else { + Whitespace + } + } + '-' if matches!(self.frontmatter_allowed, FrontmatterAllowed::Yes) + && self.as_str().starts_with("--") => + { + // happy path + self.frontmatter(false) + } // Slash, comment or block comment. '/' => match self.first() { '/' => self.line_comment(), @@ -464,11 +506,110 @@ impl Cursor<'_> { c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident(), _ => Unknown, }; + if matches!(self.frontmatter_allowed, FrontmatterAllowed::Yes) + && !matches!(token_kind, Whitespace) + { + // stop allowing frontmatters after first non-whitespace token + self.frontmatter_allowed = FrontmatterAllowed::No; + } let res = Token::new(token_kind, self.pos_within_token()); self.reset_pos_within_token(); res } + /// Given that one `-` was eaten, eat the rest of the frontmatter. + fn frontmatter(&mut self, has_invalid_preceding_whitespace: bool) -> TokenKind { + debug_assert_eq!('-', self.prev()); + + let pos = self.pos_within_token(); + self.eat_while(|c| c == '-'); + + // one `-` is eaten by the caller. + let length_opening = self.pos_within_token() - pos + 1; + + // must be ensured by the caller + debug_assert!(length_opening >= 3); + + // whitespace between the opening and the infostring. + self.eat_while(|ch| ch != '\n' && is_whitespace(ch)); + + // copied from `eat_identifier`, but allows `.` in infostring to allow something like + // `---Cargo.toml` as a valid opener + if is_id_start(self.first()) { + self.bump(); + self.eat_while(|c| is_id_continue(c) || c == '.'); + } + + self.eat_while(|ch| ch != '\n' && is_whitespace(ch)); + let invalid_infostring = self.first() != '\n'; + + let mut s = self.as_str(); + let mut found = false; + while let Some(closing) = s.find(&"-".repeat(length_opening as usize)) { + let preceding_chars_start = s[..closing].rfind("\n").map_or(0, |i| i + 1); + if s[preceding_chars_start..closing].chars().all(is_whitespace) { + // candidate found + self.bump_bytes(closing); + // in case like + // ---cargo + // --- blahblah + // or + // ---cargo + // ---- + // combine those stuff into this frontmatter token such that it gets detected later. + self.eat_until(b'\n'); + found = true; + break; + } else { + s = &s[closing + length_opening as usize..]; + } + } + + if !found { + // recovery strategy: a closing statement might have precending whitespace/newline + // but not have enough dashes to properly close. In this case, we eat until there, + // and report a mismatch in the parser. + let mut rest = self.as_str(); + // We can look for a shorter closing (starting with four dashes but closing with three) + // and other indications that Rust has started and the infostring has ended. + let mut potential_closing = rest + .find("\n---") + // n.b. only in the case where there are dashes, we move the index to the line where + // the dashes start as we eat to include that line. For other cases those are Rust code + // and not included in the frontmatter. + .map(|x| x + 1) + .or_else(|| rest.find("\nuse ")) + .or_else(|| rest.find("\n//!")) + .or_else(|| rest.find("\n#![")); + + if potential_closing.is_none() { + // a less fortunate recovery if all else fails which finds any dashes preceded by whitespace + // on a standalone line. Might be wrong. + while let Some(closing) = rest.find("---") { + let preceding_chars_start = rest[..closing].rfind("\n").map_or(0, |i| i + 1); + if rest[preceding_chars_start..closing].chars().all(is_whitespace) { + // candidate found + potential_closing = Some(closing); + break; + } else { + rest = &rest[closing + 3..]; + } + } + } + + if let Some(potential_closing) = potential_closing { + // bump to the potential closing, and eat everything on that line. + self.bump_bytes(potential_closing); + self.eat_until(b'\n'); + } else { + // eat everything. this will get reported as an unclosed frontmatter. + self.eat_while(|_| true); + } + } + + Frontmatter { has_invalid_preceding_whitespace, invalid_infostring } + } + fn line_comment(&mut self) -> TokenKind { debug_assert!(self.prev() == '/' && self.first() == '/'); self.bump(); |
