diff options
| author | Peter Jaszkowiak <p.jaszkow@gmail.com> | 2024-04-14 17:37:45 -0600 |
|---|---|---|
| committer | Peter Jaszkowiak <p.jaszkow@gmail.com> | 2024-10-08 18:21:16 -0600 |
| commit | 321a5db7d4026cf703d6deebc413c855ba133142 (patch) | |
| tree | f9760cc3beb7c6854859f1298ff0d6e9f85ac127 /compiler/rustc_lexer | |
| parent | 6f4ae0f34503601e54680a137c1db0b81b56cc3d (diff) | |
| download | rust-321a5db7d4026cf703d6deebc413c855ba133142.tar.gz rust-321a5db7d4026cf703d6deebc413c855ba133142.zip | |
Reserve guarded string literals (RFC 3593)
Diffstat (limited to 'compiler/rustc_lexer')
| -rw-r--r-- | compiler/rustc_lexer/src/lib.rs | 93 |
1 files changed, 85 insertions, 8 deletions
diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index 60aab668cba..b0ab50dd773 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -104,6 +104,12 @@ pub enum TokenKind { /// for emoji identifier recovery, as those are not meant to be ever accepted. InvalidPrefix, + /// Guarded string literal prefix: `#"` or `##`. + /// + /// Used for reserving "guarded strings" (RFC 3598) in edition 2024. + /// Split into the component tokens on older editions. + GuardedStrPrefix, + /// Examples: `12u8`, `1.0e-40`, `b"123"`. Note that `_` is an invalid /// suffix, but may be present here on string and float literals. Users of /// this type will need to check for and reject that case. @@ -191,30 +197,41 @@ pub enum DocStyle { /// `rustc_ast::ast::LitKind`). #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] pub enum LiteralKind { - /// "12_u8", "0o100", "0b120i99", "1f32". + /// `12_u8`, `0o100`, `0b120i99`, `1f32`. Int { base: Base, empty_int: bool }, - /// "12.34f32", "1e3", but not "1f32". + /// `12.34f32`, `1e3`, but not `1f32`. Float { base: Base, empty_exponent: bool }, - /// "'a'", "'\\'", "'''", "';" + /// `'a'`, `'\\'`, `'''`, `';` Char { terminated: bool }, - /// "b'a'", "b'\\'", "b'''", "b';" + /// `b'a'`, `b'\\'`, `b'''`, `b';` Byte { terminated: bool }, - /// ""abc"", ""abc" + /// `"abc"`, `"abc` Str { terminated: bool }, - /// "b"abc"", "b"abc" + /// `b"abc"`, `b"abc` ByteStr { terminated: bool }, /// `c"abc"`, `c"abc` CStr { terminated: bool }, - /// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a". `None` indicates + /// `r"abc"`, `r#"abc"#`, `r####"ab"###"c"####`, `r#"a`. `None` indicates /// an invalid literal. RawStr { n_hashes: Option<u8> }, - /// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a". `None` + /// `br"abc"`, `br#"abc"#`, `br####"ab"###"c"####`, `br#"a`. `None` /// indicates an invalid literal. RawByteStr { n_hashes: Option<u8> }, /// `cr"abc"`, "cr#"abc"#", `cr#"a`. `None` indicates an invalid literal. RawCStr { n_hashes: Option<u8> }, } +/// `#"abc"#`, `##"a"` (fewer closing), or even `#"a` (unterminated). +/// +/// Can capture fewer closing hashes than starting hashes, +/// for more efficient lexing and better backwards diagnostics. +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub struct GuardedStr { + pub n_hashes: u32, + pub terminated: bool, + pub token_len: u32, +} + #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] pub enum RawStrError { /// Non `#` characters exist between `r` and `"`, e.g. `r##~"abcde"##` @@ -403,6 +420,12 @@ impl Cursor<'_> { TokenKind::Literal { kind: literal_kind, suffix_start } } + // Guarded string literal prefix: `#"` or `##` + '#' if matches!(self.first(), '"' | '#') => { + self.bump(); + TokenKind::GuardedStrPrefix + } + // One-symbol tokens. ';' => Semi, ',' => Comma, @@ -780,6 +803,60 @@ impl Cursor<'_> { false } + /// Attempt to lex for a guarded string literal. + /// + /// Used by `rustc_parse::lexer` to lex for guarded strings + /// conditionally based on edition. + /// + /// Note: this will not reset the `Cursor` when a + /// guarded string is not found. It is the caller's + /// responsibility to do so. + pub fn guarded_double_quoted_string(&mut self) -> Option<GuardedStr> { + debug_assert!(self.prev() != '#'); + + let mut n_start_hashes: u32 = 0; + while self.first() == '#' { + n_start_hashes += 1; + self.bump(); + } + + if self.first() != '"' { + return None; + } + self.bump(); + debug_assert!(self.prev() == '"'); + + // Lex the string itself as a normal string literal + // so we can recover that for older editions later. + let terminated = self.double_quoted_string(); + if !terminated { + let token_len = self.pos_within_token(); + self.reset_pos_within_token(); + + return Some(GuardedStr { n_hashes: n_start_hashes, terminated: false, token_len }); + } + + // Consume closing '#' symbols. + // Note that this will not consume extra trailing `#` characters: + // `###"abcde"####` is lexed as a `GuardedStr { n_end_hashes: 3, .. }` + // followed by a `#` token. + let mut n_end_hashes = 0; + while self.first() == '#' && n_end_hashes < n_start_hashes { + n_end_hashes += 1; + self.bump(); + } + + // Reserved syntax, always an error, so it doesn't matter if + // `n_start_hashes != n_end_hashes`. + + self.eat_literal_suffix(); + + let token_len = self.pos_within_token(); + self.reset_pos_within_token(); + + Some(GuardedStr { n_hashes: n_start_hashes, terminated: true, token_len }) + } + /// Eats the double-quoted string and returns `n_hashes` and an error if encountered. fn raw_double_quoted_string(&mut self, prefix_len: u32) -> Result<u8, RawStrError> { // Wrap the actual function to handle the error with too many hashes. |
