diff options
Diffstat (limited to 'compiler/rustc_lexer')
| -rw-r--r-- | compiler/rustc_lexer/src/cursor.rs | 4 | ||||
| -rw-r--r-- | compiler/rustc_lexer/src/lib.rs | 88 | ||||
| -rw-r--r-- | compiler/rustc_lexer/src/tests.rs | 50 |
3 files changed, 71 insertions, 71 deletions
diff --git a/compiler/rustc_lexer/src/cursor.rs b/compiler/rustc_lexer/src/cursor.rs index 0ba6c56dbb5..21557a9c854 100644 --- a/compiler/rustc_lexer/src/cursor.rs +++ b/compiler/rustc_lexer/src/cursor.rs @@ -61,8 +61,8 @@ impl<'a> Cursor<'a> { } /// Returns amount of already consumed symbols. - pub(crate) fn len_consumed(&self) -> usize { - self.initial_len - self.chars.as_str().len() + pub(crate) fn len_consumed(&self) -> u32 { + (self.initial_len - self.chars.as_str().len()) as u32 } /// Resets the number of bytes consumed to 0. diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index 32260913491..6d311af9007 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -38,18 +38,17 @@ use std::convert::TryFrom; #[derive(Debug)] pub struct Token { pub kind: TokenKind, - pub len: usize, + pub len: u32, } impl Token { - fn new(kind: TokenKind, len: usize) -> Token { + fn new(kind: TokenKind, len: u32) -> Token { Token { kind, len } } } /// Enum representing common lexeme types. -// perf note: Changing all `usize` to `u32` doesn't change performance. See #77629 -#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +#[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum TokenKind { // Multi-char tokens: /// "// comment" @@ -76,7 +75,7 @@ pub enum TokenKind { /// tokens. UnknownPrefix, /// "12_u8", "1.0e-40", "b"123"". See `LiteralKind` for more details. - Literal { kind: LiteralKind, suffix_start: usize }, + Literal { kind: LiteralKind, suffix_start: u32 }, /// "'a" Lifetime { starts_with_number: bool }, @@ -160,26 +159,24 @@ pub enum LiteralKind { Str { terminated: bool }, /// "b"abc"", "b"abc" ByteStr { terminated: bool }, - /// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a" - RawStr { n_hashes: u8, err: Option<RawStrError> }, - /// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a" - RawByteStr { n_hashes: u8, err: Option<RawStrError> }, + /// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a". `None` indicates + /// an invalid literal. + RawStr { n_hashes: Option<u8> }, + /// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a". `None` + /// indicates an invalid literal. + RawByteStr { n_hashes: Option<u8> }, } -/// Error produced validating a raw string. Represents cases like: -/// - `r##~"abcde"##`: `InvalidStarter` -/// - `r###"abcde"##`: `NoTerminator { expected: 3, found: 2, possible_terminator_offset: Some(11)` -/// - Too many `#`s (>255): `TooManyDelimiters` -// perf note: It doesn't matter that this makes `Token` 36 bytes bigger. See #77629 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] pub enum RawStrError { - /// Non `#` characters exist between `r` and `"` eg. `r#~"..` + /// Non `#` characters exist between `r` and `"`, e.g. `r##~"abcde"##` InvalidStarter { bad_char: char }, - /// The string was never terminated. `possible_terminator_offset` is the number of characters after `r` or `br` where they - /// may have intended to terminate it. - NoTerminator { expected: usize, found: usize, possible_terminator_offset: Option<usize> }, + /// The string was not terminated, e.g. `r###"abcde"##`. + /// `possible_terminator_offset` is the number of characters after `r` or + /// `br` where they may have intended to terminate it. + NoTerminator { expected: u32, found: u32, possible_terminator_offset: Option<u32> }, /// More than 255 `#`s exist. - TooManyDelimiters { found: usize }, + TooManyDelimiters { found: u32 }, } /// Base of numeric literal encoding according to its prefix. @@ -227,6 +224,19 @@ pub fn first_token(input: &str) -> Token { Cursor::new(input).advance_token() } +/// Validates a raw string literal. Used for getting more information about a +/// problem with a `RawStr`/`RawByteStr` with a `None` field. +#[inline] +pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError> { + debug_assert!(!input.is_empty()); + let mut cursor = Cursor::new(input); + // Move past the leading `r` or `br`. + for _ in 0..prefix_len { + cursor.bump().unwrap(); + } + cursor.raw_double_quoted_string(prefix_len).map(|_| ()) +} + /// Creates an iterator that produces tokens from the input string. pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ { let mut cursor = Cursor::new(input); @@ -316,12 +326,12 @@ impl Cursor<'_> { 'r' => match (self.first(), self.second()) { ('#', c1) if is_id_start(c1) => self.raw_ident(), ('#', _) | ('"', _) => { - let (n_hashes, err) = self.raw_double_quoted_string(1); + let res = self.raw_double_quoted_string(1); let suffix_start = self.len_consumed(); - if err.is_none() { + if res.is_ok() { self.eat_literal_suffix(); } - let kind = RawStr { n_hashes, err }; + let kind = RawStr { n_hashes: res.ok() }; Literal { kind, suffix_start } } _ => self.ident_or_unknown_prefix(), @@ -351,12 +361,12 @@ impl Cursor<'_> { } ('r', '"') | ('r', '#') => { self.bump(); - let (n_hashes, err) = self.raw_double_quoted_string(2); + let res = self.raw_double_quoted_string(2); let suffix_start = self.len_consumed(); - if err.is_none() { + if res.is_ok() { self.eat_literal_suffix(); } - let kind = RawByteStr { n_hashes, err }; + let kind = RawByteStr { n_hashes: res.ok() }; Literal { kind, suffix_start } } _ => self.ident_or_unknown_prefix(), @@ -699,19 +709,18 @@ impl Cursor<'_> { } /// Eats the double-quoted string and returns `n_hashes` and an error if encountered. - fn raw_double_quoted_string(&mut self, prefix_len: usize) -> (u8, Option<RawStrError>) { + fn raw_double_quoted_string(&mut self, prefix_len: u32) -> Result<u8, RawStrError> { // Wrap the actual function to handle the error with too many hashes. // This way, it eats the whole raw string. - let (n_hashes, err) = self.raw_string_unvalidated(prefix_len); + let n_hashes = self.raw_string_unvalidated(prefix_len)?; // Only up to 255 `#`s are allowed in raw strings match u8::try_from(n_hashes) { - Ok(num) => (num, err), - // We lie about the number of hashes here :P - Err(_) => (0, Some(RawStrError::TooManyDelimiters { found: n_hashes })), + Ok(num) => Ok(num), + Err(_) => Err(RawStrError::TooManyDelimiters { found: n_hashes }), } } - fn raw_string_unvalidated(&mut self, prefix_len: usize) -> (usize, Option<RawStrError>) { + fn raw_string_unvalidated(&mut self, prefix_len: u32) -> Result<u32, RawStrError> { debug_assert!(self.prev() == 'r'); let start_pos = self.len_consumed(); let mut possible_terminator_offset = None; @@ -730,7 +739,7 @@ impl Cursor<'_> { Some('"') => (), c => { let c = c.unwrap_or(EOF_CHAR); - return (n_start_hashes, Some(RawStrError::InvalidStarter { bad_char: c })); + return Err(RawStrError::InvalidStarter { bad_char: c }); } } @@ -740,14 +749,11 @@ impl Cursor<'_> { self.eat_while(|c| c != '"'); if self.is_eof() { - return ( - n_start_hashes, - Some(RawStrError::NoTerminator { - expected: n_start_hashes, - found: max_hashes, - possible_terminator_offset, - }), - ); + return Err(RawStrError::NoTerminator { + expected: n_start_hashes, + found: max_hashes, + possible_terminator_offset, + }); } // Eat closing double quote. @@ -765,7 +771,7 @@ impl Cursor<'_> { } if n_end_hashes == n_start_hashes { - return (n_start_hashes, None); + return Ok(n_start_hashes); } else if n_end_hashes > max_hashes { // Keep track of possible terminators to give a hint about // where there might be a missing terminator diff --git a/compiler/rustc_lexer/src/tests.rs b/compiler/rustc_lexer/src/tests.rs index 07daee06f0f..e4c1787f2cc 100644 --- a/compiler/rustc_lexer/src/tests.rs +++ b/compiler/rustc_lexer/src/tests.rs @@ -2,42 +2,39 @@ use super::*; use expect_test::{expect, Expect}; -fn check_raw_str(s: &str, expected_hashes: u8, expected_err: Option<RawStrError>) { +fn check_raw_str(s: &str, expected: Result<u8, RawStrError>) { let s = &format!("r{}", s); let mut cursor = Cursor::new(s); cursor.bump(); - let (n_hashes, err) = cursor.raw_double_quoted_string(0); - assert_eq!(n_hashes, expected_hashes); - assert_eq!(err, expected_err); + let res = cursor.raw_double_quoted_string(0); + assert_eq!(res, expected); } #[test] fn test_naked_raw_str() { - check_raw_str(r#""abc""#, 0, None); + check_raw_str(r#""abc""#, Ok(0)); } #[test] fn test_raw_no_start() { - check_raw_str(r##""abc"#"##, 0, None); + check_raw_str(r##""abc"#"##, Ok(0)); } #[test] fn test_too_many_terminators() { // this error is handled in the parser later - check_raw_str(r###"#"abc"##"###, 1, None); + check_raw_str(r###"#"abc"##"###, Ok(1)); } #[test] fn test_unterminated() { check_raw_str( r#"#"abc"#, - 1, - Some(RawStrError::NoTerminator { expected: 1, found: 0, possible_terminator_offset: None }), + Err(RawStrError::NoTerminator { expected: 1, found: 0, possible_terminator_offset: None }), ); check_raw_str( r###"##"abc"#"###, - 2, - Some(RawStrError::NoTerminator { + Err(RawStrError::NoTerminator { expected: 2, found: 1, possible_terminator_offset: Some(7), @@ -46,14 +43,13 @@ fn test_unterminated() { // We're looking for "# not just any # check_raw_str( r###"##"abc#"###, - 2, - Some(RawStrError::NoTerminator { expected: 2, found: 0, possible_terminator_offset: None }), + Err(RawStrError::NoTerminator { expected: 2, found: 0, possible_terminator_offset: None }), ) } #[test] fn test_invalid_start() { - check_raw_str(r##"#~"abc"#"##, 1, Some(RawStrError::InvalidStarter { bad_char: '~' })); + check_raw_str(r##"#~"abc"#"##, Err(RawStrError::InvalidStarter { bad_char: '~' })); } #[test] @@ -61,26 +57,24 @@ fn test_unterminated_no_pound() { // https://github.com/rust-lang/rust/issues/70677 check_raw_str( r#"""#, - 0, - Some(RawStrError::NoTerminator { expected: 0, found: 0, possible_terminator_offset: None }), + Err(RawStrError::NoTerminator { expected: 0, found: 0, possible_terminator_offset: None }), ); } #[test] fn test_too_many_hashes() { let max_count = u8::MAX; - let mut hashes: String = "#".repeat(max_count.into()); + let hashes1 = "#".repeat(max_count as usize); + let hashes2 = "#".repeat(max_count as usize + 1); + let middle = "\"abc\""; + let s1 = [&hashes1, middle, &hashes1].join(""); + let s2 = [&hashes2, middle, &hashes2].join(""); - // Valid number of hashes (255 = 2^8 - 1 = u8::MAX), but invalid string. - check_raw_str(&hashes, max_count, Some(RawStrError::InvalidStarter { bad_char: '\u{0}' })); + // Valid number of hashes (255 = 2^8 - 1 = u8::MAX). + check_raw_str(&s1, Ok(255)); // One more hash sign (256 = 2^8) becomes too many. - hashes.push('#'); - check_raw_str( - &hashes, - 0, - Some(RawStrError::TooManyDelimiters { found: usize::from(max_count) + 1 }), - ); + check_raw_str(&s2, Err(RawStrError::TooManyDelimiters { found: u32::from(max_count) + 1 })); } #[test] @@ -251,7 +245,7 @@ fn raw_string() { check_lexing( "r###\"\"#a\\b\x00c\"\"###", expect![[r#" - Token { kind: Literal { kind: RawStr { n_hashes: 3, err: None }, suffix_start: 17 }, len: 17 } + Token { kind: Literal { kind: RawStr { n_hashes: Some(3) }, suffix_start: 17 }, len: 17 } "#]], ) } @@ -295,9 +289,9 @@ br###"raw"###suffix Token { kind: Whitespace, len: 1 } Token { kind: Literal { kind: Int { base: Decimal, empty_int: false }, suffix_start: 1 }, len: 3 } Token { kind: Whitespace, len: 1 } - Token { kind: Literal { kind: RawStr { n_hashes: 3, err: None }, suffix_start: 12 }, len: 18 } + Token { kind: Literal { kind: RawStr { n_hashes: Some(3) }, suffix_start: 12 }, len: 18 } Token { kind: Whitespace, len: 1 } - Token { kind: Literal { kind: RawByteStr { n_hashes: 3, err: None }, suffix_start: 13 }, len: 19 } + Token { kind: Literal { kind: RawByteStr { n_hashes: Some(3) }, suffix_start: 13 }, len: 19 } Token { kind: Whitespace, len: 1 } "#]], ) |
