diff options
| author | Nicholas Nethercote <n.nethercote@gmail.com> | 2024-01-23 12:27:56 +1100 |
|---|---|---|
| committer | Nicholas Nethercote <n.nethercote@gmail.com> | 2024-01-25 12:28:11 +1100 |
| commit | a1c07214f0f7988cbc5a645a499bb8f7dd9cbed7 (patch) | |
| tree | ac9f663663ff93eb739fd2e3cfe530c80b39af7e | |
| parent | ef1e2228cfd9df4059aa44740b0659fea7c5a52f (diff) | |
| download | rust-a1c07214f0f7988cbc5a645a499bb8f7dd9cbed7.tar.gz rust-a1c07214f0f7988cbc5a645a499bb8f7dd9cbed7.zip | |
Rework `CStrUnit`.
- Rename it as `MixedUnit`, because it will soon be used in more than just C string literals. - Change the `Byte` variant to `HighByte` and use it only for `\x80`..`\xff` cases. This fixes the old inexactness where ASCII chars could be encoded with either `Byte` or `Char`. - Add useful comments. - Remove `is_ascii`, in favour of `u8::is_ascii`.
| -rw-r--r-- | compiler/rustc_ast/src/util/literal.rs | 6 | ||||
| -rw-r--r-- | compiler/rustc_lexer/src/unescape.rs | 79 | ||||
| -rw-r--r-- | src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs | 9 |
3 files changed, 52 insertions, 42 deletions
diff --git a/compiler/rustc_ast/src/util/literal.rs b/compiler/rustc_ast/src/util/literal.rs index 852d49fc5b6..c3995c7776f 100644 --- a/compiler/rustc_ast/src/util/literal.rs +++ b/compiler/rustc_ast/src/util/literal.rs @@ -3,7 +3,7 @@ use crate::ast::{self, LitKind, MetaItemLit, StrStyle}; use crate::token::{self, Token}; use rustc_lexer::unescape::{ - byte_from_char, unescape_byte, unescape_c_string, unescape_char, unescape_literal, CStrUnit, + byte_from_char, unescape_byte, unescape_c_string, unescape_char, unescape_literal, MixedUnit, Mode, }; use rustc_span::symbol::{kw, sym, Symbol}; @@ -127,10 +127,10 @@ impl LitKind { let s = symbol.as_str(); let mut buf = Vec::with_capacity(s.len()); unescape_c_string(s, Mode::CStr, &mut |_span, c| match c { - Ok(CStrUnit::Byte(b)) => buf.push(b), - Ok(CStrUnit::Char(c)) => { + Ok(MixedUnit::Char(c)) => { buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes()) } + Ok(MixedUnit::HighByte(b)) => buf.push(b), Err(err) => { assert!(!err.is_fatal(), "failed to unescape C string literal") } diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs index a5ab3fcdd34..3c23af58f37 100644 --- a/compiler/rustc_lexer/src/unescape.rs +++ b/compiler/rustc_lexer/src/unescape.rs @@ -101,32 +101,45 @@ where } } -/// A unit within CStr. Must not be a nul character. -pub enum CStrUnit { - Byte(u8), +/// Used for mixed utf8 string literals, i.e. those that allow both unicode +/// chars and high bytes. +pub enum MixedUnit { + /// Used for ASCII chars (written directly or via `\x00`..`\x7f` escapes) + /// and Unicode chars (written directly or via `\u` escapes). + /// + /// For example, if '¥' appears in a string it is represented here as + /// `MixedUnit::Char('¥')`, and it will be appended to the relevant byte + /// string as the two-byte UTF-8 sequence `[0xc2, 0xa5]` Char(char), + + /// Used for high bytes (`\x80`..`\xff`). + /// + /// For example, if `\xa5` appears in a string it is represented here as + /// `MixedUnit::HighByte(0xa5)`, and it will be appended to the relevant + /// byte string as the single byte `0xa5`. + HighByte(u8), } -impl From<u8> for CStrUnit { - fn from(value: u8) -> Self { - CStrUnit::Byte(value) +impl From<char> for MixedUnit { + fn from(c: char) -> Self { + MixedUnit::Char(c) } } -impl From<char> for CStrUnit { - fn from(value: char) -> Self { - CStrUnit::Char(value) +impl From<u8> for MixedUnit { + fn from(n: u8) -> Self { + if n.is_ascii() { MixedUnit::Char(n as char) } else { MixedUnit::HighByte(n) } } } pub fn unescape_c_string<F>(src: &str, mode: Mode, callback: &mut F) where - F: FnMut(Range<usize>, Result<CStrUnit, EscapeError>), + F: FnMut(Range<usize>, Result<MixedUnit, EscapeError>), { match mode { CStr => { unescape_non_raw_common(src, mode, &mut |r, mut result| { - if let Ok(CStrUnit::Byte(0) | CStrUnit::Char('\0')) = result { + if let Ok(MixedUnit::Char('\0')) = result { result = Err(EscapeError::NulInCStr); } callback(r, result) @@ -137,7 +150,8 @@ where if let Ok('\0') = result { result = Err(EscapeError::NulInCStr); } - callback(r, result.map(CStrUnit::Char)) + // High bytes aren't possible in raw strings. + callback(r, result.map(MixedUnit::Char)) }); } Char | Byte | Str | RawStr | ByteStr | RawByteStr => unreachable!(), @@ -217,20 +231,19 @@ impl Mode { } } -fn scan_escape<T: From<u8> + From<char>>( +fn scan_escape<T: From<char> + From<u8>>( chars: &mut Chars<'_>, mode: Mode, ) -> Result<T, EscapeError> { // Previous character was '\\', unescape what follows. - let res: u8 = match chars.next().ok_or(EscapeError::LoneSlash)? { - '"' => b'"', - 'n' => b'\n', - 'r' => b'\r', - 't' => b'\t', - '\\' => b'\\', - '\'' => b'\'', - '0' => b'\0', - + let res: char = match chars.next().ok_or(EscapeError::LoneSlash)? { + '"' => '"', + 'n' => '\n', + 'r' => '\r', + 't' => '\t', + '\\' => '\\', + '\'' => '\'', + '0' => '\0', 'x' => { // Parse hexadecimal character code. @@ -240,15 +253,17 @@ fn scan_escape<T: From<u8> + From<char>>( let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?; let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; - let value = hi * 16 + lo; - - if mode.ascii_escapes_should_be_ascii() && !is_ascii(value) { - return Err(EscapeError::OutOfRangeHexEscape); - } + let value = (hi * 16 + lo) as u8; - value as u8 + return if mode.ascii_escapes_should_be_ascii() && !value.is_ascii() { + Err(EscapeError::OutOfRangeHexEscape) + } else { + // This may be a high byte, but that will only happen if `T` is + // `MixedUnit`, because of the `ascii_escapes_should_be_ascii` + // check above. + Ok(T::from(value as u8)) + }; } - 'u' => return scan_unicode(chars, mode.is_unicode_escape_disallowed()).map(T::from), _ => return Err(EscapeError::InvalidEscape), }; @@ -336,7 +351,7 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, Esca /// Takes a contents of a string literal (without quotes) and produces a /// sequence of escaped characters or errors. -fn unescape_non_raw_common<F, T: From<u8> + From<char>>(src: &str, mode: Mode, callback: &mut F) +fn unescape_non_raw_common<F, T: From<char> + From<u8>>(src: &str, mode: Mode, callback: &mut F) where F: FnMut(Range<usize>, Result<T, EscapeError>), { @@ -430,7 +445,3 @@ pub fn byte_from_char(c: char) -> u8 { debug_assert!(res <= u8::MAX as u32, "guaranteed because of ByteStr"); res as u8 } - -fn is_ascii(x: u32) -> bool { - x <= 0x7F -} diff --git a/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs b/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs index b39006e2ff2..2f75e9677ec 100644 --- a/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs +++ b/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs @@ -6,7 +6,7 @@ use std::{ }; use rustc_lexer::unescape::{ - unescape_byte, unescape_c_string, unescape_char, unescape_literal, CStrUnit, Mode, + unescape_byte, unescape_c_string, unescape_char, unescape_literal, MixedUnit, Mode, }; use crate::{ @@ -336,10 +336,9 @@ impl ast::CString { let mut buf = Vec::new(); let mut prev_end = 0; let mut has_error = false; - let mut char_buf = [0u8; 4]; - let mut extend_unit = |buf: &mut Vec<u8>, unit: CStrUnit| match unit { - CStrUnit::Byte(b) => buf.push(b), - CStrUnit::Char(c) => buf.extend(c.encode_utf8(&mut char_buf).as_bytes()), + let extend_unit = |buf: &mut Vec<u8>, unit: MixedUnit| match unit { + MixedUnit::Char(c) => buf.extend(c.encode_utf8(&mut [0; 4]).as_bytes()), + MixedUnit::HighByte(b) => buf.push(b), }; unescape_c_string(text, Self::MODE, &mut |char_range, unescaped| match ( unescaped, |
