Auto merge of #118897 - nnethercote:more-unescaping-cleanups, r=fee1-dead

More unescaping cleanups More minor improvements I found while working on #118699. r? `@fee1-dead`
author: bors <bors@rust-lang.org> 2023-12-16 08:52:06 +0000
committer: bors <bors@rust-lang.org> 2023-12-16 08:52:06 +0000
commit: 3ad8e2d129a67b0dc450d6ef1100ae41d2cd4b8a (patch)
tree: a63fab27ff21a30183268fcd799e5c20adfe5e64
parent: 1c6a06183a1452c2772db370b572e38bac9c5d86 (diff)
parent: b900eb73173f416dba5619ebfc10e6e0438a4753 (diff)
download: rust-3ad8e2d129a67b0dc450d6ef1100ae41d2cd4b8a.tar.gz
rust-3ad8e2d129a67b0dc450d6ef1100ae41d2cd4b8a.zip
3 files changed, 82 insertions, 88 deletions
diff --git a/compiler/rustc_ast/src/util/literal.rs b/compiler/rustc_ast/src/util/literal.rs
index 50eb92125b9..92b9adf1db7 100644
--- a/compiler/rustc_ast/src/util/literal.rs
+++ b/compiler/rustc_ast/src/util/literal.rs
@@ -77,6 +77,8 @@ impl LitKind {
                 // new symbol because the string in the LitKind is different to the
                 // string in the token.
                 let s = symbol.as_str();
+                // Vanilla strings are so common we optimize for the common case where no chars
+                // requiring special behaviour are present.
                 let symbol = if s.contains(['\\', '\r']) {
                     let mut buf = String::with_capacity(s.len());
                     let mut error = Ok(());
@@ -104,27 +106,20 @@ impl LitKind {
                 LitKind::Str(symbol, ast::StrStyle::Cooked)
             }
             token::StrRaw(n) => {
-                // Ditto.
-                let s = symbol.as_str();
-                let symbol =
-                    if s.contains('\r') {
-                        let mut buf = String::with_capacity(s.len());
-                        let mut error = Ok(());
-                        unescape_literal(s, Mode::RawStr, &mut |_, unescaped_char| {
-                            match unescaped_char {
-                                Ok(c) => buf.push(c),
-                                Err(err) => {
-                                    if err.is_fatal() {
-                                        error = Err(LitError::LexerError);
-                                    }
-                                }
+                // Raw strings have no escapes, so we only need to check for invalid chars, and we
+                // can reuse the symbol on success.
+                let mut error = Ok(());
+                unescape_literal(symbol.as_str(), Mode::RawStr, &mut |_, unescaped_char| {
+                    match unescaped_char {
+                        Ok(_) => {}
+                        Err(err) => {
+                            if err.is_fatal() {
+                                error = Err(LitError::LexerError);
                             }
-                        });
-                        error?;
-                        Symbol::intern(&buf)
-                    } else {
-                        symbol
-                    };
+                        }
+                    }
+                });
+                error?;
                 LitKind::Str(symbol, ast::StrStyle::Raw(n))
             }
             token::ByteStr => {
@@ -143,25 +138,19 @@ impl LitKind {
                 LitKind::ByteStr(buf.into(), StrStyle::Cooked)
             }
             token::ByteStrRaw(n) => {
+                // Raw strings have no escapes, so we only need to check for invalid chars, and we
+                // can convert the symbol directly to a `Lrc<u8>` on success.
                 let s = symbol.as_str();
-                let bytes = if s.contains('\r') {
-                    let mut buf = Vec::with_capacity(s.len());
-                    let mut error = Ok(());
-                    unescape_literal(s, Mode::RawByteStr, &mut |_, c| match c {
-                        Ok(c) => buf.push(byte_from_char(c)),
-                        Err(err) => {
-                            if err.is_fatal() {
-                                error = Err(LitError::LexerError);
-                            }
+                let mut error = Ok(());
+                unescape_literal(s, Mode::RawByteStr, &mut |_, c| match c {
+                    Ok(_) => {}
+                    Err(err) => {
+                        if err.is_fatal() {
+                            error = Err(LitError::LexerError);
                         }
-                    });
-                    error?;
-                    buf
-                } else {
-                    symbol.to_string().into_bytes()
-                };
-
-                LitKind::ByteStr(bytes.into(), StrStyle::Raw(n))
+                    }
+                });
+                LitKind::ByteStr(s.to_owned().into_bytes().into(), StrStyle::Raw(n))
             }
             token::CStr => {
                 let s = symbol.as_str();
@@ -172,7 +161,6 @@ impl LitKind {
                         error = Err(LitError::NulInCStr(span));
                     }
                     Ok(CStrUnit::Byte(b)) => buf.push(b),
-                    Ok(CStrUnit::Char(c)) if c.len_utf8() == 1 => buf.push(c as u8),
                     Ok(CStrUnit::Char(c)) => {
                         buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
                     }
@@ -187,18 +175,15 @@ impl LitKind {
                 LitKind::CStr(buf.into(), StrStyle::Cooked)
             }
             token::CStrRaw(n) => {
+                // Raw strings have no escapes, so we only need to check for invalid chars, and we
+                // can convert the symbol directly to a `Lrc<u8>` on success.
                 let s = symbol.as_str();
-                let mut buf = Vec::with_capacity(s.len());
                 let mut error = Ok(());
                 unescape_c_string(s, Mode::RawCStr, &mut |span, c| match c {
                     Ok(CStrUnit::Byte(0) | CStrUnit::Char('\0')) => {
                         error = Err(LitError::NulInCStr(span));
                     }
-                    Ok(CStrUnit::Byte(b)) => buf.push(b),
-                    Ok(CStrUnit::Char(c)) if c.len_utf8() == 1 => buf.push(c as u8),
-                    Ok(CStrUnit::Char(c)) => {
-                        buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
-                    }
+                    Ok(_) => {}
                     Err(err) => {
                         if err.is_fatal() {
                             error = Err(LitError::LexerError);
@@ -206,6 +191,7 @@ impl LitKind {
                     }
                 });
                 error?;
+                let mut buf = s.to_owned().into_bytes();
                 buf.push(0);
                 LitKind::CStr(buf.into(), StrStyle::Raw(n))
             }
diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs
index 249126a269e..abec12f52a6 100644
--- a/compiler/rustc_lexer/src/unescape.rs
+++ b/compiler/rustc_lexer/src/unescape.rs
@@ -92,8 +92,8 @@ where
             let res = unescape_char_or_byte(&mut chars, mode);
             callback(0..(src.len() - chars.as_str().len()), res);
         }
-        Str | ByteStr => unescape_str_common(src, mode, callback),
-        RawStr | RawByteStr => unescape_raw_str_or_raw_byte_str(src, mode, callback),
+        Str | ByteStr => unescape_non_raw_common(src, mode, callback),
+        RawStr | RawByteStr => check_raw_common(src, mode, callback),
         CStr | RawCStr => unreachable!(),
     }
 }
@@ -122,12 +122,10 @@ where
 {
     match mode {
         CStr => {
-            unescape_str_common(src, mode, callback);
+            unescape_non_raw_common(src, mode, callback);
         }
         RawCStr => {
-            unescape_raw_str_or_raw_byte_str(src, mode, &mut |r, result| {
-                callback(r, result.map(CStrUnit::Char))
-            });
+            check_raw_common(src, mode, &mut |r, result| callback(r, result.map(CStrUnit::Char)));
         }
         Char | Byte | Str | RawStr | ByteStr | RawByteStr => unreachable!(),
     }
@@ -191,8 +189,9 @@ impl Mode {
     /// Byte literals do not allow unicode escape.
     fn is_unicode_escape_disallowed(self) -> bool {
         match self {
-            Byte | ByteStr | RawByteStr => true,
-            Char | Str | RawStr | CStr | RawCStr => false,
+            Byte | ByteStr => true,
+            Char | Str | CStr => false,
+            RawByteStr | RawStr | RawCStr => unreachable!(),
         }
     }
 
@@ -324,7 +323,7 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, Esca
 
 /// Takes a contents of a string literal (without quotes) and produces a
 /// sequence of escaped characters or errors.
-fn unescape_str_common<F, T: From<u8> + From<char>>(src: &str, mode: Mode, callback: &mut F)
+fn unescape_non_raw_common<F, T: From<u8> + From<char>>(src: &str, mode: Mode, callback: &mut F)
 where
     F: FnMut(Range<usize>, Result<T, EscapeError>),
 {
@@ -391,7 +390,7 @@ where
 /// sequence of characters or errors.
 /// NOTE: Raw strings do not perform any explicit character escaping, here we
 /// only produce errors on bare CR.
-fn unescape_raw_str_or_raw_byte_str<F>(src: &str, mode: Mode, callback: &mut F)
+fn check_raw_common<F>(src: &str, mode: Mode, callback: &mut F)
 where
     F: FnMut(Range<usize>, Result<char, EscapeError>),
 {
@@ -399,7 +398,7 @@ where
     let chars_should_be_ascii = mode.chars_should_be_ascii(); // get this outside the loop
 
     // The `start` and `end` computation here matches the one in
-    // `unescape_str_common` for consistency, even though this function
+    // `unescape_non_raw_common` for consistency, even though this function
     // doesn't have to worry about skipping any chars.
     while let Some(c) = chars.next() {
         let start = src.len() - chars.as_str().len() - c.len_utf8();
diff --git a/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs b/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs
index b659c40b233..2b4c2e3c250 100644
--- a/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs
+++ b/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs
@@ -11,12 +11,12 @@ use crate::errors::{MoreThanOneCharNote, MoreThanOneCharSugg, NoBraceUnicodeSub,
 
 pub(crate) fn emit_unescape_error(
     handler: &Handler,
-    // interior part of the literal, without quotes
+    // interior part of the literal, between quotes
     lit: &str,
-    // full span of the literal, including quotes
-    span_with_quotes: Span,
-    // interior span of the literal, without quotes
-    span: Span,
+    // full span of the literal, including quotes and any prefix
+    full_lit_span: Span,
+    // span of the error part of the literal
+    err_span: Span,
     mode: Mode,
     // range of the error inside `lit`
     range: Range<usize>,
@@ -24,19 +24,21 @@ pub(crate) fn emit_unescape_error(
 ) {
     debug!(
         "emit_unescape_error: {:?}, {:?}, {:?}, {:?}, {:?}",
-        lit, span_with_quotes, mode, range, error
+        lit, full_lit_span, mode, range, error
     );
     let last_char = || {
         let c = lit[range.clone()].chars().next_back().unwrap();
-        let span = span.with_lo(span.hi() - BytePos(c.len_utf8() as u32));
+        let span = err_span.with_lo(err_span.hi() - BytePos(c.len_utf8() as u32));
         (c, span)
     };
     match error {
         EscapeError::LoneSurrogateUnicodeEscape => {
-            handler.emit_err(UnescapeError::InvalidUnicodeEscape { span, surrogate: true });
+            handler
+                .emit_err(UnescapeError::InvalidUnicodeEscape { span: err_span, surrogate: true });
         }
         EscapeError::OutOfRangeUnicodeEscape => {
-            handler.emit_err(UnescapeError::InvalidUnicodeEscape { span, surrogate: false });
+            handler
+                .emit_err(UnescapeError::InvalidUnicodeEscape { span: err_span, surrogate: false });
         }
         EscapeError::MoreThanOneChar => {
             use unicode_normalization::{char::is_combining_mark, UnicodeNormalization};
@@ -49,12 +51,16 @@ pub(crate) fn emit_unescape_error(
                 let normalized = lit.nfc().to_string();
                 if normalized.chars().count() == 1 {
                     let ch = normalized.chars().next().unwrap().escape_default().to_string();
-                    sugg = Some(MoreThanOneCharSugg::NormalizedForm { span, ch, normalized });
+                    sugg = Some(MoreThanOneCharSugg::NormalizedForm {
+                        span: err_span,
+                        ch,
+                        normalized,
+                    });
                 }
                 let escaped_marks =
                     rest.iter().map(|c| c.escape_default().to_string()).collect::<Vec<_>>();
                 note = Some(MoreThanOneCharNote::AllCombining {
-                    span,
+                    span: err_span,
                     chr: format!("{first}"),
                     len: escaped_marks.len(),
                     escaped_marks: escaped_marks.join(""),
@@ -69,10 +75,12 @@ pub(crate) fn emit_unescape_error(
                     .collect();
 
                 if let &[ch] = printable.as_slice() {
-                    sugg =
-                        Some(MoreThanOneCharSugg::RemoveNonPrinting { span, ch: ch.to_string() });
+                    sugg = Some(MoreThanOneCharSugg::RemoveNonPrinting {
+                        span: err_span,
+                        ch: ch.to_string(),
+                    });
                     note = Some(MoreThanOneCharNote::NonPrinting {
-                        span,
+                        span: err_span,
                         escaped: lit.escape_default().to_string(),
                     });
                 }
@@ -91,13 +99,13 @@ pub(crate) fn emit_unescape_error(
                 }
                 let sugg = format!("{prefix}\"{escaped}\"");
                 MoreThanOneCharSugg::Quotes {
-                    span: span_with_quotes,
+                    span: full_lit_span,
                     is_byte: mode == Mode::Byte,
                     sugg,
                 }
             });
             handler.emit_err(UnescapeError::MoreThanOneChar {
-                span: span_with_quotes,
+                span: full_lit_span,
                 note,
                 suggestion: sugg,
             });
@@ -105,7 +113,7 @@ pub(crate) fn emit_unescape_error(
         EscapeError::EscapeOnlyChar => {
             let (c, char_span) = last_char();
             handler.emit_err(UnescapeError::EscapeOnlyChar {
-                span,
+                span: err_span,
                 char_span,
                 escaped_sugg: c.escape_default().to_string(),
                 escaped_msg: escaped_char(c),
@@ -114,11 +122,11 @@ pub(crate) fn emit_unescape_error(
         }
         EscapeError::BareCarriageReturn => {
             let double_quotes = mode.in_double_quotes();
-            handler.emit_err(UnescapeError::BareCr { span, double_quotes });
+            handler.emit_err(UnescapeError::BareCr { span: err_span, double_quotes });
         }
         EscapeError::BareCarriageReturnInRawString => {
             assert!(mode.in_double_quotes());
-            handler.emit_err(UnescapeError::BareCrRawString(span));
+            handler.emit_err(UnescapeError::BareCrRawString(err_span));
         }
         EscapeError::InvalidEscape => {
             let (c, span) = last_char();
@@ -143,7 +151,7 @@ pub(crate) fn emit_unescape_error(
             } else {
                 if mode == Mode::Str || mode == Mode::Char {
                     diag.span_suggestion(
-                        span_with_quotes,
+                        full_lit_span,
                         "if you meant to write a literal backslash (perhaps escaping in a regular expression), consider a raw string literal",
                         format!("r\"{lit}\""),
                         Applicability::MaybeIncorrect,
@@ -158,7 +166,7 @@ pub(crate) fn emit_unescape_error(
             diag.emit();
         }
         EscapeError::TooShortHexEscape => {
-            handler.emit_err(UnescapeError::TooShortHexEscape(span));
+            handler.emit_err(UnescapeError::TooShortHexEscape(err_span));
         }
         EscapeError::InvalidCharInHexEscape | EscapeError::InvalidCharInUnicodeEscape => {
             let (c, span) = last_char();
@@ -210,7 +218,7 @@ pub(crate) fn emit_unescape_error(
             err.emit();
         }
         EscapeError::OutOfRangeHexEscape => {
-            handler.emit_err(UnescapeError::OutOfRangeHexEscape(span));
+            handler.emit_err(UnescapeError::OutOfRangeHexEscape(err_span));
         }
         EscapeError::LeadingUnderscoreUnicodeEscape => {
             let (c, span) = last_char();
@@ -220,10 +228,11 @@ pub(crate) fn emit_unescape_error(
             });
         }
         EscapeError::OverlongUnicodeEscape => {
-            handler.emit_err(UnescapeError::OverlongUnicodeEscape(span));
+            handler.emit_err(UnescapeError::OverlongUnicodeEscape(err_span));
         }
         EscapeError::UnclosedUnicodeEscape => {
-            handler.emit_err(UnescapeError::UnclosedUnicodeEscape(span, span.shrink_to_hi()));
+            handler
+                .emit_err(UnescapeError::UnclosedUnicodeEscape(err_span, err_span.shrink_to_hi()));
         }
         EscapeError::NoBraceInUnicodeEscape => {
             let mut suggestion = "\\u{".to_owned();
@@ -238,34 +247,34 @@ pub(crate) fn emit_unescape_error(
             let (label, sub) = if suggestion_len > 0 {
                 suggestion.push('}');
                 let hi = char_span.lo() + BytePos(suggestion_len as u32);
-                (None, NoBraceUnicodeSub::Suggestion { span: span.with_hi(hi), suggestion })
+                (None, NoBraceUnicodeSub::Suggestion { span: err_span.with_hi(hi), suggestion })
             } else {
-                (Some(span), NoBraceUnicodeSub::Help)
+                (Some(err_span), NoBraceUnicodeSub::Help)
             };
-            handler.emit_err(UnescapeError::NoBraceInUnicodeEscape { span, label, sub });
+            handler.emit_err(UnescapeError::NoBraceInUnicodeEscape { span: err_span, label, sub });
         }
         EscapeError::UnicodeEscapeInByte => {
-            handler.emit_err(UnescapeError::UnicodeEscapeInByte(span));
+            handler.emit_err(UnescapeError::UnicodeEscapeInByte(err_span));
         }
         EscapeError::EmptyUnicodeEscape => {
-            handler.emit_err(UnescapeError::EmptyUnicodeEscape(span));
+            handler.emit_err(UnescapeError::EmptyUnicodeEscape(err_span));
         }
         EscapeError::ZeroChars => {
-            handler.emit_err(UnescapeError::ZeroChars(span));
+            handler.emit_err(UnescapeError::ZeroChars(err_span));
         }
         EscapeError::LoneSlash => {
-            handler.emit_err(UnescapeError::LoneSlash(span));
+            handler.emit_err(UnescapeError::LoneSlash(err_span));
         }
         EscapeError::UnskippedWhitespaceWarning => {
             let (c, char_span) = last_char();
             handler.emit_warning(UnescapeError::UnskippedWhitespace {
-                span,
+                span: err_span,
                 ch: escaped_char(c),
                 char_span,
             });
         }
         EscapeError::MultipleSkippedLinesWarning => {
-            handler.emit_warning(UnescapeError::MultipleSkippedLinesWarning(span));
+            handler.emit_warning(UnescapeError::MultipleSkippedLinesWarning(err_span));
         }
     }
 }
author	bors <bors@rust-lang.org>	2023-12-16 08:52:06 +0000
committer	bors <bors@rust-lang.org>	2023-12-16 08:52:06 +0000
commit	3ad8e2d129a67b0dc450d6ef1100ae41d2cd4b8a (patch)
tree	a63fab27ff21a30183268fcd799e5c20adfe5e64
parent	1c6a06183a1452c2772db370b572e38bac9c5d86 (diff)
parent	b900eb73173f416dba5619ebfc10e6e0438a4753 (diff)
download	rust-3ad8e2d129a67b0dc450d6ef1100ae41d2cd4b8a.tar.gz rust-3ad8e2d129a67b0dc450d6ef1100ae41d2cd4b8a.zip