Introduce `ByteSymbol`.

It's like `Symbol` but for byte strings. The interner is now used for both `Symbol` and `ByteSymbol`. E.g. if you intern `"dog"` and `b"dog"` you'll get a `Symbol` and a `ByteSymbol` with the same index and the characters will only be stored once. The motivation for this is to eliminate the `Arc`s in `ast::LitKind`, to make `ast::LitKind` impl `Copy`, and to avoid the need to arena-allocate `ast::LitKind` in HIR. The latter change reduces peak memory by a non-trivial amount on literal-heavy benchmarks such as `deep-vector` and `tuple-stress`. `Encoder`, `Decoder`, `SpanEncoder`, and `SpanDecoder` all get some changes so that they can handle normal strings and byte strings. This change does slow down compilation of programs that use `include_bytes!` on large files, because the contents of those files are now interned (hashed). This makes `include_bytes!` more similar to `include_str!`, though `include_bytes!` contents still aren't escaped, and hashing is still much cheaper than escaping.
author: Nicholas Nethercote <n.nethercote@gmail.com> 2025-06-02 08:59:29 +1000
committer: Nicholas Nethercote <n.nethercote@gmail.com> 2025-06-30 20:42:27 +1000
commit: 478f8287c0e2c35cda511fd3ac01b7ac78ee7cfe (patch)
tree: 4d8f19b4e4e440ed8a22ee809ce2a565707d4c27 /compiler/rustc_ast/src/util
parent: ed2d759783dc9de134bbb3f01085b1e6dbf539f3 (diff)
download: rust-478f8287c0e2c35cda511fd3ac01b7ac78ee7cfe.tar.gz
rust-478f8287c0e2c35cda511fd3ac01b7ac78ee7cfe.zip
1 files changed, 12 insertions, 13 deletions
diff --git a/compiler/rustc_ast/src/util/literal.rs b/compiler/rustc_ast/src/util/literal.rs
index ad9e5d1468b..fa7878873e5 100644
--- a/compiler/rustc_ast/src/util/literal.rs
+++ b/compiler/rustc_ast/src/util/literal.rs
@@ -5,7 +5,7 @@ use std::{ascii, fmt, str};
 use rustc_literal_escaper::{
     MixedUnit, unescape_byte, unescape_byte_str, unescape_c_str, unescape_char, unescape_str,
 };
-use rustc_span::{Span, Symbol, kw, sym};
+use rustc_span::{ByteSymbol, Span, Symbol, kw, sym};
 use tracing::debug;
 
 use crate::ast::{self, LitKind, MetaItemLit, StrStyle};
@@ -116,13 +116,12 @@ impl LitKind {
                         assert!(!err.is_fatal(), "failed to unescape string literal")
                     }
                 });
-                LitKind::ByteStr(buf.into(), StrStyle::Cooked)
+                LitKind::ByteStr(ByteSymbol::intern(&buf), StrStyle::Cooked)
             }
             token::ByteStrRaw(n) => {
-                // Raw strings have no escapes so we can convert the symbol
-                // directly to a `Arc<u8>`.
+                // Raw byte strings have no escapes so no work is needed here.
                 let buf = symbol.as_str().to_owned().into_bytes();
-                LitKind::ByteStr(buf.into(), StrStyle::Raw(n))
+                LitKind::ByteStr(ByteSymbol::intern(&buf), StrStyle::Raw(n))
             }
             token::CStr => {
                 let s = symbol.as_str();
@@ -137,7 +136,7 @@ impl LitKind {
                     }
                 });
                 buf.push(0);
-                LitKind::CStr(buf.into(), StrStyle::Cooked)
+                LitKind::CStr(ByteSymbol::intern(&buf), StrStyle::Cooked)
             }
             token::CStrRaw(n) => {
                 // Raw strings have no escapes so we can convert the symbol
@@ -145,7 +144,7 @@ impl LitKind {
                 // char.
                 let mut buf = symbol.as_str().to_owned().into_bytes();
                 buf.push(0);
-                LitKind::CStr(buf.into(), StrStyle::Raw(n))
+                LitKind::CStr(ByteSymbol::intern(&buf), StrStyle::Raw(n))
             }
             token::Err(guar) => LitKind::Err(guar),
         })
@@ -167,12 +166,12 @@ impl fmt::Display for LitKind {
                 delim = "#".repeat(n as usize),
                 string = sym
             )?,
-            LitKind::ByteStr(ref bytes, StrStyle::Cooked) => {
-                write!(f, "b\"{}\"", escape_byte_str_symbol(bytes))?
+            LitKind::ByteStr(ref byte_sym, StrStyle::Cooked) => {
+                write!(f, "b\"{}\"", escape_byte_str_symbol(byte_sym.as_byte_str()))?
             }
-            LitKind::ByteStr(ref bytes, StrStyle::Raw(n)) => {
+            LitKind::ByteStr(ref byte_sym, StrStyle::Raw(n)) => {
                 // Unwrap because raw byte string literals can only contain ASCII.
-                let symbol = str::from_utf8(bytes).unwrap();
+                let symbol = str::from_utf8(byte_sym.as_byte_str()).unwrap();
                 write!(
                     f,
                     "br{delim}\"{string}\"{delim}",
@@ -181,11 +180,11 @@ impl fmt::Display for LitKind {
                 )?;
             }
             LitKind::CStr(ref bytes, StrStyle::Cooked) => {
-                write!(f, "c\"{}\"", escape_byte_str_symbol(bytes))?
+                write!(f, "c\"{}\"", escape_byte_str_symbol(bytes.as_byte_str()))?
             }
             LitKind::CStr(ref bytes, StrStyle::Raw(n)) => {
                 // This can only be valid UTF-8.
-                let symbol = str::from_utf8(bytes).unwrap();
+                let symbol = str::from_utf8(bytes.as_byte_str()).unwrap();
                 write!(f, "cr{delim}\"{symbol}\"{delim}", delim = "#".repeat(n as usize),)?;
             }
             LitKind::Int(n, ty) => {
author	Nicholas Nethercote <n.nethercote@gmail.com>	2025-06-02 08:59:29 +1000
committer	Nicholas Nethercote <n.nethercote@gmail.com>	2025-06-30 20:42:27 +1000
commit	478f8287c0e2c35cda511fd3ac01b7ac78ee7cfe (patch)
tree	4d8f19b4e4e440ed8a22ee809ce2a565707d4c27 /compiler/rustc_ast/src/util
parent	ed2d759783dc9de134bbb3f01085b1e6dbf539f3 (diff)
download	rust-478f8287c0e2c35cda511fd3ac01b7ac78ee7cfe.tar.gz rust-478f8287c0e2c35cda511fd3ac01b7ac78ee7cfe.zip