diff options
| author | Nicholas Nethercote <n.nethercote@gmail.com> | 2025-06-02 08:59:29 +1000 |
|---|---|---|
| committer | Nicholas Nethercote <n.nethercote@gmail.com> | 2025-06-30 20:42:27 +1000 |
| commit | 478f8287c0e2c35cda511fd3ac01b7ac78ee7cfe (patch) | |
| tree | 4d8f19b4e4e440ed8a22ee809ce2a565707d4c27 /compiler/rustc_ast/src | |
| parent | ed2d759783dc9de134bbb3f01085b1e6dbf539f3 (diff) | |
| download | rust-478f8287c0e2c35cda511fd3ac01b7ac78ee7cfe.tar.gz rust-478f8287c0e2c35cda511fd3ac01b7ac78ee7cfe.zip | |
Introduce `ByteSymbol`.
It's like `Symbol` but for byte strings. The interner is now used for both `Symbol` and `ByteSymbol`. E.g. if you intern `"dog"` and `b"dog"` you'll get a `Symbol` and a `ByteSymbol` with the same index and the characters will only be stored once. The motivation for this is to eliminate the `Arc`s in `ast::LitKind`, to make `ast::LitKind` impl `Copy`, and to avoid the need to arena-allocate `ast::LitKind` in HIR. The latter change reduces peak memory by a non-trivial amount on literal-heavy benchmarks such as `deep-vector` and `tuple-stress`. `Encoder`, `Decoder`, `SpanEncoder`, and `SpanDecoder` all get some changes so that they can handle normal strings and byte strings. This change does slow down compilation of programs that use `include_bytes!` on large files, because the contents of those files are now interned (hashed). This makes `include_bytes!` more similar to `include_str!`, though `include_bytes!` contents still aren't escaped, and hashing is still much cheaper than escaping.
Diffstat (limited to 'compiler/rustc_ast/src')
| -rw-r--r-- | compiler/rustc_ast/src/ast.rs | 28 | ||||
| -rw-r--r-- | compiler/rustc_ast/src/util/literal.rs | 25 |
2 files changed, 30 insertions, 23 deletions
diff --git a/compiler/rustc_ast/src/ast.rs b/compiler/rustc_ast/src/ast.rs index b2d8881e3f6..f5418402377 100644 --- a/compiler/rustc_ast/src/ast.rs +++ b/compiler/rustc_ast/src/ast.rs @@ -19,7 +19,6 @@ //! - [`UnOp`], [`BinOp`], and [`BinOpKind`]: Unary and binary operators. use std::borrow::Cow; -use std::sync::Arc; use std::{cmp, fmt}; pub use GenericArgs::*; @@ -32,7 +31,7 @@ use rustc_data_structures::tagged_ptr::Tag; use rustc_macros::{Decodable, Encodable, HashStable_Generic}; pub use rustc_span::AttrId; use rustc_span::source_map::{Spanned, respan}; -use rustc_span::{DUMMY_SP, ErrorGuaranteed, Ident, Span, Symbol, kw, sym}; +use rustc_span::{ByteSymbol, DUMMY_SP, ErrorGuaranteed, Ident, Span, Symbol, kw, sym}; use thin_vec::{ThinVec, thin_vec}; pub use crate::format::*; @@ -1805,10 +1804,17 @@ pub enum ExprKind { Become(P<Expr>), /// Bytes included via `include_bytes!` + /// /// Added for optimization purposes to avoid the need to escape /// large binary blobs - should always behave like [`ExprKind::Lit`] /// with a `ByteStr` literal. - IncludedBytes(Arc<[u8]>), + /// + /// The value is stored as a `ByteSymbol`. It's unfortunate that we need to + /// intern (hash) the bytes because they're likely to be large and unique. + /// But it's necessary because this will eventually be lowered to + /// `LitKind::ByteStr`, which needs a `ByteSymbol` to impl `Copy` and avoid + /// arena allocation. + IncludedBytes(ByteSymbol), /// A `format_args!()` expression. FormatArgs(P<FormatArgs>), @@ -2066,7 +2072,7 @@ impl YieldKind { } /// A literal in a meta item. -#[derive(Clone, Encodable, Decodable, Debug, HashStable_Generic)] +#[derive(Clone, Copy, Encodable, Decodable, Debug, HashStable_Generic)] pub struct MetaItemLit { /// The original literal as written in the source code. pub symbol: Symbol, @@ -2129,16 +2135,18 @@ pub enum LitFloatType { /// deciding the `LitKind`. This means that float literals like `1f32` are /// classified by this type as `Float`. This is different to `token::LitKind` /// which does *not* consider the suffix. -#[derive(Clone, Encodable, Decodable, Debug, Hash, Eq, PartialEq, HashStable_Generic)] +#[derive(Clone, Copy, Encodable, Decodable, Debug, Hash, Eq, PartialEq, HashStable_Generic)] pub enum LitKind { /// A string literal (`"foo"`). The symbol is unescaped, and so may differ /// from the original token's symbol. Str(Symbol, StrStyle), - /// A byte string (`b"foo"`). Not stored as a symbol because it might be - /// non-utf8, and symbols only allow utf8 strings. - ByteStr(Arc<[u8]>, StrStyle), - /// A C String (`c"foo"`). Guaranteed to only have `\0` at the end. - CStr(Arc<[u8]>, StrStyle), + /// A byte string (`b"foo"`). The symbol is unescaped, and so may differ + /// from the original token's symbol. + ByteStr(ByteSymbol, StrStyle), + /// A C String (`c"foo"`). Guaranteed to only have `\0` at the end. The + /// symbol is unescaped, and so may differ from the original token's + /// symbol. + CStr(ByteSymbol, StrStyle), /// A byte char (`b'f'`). Byte(u8), /// A character literal (`'a'`). diff --git a/compiler/rustc_ast/src/util/literal.rs b/compiler/rustc_ast/src/util/literal.rs index ad9e5d1468b..fa7878873e5 100644 --- a/compiler/rustc_ast/src/util/literal.rs +++ b/compiler/rustc_ast/src/util/literal.rs @@ -5,7 +5,7 @@ use std::{ascii, fmt, str}; use rustc_literal_escaper::{ MixedUnit, unescape_byte, unescape_byte_str, unescape_c_str, unescape_char, unescape_str, }; -use rustc_span::{Span, Symbol, kw, sym}; +use rustc_span::{ByteSymbol, Span, Symbol, kw, sym}; use tracing::debug; use crate::ast::{self, LitKind, MetaItemLit, StrStyle}; @@ -116,13 +116,12 @@ impl LitKind { assert!(!err.is_fatal(), "failed to unescape string literal") } }); - LitKind::ByteStr(buf.into(), StrStyle::Cooked) + LitKind::ByteStr(ByteSymbol::intern(&buf), StrStyle::Cooked) } token::ByteStrRaw(n) => { - // Raw strings have no escapes so we can convert the symbol - // directly to a `Arc<u8>`. + // Raw byte strings have no escapes so no work is needed here. let buf = symbol.as_str().to_owned().into_bytes(); - LitKind::ByteStr(buf.into(), StrStyle::Raw(n)) + LitKind::ByteStr(ByteSymbol::intern(&buf), StrStyle::Raw(n)) } token::CStr => { let s = symbol.as_str(); @@ -137,7 +136,7 @@ impl LitKind { } }); buf.push(0); - LitKind::CStr(buf.into(), StrStyle::Cooked) + LitKind::CStr(ByteSymbol::intern(&buf), StrStyle::Cooked) } token::CStrRaw(n) => { // Raw strings have no escapes so we can convert the symbol @@ -145,7 +144,7 @@ impl LitKind { // char. let mut buf = symbol.as_str().to_owned().into_bytes(); buf.push(0); - LitKind::CStr(buf.into(), StrStyle::Raw(n)) + LitKind::CStr(ByteSymbol::intern(&buf), StrStyle::Raw(n)) } token::Err(guar) => LitKind::Err(guar), }) @@ -167,12 +166,12 @@ impl fmt::Display for LitKind { delim = "#".repeat(n as usize), string = sym )?, - LitKind::ByteStr(ref bytes, StrStyle::Cooked) => { - write!(f, "b\"{}\"", escape_byte_str_symbol(bytes))? + LitKind::ByteStr(ref byte_sym, StrStyle::Cooked) => { + write!(f, "b\"{}\"", escape_byte_str_symbol(byte_sym.as_byte_str()))? } - LitKind::ByteStr(ref bytes, StrStyle::Raw(n)) => { + LitKind::ByteStr(ref byte_sym, StrStyle::Raw(n)) => { // Unwrap because raw byte string literals can only contain ASCII. - let symbol = str::from_utf8(bytes).unwrap(); + let symbol = str::from_utf8(byte_sym.as_byte_str()).unwrap(); write!( f, "br{delim}\"{string}\"{delim}", @@ -181,11 +180,11 @@ impl fmt::Display for LitKind { )?; } LitKind::CStr(ref bytes, StrStyle::Cooked) => { - write!(f, "c\"{}\"", escape_byte_str_symbol(bytes))? + write!(f, "c\"{}\"", escape_byte_str_symbol(bytes.as_byte_str()))? } LitKind::CStr(ref bytes, StrStyle::Raw(n)) => { // This can only be valid UTF-8. - let symbol = str::from_utf8(bytes).unwrap(); + let symbol = str::from_utf8(bytes.as_byte_str()).unwrap(); write!(f, "cr{delim}\"{symbol}\"{delim}", delim = "#".repeat(n as usize),)?; } LitKind::Int(n, ty) => { |
