Introduce `ByteSymbol`.

It's like `Symbol` but for byte strings. The interner is now used for both `Symbol` and `ByteSymbol`. E.g. if you intern `"dog"` and `b"dog"` you'll get a `Symbol` and a `ByteSymbol` with the same index and the characters will only be stored once. The motivation for this is to eliminate the `Arc`s in `ast::LitKind`, to make `ast::LitKind` impl `Copy`, and to avoid the need to arena-allocate `ast::LitKind` in HIR. The latter change reduces peak memory by a non-trivial amount on literal-heavy benchmarks such as `deep-vector` and `tuple-stress`. `Encoder`, `Decoder`, `SpanEncoder`, and `SpanDecoder` all get some changes so that they can handle normal strings and byte strings. This change does slow down compilation of programs that use `include_bytes!` on large files, because the contents of those files are now interned (hashed). This makes `include_bytes!` more similar to `include_str!`, though `include_bytes!` contents still aren't escaped, and hashing is still much cheaper than escaping.
author: Nicholas Nethercote <n.nethercote@gmail.com> 2025-06-02 08:59:29 +1000
committer: Nicholas Nethercote <n.nethercote@gmail.com> 2025-06-30 20:42:27 +1000
commit: 478f8287c0e2c35cda511fd3ac01b7ac78ee7cfe (patch)
tree: 4d8f19b4e4e440ed8a22ee809ce2a565707d4c27 /compiler/rustc_serialize
parent: ed2d759783dc9de134bbb3f01085b1e6dbf539f3 (diff)
download: rust-478f8287c0e2c35cda511fd3ac01b7ac78ee7cfe.tar.gz
rust-478f8287c0e2c35cda511fd3ac01b7ac78ee7cfe.zip
1 files changed, 23 insertions, 1 deletions
diff --git a/compiler/rustc_serialize/src/serialize.rs b/compiler/rustc_serialize/src/serialize.rs
index 8940d10696d..846710c3398 100644
--- a/compiler/rustc_serialize/src/serialize.rs
+++ b/compiler/rustc_serialize/src/serialize.rs
@@ -21,6 +21,11 @@ use thin_vec::ThinVec;
 /// [utf8]: https://en.wikipedia.org/w/index.php?title=UTF-8&oldid=1058865525#Codepage_layout
 const STR_SENTINEL: u8 = 0xC1;
 
+/// For byte strings there are no bytes that canot occur. Just use this value
+/// as a best-effort sentinel. There is no validation skipped so the potential
+/// for badness is lower than in the `STR_SENTINEL` case.
+const BYTE_STR_SENTINEL: u8 = 0xC2;
+
 /// A note about error handling.
 ///
 /// Encoders may be fallible, but in practice failure is rare and there are so
@@ -72,6 +77,13 @@ pub trait Encoder {
         self.emit_u8(STR_SENTINEL);
     }
 
+    #[inline]
+    fn emit_byte_str(&mut self, v: &[u8]) {
+        self.emit_usize(v.len());
+        self.emit_raw_bytes(v);
+        self.emit_u8(BYTE_STR_SENTINEL);
+    }
+
     fn emit_raw_bytes(&mut self, s: &[u8]);
 }
 
@@ -122,9 +134,19 @@ pub trait Decoder {
         let len = self.read_usize();
         let bytes = self.read_raw_bytes(len + 1);
         assert!(bytes[len] == STR_SENTINEL);
+        // SAFETY: the presence of `STR_SENTINEL` gives us high (but not
+        // perfect) confidence that the bytes we just read truly are UTF-8.
         unsafe { std::str::from_utf8_unchecked(&bytes[..len]) }
     }
 
+    #[inline]
+    fn read_byte_str(&mut self) -> &[u8] {
+        let len = self.read_usize();
+        let bytes = self.read_raw_bytes(len + 1);
+        assert!(bytes[len] == BYTE_STR_SENTINEL);
+        &bytes[..len]
+    }
+
     fn read_raw_bytes(&mut self, len: usize) -> &[u8];
 
     fn peek_byte(&self) -> u8;
@@ -239,7 +261,7 @@ impl<S: Encoder> Encodable<S> for str {
 
 impl<S: Encoder> Encodable<S> for String {
     fn encode(&self, s: &mut S) {
-        s.emit_str(&self[..]);
+        s.emit_str(&self);
     }
 }
author	Nicholas Nethercote <n.nethercote@gmail.com>	2025-06-02 08:59:29 +1000
committer	Nicholas Nethercote <n.nethercote@gmail.com>	2025-06-30 20:42:27 +1000
commit	478f8287c0e2c35cda511fd3ac01b7ac78ee7cfe (patch)
tree	4d8f19b4e4e440ed8a22ee809ce2a565707d4c27 /compiler/rustc_serialize
parent	ed2d759783dc9de134bbb3f01085b1e6dbf539f3 (diff)
download	rust-478f8287c0e2c35cda511fd3ac01b7ac78ee7cfe.tar.gz rust-478f8287c0e2c35cda511fd3ac01b7ac78ee7cfe.zip