summary refs log tree commit diff
path: root/compiler/rustc_span/src
diff options
context:
space:
mode:
Diffstat (limited to 'compiler/rustc_span/src')
-rw-r--r--compiler/rustc_span/src/lib.rs36
-rw-r--r--compiler/rustc_span/src/symbol.rs123
-rw-r--r--compiler/rustc_span/src/symbol/tests.rs12
3 files changed, 137 insertions, 34 deletions
diff --git a/compiler/rustc_span/src/lib.rs b/compiler/rustc_span/src/lib.rs
index c8a29a2f68f..3d3a681c798 100644
--- a/compiler/rustc_span/src/lib.rs
+++ b/compiler/rustc_span/src/lib.rs
@@ -66,7 +66,9 @@ mod span_encoding;
 pub use span_encoding::{DUMMY_SP, Span};
 
 pub mod symbol;
-pub use symbol::{Ident, MacroRulesNormalizedIdent, STDLIB_STABLE_CRATES, Symbol, kw, sym};
+pub use symbol::{
+    ByteSymbol, Ident, MacroRulesNormalizedIdent, STDLIB_STABLE_CRATES, Symbol, kw, sym,
+};
 
 mod analyze_source_file;
 pub mod fatal_error;
@@ -1184,11 +1186,12 @@ rustc_index::newtype_index! {
 /// It is similar to rustc_type_ir's TyEncoder.
 pub trait SpanEncoder: Encoder {
     fn encode_span(&mut self, span: Span);
-    fn encode_symbol(&mut self, symbol: Symbol);
+    fn encode_symbol(&mut self, sym: Symbol);
+    fn encode_byte_symbol(&mut self, byte_sym: ByteSymbol);
     fn encode_expn_id(&mut self, expn_id: ExpnId);
     fn encode_syntax_context(&mut self, syntax_context: SyntaxContext);
-    /// As a local identifier, a `CrateNum` is only meaningful within its context, e.g. within a tcx.
-    /// Therefore, make sure to include the context when encode a `CrateNum`.
+    /// As a local identifier, a `CrateNum` is only meaningful within its context, e.g. within a
+    /// tcx. Therefore, make sure to include the context when encode a `CrateNum`.
     fn encode_crate_num(&mut self, crate_num: CrateNum);
     fn encode_def_index(&mut self, def_index: DefIndex);
     fn encode_def_id(&mut self, def_id: DefId);
@@ -1201,8 +1204,12 @@ impl SpanEncoder for FileEncoder {
         span.hi.encode(self);
     }
 
-    fn encode_symbol(&mut self, symbol: Symbol) {
-        self.emit_str(symbol.as_str());
+    fn encode_symbol(&mut self, sym: Symbol) {
+        self.emit_str(sym.as_str());
+    }
+
+    fn encode_byte_symbol(&mut self, byte_sym: ByteSymbol) {
+        self.emit_byte_str(byte_sym.as_byte_str());
     }
 
     fn encode_expn_id(&mut self, _expn_id: ExpnId) {
@@ -1239,6 +1246,12 @@ impl<E: SpanEncoder> Encodable<E> for Symbol {
     }
 }
 
+impl<E: SpanEncoder> Encodable<E> for ByteSymbol {
+    fn encode(&self, s: &mut E) {
+        s.encode_byte_symbol(*self);
+    }
+}
+
 impl<E: SpanEncoder> Encodable<E> for ExpnId {
     fn encode(&self, s: &mut E) {
         s.encode_expn_id(*self)
@@ -1280,6 +1293,7 @@ impl<E: SpanEncoder> Encodable<E> for AttrId {
 pub trait SpanDecoder: Decoder {
     fn decode_span(&mut self) -> Span;
     fn decode_symbol(&mut self) -> Symbol;
+    fn decode_byte_symbol(&mut self) -> ByteSymbol;
     fn decode_expn_id(&mut self) -> ExpnId;
     fn decode_syntax_context(&mut self) -> SyntaxContext;
     fn decode_crate_num(&mut self) -> CrateNum;
@@ -1300,6 +1314,10 @@ impl SpanDecoder for MemDecoder<'_> {
         Symbol::intern(self.read_str())
     }
 
+    fn decode_byte_symbol(&mut self) -> ByteSymbol {
+        ByteSymbol::intern(self.read_byte_str())
+    }
+
     fn decode_expn_id(&mut self) -> ExpnId {
         panic!("cannot decode `ExpnId` with `MemDecoder`");
     }
@@ -1337,6 +1355,12 @@ impl<D: SpanDecoder> Decodable<D> for Symbol {
     }
 }
 
+impl<D: SpanDecoder> Decodable<D> for ByteSymbol {
+    fn decode(s: &mut D) -> ByteSymbol {
+        s.decode_byte_symbol()
+    }
+}
+
 impl<D: SpanDecoder> Decodable<D> for ExpnId {
     fn decode(s: &mut D) -> ExpnId {
         s.decode_expn_id()
diff --git a/compiler/rustc_span/src/symbol.rs b/compiler/rustc_span/src/symbol.rs
index 4b8762d0dd1..34869a38bb4 100644
--- a/compiler/rustc_span/src/symbol.rs
+++ b/compiler/rustc_span/src/symbol.rs
@@ -2583,7 +2583,7 @@ impl fmt::Display for MacroRulesNormalizedIdent {
     }
 }
 
-/// An interned string.
+/// An interned UTF-8 string.
 ///
 /// Internally, a `Symbol` is implemented as an index, and all operations
 /// (including hashing, equality, and ordering) operate on that index. The use
@@ -2595,20 +2595,23 @@ impl fmt::Display for MacroRulesNormalizedIdent {
 #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
 pub struct Symbol(SymbolIndex);
 
+// Used within both `Symbol` and `ByteSymbol`.
 rustc_index::newtype_index! {
     #[orderable]
     struct SymbolIndex {}
 }
 
 impl Symbol {
+    /// Avoid this except for things like deserialization of previously
+    /// serialized symbols, and testing. Use `intern` instead.
     pub const fn new(n: u32) -> Self {
         Symbol(SymbolIndex::from_u32(n))
     }
 
     /// Maps a string to its interned representation.
     #[rustc_diagnostic_item = "SymbolIntern"]
-    pub fn intern(string: &str) -> Self {
-        with_session_globals(|session_globals| session_globals.symbol_interner.intern(string))
+    pub fn intern(str: &str) -> Self {
+        with_session_globals(|session_globals| session_globals.symbol_interner.intern_str(str))
     }
 
     /// Access the underlying string. This is a slowish operation because it
@@ -2621,7 +2624,7 @@ impl Symbol {
     /// it works out ok.
     pub fn as_str(&self) -> &str {
         with_session_globals(|session_globals| unsafe {
-            std::mem::transmute::<&str, &str>(session_globals.symbol_interner.get(*self))
+            std::mem::transmute::<&str, &str>(session_globals.symbol_interner.get_str(*self))
         })
     }
 
@@ -2678,56 +2681,130 @@ impl StableCompare for Symbol {
     }
 }
 
+/// Like `Symbol`, but for byte strings. `ByteSymbol` is used less widely, so
+/// it has fewer operations defined than `Symbol`.
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct ByteSymbol(SymbolIndex);
+
+impl ByteSymbol {
+    /// Avoid this except for things like deserialization of previously
+    /// serialized symbols, and testing. Use `intern` instead.
+    pub const fn new(n: u32) -> Self {
+        ByteSymbol(SymbolIndex::from_u32(n))
+    }
+
+    /// Maps a string to its interned representation.
+    pub fn intern(byte_str: &[u8]) -> Self {
+        with_session_globals(|session_globals| {
+            session_globals.symbol_interner.intern_byte_str(byte_str)
+        })
+    }
+
+    /// Like `Symbol::as_str`.
+    pub fn as_byte_str(&self) -> &[u8] {
+        with_session_globals(|session_globals| unsafe {
+            std::mem::transmute::<&[u8], &[u8]>(session_globals.symbol_interner.get_byte_str(*self))
+        })
+    }
+
+    pub fn as_u32(self) -> u32 {
+        self.0.as_u32()
+    }
+}
+
+impl fmt::Debug for ByteSymbol {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        fmt::Debug::fmt(self.as_byte_str(), f)
+    }
+}
+
+impl<CTX> HashStable<CTX> for ByteSymbol {
+    #[inline]
+    fn hash_stable(&self, hcx: &mut CTX, hasher: &mut StableHasher) {
+        self.as_byte_str().hash_stable(hcx, hasher);
+    }
+}
+
+// Interner used for both `Symbol`s and `ByteSymbol`s. If a string and a byte
+// string with identical contents (e.g. "foo" and b"foo") are both interned,
+// only one copy will be stored and the resulting `Symbol` and `ByteSymbol`
+// will have the same index.
 pub(crate) struct Interner(Lock<InternerInner>);
 
-// The `&'static str`s in this type actually point into the arena.
+// The `&'static [u8]`s in this type actually point into the arena.
 //
 // This type is private to prevent accidentally constructing more than one
 // `Interner` on the same thread, which makes it easy to mix up `Symbol`s
 // between `Interner`s.
 struct InternerInner {
     arena: DroplessArena,
-    strings: FxIndexSet<&'static str>,
+    byte_strs: FxIndexSet<&'static [u8]>,
 }
 
 impl Interner {
+    // These arguments are `&str`, but because of the sharing, we are
+    // effectively pre-interning all these strings for both `Symbol` and
+    // `ByteSymbol`.
     fn prefill(init: &[&'static str], extra: &[&'static str]) -> Self {
-        let strings = FxIndexSet::from_iter(init.iter().copied().chain(extra.iter().copied()));
+        let byte_strs = FxIndexSet::from_iter(
+            init.iter().copied().chain(extra.iter().copied()).map(|str| str.as_bytes()),
+        );
         assert_eq!(
-            strings.len(),
+            byte_strs.len(),
             init.len() + extra.len(),
-            "there are duplicate symbols in the rustc symbol list and the extra symbols added by the driver",
+            "duplicate symbols in the rustc symbol list and the extra symbols added by the driver",
         );
-        Interner(Lock::new(InternerInner { arena: Default::default(), strings }))
+        Interner(Lock::new(InternerInner { arena: Default::default(), byte_strs }))
+    }
+
+    fn intern_str(&self, str: &str) -> Symbol {
+        Symbol::new(self.intern_inner(str.as_bytes()))
+    }
+
+    fn intern_byte_str(&self, byte_str: &[u8]) -> ByteSymbol {
+        ByteSymbol::new(self.intern_inner(byte_str))
     }
 
     #[inline]
-    fn intern(&self, string: &str) -> Symbol {
+    fn intern_inner(&self, byte_str: &[u8]) -> u32 {
         let mut inner = self.0.lock();
-        if let Some(idx) = inner.strings.get_index_of(string) {
-            return Symbol::new(idx as u32);
+        if let Some(idx) = inner.byte_strs.get_index_of(byte_str) {
+            return idx as u32;
         }
 
-        let string: &str = inner.arena.alloc_str(string);
+        let byte_str: &[u8] = inner.arena.alloc_slice(byte_str);
 
         // SAFETY: we can extend the arena allocation to `'static` because we
         // only access these while the arena is still alive.
-        let string: &'static str = unsafe { &*(string as *const str) };
+        let byte_str: &'static [u8] = unsafe { &*(byte_str as *const [u8]) };
 
         // This second hash table lookup can be avoided by using `RawEntryMut`,
         // but this code path isn't hot enough for it to be worth it. See
         // #91445 for details.
-        let (idx, is_new) = inner.strings.insert_full(string);
+        let (idx, is_new) = inner.byte_strs.insert_full(byte_str);
         debug_assert!(is_new); // due to the get_index_of check above
 
-        Symbol::new(idx as u32)
+        idx as u32
     }
 
     /// Get the symbol as a string.
     ///
     /// [`Symbol::as_str()`] should be used in preference to this function.
-    fn get(&self, symbol: Symbol) -> &str {
-        self.0.lock().strings.get_index(symbol.0.as_usize()).unwrap()
+    fn get_str(&self, symbol: Symbol) -> &str {
+        let byte_str = self.get_inner(symbol.0.as_usize());
+        // SAFETY: known to be a UTF8 string because it's a `Symbol`.
+        unsafe { str::from_utf8_unchecked(byte_str) }
+    }
+
+    /// Get the symbol as a string.
+    ///
+    /// [`ByteSymbol::as_byte_str()`] should be used in preference to this function.
+    fn get_byte_str(&self, symbol: ByteSymbol) -> &[u8] {
+        self.get_inner(symbol.0.as_usize())
+    }
+
+    fn get_inner(&self, index: usize) -> &[u8] {
+        self.0.lock().byte_strs.get_index(index).unwrap()
     }
 }
 
@@ -2822,9 +2899,11 @@ impl Symbol {
         self != sym::empty && self != kw::Underscore && !self.is_path_segment_keyword()
     }
 
-    /// Was this symbol predefined in the compiler's `symbols!` macro
-    pub fn is_predefined(self) -> bool {
-        self.as_u32() < PREDEFINED_SYMBOLS_COUNT
+    /// Was this symbol index predefined in the compiler's `symbols!` macro?
+    /// Note: this applies to both `Symbol`s and `ByteSymbol`s, which is why it
+    /// takes a `u32` argument instead of a `&self` argument. Use with care.
+    pub fn is_predefined(index: u32) -> bool {
+        index < PREDEFINED_SYMBOLS_COUNT
     }
 }
 
diff --git a/compiler/rustc_span/src/symbol/tests.rs b/compiler/rustc_span/src/symbol/tests.rs
index 660d0d7179a..bf0660aa510 100644
--- a/compiler/rustc_span/src/symbol/tests.rs
+++ b/compiler/rustc_span/src/symbol/tests.rs
@@ -5,14 +5,14 @@ use crate::create_default_session_globals_then;
 fn interner_tests() {
     let i = Interner::prefill(&[], &[]);
     // first one is zero:
-    assert_eq!(i.intern("dog"), Symbol::new(0));
-    // re-use gets the same entry:
-    assert_eq!(i.intern("dog"), Symbol::new(0));
+    assert_eq!(i.intern_str("dog"), Symbol::new(0));
+    // re-use gets the same entry, even with a `ByteSymbol`
+    assert_eq!(i.intern_byte_str(b"dog"), ByteSymbol::new(0));
     // different string gets a different #:
-    assert_eq!(i.intern("cat"), Symbol::new(1));
-    assert_eq!(i.intern("cat"), Symbol::new(1));
+    assert_eq!(i.intern_byte_str(b"cat"), ByteSymbol::new(1));
+    assert_eq!(i.intern_str("cat"), Symbol::new(1));
     // dog is still at zero
-    assert_eq!(i.intern("dog"), Symbol::new(0));
+    assert_eq!(i.intern_str("dog"), Symbol::new(0));
 }
 
 #[test]