about summary refs log tree commit diff
diff options
context:
space:
mode:
authorbors <bors@rust-lang.org>2021-12-07 21:50:46 +0000
committerbors <bors@rust-lang.org>2021-12-07 21:50:46 +0000
commit477fd7038c235689913abf9208dfa9371cbacd88 (patch)
treedcc5d4942973e667bf2d8b6f3a843f5225663107
parent0b6f079e4987ded15c13a15b734e7cfb8176839f (diff)
parentc640f31c9f4b2159ebd2817904fcbc7077cec57a (diff)
downloadrust-477fd7038c235689913abf9208dfa9371cbacd88.tar.gz
rust-477fd7038c235689913abf9208dfa9371cbacd88.zip
Auto merge of #91407 - the8472:deserialize-unchecked-utf8, r=michaelwoerister
Avoid string validation in rustc_serialize, check a marker byte instead

Since the serialization format isn't self-describing we need a way to detect when encoder and decoder don't match up. But for strings it doesn't have to be utf8 validation, which currently does cost a few percent of performance.
Instead we can use a marker byte at the end to be reasonably sure that we're dealing with a string and it wasn't overwritten in some way.
-rw-r--r--compiler/rustc_serialize/src/opaque.rs21
1 files changed, 17 insertions, 4 deletions
diff --git a/compiler/rustc_serialize/src/opaque.rs b/compiler/rustc_serialize/src/opaque.rs
index 6e36184aff0..cc1216418ae 100644
--- a/compiler/rustc_serialize/src/opaque.rs
+++ b/compiler/rustc_serialize/src/opaque.rs
@@ -55,6 +55,13 @@ macro_rules! write_leb128 {
     }};
 }
 
+/// A byte that [cannot occur in UTF8 sequences][utf8]. Used to mark the end of a string.
+/// This way we can skip validation and still be relatively sure that deserialization
+/// did not desynchronize.
+///
+/// [utf8]: https://en.wikipedia.org/w/index.php?title=UTF-8&oldid=1058865525#Codepage_layout
+const STR_SENTINEL: u8 = 0xC1;
+
 impl serialize::Encoder for Encoder {
     type Error = !;
 
@@ -150,7 +157,8 @@ impl serialize::Encoder for Encoder {
     #[inline]
     fn emit_str(&mut self, v: &str) -> EncodeResult {
         self.emit_usize(v.len())?;
-        self.emit_raw_bytes(v.as_bytes())
+        self.emit_raw_bytes(v.as_bytes())?;
+        self.emit_u8(STR_SENTINEL)
     }
 
     #[inline]
@@ -502,7 +510,8 @@ impl serialize::Encoder for FileEncoder {
     #[inline]
     fn emit_str(&mut self, v: &str) -> FileEncodeResult {
         self.emit_usize(v.len())?;
-        self.emit_raw_bytes(v.as_bytes())
+        self.emit_raw_bytes(v.as_bytes())?;
+        self.emit_u8(STR_SENTINEL)
     }
 
     #[inline]
@@ -656,8 +665,12 @@ impl<'a> serialize::Decoder for Decoder<'a> {
     #[inline]
     fn read_str(&mut self) -> Result<Cow<'_, str>, Self::Error> {
         let len = self.read_usize()?;
-        let s = std::str::from_utf8(&self.data[self.position..self.position + len]).unwrap();
-        self.position += len;
+        let sentinel = self.data[self.position + len];
+        assert!(sentinel == STR_SENTINEL);
+        let s = unsafe {
+            std::str::from_utf8_unchecked(&self.data[self.position..self.position + len])
+        };
+        self.position += len + 1;
         Ok(Cow::Borrowed(s))
     }