diff options
| author | The 8472 <git@infinite-source.de> | 2021-12-01 00:31:46 +0100 |
|---|---|---|
| committer | The 8472 <git@infinite-source.de> | 2021-12-06 18:43:01 +0100 |
| commit | c640f31c9f4b2159ebd2817904fcbc7077cec57a (patch) | |
| tree | a373660033e46bafcf70d6890b173ba7891a2efa | |
| parent | 207c80f105282245d93024c95ac408c622f70114 (diff) | |
| download | rust-c640f31c9f4b2159ebd2817904fcbc7077cec57a.tar.gz rust-c640f31c9f4b2159ebd2817904fcbc7077cec57a.zip | |
avoid string validation in rustc_serialize, check a marker byte instead
since the serialization format isn't self-describing we need a way to detect when encoder and decoder don't match up. but that doesn't have to be utf8 validation for strings, which does cost a few % of performance. Instead we can use a marker byte at the end to be reasonably sure that we're dealing with a string and it wasn't overwritten in some way.
| -rw-r--r-- | compiler/rustc_serialize/src/opaque.rs | 21 |
1 files changed, 17 insertions, 4 deletions
diff --git a/compiler/rustc_serialize/src/opaque.rs b/compiler/rustc_serialize/src/opaque.rs index 6e36184aff0..cc1216418ae 100644 --- a/compiler/rustc_serialize/src/opaque.rs +++ b/compiler/rustc_serialize/src/opaque.rs @@ -55,6 +55,13 @@ macro_rules! write_leb128 { }}; } +/// A byte that [cannot occur in UTF8 sequences][utf8]. Used to mark the end of a string. +/// This way we can skip validation and still be relatively sure that deserialization +/// did not desynchronize. +/// +/// [utf8]: https://en.wikipedia.org/w/index.php?title=UTF-8&oldid=1058865525#Codepage_layout +const STR_SENTINEL: u8 = 0xC1; + impl serialize::Encoder for Encoder { type Error = !; @@ -150,7 +157,8 @@ impl serialize::Encoder for Encoder { #[inline] fn emit_str(&mut self, v: &str) -> EncodeResult { self.emit_usize(v.len())?; - self.emit_raw_bytes(v.as_bytes()) + self.emit_raw_bytes(v.as_bytes())?; + self.emit_u8(STR_SENTINEL) } #[inline] @@ -502,7 +510,8 @@ impl serialize::Encoder for FileEncoder { #[inline] fn emit_str(&mut self, v: &str) -> FileEncodeResult { self.emit_usize(v.len())?; - self.emit_raw_bytes(v.as_bytes()) + self.emit_raw_bytes(v.as_bytes())?; + self.emit_u8(STR_SENTINEL) } #[inline] @@ -656,8 +665,12 @@ impl<'a> serialize::Decoder for Decoder<'a> { #[inline] fn read_str(&mut self) -> Result<Cow<'_, str>, Self::Error> { let len = self.read_usize()?; - let s = std::str::from_utf8(&self.data[self.position..self.position + len]).unwrap(); - self.position += len; + let sentinel = self.data[self.position + len]; + assert!(sentinel == STR_SENTINEL); + let s = unsafe { + std::str::from_utf8_unchecked(&self.data[self.position..self.position + len]) + }; + self.position += len + 1; Ok(Cow::Borrowed(s)) } |
