diff options
| author | bors <bors@rust-lang.org> | 2022-05-06 09:43:57 +0000 |
|---|---|---|
| committer | bors <bors@rust-lang.org> | 2022-05-06 09:43:57 +0000 |
| commit | 8c4fc9d9a45b0577f583abda870386f1f940c706 (patch) | |
| tree | 83d34377b91bfa28ca3fe9d27448f50eceb4cf13 | |
| parent | 7f9e013ba62e7bf77a95ba29bcb2c5740b7b2059 (diff) | |
| parent | ebdcb08abf00f193b3787dc32cea97ce74a14963 (diff) | |
| download | rust-8c4fc9d9a45b0577f583abda870386f1f940c706.tar.gz rust-8c4fc9d9a45b0577f583abda870386f1f940c706.zip | |
Auto merge of #94598 - scottmcm:prefix-free-hasher-methods, r=Amanieu
Add a dedicated length-prefixing method to `Hasher`
This accomplishes two main goals:
- Make it clear who is responsible for prefix-freedom, including how they should do it
- Make it feasible for a `Hasher` that *doesn't* care about Hash-DoS resistance to get better performance by not hashing lengths
This does not change rustc-hash, since that's in an external crate, but that could potentially use it in future.
Fixes #94026
r? rust-lang/libs
---
The core of this change is the following two new methods on `Hasher`:
```rust
pub trait Hasher {
/// Writes a length prefix into this hasher, as part of being prefix-free.
///
/// If you're implementing [`Hash`] for a custom collection, call this before
/// writing its contents to this `Hasher`. That way
/// `(collection![1, 2, 3], collection![4, 5])` and
/// `(collection![1, 2], collection![3, 4, 5])` will provide different
/// sequences of values to the `Hasher`
///
/// The `impl<T> Hash for [T]` includes a call to this method, so if you're
/// hashing a slice (or array or vector) via its `Hash::hash` method,
/// you should **not** call this yourself.
///
/// This method is only for providing domain separation. If you want to
/// hash a `usize` that represents part of the *data*, then it's important
/// that you pass it to [`Hasher::write_usize`] instead of to this method.
///
/// # Examples
///
/// ```
/// #![feature(hasher_prefixfree_extras)]
/// # // Stubs to make the `impl` below pass the compiler
/// # struct MyCollection<T>(Option<T>);
/// # impl<T> MyCollection<T> {
/// # fn len(&self) -> usize { todo!() }
/// # }
/// # impl<'a, T> IntoIterator for &'a MyCollection<T> {
/// # type Item = T;
/// # type IntoIter = std::iter::Empty<T>;
/// # fn into_iter(self) -> Self::IntoIter { todo!() }
/// # }
///
/// use std::hash::{Hash, Hasher};
/// impl<T: Hash> Hash for MyCollection<T> {
/// fn hash<H: Hasher>(&self, state: &mut H) {
/// state.write_length_prefix(self.len());
/// for elt in self {
/// elt.hash(state);
/// }
/// }
/// }
/// ```
///
/// # Note to Implementers
///
/// If you've decided that your `Hasher` is willing to be susceptible to
/// Hash-DoS attacks, then you might consider skipping hashing some or all
/// of the `len` provided in the name of increased performance.
#[inline]
#[unstable(feature = "hasher_prefixfree_extras", issue = "88888888")]
fn write_length_prefix(&mut self, len: usize) {
self.write_usize(len);
}
/// Writes a single `str` into this hasher.
///
/// If you're implementing [`Hash`], you generally do not need to call this,
/// as the `impl Hash for str` does, so you can just use that.
///
/// This includes the domain separator for prefix-freedom, so you should
/// **not** call `Self::write_length_prefix` before calling this.
///
/// # Note to Implementers
///
/// The default implementation of this method includes a call to
/// [`Self::write_length_prefix`], so if your implementation of `Hasher`
/// doesn't care about prefix-freedom and you've thus overridden
/// that method to do nothing, there's no need to override this one.
///
/// This method is available to be overridden separately from the others
/// as `str` being UTF-8 means that it never contains `0xFF` bytes, which
/// can be used to provide prefix-freedom cheaper than hashing a length.
///
/// For example, if your `Hasher` works byte-by-byte (perhaps by accumulating
/// them into a buffer), then you can hash the bytes of the `str` followed
/// by a single `0xFF` byte.
///
/// If your `Hasher` works in chunks, you can also do this by being careful
/// about how you pad partial chunks. If the chunks are padded with `0x00`
/// bytes then just hashing an extra `0xFF` byte doesn't necessarily
/// provide prefix-freedom, as `"ab"` and `"ab\u{0}"` would likely hash
/// the same sequence of chunks. But if you pad with `0xFF` bytes instead,
/// ensuring at least one padding byte, then it can often provide
/// prefix-freedom cheaper than hashing the length would.
#[inline]
#[unstable(feature = "hasher_prefixfree_extras", issue = "88888888")]
fn write_str(&mut self, s: &str) {
self.write_length_prefix(s.len());
self.write(s.as_bytes());
}
}
```
With updates to the `Hash` implementations for slices and containers to call `write_length_prefix` instead of `write_usize`.
`write_str` defaults to using `write_length_prefix` since, as was pointed out in the issue, the `write_u8(0xFF)` approach is insufficient for hashers that work in chunks, as those would hash `"a\u{0}"` and `"a"` to the same thing. But since `SipHash` works byte-wise (there's an internal buffer to accumulate bytes until a full chunk is available) it overrides `write_str` to continue to use the add-non-UTF-8-byte approach.
---
Compatibility:
Because the default implementation of `write_length_prefix` calls `write_usize`, the changed hash implementation for slices will do the same thing the old one did on existing `Hasher`s.
| -rw-r--r-- | compiler/rustc_data_structures/src/lib.rs | 1 | ||||
| -rw-r--r-- | compiler/rustc_data_structures/src/sip128.rs | 8 | ||||
| -rw-r--r-- | compiler/rustc_data_structures/src/stable_hasher.rs | 11 | ||||
| -rw-r--r-- | library/alloc/src/boxed.rs | 6 | ||||
| -rw-r--r-- | library/alloc/src/collections/btree/map.rs | 2 | ||||
| -rw-r--r-- | library/alloc/src/collections/linked_list.rs | 2 | ||||
| -rw-r--r-- | library/alloc/src/collections/vec_deque/mod.rs | 2 | ||||
| -rw-r--r-- | library/alloc/src/lib.rs | 1 | ||||
| -rw-r--r-- | library/core/src/hash/mod.rs | 138 | ||||
| -rw-r--r-- | library/core/src/hash/sip.rs | 18 | ||||
| -rw-r--r-- | library/core/tests/hash/mod.rs | 4 | ||||
| -rw-r--r-- | library/core/tests/lib.rs | 1 | ||||
| -rw-r--r-- | library/std/src/collections/hash/map.rs | 8 | ||||
| -rw-r--r-- | library/std/src/lib.rs | 1 |
14 files changed, 197 insertions, 6 deletions
diff --git a/compiler/rustc_data_structures/src/lib.rs b/compiler/rustc_data_structures/src/lib.rs index 5d42f8c9306..76ae17f28c6 100644 --- a/compiler/rustc_data_structures/src/lib.rs +++ b/compiler/rustc_data_structures/src/lib.rs @@ -17,6 +17,7 @@ #![feature(generators)] #![feature(let_else)] #![feature(hash_raw_entry)] +#![feature(hasher_prefixfree_extras)] #![feature(maybe_uninit_uninit_array)] #![feature(min_specialization)] #![feature(never_type)] diff --git a/compiler/rustc_data_structures/src/sip128.rs b/compiler/rustc_data_structures/src/sip128.rs index 6e5c0617bf3..abd25f46ad5 100644 --- a/compiler/rustc_data_structures/src/sip128.rs +++ b/compiler/rustc_data_structures/src/sip128.rs @@ -462,6 +462,14 @@ impl Hasher for SipHasher128 { self.slice_write(msg); } + #[inline] + fn write_str(&mut self, s: &str) { + // This hasher works byte-wise, and `0xFF` cannot show up in a `str`, + // so just hashing the one extra byte is enough to be prefix-free. + self.write(s.as_bytes()); + self.write_u8(0xFF); + } + fn finish(&self) -> u64 { panic!("SipHasher128 cannot provide valid 64 bit hashes") } diff --git a/compiler/rustc_data_structures/src/stable_hasher.rs b/compiler/rustc_data_structures/src/stable_hasher.rs index 25353290fd5..c8bb4fc5e6a 100644 --- a/compiler/rustc_data_structures/src/stable_hasher.rs +++ b/compiler/rustc_data_structures/src/stable_hasher.rs @@ -74,6 +74,17 @@ impl Hasher for StableHasher { } #[inline] + fn write_str(&mut self, s: &str) { + self.state.write_str(s); + } + + #[inline] + fn write_length_prefix(&mut self, len: usize) { + // Our impl for `usize` will extend it if needed. + self.write_usize(len); + } + + #[inline] fn write_u8(&mut self, i: u8) { self.state.write_u8(i); } diff --git a/library/alloc/src/boxed.rs b/library/alloc/src/boxed.rs index 639e7f213ea..c07536f0d0c 100644 --- a/library/alloc/src/boxed.rs +++ b/library/alloc/src/boxed.rs @@ -1369,6 +1369,12 @@ impl<T: ?Sized + Hasher, A: Allocator> Hasher for Box<T, A> { fn write_isize(&mut self, i: isize) { (**self).write_isize(i) } + fn write_length_prefix(&mut self, len: usize) { + (**self).write_length_prefix(len) + } + fn write_str(&mut self, s: &str) { + (**self).write_str(s) + } } #[cfg(not(no_global_oom_handling))] diff --git a/library/alloc/src/collections/btree/map.rs b/library/alloc/src/collections/btree/map.rs index c178d3e3b03..264c217c9ef 100644 --- a/library/alloc/src/collections/btree/map.rs +++ b/library/alloc/src/collections/btree/map.rs @@ -1990,7 +1990,7 @@ impl<'a, K: Ord + Copy, V: Copy> Extend<(&'a K, &'a V)> for BTreeMap<K, V> { #[stable(feature = "rust1", since = "1.0.0")] impl<K: Hash, V: Hash> Hash for BTreeMap<K, V> { fn hash<H: Hasher>(&self, state: &mut H) { - self.len().hash(state); + state.write_length_prefix(self.len()); for elt in self { elt.hash(state); } diff --git a/library/alloc/src/collections/linked_list.rs b/library/alloc/src/collections/linked_list.rs index 736b38370ab..67dc4f30f31 100644 --- a/library/alloc/src/collections/linked_list.rs +++ b/library/alloc/src/collections/linked_list.rs @@ -1944,7 +1944,7 @@ impl<T: fmt::Debug> fmt::Debug for LinkedList<T> { #[stable(feature = "rust1", since = "1.0.0")] impl<T: Hash> Hash for LinkedList<T> { fn hash<H: Hasher>(&self, state: &mut H) { - self.len().hash(state); + state.write_length_prefix(self.len()); for elt in self { elt.hash(state); } diff --git a/library/alloc/src/collections/vec_deque/mod.rs b/library/alloc/src/collections/vec_deque/mod.rs index 5f1a6848ae6..04900ead579 100644 --- a/library/alloc/src/collections/vec_deque/mod.rs +++ b/library/alloc/src/collections/vec_deque/mod.rs @@ -2899,7 +2899,7 @@ impl<T: Ord, A: Allocator> Ord for VecDeque<T, A> { #[stable(feature = "rust1", since = "1.0.0")] impl<T: Hash, A: Allocator> Hash for VecDeque<T, A> { fn hash<H: Hasher>(&self, state: &mut H) { - self.len().hash(state); + state.write_length_prefix(self.len()); // It's not possible to use Hash::hash_slice on slices // returned by as_slices method as their length can vary // in otherwise identical deques. diff --git a/library/alloc/src/lib.rs b/library/alloc/src/lib.rs index 4d2dc4ecee0..ecebc7ed9ac 100644 --- a/library/alloc/src/lib.rs +++ b/library/alloc/src/lib.rs @@ -117,6 +117,7 @@ #![feature(extend_one)] #![feature(fmt_internals)] #![feature(fn_traits)] +#![feature(hasher_prefixfree_extras)] #![feature(inplace_iteration)] #![feature(iter_advance_by)] #![feature(layout_for_ptr)] diff --git a/library/core/src/hash/mod.rs b/library/core/src/hash/mod.rs index 45c9df0c930..9d64c786d67 100644 --- a/library/core/src/hash/mod.rs +++ b/library/core/src/hash/mod.rs @@ -333,6 +333,12 @@ pub trait Hasher { /// /// println!("Hash is {:x}!", hasher.finish()); /// ``` + /// + /// # Note to Implementers + /// + /// You generally should not do length-prefixing as part of implementing + /// this method. It's up to the [`Hash`] implementation to call + /// [`Hasher::write_length_prefix`] before sequences that need it. #[stable(feature = "rust1", since = "1.0.0")] fn write(&mut self, bytes: &[u8]); @@ -409,6 +415,127 @@ pub trait Hasher { fn write_isize(&mut self, i: isize) { self.write_usize(i as usize) } + + /// Writes a length prefix into this hasher, as part of being prefix-free. + /// + /// If you're implementing [`Hash`] for a custom collection, call this before + /// writing its contents to this `Hasher`. That way + /// `(collection![1, 2, 3], collection![4, 5])` and + /// `(collection![1, 2], collection![3, 4, 5])` will provide different + /// sequences of values to the `Hasher` + /// + /// The `impl<T> Hash for [T]` includes a call to this method, so if you're + /// hashing a slice (or array or vector) via its `Hash::hash` method, + /// you should **not** call this yourself. + /// + /// This method is only for providing domain separation. If you want to + /// hash a `usize` that represents part of the *data*, then it's important + /// that you pass it to [`Hasher::write_usize`] instead of to this method. + /// + /// # Examples + /// + /// ``` + /// #![feature(hasher_prefixfree_extras)] + /// # // Stubs to make the `impl` below pass the compiler + /// # struct MyCollection<T>(Option<T>); + /// # impl<T> MyCollection<T> { + /// # fn len(&self) -> usize { todo!() } + /// # } + /// # impl<'a, T> IntoIterator for &'a MyCollection<T> { + /// # type Item = T; + /// # type IntoIter = std::iter::Empty<T>; + /// # fn into_iter(self) -> Self::IntoIter { todo!() } + /// # } + /// + /// use std::hash::{Hash, Hasher}; + /// impl<T: Hash> Hash for MyCollection<T> { + /// fn hash<H: Hasher>(&self, state: &mut H) { + /// state.write_length_prefix(self.len()); + /// for elt in self { + /// elt.hash(state); + /// } + /// } + /// } + /// ``` + /// + /// # Note to Implementers + /// + /// If you've decided that your `Hasher` is willing to be susceptible to + /// Hash-DoS attacks, then you might consider skipping hashing some or all + /// of the `len` provided in the name of increased performance. + #[inline] + #[unstable(feature = "hasher_prefixfree_extras", issue = "96762")] + fn write_length_prefix(&mut self, len: usize) { + self.write_usize(len); + } + + /// Writes a single `str` into this hasher. + /// + /// If you're implementing [`Hash`], you generally do not need to call this, + /// as the `impl Hash for str` does, so you should prefer that instead. + /// + /// This includes the domain separator for prefix-freedom, so you should + /// **not** call `Self::write_length_prefix` before calling this. + /// + /// # Note to Implementers + /// + /// There are at least two reasonable default ways to implement this. + /// Which one will be the default is not yet decided, so for now + /// you probably want to override it specifically. + /// + /// ## The general answer + /// + /// It's always correct to implement this with a length prefix: + /// + /// ``` + /// # #![feature(hasher_prefixfree_extras)] + /// # struct Foo; + /// # impl std::hash::Hasher for Foo { + /// # fn finish(&self) -> u64 { unimplemented!() } + /// # fn write(&mut self, _bytes: &[u8]) { unimplemented!() } + /// fn write_str(&mut self, s: &str) { + /// self.write_length_prefix(s.len()); + /// self.write(s.as_bytes()); + /// } + /// # } + /// ``` + /// + /// And, if your `Hasher` works in `usize` chunks, this is likely a very + /// efficient way to do it, as anything more complicated may well end up + /// slower than just running the round with the length. + /// + /// ## If your `Hasher` works byte-wise + /// + /// One nice thing about `str` being UTF-8 is that the `b'\xFF'` byte + /// never happens. That means that you can append that to the byte stream + /// being hashed and maintain prefix-freedom: + /// + /// ``` + /// # #![feature(hasher_prefixfree_extras)] + /// # struct Foo; + /// # impl std::hash::Hasher for Foo { + /// # fn finish(&self) -> u64 { unimplemented!() } + /// # fn write(&mut self, _bytes: &[u8]) { unimplemented!() } + /// fn write_str(&mut self, s: &str) { + /// self.write(s.as_bytes()); + /// self.write_u8(0xff); + /// } + /// # } + /// ``` + /// + /// This does require that your implementation not add extra padding, and + /// thus generally requires that you maintain a buffer, running a round + /// only once that buffer is full (or `finish` is called). + /// + /// That's because if `write` pads data out to a fixed chunk size, it's + /// likely that it does it in such a way that `"a"` and `"a\x00"` would + /// end up hashing the same sequence of things, introducing conflicts. + #[inline] + #[unstable(feature = "hasher_prefixfree_extras", issue = "96762")] + fn write_str(&mut self, s: &str) { + self.write(s.as_bytes()); + self.write_u8(0xff); + } } #[stable(feature = "indirect_hasher_impl", since = "1.22.0")] @@ -455,6 +582,12 @@ impl<H: Hasher + ?Sized> Hasher for &mut H { fn write_isize(&mut self, i: isize) { (**self).write_isize(i) } + fn write_length_prefix(&mut self, len: usize) { + (**self).write_length_prefix(len) + } + fn write_str(&mut self, s: &str) { + (**self).write_str(s) + } } /// A trait for creating instances of [`Hasher`]. @@ -709,8 +842,7 @@ mod impls { impl Hash for str { #[inline] fn hash<H: Hasher>(&self, state: &mut H) { - state.write(self.as_bytes()); - state.write_u8(0xff) + state.write_str(self); } } @@ -767,7 +899,7 @@ mod impls { impl<T: Hash> Hash for [T] { #[inline] fn hash<H: Hasher>(&self, state: &mut H) { - self.len().hash(state); + state.write_length_prefix(self.len()); Hash::hash_slice(self, state) } } diff --git a/library/core/src/hash/sip.rs b/library/core/src/hash/sip.rs index b9443e30074..9d7daf1f1a0 100644 --- a/library/core/src/hash/sip.rs +++ b/library/core/src/hash/sip.rs @@ -234,6 +234,11 @@ impl super::Hasher for SipHasher { } #[inline] + fn write_str(&mut self, s: &str) { + self.0.hasher.write_str(s); + } + + #[inline] fn finish(&self) -> u64 { self.0.hasher.finish() } @@ -247,6 +252,11 @@ impl super::Hasher for SipHasher13 { } #[inline] + fn write_str(&mut self, s: &str) { + self.hasher.write_str(s); + } + + #[inline] fn finish(&self) -> u64 { self.hasher.finish() } @@ -308,6 +318,14 @@ impl<S: Sip> super::Hasher for Hasher<S> { } #[inline] + fn write_str(&mut self, s: &str) { + // This hasher works byte-wise, and `0xFF` cannot show up in a `str`, + // so just hashing the one extra byte is enough to be prefix-free. + self.write(s.as_bytes()); + self.write_u8(0xFF); + } + + #[inline] fn finish(&self) -> u64 { let mut state = self.state; diff --git a/library/core/tests/hash/mod.rs b/library/core/tests/hash/mod.rs index a173e461c60..5bc6aac1778 100644 --- a/library/core/tests/hash/mod.rs +++ b/library/core/tests/hash/mod.rs @@ -20,6 +20,10 @@ impl Hasher for MyHasher { self.hash += *byte as u64; } } + fn write_str(&mut self, s: &str) { + self.write(s.as_bytes()); + self.write_u8(0xFF); + } fn finish(&self) -> u64 { self.hash } diff --git a/library/core/tests/lib.rs b/library/core/tests/lib.rs index 21d600ac557..31b764a6e2d 100644 --- a/library/core/tests/lib.rs +++ b/library/core/tests/lib.rs @@ -36,6 +36,7 @@ #![feature(future_join)] #![feature(future_poll_fn)] #![feature(array_from_fn)] +#![feature(hasher_prefixfree_extras)] #![feature(hashmap_internals)] #![feature(try_find)] #![feature(inline_const)] diff --git a/library/std/src/collections/hash/map.rs b/library/std/src/collections/hash/map.rs index 0c638192264..e38368790e6 100644 --- a/library/std/src/collections/hash/map.rs +++ b/library/std/src/collections/hash/map.rs @@ -3006,12 +3006,20 @@ impl Default for DefaultHasher { #[stable(feature = "hashmap_default_hasher", since = "1.13.0")] impl Hasher for DefaultHasher { + // The underlying `SipHasher13` doesn't override the other + // `write_*` methods, so it's ok not to forward them here. + #[inline] fn write(&mut self, msg: &[u8]) { self.0.write(msg) } #[inline] + fn write_str(&mut self, s: &str) { + self.0.write_str(s); + } + + #[inline] fn finish(&self) -> u64 { self.0.finish() } diff --git a/library/std/src/lib.rs b/library/std/src/lib.rs index d70befa9d20..c394865d886 100644 --- a/library/std/src/lib.rs +++ b/library/std/src/lib.rs @@ -271,6 +271,7 @@ #![feature(exact_size_is_empty)] #![feature(extend_one)] #![feature(float_minimum_maximum)] +#![feature(hasher_prefixfree_extras)] #![feature(hashmap_internals)] #![feature(int_error_internals)] #![feature(maybe_uninit_slice)] |
