diff options
| author | Ulrik Sverdrup <bluss@users.noreply.github.com> | 2016-09-20 09:40:25 +0200 |
|---|---|---|
| committer | Ulrik Sverdrup <bluss@users.noreply.github.com> | 2016-10-17 15:54:09 +0200 |
| commit | 13a1f21371efa5be7a7d8b26bde19fb7da5bd967 (patch) | |
| tree | a1d169fb94a0d21abc2041684666de714724815d /src/libstd | |
| parent | da7f8c540b47c5bb063356bf5ad05a6a49ed0ff1 (diff) | |
| download | rust-13a1f21371efa5be7a7d8b26bde19fb7da5bd967.tar.gz rust-13a1f21371efa5be7a7d8b26bde19fb7da5bd967.zip | |
hashmap: Store hashes as usize internally
We can't use more than usize's bits of a hash to select a bucket anyway, so we only need to store that part in the table. This should be an improvement for the size of the data structure on 32-bit platforms. Smaller data means better cache utilization and hopefully better performance.
Diffstat (limited to 'src/libstd')
| -rw-r--r-- | src/libstd/collections/hash/table.rs | 69 |
1 files changed, 44 insertions, 25 deletions
diff --git a/src/libstd/collections/hash/table.rs b/src/libstd/collections/hash/table.rs index b357bc3552a..ec0e457dc6a 100644 --- a/src/libstd/collections/hash/table.rs +++ b/src/libstd/collections/hash/table.rs @@ -21,7 +21,18 @@ use ptr::{self, Unique, Shared}; use self::BucketState::*; -const EMPTY_BUCKET: u64 = 0; +/// Integer type used for stored hash values. +/// +/// No more than bit_width(usize) bits are needed to select a bucket. +/// +/// The most significant bit is ours to use for tagging `SafeHash`. +/// +/// (Even if we could have usize::MAX bytes allocated for buckets, +/// each bucket stores at least a `HashUint`, so there can be no more than +/// usize::MAX / size_of(usize) buckets.) +type HashUint = usize; + +const EMPTY_BUCKET: HashUint = 0; /// The raw hashtable, providing safe-ish access to the unzipped and highly /// optimized arrays of hashes, and key-value pairs. @@ -64,7 +75,7 @@ const EMPTY_BUCKET: u64 = 0; pub struct RawTable<K, V> { capacity: usize, size: usize, - hashes: Unique<u64>, + hashes: Unique<HashUint>, // Because K/V do not appear directly in any of the types in the struct, // inform rustc that in fact instances of K and V are reachable from here. @@ -75,7 +86,7 @@ unsafe impl<K: Send, V: Send> Send for RawTable<K, V> {} unsafe impl<K: Sync, V: Sync> Sync for RawTable<K, V> {} struct RawBucket<K, V> { - hash: *mut u64, + hash: *mut HashUint, // We use *const to ensure covariance with respect to K and V pair: *const (K, V), _marker: marker::PhantomData<(K, V)>, @@ -136,15 +147,27 @@ pub struct GapThenFull<K, V, M> { /// buckets. #[derive(PartialEq, Copy, Clone)] pub struct SafeHash { - hash: u64, + hash: HashUint, } impl SafeHash { /// Peek at the hash value, which is guaranteed to be non-zero. #[inline(always)] - pub fn inspect(&self) -> u64 { + pub fn inspect(&self) -> HashUint { self.hash } + + #[inline(always)] + pub fn new(hash: u64) -> Self { + // We need to avoid 0 in order to prevent collisions with + // EMPTY_HASH. We can maintain our precious uniform distribution + // of initial indexes by unconditionally setting the MSB, + // effectively reducing the hashes by one bit. + // + // Truncate hash to fit in `HashUint`. + let hash_bits = size_of::<HashUint>() * 8; + SafeHash { hash: (1 << (hash_bits - 1)) | (hash as HashUint) } + } } /// We need to remove hashes of 0. That's reserved for empty buckets. @@ -156,25 +179,21 @@ pub fn make_hash<T: ?Sized, S>(hash_state: &S, t: &T) -> SafeHash { let mut state = hash_state.build_hasher(); t.hash(&mut state); - // We need to avoid 0 in order to prevent collisions with - // EMPTY_HASH. We can maintain our precious uniform distribution - // of initial indexes by unconditionally setting the MSB, - // effectively reducing 64-bits hashes to 63 bits. - SafeHash { hash: 0x8000_0000_0000_0000 | state.finish() } + SafeHash::new(state.finish()) } -// `replace` casts a `*u64` to a `*SafeHash`. Since we statically +// `replace` casts a `*HashUint` to a `*SafeHash`. Since we statically // ensure that a `FullBucket` points to an index with a non-zero hash, -// and a `SafeHash` is just a `u64` with a different name, this is +// and a `SafeHash` is just a `HashUint` with a different name, this is // safe. // // This test ensures that a `SafeHash` really IS the same size as a -// `u64`. If you need to change the size of `SafeHash` (and +// `HashUint`. If you need to change the size of `SafeHash` (and // consequently made this test fail), `replace` needs to be // modified to no longer assume this. #[test] -fn can_alias_safehash_as_u64() { - assert_eq!(size_of::<SafeHash>(), size_of::<u64>()) +fn can_alias_safehash_as_hash() { + assert_eq!(size_of::<SafeHash>(), size_of::<HashUint>()) } impl<K, V> RawBucket<K, V> { @@ -605,14 +624,14 @@ impl<K, V> RawTable<K, V> { return RawTable { size: 0, capacity: 0, - hashes: Unique::new(EMPTY as *mut u64), + hashes: Unique::new(EMPTY as *mut HashUint), marker: marker::PhantomData, }; } // No need for `checked_mul` before a more restrictive check performed // later in this method. - let hashes_size = capacity.wrapping_mul(size_of::<u64>()); + let hashes_size = capacity.wrapping_mul(size_of::<HashUint>()); let pairs_size = capacity.wrapping_mul(size_of::<(K, V)>()); // Allocating hashmaps is a little tricky. We need to allocate two @@ -624,13 +643,13 @@ impl<K, V> RawTable<K, V> { // right is a little subtle. Therefore, calculating offsets has been // factored out into a different function. let (alignment, hash_offset, size, oflo) = calculate_allocation(hashes_size, - align_of::<u64>(), + align_of::<HashUint>(), pairs_size, align_of::<(K, V)>()); assert!(!oflo, "capacity overflow"); // One check for overflow that covers calculation and rounding of size. - let size_of_bucket = size_of::<u64>().checked_add(size_of::<(K, V)>()).unwrap(); + let size_of_bucket = size_of::<HashUint>().checked_add(size_of::<(K, V)>()).unwrap(); assert!(size >= capacity.checked_mul(size_of_bucket) .expect("capacity overflow"), @@ -641,7 +660,7 @@ impl<K, V> RawTable<K, V> { ::alloc::oom() } - let hashes = buffer.offset(hash_offset as isize) as *mut u64; + let hashes = buffer.offset(hash_offset as isize) as *mut HashUint; RawTable { capacity: capacity, @@ -652,7 +671,7 @@ impl<K, V> RawTable<K, V> { } fn first_bucket_raw(&self) -> RawBucket<K, V> { - let hashes_size = self.capacity * size_of::<u64>(); + let hashes_size = self.capacity * size_of::<HashUint>(); let pairs_size = self.capacity * size_of::<(K, V)>(); let buffer = *self.hashes as *mut u8; @@ -756,7 +775,7 @@ impl<K, V> RawTable<K, V> { /// this interface is safe, it's not used outside this module. struct RawBuckets<'a, K, V> { raw: RawBucket<K, V>, - hashes_end: *mut u64, + hashes_end: *mut HashUint, // Strictly speaking, this should be &'a (K,V), but that would // require that K:'a, and we often use RawBuckets<'static...> for @@ -802,7 +821,7 @@ impl<'a, K, V> Iterator for RawBuckets<'a, K, V> { /// the table's remaining entries. It's used in the implementation of Drop. struct RevMoveBuckets<'a, K, V> { raw: RawBucket<K, V>, - hashes_end: *mut u64, + hashes_end: *mut HashUint, elems_left: usize, // As above, `&'a (K,V)` would seem better, but we often use @@ -1036,10 +1055,10 @@ impl<K, V> Drop for RawTable<K, V> { } } - let hashes_size = self.capacity * size_of::<u64>(); + let hashes_size = self.capacity * size_of::<HashUint>(); let pairs_size = self.capacity * size_of::<(K, V)>(); let (align, _, size, oflo) = calculate_allocation(hashes_size, - align_of::<u64>(), + align_of::<HashUint>(), pairs_size, align_of::<(K, V)>()); |
