Auto merge of #37427 - nnethercote:opt-IchHasher, r=michaelwoerister

Reduce the number of bytes hashed by IchHasher. IchHasher uses blake2b hashing, which is expensive, so the fewer bytes hashed the better. There are two big ways to reduce the number of bytes hashed. - Filenames in spans account for ~66% of all bytes (for builds with debuginfo). The vast majority of spans have the same filename for the start of the span and the end of the span, so hashing the filename just once in those cases is a big win. - u32 and u64 and usize values account for ~25%--33% of all bytes (for builds with debuginfo). The vast majority of these are small, i.e. fit in a u8, so shrinking them down before hashing is also a big win. This PR implements these two optimizations. I'm certain the first one is safe. I'm about 90% sure that the second one is safe. Here are measurements of the number of bytes hashed when doing debuginfo-enabled builds of stdlib and rustc-benchmarks/syntex-0.42.2-incr-clean. ``` stdlib syntex-incr ------ ----------- original 156,781,386 255,095,596 half-SawSpan 106,744,403 176,345,419 short-ints 45,890,534 118,014,227 no-SawSpan[*] 6,831,874 45,875,714 [*] don't hash the SawSpan at all. Not part of this PR, just implemented for comparison's sake. ``` For debug builds of syntex-0.42.2-incr-clean, the two changes give a 1--2% speed-up.
author: bors <bors@rust-lang.org> 2016-11-05 01:10:57 -0700
committer: GitHub <noreply@github.com> 2016-11-05 01:10:57 -0700
commit: 08839965f96ea9947afe936f13a66a0f4f93db73 (patch)
tree: 06e3ec079876527a8c6ddfff5f44482cc2022000
parent: e96b9d2bb48b7a886a4d2b9df716265af5d15646 (diff)
parent: d73c68ceef6984b8e762a173b96279c42d1fe851 (diff)
download: rust-08839965f96ea9947afe936f13a66a0f4f93db73.tar.gz
rust-08839965f96ea9947afe936f13a66a0f4f93db73.zip
2 files changed, 68 insertions, 12 deletions
diff --git a/src/librustc_incremental/calculate_svh/hasher.rs b/src/librustc_incremental/calculate_svh/hasher.rs
index 49683a81227..d7d9c231a91 100644
--- a/src/librustc_incremental/calculate_svh/hasher.rs
+++ b/src/librustc_incremental/calculate_svh/hasher.rs
@@ -9,13 +9,16 @@
 // except according to those terms.
 
 use std::mem;
+use std::hash::Hasher;
 use rustc_data_structures::blake2b::Blake2bHasher;
 use rustc::ty::util::ArchIndependentHasher;
 use ich::Fingerprint;
+use rustc_serialize::leb128::write_unsigned_leb128;
 
 #[derive(Debug)]
 pub struct IchHasher {
     state: ArchIndependentHasher<Blake2bHasher>,
+    leb128_helper: Vec<u8>,
     bytes_hashed: u64,
 }
 
@@ -24,6 +27,7 @@ impl IchHasher {
         let hash_size = mem::size_of::<Fingerprint>();
         IchHasher {
             state: ArchIndependentHasher::new(Blake2bHasher::new(hash_size, &[])),
+            leb128_helper: vec![],
             bytes_hashed: 0
         }
     }
@@ -37,9 +41,19 @@ impl IchHasher {
         fingerprint.0.copy_from_slice(self.state.into_inner().finalize());
         fingerprint
     }
+
+    #[inline]
+    fn write_uleb128(&mut self, value: u64) {
+        let len = write_unsigned_leb128(&mut self.leb128_helper, 0, value);
+        self.state.write(&self.leb128_helper[0..len]);
+        self.bytes_hashed += len as u64;
+    }
 }
 
-impl ::std::hash::Hasher for IchHasher {
+// For the non-u8 integer cases we leb128 encode them first. Because small
+// integers dominate, this significantly and cheaply reduces the number of
+// bytes hashed, which is good because blake2b is expensive.
+impl Hasher for IchHasher {
     fn finish(&self) -> u64 {
         bug!("Use other finish() implementation to get the full 128-bit hash.");
     }
@@ -49,4 +63,26 @@ impl ::std::hash::Hasher for IchHasher {
         self.state.write(bytes);
         self.bytes_hashed += bytes.len() as u64;
     }
+
+    // There is no need to leb128-encode u8 values.
+
+    #[inline]
+    fn write_u16(&mut self, i: u16) {
+        self.write_uleb128(i as u64);
+    }
+
+    #[inline]
+    fn write_u32(&mut self, i: u32) {
+        self.write_uleb128(i as u64);
+    }
+
+    #[inline]
+    fn write_u64(&mut self, i: u64) {
+        self.write_uleb128(i);
+    }
+
+    #[inline]
+    fn write_usize(&mut self, i: usize) {
+        self.write_uleb128(i as u64);
+    }
 }
diff --git a/src/librustc_incremental/calculate_svh/svh_visitor.rs b/src/librustc_incremental/calculate_svh/svh_visitor.rs
index 80c41f855ba..2358d60d0de 100644
--- a/src/librustc_incremental/calculate_svh/svh_visitor.rs
+++ b/src/librustc_incremental/calculate_svh/svh_visitor.rs
@@ -88,6 +88,8 @@ impl<'a, 'hash, 'tcx> StrictVersionHashVisitor<'a, 'hash, 'tcx> {
     // within the CodeMap.
     // Also note that we are hashing byte offsets for the column, not unicode
     // codepoint offsets. For the purpose of the hash that's sufficient.
+    // Also, hashing filenames is expensive so we avoid doing it twice when the
+    // span starts and ends in the same file, which is almost always the case.
     fn hash_span(&mut self, span: Span) {
         debug!("hash_span: st={:?}", self.st);
 
@@ -103,21 +105,35 @@ impl<'a, 'hash, 'tcx> StrictVersionHashVisitor<'a, 'hash, 'tcx> {
             span.hi
         };
 
-        let loc1 = self.codemap.byte_pos_to_line_and_col(span.lo);
-        let loc2 = self.codemap.byte_pos_to_line_and_col(span_hi);
-
-        let expansion_kind = match span.expn_id {
+        let expn_kind = match span.expn_id {
             NO_EXPANSION => SawSpanExpnKind::NoExpansion,
             COMMAND_LINE_EXPN => SawSpanExpnKind::CommandLine,
             _ => SawSpanExpnKind::SomeExpansion,
         };
 
-        SawSpan(loc1.as_ref().map(|&(ref fm, line, col)| (&fm.name[..], line, col)),
-                loc2.as_ref().map(|&(ref fm, line, col)| (&fm.name[..], line, col)),
-                expansion_kind)
-            .hash(self.st);
+        let loc1 = self.codemap.byte_pos_to_line_and_col(span.lo);
+        let loc1 = loc1.as_ref()
+                       .map(|&(ref fm, line, col)| (&fm.name[..], line, col))
+                       .unwrap_or(("???", 0, BytePos(0)));
+
+        let loc2 = self.codemap.byte_pos_to_line_and_col(span_hi);
+        let loc2 = loc2.as_ref()
+                       .map(|&(ref fm, line, col)| (&fm.name[..], line, col))
+                       .unwrap_or(("???", 0, BytePos(0)));
+
+        let saw = if loc1.0 == loc2.0 {
+            SawSpan(loc1.0,
+                    loc1.1, loc1.2,
+                    loc2.1, loc2.2,
+                    expn_kind)
+        } else {
+            SawSpanTwoFiles(loc1.0, loc1.1, loc1.2,
+                            loc2.0, loc2.1, loc2.2,
+                            expn_kind)
+        };
+        saw.hash(self.st);
 
-        if expansion_kind == SawSpanExpnKind::SomeExpansion {
+        if expn_kind == SawSpanExpnKind::SomeExpansion {
             let call_site = self.codemap.codemap().source_callsite(span);
             self.hash_span(call_site);
         }
@@ -189,9 +205,13 @@ enum SawAbiComponent<'a> {
     SawAssocTypeBinding,
     SawAttribute(ast::AttrStyle),
     SawMacroDef,
-    SawSpan(Option<(&'a str, usize, BytePos)>,
-            Option<(&'a str, usize, BytePos)>,
+    SawSpan(&'a str,
+            usize, BytePos,
+            usize, BytePos,
             SawSpanExpnKind),
+    SawSpanTwoFiles(&'a str, usize, BytePos,
+                    &'a str, usize, BytePos,
+                    SawSpanExpnKind),
 }
 
 /// SawExprComponent carries all of the information that we want
author	bors <bors@rust-lang.org>	2016-11-05 01:10:57 -0700
committer	GitHub <noreply@github.com>	2016-11-05 01:10:57 -0700
commit	08839965f96ea9947afe936f13a66a0f4f93db73 (patch)
tree	06e3ec079876527a8c6ddfff5f44482cc2022000
parent	e96b9d2bb48b7a886a4d2b9df716265af5d15646 (diff)
parent	d73c68ceef6984b8e762a173b96279c42d1fe851 (diff)
download	rust-08839965f96ea9947afe936f13a66a0f4f93db73.tar.gz rust-08839965f96ea9947afe936f13a66a0f4f93db73.zip