Pre-pop zero chunks before mapping LAST_CHUNK_MAP

This avoids wasting a small amount of space for some of the data sets. The chunk resizing is caused by but not directly related to changes in this commit. Alphabetic : 3036 bytes Case_Ignorable : 2133 bytes (- 3 bytes) Cased : 934 bytes Cc : 32 bytes Grapheme_Extend: 1760 bytes (-14 bytes) Lowercase : 985 bytes N : 1220 bytes (- 5 bytes) Uppercase : 934 bytes White_Space : 97 bytes Total table sizes: 11131 bytes (-22 bytes)
author: Mark Rousskov <mark.simulacrum@gmail.com> 2020-03-20 18:38:08 -0400
committer: Mark Rousskov <mark.simulacrum@gmail.com> 2020-03-20 18:38:08 -0400
commit: 6c7691a37bf485b28fecb6856e6ede8fa952f99e (patch)
tree: cd2e5a10ff2983623e49ca808f44c273b8992c36 /src/tools/unicode-table-generator
parent: 580a6342ef9d435d241b74e86b99dc1131a526f8 (diff)
download: rust-6c7691a37bf485b28fecb6856e6ede8fa952f99e.tar.gz
rust-6c7691a37bf485b28fecb6856e6ede8fa952f99e.zip
1 files changed, 16 insertions, 8 deletions
diff --git a/src/tools/unicode-table-generator/src/raw_emitter.rs b/src/tools/unicode-table-generator/src/raw_emitter.rs
index 5d4a4c0e044..5f66bcbebaf 100644
--- a/src/tools/unicode-table-generator/src/raw_emitter.rs
+++ b/src/tools/unicode-table-generator/src/raw_emitter.rs
@@ -67,7 +67,7 @@ impl RawEmitter {
             panic!("cannot pack {} into 8 bits", unique_words.len());
         }
         // needed for the chunk mapping to work
-        assert_eq!(unique_words[0], 0, "first word is all zeros");
+        assert_eq!(unique_words[0], 0, "has a zero word");
 
         let word_indices = unique_words
             .iter()
@@ -80,7 +80,7 @@ impl RawEmitter {
         let mut best = None;
         for length in 1..=64 {
             let mut temp = self.clone();
-            temp.emit_chunk_map(&compressed_words, length);
+            temp.emit_chunk_map(word_indices[&0], &compressed_words, length);
             if let Some((_, size)) = best {
                 if temp.bytes_used < size {
                     best = Some((length, temp.bytes_used));
@@ -89,7 +89,7 @@ impl RawEmitter {
                 best = Some((length, temp.bytes_used));
             }
         }
-        self.emit_chunk_map(&compressed_words, best.unwrap().0);
+        self.emit_chunk_map(word_indices[&0], &compressed_words, best.unwrap().0);
 
         writeln!(
             &mut self.file,
@@ -101,12 +101,12 @@ impl RawEmitter {
         self.bytes_used += 8 * unique_words.len();
     }
 
-    fn emit_chunk_map(&mut self, compressed_words: &[u8], chunk_length: usize) {
+    fn emit_chunk_map(&mut self, zero_at: u8, compressed_words: &[u8], chunk_length: usize) {
         let mut compressed_words = compressed_words.to_vec();
         for _ in 0..(chunk_length - (compressed_words.len() % chunk_length)) {
             // pad out bitset index with zero words so we have all chunks of
             // chunkchunk_length
-            compressed_words.push(0);
+            compressed_words.push(zero_at);
         }
 
         let mut chunks = BTreeSet::new();
@@ -123,6 +123,14 @@ impl RawEmitter {
         for chunk in compressed_words.chunks(chunk_length) {
             chunk_indices.push(chunk_map[chunk]);
         }
+
+        // If one of the chunks has all of the entries point to the bitset
+        // word filled with zeros, then pop those off the end -- we know they
+        // are useless.
+        let zero_chunk_idx = chunks.iter().position(|chunk| chunk.iter().all(|e| *e == zero_at));
+        while zero_chunk_idx.is_some() && chunk_indices.last().cloned() == zero_chunk_idx {
+            chunk_indices.pop();
+        }
         writeln!(
             &mut self.file,
             "static BITSET_LAST_CHUNK_MAP: (u16, u8) = ({}, {});",
@@ -131,9 +139,9 @@ impl RawEmitter {
         )
         .unwrap();
         self.bytes_used += 3;
-        // Strip out the empty pieces, presuming our above pop() made us now
-        // have some trailing zeros.
-        while let Some(0) = chunk_indices.last() {
+        // Try to pop again, now that we've recorded a non-zero pointing index
+        // into the LAST_CHUNK_MAP.
+        while zero_chunk_idx.is_some() && chunk_indices.last().cloned() == zero_chunk_idx {
             chunk_indices.pop();
         }
         writeln!(
author	Mark Rousskov <mark.simulacrum@gmail.com>	2020-03-20 18:38:08 -0400
committer	Mark Rousskov <mark.simulacrum@gmail.com>	2020-03-20 18:38:08 -0400
commit	6c7691a37bf485b28fecb6856e6ede8fa952f99e (patch)
tree	cd2e5a10ff2983623e49ca808f44c273b8992c36 /src/tools/unicode-table-generator
parent	580a6342ef9d435d241b74e86b99dc1131a526f8 (diff)
download	rust-6c7691a37bf485b28fecb6856e6ede8fa952f99e.tar.gz rust-6c7691a37bf485b28fecb6856e6ede8fa952f99e.zip