diff options
| author | Mark Rousskov <mark.simulacrum@gmail.com> | 2020-03-20 18:38:08 -0400 |
|---|---|---|
| committer | Mark Rousskov <mark.simulacrum@gmail.com> | 2020-03-20 18:38:08 -0400 |
| commit | 6c7691a37bf485b28fecb6856e6ede8fa952f99e (patch) | |
| tree | cd2e5a10ff2983623e49ca808f44c273b8992c36 /src/tools/unicode-table-generator | |
| parent | 580a6342ef9d435d241b74e86b99dc1131a526f8 (diff) | |
| download | rust-6c7691a37bf485b28fecb6856e6ede8fa952f99e.tar.gz rust-6c7691a37bf485b28fecb6856e6ede8fa952f99e.zip | |
Pre-pop zero chunks before mapping LAST_CHUNK_MAP
This avoids wasting a small amount of space for some of the data sets. The chunk resizing is caused by but not directly related to changes in this commit. Alphabetic : 3036 bytes Case_Ignorable : 2133 bytes (- 3 bytes) Cased : 934 bytes Cc : 32 bytes Grapheme_Extend: 1760 bytes (-14 bytes) Lowercase : 985 bytes N : 1220 bytes (- 5 bytes) Uppercase : 934 bytes White_Space : 97 bytes Total table sizes: 11131 bytes (-22 bytes)
Diffstat (limited to 'src/tools/unicode-table-generator')
| -rw-r--r-- | src/tools/unicode-table-generator/src/raw_emitter.rs | 24 |
1 files changed, 16 insertions, 8 deletions
diff --git a/src/tools/unicode-table-generator/src/raw_emitter.rs b/src/tools/unicode-table-generator/src/raw_emitter.rs index 5d4a4c0e044..5f66bcbebaf 100644 --- a/src/tools/unicode-table-generator/src/raw_emitter.rs +++ b/src/tools/unicode-table-generator/src/raw_emitter.rs @@ -67,7 +67,7 @@ impl RawEmitter { panic!("cannot pack {} into 8 bits", unique_words.len()); } // needed for the chunk mapping to work - assert_eq!(unique_words[0], 0, "first word is all zeros"); + assert_eq!(unique_words[0], 0, "has a zero word"); let word_indices = unique_words .iter() @@ -80,7 +80,7 @@ impl RawEmitter { let mut best = None; for length in 1..=64 { let mut temp = self.clone(); - temp.emit_chunk_map(&compressed_words, length); + temp.emit_chunk_map(word_indices[&0], &compressed_words, length); if let Some((_, size)) = best { if temp.bytes_used < size { best = Some((length, temp.bytes_used)); @@ -89,7 +89,7 @@ impl RawEmitter { best = Some((length, temp.bytes_used)); } } - self.emit_chunk_map(&compressed_words, best.unwrap().0); + self.emit_chunk_map(word_indices[&0], &compressed_words, best.unwrap().0); writeln!( &mut self.file, @@ -101,12 +101,12 @@ impl RawEmitter { self.bytes_used += 8 * unique_words.len(); } - fn emit_chunk_map(&mut self, compressed_words: &[u8], chunk_length: usize) { + fn emit_chunk_map(&mut self, zero_at: u8, compressed_words: &[u8], chunk_length: usize) { let mut compressed_words = compressed_words.to_vec(); for _ in 0..(chunk_length - (compressed_words.len() % chunk_length)) { // pad out bitset index with zero words so we have all chunks of // chunkchunk_length - compressed_words.push(0); + compressed_words.push(zero_at); } let mut chunks = BTreeSet::new(); @@ -123,6 +123,14 @@ impl RawEmitter { for chunk in compressed_words.chunks(chunk_length) { chunk_indices.push(chunk_map[chunk]); } + + // If one of the chunks has all of the entries point to the bitset + // word filled with zeros, then pop those off the end -- we know they + // are useless. + let zero_chunk_idx = chunks.iter().position(|chunk| chunk.iter().all(|e| *e == zero_at)); + while zero_chunk_idx.is_some() && chunk_indices.last().cloned() == zero_chunk_idx { + chunk_indices.pop(); + } writeln!( &mut self.file, "static BITSET_LAST_CHUNK_MAP: (u16, u8) = ({}, {});", @@ -131,9 +139,9 @@ impl RawEmitter { ) .unwrap(); self.bytes_used += 3; - // Strip out the empty pieces, presuming our above pop() made us now - // have some trailing zeros. - while let Some(0) = chunk_indices.last() { + // Try to pop again, now that we've recorded a non-zero pointing index + // into the LAST_CHUNK_MAP. + while zero_chunk_idx.is_some() && chunk_indices.last().cloned() == zero_chunk_idx { chunk_indices.pop(); } writeln!( |
