diff options
| author | Mark Rousskov <mark.simulacrum@gmail.com> | 2020-03-21 10:16:01 -0400 |
|---|---|---|
| committer | Mark Rousskov <mark.simulacrum@gmail.com> | 2020-03-21 11:22:00 -0400 |
| commit | b0e121d9d588b334eaa1b68a127f5ee0fcda4296 (patch) | |
| tree | d3bc693f5f4e5d894ccfcf1173052c381b8ff0f5 /src/tools/unicode-table-generator | |
| parent | 6c7691a37bf485b28fecb6856e6ede8fa952f99e (diff) | |
| download | rust-b0e121d9d588b334eaa1b68a127f5ee0fcda4296.tar.gz rust-b0e121d9d588b334eaa1b68a127f5ee0fcda4296.zip | |
Shrink bitset words through functional mapping
Previously, all words in the (deduplicated) bitset would be stored raw -- a full 64 bits (8 bytes). Now, those words that are equivalent to others through a specific mapping are stored separately and "mapped" to the original when loading; this shrinks the table sizes significantly, as each mapped word is stored in 2 bytes (a 4x decrease from the previous). The new encoding is also potentially non-optimal: the "mapped" byte is frequently repeated, as in practice many mapped words use the same base word. Currently we only support two forms of mapping: rotation and inversion. Note that these are both guaranteed to map transitively if at all, and supporting mappings for which this is not true may require a more interesting algorithm for choosing the optimal pairing. Updated sizes: Alphabetic : 2622 bytes (- 414 bytes) Case_Ignorable : 1803 bytes (- 330 bytes) Cased : 808 bytes (- 126 bytes) Cc : 32 bytes Grapheme_Extend: 1508 bytes (- 252 bytes) Lowercase : 901 bytes (- 84 bytes) N : 1064 bytes (- 156 bytes) Uppercase : 838 bytes (- 96 bytes) White_Space : 91 bytes (- 6 bytes) Total table sizes: 9667 bytes (-1,464 bytes)
Diffstat (limited to 'src/tools/unicode-table-generator')
| -rw-r--r-- | src/tools/unicode-table-generator/src/main.rs | 28 | ||||
| -rw-r--r-- | src/tools/unicode-table-generator/src/raw_emitter.rs | 240 |
2 files changed, 249 insertions, 19 deletions
diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index 39c288dfc61..5e8865fc9e3 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -254,12 +254,19 @@ fn generate_tests(data_path: &str, ranges: &[(&str, Vec<Range<u32>>)]) -> String s.push_str( " #[inline(always)] -fn range_search<const N: usize, const CHUNK_SIZE: usize, const N1: usize, const N2: usize>( +fn range_search< + const N: usize, + const CHUNK_SIZE: usize, + const N1: usize, + const CANONICAL: usize, + const CANONICALIZED: usize, +>( needle: u32, chunk_idx_map: &[u8; N], (last_chunk_idx, last_chunk_mapping): (u16, u8), bitset_chunk_idx: &[[u8; CHUNK_SIZE]; N1], - bitset: &[u64; N2], + bitset_canonical: &[u64; CANONICAL], + bitset_canonicalized: &[(u8, u8); CANONICALIZED], ) -> bool { let bucket_idx = (needle / 64) as usize; let chunk_map_idx = bucket_idx / CHUNK_SIZE; @@ -273,8 +280,21 @@ fn range_search<const N: usize, const CHUNK_SIZE: usize, const N1: usize, const } else { chunk_idx_map[chunk_map_idx] }; - let idx = bitset_chunk_idx[(chunk_idx as usize)][chunk_piece]; - let word = bitset[(idx as usize)]; + let idx = bitset_chunk_idx[(chunk_idx as usize)][chunk_piece] as usize; + let word = if idx < CANONICAL { + bitset_canonical[idx] + } else { + let (real_idx, mapping) = bitset_canonicalized[idx - CANONICAL]; + let mut word = bitset_canonical[real_idx as usize]; + let should_invert = mapping & (1 << 7) != 0; + if should_invert { + word = !word; + } + // Unset the inversion bit + let rotate_by = mapping & !(1 << 7); + word = word.rotate_left(rotate_by as u32); + word + }; (word & (1 << (needle % 64) as u64)) != 0 } ", diff --git a/src/tools/unicode-table-generator/src/raw_emitter.rs b/src/tools/unicode-table-generator/src/raw_emitter.rs index 5f66bcbebaf..38b36c34042 100644 --- a/src/tools/unicode-table-generator/src/raw_emitter.rs +++ b/src/tools/unicode-table-generator/src/raw_emitter.rs @@ -22,8 +22,9 @@ //! mapping into two separate sets; currently this is not dealt with). //! //! With that scheme, we now have a single byte for every 64 codepoints. We -//! further group these by 16 (arbitrarily chosen), and again deduplicate and -//! store in an array (u8 -> [u8; 16]). +//! further group these by some constant N (between 1 and 64 per group), and +//! again deduplicate and store in an array (u8 -> [u8; N]). The constant is +//! chosen to be optimal in bytes-in-memory for the given dataset. //! //! The indices into this array represent ranges of 64*16 = 1024 codepoints. //! @@ -37,9 +38,9 @@ //! down considerably. use crate::fmt_list; -use std::collections::{BTreeSet, HashMap}; +use std::collections::{BTreeMap, BTreeSet, HashMap}; use std::convert::TryFrom; -use std::fmt::Write; +use std::fmt::{self, Write}; use std::ops::Range; #[derive(Clone)] @@ -61,6 +62,10 @@ impl RawEmitter { } fn emit_bitset(&mut self, words: &[u64]) { + let mut words = words.to_vec(); + // Ensure that there's a zero word in the dataset, used for padding and + // such. + words.push(0); let unique_words = words.iter().cloned().collect::<BTreeSet<_>>().into_iter().collect::<Vec<_>>(); if unique_words.len() > u8::max_value() as usize { @@ -68,13 +73,9 @@ impl RawEmitter { } // needed for the chunk mapping to work assert_eq!(unique_words[0], 0, "has a zero word"); + let canonicalized = Canonicalized::canonicalize(&unique_words); - let word_indices = unique_words - .iter() - .cloned() - .enumerate() - .map(|(idx, word)| (word, u8::try_from(idx).unwrap())) - .collect::<HashMap<_, _>>(); + let word_indices = canonicalized.unique_mapping.clone(); let compressed_words = words.iter().map(|w| word_indices[w]).collect::<Vec<u8>>(); let mut best = None; @@ -91,14 +92,32 @@ impl RawEmitter { } self.emit_chunk_map(word_indices[&0], &compressed_words, best.unwrap().0); + struct Bits(u64); + impl fmt::Debug for Bits { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "0b{:064b}", self.0) + } + } + + writeln!( + &mut self.file, + "static BITSET_CANONICAL: [u64; {}] = [{}];", + canonicalized.canonical_words.len(), + fmt_list(canonicalized.canonical_words.iter().map(|v| Bits(*v))), + ) + .unwrap(); + self.bytes_used += 8 * canonicalized.canonical_words.len(); writeln!( &mut self.file, - "static BITSET: [u64; {}] = [{}];", - unique_words.len(), - fmt_list(&unique_words), + "static BITSET_MAPPING: [(u8, u8); {}] = [{}];", + canonicalized.canonicalized_words.len(), + fmt_list(&canonicalized.canonicalized_words), ) .unwrap(); - self.bytes_used += 8 * unique_words.len(); + // 8 bit index into shifted words, 7 bits for shift + optional flip + // We only need it for the words that we removed by applying a shift and + // flip to them. + self.bytes_used += 2 * canonicalized.canonicalized_words.len(); } fn emit_chunk_map(&mut self, zero_at: u8, compressed_words: &[u8], chunk_length: usize) { @@ -170,7 +189,8 @@ impl RawEmitter { writeln!(&mut self.file, " &BITSET_CHUNKS_MAP,").unwrap(); writeln!(&mut self.file, " BITSET_LAST_CHUNK_MAP,").unwrap(); writeln!(&mut self.file, " &BITSET_INDEX_CHUNKS,").unwrap(); - writeln!(&mut self.file, " &BITSET,").unwrap(); + writeln!(&mut self.file, " &BITSET_CANONICAL,").unwrap(); + writeln!(&mut self.file, " &BITSET_MAPPING,").unwrap(); writeln!(&mut self.file, " )").unwrap(); writeln!(&mut self.file, "}}").unwrap(); } @@ -196,3 +216,193 @@ pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range<u32>]) { emitter.blank_line(); emitter.emit_lookup(); } + +struct Canonicalized { + canonical_words: Vec<u64>, + canonicalized_words: Vec<(u8, u8)>, + + /// Maps an input unique word to the associated index (u8) which is into + /// canonical_words or canonicalized_words (in order). + unique_mapping: HashMap<u64, u8>, +} + +impl Canonicalized { + fn canonicalize(unique_words: &[u64]) -> Self { + #[derive(Copy, Clone, Debug)] + enum Mapping { + Rotate(u32), + Invert, + RotateAndInvert(u32), + } + + // key is the word being mapped to + let mut mappings: BTreeMap<u64, Vec<(u64, Mapping)>> = BTreeMap::new(); + for &a in unique_words { + 'b: for &b in unique_words { + // skip self + if a == b { + continue; + } + + // All possible distinct rotations + for rotation in 1..64 { + if a.rotate_right(rotation) == b { + mappings.entry(b).or_default().push((a, Mapping::Rotate(rotation))); + // We're not interested in further mappings between a and b + continue 'b; + } + } + + if (!a) == b { + mappings.entry(b).or_default().push((a, Mapping::Invert)); + // We're not interested in further mappings between a and b + continue 'b; + } + + // All possible distinct rotations, inverted + for rotation in 1..64 { + if (!a.rotate_right(rotation)) == b { + mappings + .entry(b) + .or_default() + .push((a, Mapping::RotateAndInvert(rotation))); + // We're not interested in further mappings between a and b + continue 'b; + } + } + } + } + // These are the bitset words which will be represented "raw" (as a u64) + let mut canonical_words = Vec::new(); + // These are mapped words, which will be represented by an index into + // the canonical_words and a Mapping; u16 when encoded. + let mut canonicalized_words = Vec::new(); + let mut unique_mapping = HashMap::new(); + + #[derive(Debug, PartialEq, Eq)] + enum UniqueMapping { + Canonical(usize), + Canonicalized(usize), + } + + while let Some((&to, _)) = mappings.iter().max_by_key(|m| m.1.len()) { + // Get the mapping with the most entries. Currently, no mapping can + // only exist transitively (i.e., there is no A, B, C such that A + // does not map to C and but A maps to B maps to C), so this is + // guaranteed to be acceptable. + // + // In the future, we may need a more sophisticated algorithm to + // identify which keys to prefer as canonical. + let mapped_from = mappings.remove(&to).unwrap(); + for (from, how) in &mapped_from { + // Remove the entries which mapped to this one. + // Noting that it should be associated with the Nth canonical word. + // + // We do not assert that this is present, because there may be + // no mappings to the `from` word; that's fine. + mappings.remove(from); + assert_eq!( + unique_mapping + .insert(*from, UniqueMapping::Canonicalized(canonicalized_words.len())), + None + ); + canonicalized_words.push((canonical_words.len(), *how)); + + // Remove the now-canonicalized word from other mappings, + // to ensure that we deprioritize them in the next iteration of + // the while loop. + for (_, mapped) in &mut mappings { + let mut i = 0; + while i != mapped.len() { + if mapped[i].0 == *from { + mapped.remove(i); + } else { + i += 1; + } + } + } + } + assert!( + unique_mapping + .insert(to, UniqueMapping::Canonical(canonical_words.len())) + .is_none() + ); + canonical_words.push(to); + + // Remove the now-canonical word from other mappings, to ensure that + // we deprioritize them in the next iteration of the while loop. + for (_, mapped) in &mut mappings { + let mut i = 0; + while i != mapped.len() { + if mapped[i].0 == to { + mapped.remove(i); + } else { + i += 1; + } + } + } + } + + // Any words which we couldn't shrink, just stick into the canonical + // words. + // + // FIXME: work harder -- there are more possibilities for mapping + // functions (e.g., multiplication, shifting instead of rotation, etc.) + // We'll probably always have some slack though so this loop will still + // be needed. + for &w in unique_words { + if !unique_mapping.contains_key(&w) { + assert!( + unique_mapping + .insert(w, UniqueMapping::Canonical(canonical_words.len())) + .is_none() + ); + canonical_words.push(w); + } + } + assert_eq!(canonicalized_words.len() + canonical_words.len(), unique_words.len()); + assert_eq!(unique_mapping.len(), unique_words.len()); + + let unique_mapping = unique_mapping + .into_iter() + .map(|(key, value)| { + ( + key, + match value { + UniqueMapping::Canonicalized(idx) => { + u8::try_from(canonical_words.len() + idx).unwrap() + } + UniqueMapping::Canonical(idx) => u8::try_from(idx).unwrap(), + }, + ) + }) + .collect::<HashMap<_, _>>(); + + let mut distinct_indices = BTreeSet::new(); + for &w in unique_words { + let idx = unique_mapping.get(&w).unwrap(); + assert!(distinct_indices.insert(idx)); + } + + let canonicalized_words = canonicalized_words + .into_iter() + .map(|v| { + ( + u8::try_from(v.0).unwrap(), + match v.1 { + Mapping::RotateAndInvert(amount) => { + assert!(amount < (1 << 7)); + 1 << 7 | (amount as u8) + } + Mapping::Rotate(amount) => { + assert!(amount < (1 << 7)); + amount as u8 + } + Mapping::Invert => 1 << 7, + }, + ) + }) + .collect::<Vec<(u8, u8)>>(); + Canonicalized { unique_mapping, canonical_words, canonicalized_words } + } +} |
