about summary refs log tree commit diff
path: root/src/tools
diff options
context:
space:
mode:
authorbors <bors@rust-lang.org>2020-03-28 17:15:32 +0000
committerbors <bors@rust-lang.org>2020-03-28 17:15:32 +0000
commitc52cee172fcd2e223100d8bdd5e105dc37aaca23 (patch)
tree1dbbe3d2d019f59bf1c2fae6b674096effae9670 /src/tools
parente768d6f0bc6db7a46c9ef08254a944ba100bc5dd (diff)
parente3ccd5ba49e9b3811dda14f66e5ca8417b3aa4fb (diff)
downloadrust-c52cee172fcd2e223100d8bdd5e105dc37aaca23.tar.gz
rust-c52cee172fcd2e223100d8bdd5e105dc37aaca23.zip
Auto merge of #70499 - Dylan-DPC:rollup-f9je1l8, r=Dylan-DPC
Rollup of 5 pull requests

Successful merges:

 - #70418 (Add long error explanation for E0703)
 - #70448 (Create output dir in rustdoc markdown render)
 - #70486 (Shrink Unicode tables (even more))
 - #70493 (Fix rustdoc.css CSS tab-size property)
 - #70495 (Replace last mention of IRC with Discord)

Failed merges:

r? @ghost
Diffstat (limited to 'src/tools')
-rw-r--r--src/tools/unicode-table-generator/src/main.rs186
-rw-r--r--src/tools/unicode-table-generator/src/range_search.rs93
-rw-r--r--src/tools/unicode-table-generator/src/raw_emitter.rs434
-rw-r--r--src/tools/unicode-table-generator/src/skiplist.rs98
-rw-r--r--src/tools/unicode-table-generator/src/unicode_download.rs11
5 files changed, 709 insertions, 113 deletions
diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs
index 839d914baa9..d5562ff91df 100644
--- a/src/tools/unicode-table-generator/src/main.rs
+++ b/src/tools/unicode-table-generator/src/main.rs
@@ -1,9 +1,83 @@
+//! This implements the core logic of the compression scheme used to compactly
+//! encode Unicode properties.
+//!
+//! We have two primary goals with the encoding: we want to be compact, because
+//! these tables often end up in ~every Rust program (especially the
+//! grapheme_extend table, used for str debugging), including those for embedded
+//! targets (where space is important). We also want to be relatively fast,
+//! though this is more of a nice to have rather than a key design constraint.
+//! It is expected that libraries/applications which are performance-sensitive
+//! to Unicode property lookups are extremely rare, and those that care may find
+//! the tradeoff of the raw bitsets worth it. For most applications, a
+//! relatively fast but much smaller (and as such less cache-impacting, etc.)
+//! data set is likely preferable.
+//!
+//! We have two separate encoding schemes: a skiplist-like approach, and a
+//! compressed bitset. The datasets we consider mostly use the skiplist (it's
+//! smaller) but the lowercase and uppercase sets are sufficiently sparse for
+//! the bitset to be worthwhile -- for those sets the biset is a 2x size win.
+//! Since the bitset is also faster, this seems an obvious choice. (As a
+//! historical note, the bitset was also the prior implementation, so its
+//! relative complexity had already been paid).
+//!
+//! ## The bitset
+//!
+//! The primary idea is that we 'flatten' the Unicode ranges into an enormous
+//! bitset. To represent any arbitrary codepoint in a raw bitset, we would need
+//! over 17 kilobytes of data per character set -- way too much for our
+//! purposes.
+//!
+//! First, the raw bitset (one bit for every valid `char`, from 0 to 0x10FFFF,
+//! not skipping the small 'gap') is associated into words (u64) and
+//! deduplicated. On random data, this would be useless; on our data, this is
+//! incredibly beneficial -- our data sets have (far) less than 256 unique
+//! words.
+//!
+//! This gives us an array that maps `u8 -> word`; the current algorithm does
+//! not handle the case of more than 256 unique words, but we are relatively far
+//! from coming that close.
+//!
+//! With that scheme, we now have a single byte for every 64 codepoints.
+//!
+//! We further chunk these by some constant N (between 1 and 64 per group,
+//! dynamically chosen for smallest size), and again deduplicate and store in an
+//! array (u8 -> [u8; N]).
+//!
+//! The bytes of this array map into the words from the bitset above, but we
+//! apply another trick here: some of these words are similar enough that they
+//! can be represented by some function of another word. The particular
+//! functions chosen are rotation, inversion, and shifting (right).
+//!
+//! ## The skiplist
+//!
+//! The skip list arose out of the desire for an even smaller encoding than the
+//! bitset -- and was the answer to the question "what is the smallest
+//! representation we can imagine?". However, it is not necessarily the
+//! smallest, and if you have a better proposal, please do suggest it!
+//!
+//! This is a relatively straightforward encoding. First, we break up all the
+//! ranges in the input data into offsets from each other, essentially a gap
+//! encoding. In practice, most gaps are small -- less than u8::MAX -- so we
+//! store those directly. We make use of the larger gaps (which are nicely
+//! interspersed already) throughout the dataset to index this data set.
+//!
+//! In particular, each run of small gaps (terminating in a large gap) is
+//! indexed in a separate dataset. That data set stores an index into the
+//! primary offset list and a prefix sum of that offset list. These are packed
+//! into a single u32 (11 bits for the offset, 21 bits for the prefix sum).
+//!
+//! Lookup proceeds via a binary search in the index and then a straightforward
+//! linear scan (adding up the offsets) until we reach the needle, and then the
+//! index of that offset is utilized as the answer to whether we're in the set
+//! or not.
+
 use std::collections::{BTreeMap, HashMap};
 use std::ops::Range;
 use ucd_parse::Codepoints;
 
 mod case_mapping;
 mod raw_emitter;
+mod skiplist;
 mod unicode_download;
 
 use raw_emitter::{emit_codepoints, RawEmitter};
@@ -152,9 +226,17 @@ fn main() {
         std::process::exit(1);
     });
 
+    // Optional test path, which is a Rust source file testing that the unicode
+    // property lookups are correct.
+    let test_path = std::env::args().nth(2);
+
     let unicode_data = load_data();
     let ranges_by_property = &unicode_data.ranges;
 
+    if let Some(path) = test_path {
+        std::fs::write(&path, generate_tests(&write_location, &ranges_by_property)).unwrap();
+    }
+
     let mut total_bytes = 0;
     let mut modules = Vec::new();
     for (property, ranges) in ranges_by_property {
@@ -163,7 +245,16 @@ fn main() {
         emit_codepoints(&mut emitter, &ranges);
 
         modules.push((property.to_lowercase().to_string(), emitter.file));
-        println!("{:15}: {} bytes, {} codepoints", property, emitter.bytes_used, datapoints,);
+        println!(
+            "{:15}: {} bytes, {} codepoints in {} ranges ({} - {}) using {}",
+            property,
+            emitter.bytes_used,
+            datapoints,
+            ranges.len(),
+            ranges.first().unwrap().start,
+            ranges.last().unwrap().end,
+            emitter.desc,
+        );
         total_bytes += emitter.bytes_used;
     }
 
@@ -173,7 +264,10 @@ fn main() {
         "///! This file is generated by src/tools/unicode-table-generator; do not edit manually!\n",
     );
 
-    table_file.push_str("use super::range_search;\n\n");
+    // Include the range search function
+    table_file.push('\n');
+    table_file.push_str(include_str!("range_search.rs"));
+    table_file.push('\n');
 
     table_file.push_str(&version());
 
@@ -236,21 +330,97 @@ fn fmt_list<V: std::fmt::Debug>(values: impl IntoIterator<Item = V>) -> String {
     out
 }
 
+fn generate_tests(data_path: &str, ranges: &[(&str, Vec<Range<u32>>)]) -> String {
+    let mut s = String::new();
+    s.push_str("#![allow(incomplete_features, unused)]\n");
+    s.push_str("#![feature(const_generics)]\n\n");
+    s.push_str("\n#[allow(unused)]\nuse std::hint;\n");
+    s.push_str(&format!("#[path = \"{}\"]\n", data_path));
+    s.push_str("mod unicode_data;\n\n");
+
+    s.push_str("\nfn main() {\n");
+
+    for (property, ranges) in ranges {
+        s.push_str(&format!(r#"    println!("Testing {}");"#, property));
+        s.push('\n');
+        s.push_str(&format!("    {}_true();\n", property.to_lowercase()));
+        s.push_str(&format!("    {}_false();\n", property.to_lowercase()));
+        let mut is_true = Vec::new();
+        let mut is_false = Vec::new();
+        for ch_num in 0..(std::char::MAX as u32) {
+            if std::char::from_u32(ch_num).is_none() {
+                continue;
+            }
+            if ranges.iter().any(|r| r.contains(&ch_num)) {
+                is_true.push(ch_num);
+            } else {
+                is_false.push(ch_num);
+            }
+        }
+
+        s.push_str(&format!("    fn {}_true() {{\n", property.to_lowercase()));
+        generate_asserts(&mut s, property, &is_true, true);
+        s.push_str("    }\n\n");
+        s.push_str(&format!("    fn {}_false() {{\n", property.to_lowercase()));
+        generate_asserts(&mut s, property, &is_false, false);
+        s.push_str("    }\n\n");
+    }
+
+    s.push_str("}");
+    s
+}
+
+fn generate_asserts(s: &mut String, property: &str, points: &[u32], truthy: bool) {
+    for range in ranges_from_set(points) {
+        if range.end == range.start + 1 {
+            s.push_str(&format!(
+                "        assert!({}unicode_data::{}::lookup({:?}), \"{}\");\n",
+                if truthy { "" } else { "!" },
+                property.to_lowercase(),
+                std::char::from_u32(range.start).unwrap(),
+                range.start,
+            ));
+        } else {
+            s.push_str(&format!("        for chn in {:?}u32 {{\n", range));
+            s.push_str(&format!(
+                "            assert!({}unicode_data::{}::lookup(std::char::from_u32(chn).unwrap()), \"{{:?}}\", chn);\n",
+                if truthy { "" } else { "!" },
+                property.to_lowercase(),
+            ));
+            s.push_str("        }\n");
+        }
+    }
+}
+
+fn ranges_from_set(set: &[u32]) -> Vec<Range<u32>> {
+    let mut ranges = set.iter().map(|e| (*e)..(*e + 1)).collect::<Vec<Range<u32>>>();
+    merge_ranges(&mut ranges);
+    ranges
+}
+
 fn merge_ranges(ranges: &mut Vec<Range<u32>>) {
     loop {
         let mut new_ranges = Vec::new();
         let mut idx_iter = 0..(ranges.len() - 1);
+        let mut should_insert_last = true;
         while let Some(idx) = idx_iter.next() {
             let cur = ranges[idx].clone();
             let next = ranges[idx + 1].clone();
             if cur.end == next.start {
-                let _ = idx_iter.next(); // skip next as we're merging it in
+                if idx_iter.next().is_none() {
+                    // We're merging the last element
+                    should_insert_last = false;
+                }
                 new_ranges.push(cur.start..next.end);
             } else {
+                // We're *not* merging the last element
+                should_insert_last = true;
                 new_ranges.push(cur);
             }
         }
-        new_ranges.push(ranges.last().unwrap().clone());
+        if should_insert_last {
+            new_ranges.push(ranges.last().unwrap().clone());
+        }
         if new_ranges.len() == ranges.len() {
             *ranges = new_ranges;
             break;
@@ -258,4 +428,12 @@ fn merge_ranges(ranges: &mut Vec<Range<u32>>) {
             *ranges = new_ranges;
         }
     }
+
+    let mut last_end = None;
+    for range in ranges {
+        if let Some(last) = last_end {
+            assert!(range.start > last, "{:?}", range);
+        }
+        last_end = Some(range.end);
+    }
 }
diff --git a/src/tools/unicode-table-generator/src/range_search.rs b/src/tools/unicode-table-generator/src/range_search.rs
new file mode 100644
index 00000000000..39b47ce703f
--- /dev/null
+++ b/src/tools/unicode-table-generator/src/range_search.rs
@@ -0,0 +1,93 @@
+#[inline(always)]
+fn bitset_search<
+    const N: usize,
+    const CHUNK_SIZE: usize,
+    const N1: usize,
+    const CANONICAL: usize,
+    const CANONICALIZED: usize,
+>(
+    needle: u32,
+    chunk_idx_map: &[u8; N],
+    bitset_chunk_idx: &[[u8; CHUNK_SIZE]; N1],
+    bitset_canonical: &[u64; CANONICAL],
+    bitset_canonicalized: &[(u8, u8); CANONICALIZED],
+) -> bool {
+    let bucket_idx = (needle / 64) as usize;
+    let chunk_map_idx = bucket_idx / CHUNK_SIZE;
+    let chunk_piece = bucket_idx % CHUNK_SIZE;
+    let chunk_idx = if let Some(&v) = chunk_idx_map.get(chunk_map_idx) {
+        v
+    } else {
+        return false;
+    };
+    let idx = bitset_chunk_idx[chunk_idx as usize][chunk_piece] as usize;
+    let word = if let Some(word) = bitset_canonical.get(idx) {
+        *word
+    } else {
+        let (real_idx, mapping) = bitset_canonicalized[idx - bitset_canonical.len()];
+        let mut word = bitset_canonical[real_idx as usize];
+        let should_invert = mapping & (1 << 6) != 0;
+        if should_invert {
+            word = !word;
+        }
+        // Lower 6 bits
+        let quantity = mapping & ((1 << 6) - 1);
+        if mapping & (1 << 7) != 0 {
+            // shift
+            word >>= quantity as u64;
+        } else {
+            word = word.rotate_left(quantity as u32);
+        }
+        word
+    };
+    (word & (1 << (needle % 64) as u64)) != 0
+}
+
+fn decode_prefix_sum(short_offset_run_header: u32) -> u32 {
+    short_offset_run_header & ((1 << 21) - 1)
+}
+
+fn decode_length(short_offset_run_header: u32) -> usize {
+    (short_offset_run_header >> 21) as usize
+}
+
+#[inline(always)]
+fn skip_search<const SOR: usize, const OFFSETS: usize>(
+    needle: u32,
+    short_offset_runs: &[u32; SOR],
+    offsets: &[u8; OFFSETS],
+) -> bool {
+    // Note that this *cannot* be past the end of the array, as the last
+    // element is greater than std::char::MAX (the largest possible needle).
+    //
+    // So, we cannot have found it (i.e. Ok(idx) + 1 != length) and the correct
+    // location cannot be past it, so Err(idx) != length either.
+    //
+    // This means that we can avoid bounds checking for the accesses below, too.
+    let last_idx =
+        match short_offset_runs.binary_search_by_key(&(needle << 11), |header| header << 11) {
+            Ok(idx) => idx + 1,
+            Err(idx) => idx,
+        };
+
+    let mut offset_idx = decode_length(short_offset_runs[last_idx]);
+    let length = if let Some(next) = short_offset_runs.get(last_idx + 1) {
+        decode_length(*next) - offset_idx
+    } else {
+        offsets.len() - offset_idx
+    };
+    let prev =
+        last_idx.checked_sub(1).map(|prev| decode_prefix_sum(short_offset_runs[prev])).unwrap_or(0);
+
+    let total = needle - prev;
+    let mut prefix_sum = 0;
+    for _ in 0..(length - 1) {
+        let offset = offsets[offset_idx];
+        prefix_sum += offset as u32;
+        if prefix_sum > total {
+            break;
+        }
+        offset_idx += 1;
+    }
+    offset_idx % 2 == 1
+}
diff --git a/src/tools/unicode-table-generator/src/raw_emitter.rs b/src/tools/unicode-table-generator/src/raw_emitter.rs
index 3e60ce13f92..95b63aca154 100644
--- a/src/tools/unicode-table-generator/src/raw_emitter.rs
+++ b/src/tools/unicode-table-generator/src/raw_emitter.rs
@@ -1,55 +1,19 @@
-//! This implements the core logic of the compression scheme used to compactly
-//! encode the Unicode character classes.
-//!
-//! The primary idea is that we 'flatten' the Unicode ranges into an enormous
-//! bitset. To represent any arbitrary codepoint in a raw bitset, we would need
-//! over 17 kilobytes of data per character set -- way too much for our
-//! purposes.
-//!
-//! We have two primary goals with the encoding: we want to be compact, because
-//! these tables often end up in ~every Rust program (especially the
-//! grapheme_extend table, used for str debugging), including those for embedded
-//! targets (where space is important). We also want to be relatively fast,
-//! though this is more of a nice to have rather than a key design constraint.
-//! In practice, due to modern processor design these two are closely related.
-//!
-//! The encoding scheme here compresses the bitset by first deduplicating the
-//! "words" (64 bits on all platforms). In practice very few words are present
-//! in most data sets.
-//!
-//! This gives us an array that maps `u8 -> word` (if we ever went beyond 256
-//! words, we could go to u16 -> word or have some dual compression scheme
-//! mapping into two separate sets; currently this is not dealt with).
-//!
-//! With that scheme, we now have a single byte for every 64 codepoints. We
-//! further group these by 16 (arbitrarily chosen), and again deduplicate and
-//! store in an array (u8 -> [u8; 16]).
-//!
-//! The indices into this array represent ranges of 64*16 = 1024 codepoints.
-//!
-//! This already reduces the top-level array to at most 1,086 bytes, but in
-//! practice we usually can encode in far fewer (the first couple Unicode planes
-//! are dense).
-//!
-//! The last byte of this top-level array is pulled out to a separate static
-//! and trailing zeros are dropped; this is simply because grapheme_extend and
-//! case_ignorable have a single entry in the 896th entry, so this shrinks them
-//! down considerably.
-
 use crate::fmt_list;
-use std::collections::{BTreeSet, HashMap};
+use std::collections::{BTreeMap, BTreeSet, HashMap};
 use std::convert::TryFrom;
-use std::fmt::Write;
+use std::fmt::{self, Write};
 use std::ops::Range;
 
+#[derive(Clone)]
 pub struct RawEmitter {
     pub file: String,
+    pub desc: String,
     pub bytes_used: usize,
 }
 
 impl RawEmitter {
     pub fn new() -> RawEmitter {
-        RawEmitter { file: String::new(), bytes_used: 0 }
+        RawEmitter { file: String::new(), bytes_used: 0, desc: String::new() }
     }
 
     fn blank_line(&mut self) {
@@ -59,30 +23,100 @@ impl RawEmitter {
         writeln!(&mut self.file, "").unwrap();
     }
 
-    fn emit_bitset(&mut self, words: &[u64]) {
+    fn emit_bitset(&mut self, ranges: &[Range<u32>]) {
+        let last_code_point = ranges.last().unwrap().end;
+        // bitset for every bit in the codepoint range
+        //
+        // + 2 to ensure an all zero word to use for padding
+        let mut buckets = vec![0u64; (last_code_point as usize / 64) + 2];
+        for range in ranges {
+            for codepoint in range.clone() {
+                let bucket = codepoint as usize / 64;
+                let bit = codepoint as u64 % 64;
+                buckets[bucket] |= 1 << bit;
+            }
+        }
+
+        let mut words = buckets;
+        // Ensure that there's a zero word in the dataset, used for padding and
+        // such.
+        words.push(0);
         let unique_words =
             words.iter().cloned().collect::<BTreeSet<_>>().into_iter().collect::<Vec<_>>();
         if unique_words.len() > u8::max_value() as usize {
             panic!("cannot pack {} into 8 bits", unique_words.len());
         }
+        // needed for the chunk mapping to work
+        assert_eq!(unique_words[0], 0, "has a zero word");
+        let canonicalized = Canonicalized::canonicalize(&unique_words);
 
-        let word_indices = unique_words
-            .iter()
-            .cloned()
-            .enumerate()
-            .map(|(idx, word)| (word, u8::try_from(idx).unwrap()))
-            .collect::<HashMap<_, _>>();
+        let word_indices = canonicalized.unique_mapping.clone();
+        let compressed_words = words.iter().map(|w| word_indices[w]).collect::<Vec<u8>>();
+
+        let mut best = None;
+        for length in 1..=64 {
+            let mut temp = self.clone();
+            temp.emit_chunk_map(word_indices[&0], &compressed_words, length);
+            if let Some((_, size)) = best {
+                if temp.bytes_used < size {
+                    best = Some((length, temp.bytes_used));
+                }
+            } else {
+                best = Some((length, temp.bytes_used));
+            }
+        }
+        self.emit_chunk_map(word_indices[&0], &compressed_words, best.unwrap().0);
+
+        struct Bits(u64);
+        impl fmt::Debug for Bits {
+            fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+                write!(f, "0b{:064b}", self.0)
+            }
+        }
+
+        writeln!(
+            &mut self.file,
+            "static BITSET_CANONICAL: [u64; {}] = [{}];",
+            canonicalized.canonical_words.len(),
+            fmt_list(canonicalized.canonical_words.iter().map(|v| Bits(*v))),
+        )
+        .unwrap();
+        self.bytes_used += 8 * canonicalized.canonical_words.len();
+        writeln!(
+            &mut self.file,
+            "static BITSET_MAPPING: [(u8, u8); {}] = [{}];",
+            canonicalized.canonicalized_words.len(),
+            fmt_list(&canonicalized.canonicalized_words),
+        )
+        .unwrap();
+        // 8 bit index into shifted words, 7 bits for shift + optional flip
+        // We only need it for the words that we removed by applying a shift and
+        // flip to them.
+        self.bytes_used += 2 * canonicalized.canonicalized_words.len();
+
+        self.blank_line();
 
-        let mut idx = words.iter().map(|w| word_indices[w]).collect::<Vec<u8>>();
-        let chunk_length = 16;
-        for _ in 0..(chunk_length - (idx.len() % chunk_length)) {
-            assert_eq!(unique_words[0], 0, "first word is all zeros");
-            // pad out bitset index with zero words so we have all chunks of 16
-            idx.push(0);
+        writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
+        writeln!(&mut self.file, "    super::bitset_search(",).unwrap();
+        writeln!(&mut self.file, "        c as u32,").unwrap();
+        writeln!(&mut self.file, "        &BITSET_CHUNKS_MAP,").unwrap();
+        writeln!(&mut self.file, "        &BITSET_INDEX_CHUNKS,").unwrap();
+        writeln!(&mut self.file, "        &BITSET_CANONICAL,").unwrap();
+        writeln!(&mut self.file, "        &BITSET_MAPPING,").unwrap();
+        writeln!(&mut self.file, "    )").unwrap();
+        writeln!(&mut self.file, "}}").unwrap();
+    }
+
+    fn emit_chunk_map(&mut self, zero_at: u8, compressed_words: &[u8], chunk_length: usize) {
+        let mut compressed_words = compressed_words.to_vec();
+        for _ in 0..(chunk_length - (compressed_words.len() % chunk_length)) {
+            // pad out bitset index with zero words so we have all chunks of
+            // chunkchunk_length
+            compressed_words.push(zero_at);
         }
 
         let mut chunks = BTreeSet::new();
-        for chunk in idx.chunks(chunk_length) {
+        for chunk in compressed_words.chunks(chunk_length) {
             chunks.insert(chunk);
         }
         let chunk_map = chunks
@@ -92,23 +126,10 @@ impl RawEmitter {
             .map(|(idx, chunk)| (chunk, idx))
             .collect::<HashMap<_, _>>();
         let mut chunk_indices = Vec::new();
-        for chunk in idx.chunks(chunk_length) {
+        for chunk in compressed_words.chunks(chunk_length) {
             chunk_indices.push(chunk_map[chunk]);
         }
-        writeln!(
-            &mut self.file,
-            "static BITSET_LAST_CHUNK_MAP: (u16, u8) = ({}, {});",
-            chunk_indices.len() - 1,
-            chunk_indices.pop().unwrap(),
-        )
-        .unwrap();
-        self.bytes_used += 3;
-        // Strip out the empty pieces, presuming our above pop() made us now
-        // have some trailing zeros.
-        assert_eq!(unique_words[0], 0, "first word is all zeros");
-        while let Some(0) = chunk_indices.last() {
-            chunk_indices.pop();
-        }
+
         writeln!(
             &mut self.file,
             "static BITSET_CHUNKS_MAP: [u8; {}] = [{}];",
@@ -119,52 +140,253 @@ impl RawEmitter {
         self.bytes_used += chunk_indices.len();
         writeln!(
             &mut self.file,
-            "static BITSET_INDEX_CHUNKS: [[u8; 16]; {}] = [{}];",
+            "static BITSET_INDEX_CHUNKS: [[u8; {}]; {}] = [{}];",
+            chunk_length,
             chunks.len(),
             fmt_list(chunks.iter()),
         )
         .unwrap();
-        self.bytes_used += 16 * chunks.len();
-        writeln!(
-            &mut self.file,
-            "static BITSET: [u64; {}] = [{}];",
-            unique_words.len(),
-            fmt_list(&unique_words),
-        )
-        .unwrap();
-        self.bytes_used += 8 * unique_words.len();
-    }
-
-    pub fn emit_lookup(&mut self) {
-        writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
-        writeln!(&mut self.file, "    super::range_search(",).unwrap();
-        writeln!(&mut self.file, "        c as u32,").unwrap();
-        writeln!(&mut self.file, "        &BITSET_CHUNKS_MAP,").unwrap();
-        writeln!(&mut self.file, "        BITSET_LAST_CHUNK_MAP,").unwrap();
-        writeln!(&mut self.file, "        &BITSET_INDEX_CHUNKS,").unwrap();
-        writeln!(&mut self.file, "        &BITSET,").unwrap();
-        writeln!(&mut self.file, "    )").unwrap();
-        writeln!(&mut self.file, "}}").unwrap();
+        self.bytes_used += chunk_length * chunks.len();
     }
 }
 
 pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range<u32>]) {
     emitter.blank_line();
 
-    let last_code_point = ranges.last().unwrap().end;
-    // bitset for every bit in the codepoint range
-    //
-    // + 2 to ensure an all zero word to use for padding
-    let mut buckets = vec![0u64; (last_code_point as usize / 64) + 2];
-    for range in ranges {
-        for codepoint in range.clone() {
-            let bucket = codepoint as usize / 64;
-            let bit = codepoint as u64 % 64;
-            buckets[bucket] |= 1 << bit;
-        }
+    let mut bitset = emitter.clone();
+    bitset.emit_bitset(&ranges);
+
+    let mut skiplist = emitter.clone();
+    skiplist.emit_skiplist(&ranges);
+
+    if bitset.bytes_used <= skiplist.bytes_used {
+        *emitter = bitset;
+        emitter.desc = format!("bitset");
+    } else {
+        *emitter = skiplist;
+        emitter.desc = format!("skiplist");
     }
+}
 
-    emitter.emit_bitset(&buckets);
-    emitter.blank_line();
-    emitter.emit_lookup();
+struct Canonicalized {
+    canonical_words: Vec<u64>,
+    canonicalized_words: Vec<(u8, u8)>,
+
+    /// Maps an input unique word to the associated index (u8) which is into
+    /// canonical_words or canonicalized_words (in order).
+    unique_mapping: HashMap<u64, u8>,
+}
+
+impl Canonicalized {
+    fn canonicalize(unique_words: &[u64]) -> Self {
+        #[derive(Copy, Clone, Debug)]
+        enum Mapping {
+            Rotate(u32),
+            Invert,
+            RotateAndInvert(u32),
+            ShiftRight(u32),
+        }
+
+        // key is the word being mapped to
+        let mut mappings: BTreeMap<u64, Vec<(u64, Mapping)>> = BTreeMap::new();
+        for &a in unique_words {
+            'b: for &b in unique_words {
+                // skip self
+                if a == b {
+                    continue;
+                }
+
+                // All possible distinct rotations
+                for rotation in 1..64 {
+                    if a.rotate_right(rotation) == b {
+                        mappings.entry(b).or_default().push((a, Mapping::Rotate(rotation)));
+                        // We're not interested in further mappings between a and b
+                        continue 'b;
+                    }
+                }
+
+                if (!a) == b {
+                    mappings.entry(b).or_default().push((a, Mapping::Invert));
+                    // We're not interested in further mappings between a and b
+                    continue 'b;
+                }
+
+                // All possible distinct rotations, inverted
+                for rotation in 1..64 {
+                    if (!a.rotate_right(rotation)) == b {
+                        mappings
+                            .entry(b)
+                            .or_default()
+                            .push((a, Mapping::RotateAndInvert(rotation)));
+                        // We're not interested in further mappings between a and b
+                        continue 'b;
+                    }
+                }
+
+                // All possible shifts
+                for shift_by in 1..64 {
+                    if a == (b >> shift_by) {
+                        mappings
+                            .entry(b)
+                            .or_default()
+                            .push((a, Mapping::ShiftRight(shift_by as u32)));
+                        // We're not interested in further mappings between a and b
+                        continue 'b;
+                    }
+                }
+            }
+        }
+        // These are the bitset words which will be represented "raw" (as a u64)
+        let mut canonical_words = Vec::new();
+        // These are mapped words, which will be represented by an index into
+        // the canonical_words and a Mapping; u16 when encoded.
+        let mut canonicalized_words = Vec::new();
+        let mut unique_mapping = HashMap::new();
+
+        #[derive(Debug, PartialEq, Eq)]
+        enum UniqueMapping {
+            Canonical(usize),
+            Canonicalized(usize),
+        }
+
+        // Map 0 first, so that it is the first canonical word.
+        // This is realistically not inefficient because 0 is not mapped to by
+        // anything else (a shift pattern could do it, but would be wasteful).
+        //
+        // However, 0s are quite common in the overall dataset, and it is quite
+        // wasteful to have to go through a mapping function to determine that
+        // we have a zero.
+        //
+        // FIXME: Experiment with choosing most common words in overall data set
+        // for canonical when possible.
+        while let Some((&to, _)) = mappings
+            .iter()
+            .find(|(&to, _)| to == 0)
+            .or_else(|| mappings.iter().max_by_key(|m| m.1.len()))
+        {
+            // Get the mapping with the most entries. Currently, no mapping can
+            // only exist transitively (i.e., there is no A, B, C such that A
+            // does not map to C and but A maps to B maps to C), so this is
+            // guaranteed to be acceptable.
+            //
+            // In the future, we may need a more sophisticated algorithm to
+            // identify which keys to prefer as canonical.
+            let mapped_from = mappings.remove(&to).unwrap();
+            for (from, how) in &mapped_from {
+                // Remove the entries which mapped to this one.
+                // Noting that it should be associated with the Nth canonical word.
+                //
+                // We do not assert that this is present, because there may be
+                // no mappings to the `from` word; that's fine.
+                mappings.remove(from);
+                assert_eq!(
+                    unique_mapping
+                        .insert(*from, UniqueMapping::Canonicalized(canonicalized_words.len())),
+                    None
+                );
+                canonicalized_words.push((canonical_words.len(), *how));
+
+                // Remove the now-canonicalized word from other mappings,
+                // to ensure that we deprioritize them in the next iteration of
+                // the while loop.
+                for (_, mapped) in &mut mappings {
+                    let mut i = 0;
+                    while i != mapped.len() {
+                        if mapped[i].0 == *from {
+                            mapped.remove(i);
+                        } else {
+                            i += 1;
+                        }
+                    }
+                }
+            }
+            assert!(
+                unique_mapping
+                    .insert(to, UniqueMapping::Canonical(canonical_words.len()))
+                    .is_none()
+            );
+            canonical_words.push(to);
+
+            // Remove the now-canonical word from other mappings, to ensure that
+            // we deprioritize them in the next iteration of the while loop.
+            for (_, mapped) in &mut mappings {
+                let mut i = 0;
+                while i != mapped.len() {
+                    if mapped[i].0 == to {
+                        mapped.remove(i);
+                    } else {
+                        i += 1;
+                    }
+                }
+            }
+        }
+
+        // Any words which we couldn't shrink, just stick into the canonical
+        // words.
+        //
+        // FIXME: work harder -- there are more possibilities for mapping
+        // functions (e.g., multiplication, shifting instead of rotation, etc.)
+        // We'll probably always have some slack though so this loop will still
+        // be needed.
+        for &w in unique_words {
+            if !unique_mapping.contains_key(&w) {
+                assert!(
+                    unique_mapping
+                        .insert(w, UniqueMapping::Canonical(canonical_words.len()))
+                        .is_none()
+                );
+                canonical_words.push(w);
+            }
+        }
+        assert_eq!(canonicalized_words.len() + canonical_words.len(), unique_words.len());
+        assert_eq!(unique_mapping.len(), unique_words.len());
+
+        let unique_mapping = unique_mapping
+            .into_iter()
+            .map(|(key, value)| {
+                (
+                    key,
+                    match value {
+                        UniqueMapping::Canonicalized(idx) => {
+                            u8::try_from(canonical_words.len() + idx).unwrap()
+                        }
+                        UniqueMapping::Canonical(idx) => u8::try_from(idx).unwrap(),
+                    },
+                )
+            })
+            .collect::<HashMap<_, _>>();
+
+        let mut distinct_indices = BTreeSet::new();
+        for &w in unique_words {
+            let idx = unique_mapping.get(&w).unwrap();
+            assert!(distinct_indices.insert(idx));
+        }
+
+        const LOWER_6: u32 = (1 << 6) - 1;
+
+        let canonicalized_words = canonicalized_words
+            .into_iter()
+            .map(|v| {
+                (
+                    u8::try_from(v.0).unwrap(),
+                    match v.1 {
+                        Mapping::RotateAndInvert(amount) => {
+                            assert_eq!(amount, amount & LOWER_6);
+                            1 << 6 | (amount as u8)
+                        }
+                        Mapping::Rotate(amount) => {
+                            assert_eq!(amount, amount & LOWER_6);
+                            amount as u8
+                        }
+                        Mapping::Invert => 1 << 6,
+                        Mapping::ShiftRight(shift_by) => {
+                            assert_eq!(shift_by, shift_by & LOWER_6);
+                            1 << 7 | (shift_by as u8)
+                        }
+                    },
+                )
+            })
+            .collect::<Vec<(u8, u8)>>();
+        Canonicalized { unique_mapping, canonical_words, canonicalized_words }
+    }
 }
diff --git a/src/tools/unicode-table-generator/src/skiplist.rs b/src/tools/unicode-table-generator/src/skiplist.rs
new file mode 100644
index 00000000000..6e439968c3b
--- /dev/null
+++ b/src/tools/unicode-table-generator/src/skiplist.rs
@@ -0,0 +1,98 @@
+use crate::fmt_list;
+use crate::raw_emitter::RawEmitter;
+use std::convert::TryInto;
+use std::fmt::Write as _;
+use std::ops::Range;
+
+/// This will get packed into a single u32 before inserting into the data set.
+#[derive(Debug, PartialEq)]
+struct ShortOffsetRunHeader {
+    /// Note, we only allow for 21 bits here.
+    prefix_sum: u32,
+
+    /// Note, we actually only allow for 11 bits here. This should be enough --
+    /// our largest sets are around ~1400 offsets long.
+    start_idx: u16,
+}
+
+impl ShortOffsetRunHeader {
+    fn pack(&self) -> u32 {
+        assert!(self.start_idx < (1 << 11));
+        assert!(self.prefix_sum < (1 << 21));
+
+        (self.start_idx as u32) << 21 | self.prefix_sum
+    }
+}
+
+impl RawEmitter {
+    pub fn emit_skiplist(&mut self, ranges: &[Range<u32>]) {
+        let mut offsets = Vec::<u32>::new();
+        let points = ranges.iter().flat_map(|r| vec![r.start, r.end]).collect::<Vec<u32>>();
+        let mut offset = 0;
+        for pt in points {
+            let delta = pt - offset;
+            offsets.push(delta);
+            offset = pt;
+        }
+        // Guaranteed to terminate, as it's impossible to subtract a value this
+        // large from a valid char.
+        offsets.push(std::char::MAX as u32 + 1);
+        let mut coded_offsets: Vec<u8> = Vec::new();
+        let mut short_offset_runs: Vec<ShortOffsetRunHeader> = vec![];
+        let mut iter = offsets.iter().cloned();
+        let mut prefix_sum = 0;
+        loop {
+            let mut any_elements = false;
+            let mut inserted = false;
+            let start = coded_offsets.len();
+            for offset in iter.by_ref() {
+                any_elements = true;
+                prefix_sum += offset;
+                if let Ok(offset) = offset.try_into() {
+                    coded_offsets.push(offset);
+                } else {
+                    short_offset_runs.push(ShortOffsetRunHeader {
+                        start_idx: start.try_into().unwrap(),
+                        prefix_sum,
+                    });
+                    // This is just needed to maintain indices even/odd
+                    // correctly.
+                    coded_offsets.push(0);
+                    inserted = true;
+                    break;
+                }
+            }
+            if !any_elements {
+                break;
+            }
+            // We always append the huge char::MAX offset to the end which
+            // should never be able to fit into the u8 offsets.
+            assert!(inserted);
+        }
+
+        writeln!(
+            &mut self.file,
+            "static SHORT_OFFSET_RUNS: [u32; {}] = [{}];",
+            short_offset_runs.len(),
+            fmt_list(short_offset_runs.iter().map(|v| v.pack()))
+        )
+        .unwrap();
+        self.bytes_used += 4 * short_offset_runs.len();
+        writeln!(
+            &mut self.file,
+            "static OFFSETS: [u8; {}] = [{}];",
+            coded_offsets.len(),
+            fmt_list(&coded_offsets)
+        )
+        .unwrap();
+        self.bytes_used += coded_offsets.len();
+
+        writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
+        writeln!(&mut self.file, "    super::skip_search(",).unwrap();
+        writeln!(&mut self.file, "        c as u32,").unwrap();
+        writeln!(&mut self.file, "        &SHORT_OFFSET_RUNS,").unwrap();
+        writeln!(&mut self.file, "        &OFFSETS,").unwrap();
+        writeln!(&mut self.file, "    )").unwrap();
+        writeln!(&mut self.file, "}}").unwrap();
+    }
+}
diff --git a/src/tools/unicode-table-generator/src/unicode_download.rs b/src/tools/unicode-table-generator/src/unicode_download.rs
index 3f6de9ea3bb..fa57f650ac0 100644
--- a/src/tools/unicode-table-generator/src/unicode_download.rs
+++ b/src/tools/unicode-table-generator/src/unicode_download.rs
@@ -11,10 +11,15 @@ static RESOURCES: &[&str] =
 
 pub fn fetch_latest() {
     let directory = Path::new(UNICODE_DIRECTORY);
+    if directory.exists() {
+        eprintln!(
+            "Not refetching unicode data, already exists, please delete {:?} to regenerate",
+            directory
+        );
+        return;
+    }
     if let Err(e) = std::fs::create_dir_all(directory) {
-        if e.kind() != std::io::ErrorKind::AlreadyExists {
-            panic!("Failed to create {:?}: {}", UNICODE_DIRECTORY, e);
-        }
+        panic!("Failed to create {:?}: {}", UNICODE_DIRECTORY, e);
     }
     let output = Command::new("curl").arg(URL_PREFIX.to_owned() + README).output().unwrap();
     if !output.status.success() {