about summary refs log tree commit diff
path: root/src/etc/unicode.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/etc/unicode.py')
-rwxr-xr-xsrc/etc/unicode.py39
1 files changed, 33 insertions, 6 deletions
diff --git a/src/etc/unicode.py b/src/etc/unicode.py
index 2b0f8d971f5..a99770f2261 100755
--- a/src/etc/unicode.py
+++ b/src/etc/unicode.py
@@ -25,6 +25,9 @@
 
 import fileinput, re, os, sys, operator
 
+bytes_old = 0
+bytes_new = 0
+
 preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
 // file at the top-level directory of this distribution and at
 // http://rust-lang.org/COPYRIGHT.
@@ -309,16 +312,36 @@ def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
 
 def emit_trie_lookup_range_table(f):
     f.write("""
+
+// BoolTrie is a trie for representing a set of Unicode codepoints. It is
+// implemented with postfix compression (sharing of identical child nodes),
+// which gives both compact size and fast lookup.
+//
+// The space of Unicode codepoints is divided into 3 subareas, each
+// represented by a trie with different depth. In the first (0..0x800), there
+// is no trie structure at all; each u64 entry corresponds to a bitvector
+// effectively holding 64 bool values.
+//
+// In the second (0x800..0x10000), each child of the root node represents a
+// 64-wide subrange, but instead of storing the full 64-bit value of the leaf,
+// the trie stores an 8-bit index into a shared table of leaf values. This
+// exploits the fact that in reasonable sets, many such leaves can be shared.
+//
+// In the third (0x10000..0x110000), each child of the root node represents a
+// 4096-wide subrange, and the trie stores an 8-bit index into a 64-byte slice
+// of a child tree. Each of these 64 bytes represents an index into the table
+// of shared 64-bit leaf values. This exploits the sparse structure in the
+// non-BMP range of most Unicode sets.
 pub struct BoolTrie {
     // 0..0x800 (corresponding to 1 and 2 byte utf-8 sequences)
     r1: [u64; 32],   // leaves
 
     // 0x800..0x10000 (corresponding to 3 byte utf-8 sequences)
-    r2: [u8; 1024],      // first level
+    r2: [u8; 992],      // first level
     r3: &'static [u64],  // leaves
 
     // 0x10000..0x110000 (corresponding to 4 byte utf-8 sequences)
-    r4: [u8; 272],       // first level
+    r4: [u8; 256],       // first level
     r5: &'static [u8],   // second level
     r6: &'static [u64],  // leaves
 }
@@ -332,10 +355,10 @@ fn trie_lookup_range_table(c: char, r: &'static BoolTrie) -> bool {
     if c < 0x800 {
         trie_range_leaf(c, r.r1[c >> 6])
     } else if c < 0x10000 {
-        let child = r.r2[c >> 6];
+        let child = r.r2[(c >> 6) - 0x20];
         trie_range_leaf(c, r.r3[child as usize])
     } else {
-        let child = r.r4[c >> 12];
+        let child = r.r4[(c >> 12) - 0x10];
         let leaf = r.r5[((child as usize) << 6) + ((c >> 6) & 0x3f)];
         trie_range_leaf(c, r.r6[leaf as usize])
     }
@@ -356,6 +379,8 @@ def compute_trie(rawdata, chunksize):
     return (root, child_data)
 
 def emit_bool_trie(f, name, t_data, is_pub=True):
+    global bytes_old, bytes_new
+    bytes_old += 8 * len(t_data)
     CHUNK = 64
     rawdata = [False] * 0x110000;
     for (lo, hi) in t_data:
@@ -383,7 +408,7 @@ def emit_bool_trie(f, name, t_data, is_pub=True):
     # 0x800..0x10000 trie
     (r2, r3) = compute_trie(chunks[0x800 / CHUNK : 0x10000 / CHUNK], 64 / CHUNK)
     f.write("        r2: [\n")
-    data = ','.join(str(node) for node in [255] * 32 + r2)
+    data = ','.join(str(node) for node in r2)
     format_table_content(f, data, 12)
     f.write("\n        ],\n")
     f.write("        r3: &[\n")
@@ -395,7 +420,7 @@ def emit_bool_trie(f, name, t_data, is_pub=True):
     (mid, r6) = compute_trie(chunks[0x10000 / CHUNK : 0x110000 / CHUNK], 64 / CHUNK)
     (r4, r5) = compute_trie(mid, 64)
     f.write("        r4: [\n")
-    data = ','.join(str(node) for node in [255] * 16 + r4)
+    data = ','.join(str(node) for node in r4)
     format_table_content(f, data, 12)
     f.write("\n        ],\n")
     f.write("        r5: &[\n")
@@ -408,6 +433,7 @@ def emit_bool_trie(f, name, t_data, is_pub=True):
     f.write("\n        ],\n")
 
     f.write("    };\n\n")
+    bytes_new += 256 + 992 + 256 + 8 * len(r3) + len(r5) + 8 * len(r6)
 
 def emit_property_module(f, mod, tbl, emit):
     f.write("pub mod %s {\n" % mod)
@@ -517,3 +543,4 @@ pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
         # normalizations and conversions module
         emit_norm_module(rf, canon_decomp, compat_decomp, combines, norm_props)
         emit_conversions_module(rf, to_upper, to_lower, to_title)
+    #print 'bytes before = %d, bytes after = %d' % (bytes_old, bytes_new)