diff options
Diffstat (limited to 'src/etc/unicode.py')
| -rwxr-xr-x | src/etc/unicode.py | 39 |
1 files changed, 33 insertions, 6 deletions
diff --git a/src/etc/unicode.py b/src/etc/unicode.py index 2b0f8d971f5..a99770f2261 100755 --- a/src/etc/unicode.py +++ b/src/etc/unicode.py @@ -25,6 +25,9 @@ import fileinput, re, os, sys, operator +bytes_old = 0 +bytes_new = 0 + preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT // file at the top-level directory of this distribution and at // http://rust-lang.org/COPYRIGHT. @@ -309,16 +312,36 @@ def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True, def emit_trie_lookup_range_table(f): f.write(""" + +// BoolTrie is a trie for representing a set of Unicode codepoints. It is +// implemented with postfix compression (sharing of identical child nodes), +// which gives both compact size and fast lookup. +// +// The space of Unicode codepoints is divided into 3 subareas, each +// represented by a trie with different depth. In the first (0..0x800), there +// is no trie structure at all; each u64 entry corresponds to a bitvector +// effectively holding 64 bool values. +// +// In the second (0x800..0x10000), each child of the root node represents a +// 64-wide subrange, but instead of storing the full 64-bit value of the leaf, +// the trie stores an 8-bit index into a shared table of leaf values. This +// exploits the fact that in reasonable sets, many such leaves can be shared. +// +// In the third (0x10000..0x110000), each child of the root node represents a +// 4096-wide subrange, and the trie stores an 8-bit index into a 64-byte slice +// of a child tree. Each of these 64 bytes represents an index into the table +// of shared 64-bit leaf values. This exploits the sparse structure in the +// non-BMP range of most Unicode sets. pub struct BoolTrie { // 0..0x800 (corresponding to 1 and 2 byte utf-8 sequences) r1: [u64; 32], // leaves // 0x800..0x10000 (corresponding to 3 byte utf-8 sequences) - r2: [u8; 1024], // first level + r2: [u8; 992], // first level r3: &'static [u64], // leaves // 0x10000..0x110000 (corresponding to 4 byte utf-8 sequences) - r4: [u8; 272], // first level + r4: [u8; 256], // first level r5: &'static [u8], // second level r6: &'static [u64], // leaves } @@ -332,10 +355,10 @@ fn trie_lookup_range_table(c: char, r: &'static BoolTrie) -> bool { if c < 0x800 { trie_range_leaf(c, r.r1[c >> 6]) } else if c < 0x10000 { - let child = r.r2[c >> 6]; + let child = r.r2[(c >> 6) - 0x20]; trie_range_leaf(c, r.r3[child as usize]) } else { - let child = r.r4[c >> 12]; + let child = r.r4[(c >> 12) - 0x10]; let leaf = r.r5[((child as usize) << 6) + ((c >> 6) & 0x3f)]; trie_range_leaf(c, r.r6[leaf as usize]) } @@ -356,6 +379,8 @@ def compute_trie(rawdata, chunksize): return (root, child_data) def emit_bool_trie(f, name, t_data, is_pub=True): + global bytes_old, bytes_new + bytes_old += 8 * len(t_data) CHUNK = 64 rawdata = [False] * 0x110000; for (lo, hi) in t_data: @@ -383,7 +408,7 @@ def emit_bool_trie(f, name, t_data, is_pub=True): # 0x800..0x10000 trie (r2, r3) = compute_trie(chunks[0x800 / CHUNK : 0x10000 / CHUNK], 64 / CHUNK) f.write(" r2: [\n") - data = ','.join(str(node) for node in [255] * 32 + r2) + data = ','.join(str(node) for node in r2) format_table_content(f, data, 12) f.write("\n ],\n") f.write(" r3: &[\n") @@ -395,7 +420,7 @@ def emit_bool_trie(f, name, t_data, is_pub=True): (mid, r6) = compute_trie(chunks[0x10000 / CHUNK : 0x110000 / CHUNK], 64 / CHUNK) (r4, r5) = compute_trie(mid, 64) f.write(" r4: [\n") - data = ','.join(str(node) for node in [255] * 16 + r4) + data = ','.join(str(node) for node in r4) format_table_content(f, data, 12) f.write("\n ],\n") f.write(" r5: &[\n") @@ -408,6 +433,7 @@ def emit_bool_trie(f, name, t_data, is_pub=True): f.write("\n ],\n") f.write(" };\n\n") + bytes_new += 256 + 992 + 256 + 8 * len(r3) + len(r5) + 8 * len(r6) def emit_property_module(f, mod, tbl, emit): f.write("pub mod %s {\n" % mod) @@ -517,3 +543,4 @@ pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s); # normalizations and conversions module emit_norm_module(rf, canon_decomp, compat_decomp, combines, norm_props) emit_conversions_module(rf, to_upper, to_lower, to_title) + #print 'bytes before = %d, bytes after = %d' % (bytes_old, bytes_new) |
