about summary refs log tree commit diff
path: root/src/etc
diff options
context:
space:
mode:
authorbors <bors@rust-lang.org>2014-08-04 17:06:19 +0000
committerbors <bors@rust-lang.org>2014-08-04 17:06:19 +0000
commitefe1f7ee9efb5da5613f2cff4f9b810d2d5992d4 (patch)
tree67297aa25589ccc189255bc45ed3ec371b7c2b36 /src/etc
parent31590bd34900403a18079bf4623cd35f9da0c100 (diff)
parent7ece0abe64bf7c5bdd03e4cbecdb914f470eb846 (diff)
downloadrust-efe1f7ee9efb5da5613f2cff4f9b810d2d5992d4.tar.gz
rust-efe1f7ee9efb5da5613f2cff4f9b810d2d5992d4.zip
auto merge of #15986 : Florob/rust/nfKc-new, r=alexcrichton
This adds a new `Recompositions` iterator, which performs canonical composition on the result of the `Decompositions` iterator (which is canonical or compatibility decomposition). In effect this implements Unicode normalization forms C and KC.
Diffstat (limited to 'src/etc')
-rwxr-xr-xsrc/etc/unicode.py35
1 files changed, 33 insertions, 2 deletions
diff --git a/src/etc/unicode.py b/src/etc/unicode.py
index f1761c5719a..5424cd3b3ab 100755
--- a/src/etc/unicode.py
+++ b/src/etc/unicode.py
@@ -464,13 +464,26 @@ def emit_charwidth_module(f, width_table):
             pfun=lambda x: "(%s,%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), x[2], x[3]))
     f.write("}\n\n")
 
-def emit_norm_module(f, canon, compat, combine):
+def emit_norm_module(f, canon, compat, combine, norm_props):
     canon_keys = canon.keys()
     canon_keys.sort()
 
     compat_keys = compat.keys()
     compat_keys.sort()
 
+    canon_comp = {}
+    comp_exclusions = norm_props["Full_Composition_Exclusion"]
+    for char in canon_keys:
+        if True in map(lambda (lo, hi): lo <= char <= hi, comp_exclusions):
+            continue
+        decomp = canon[char]
+        if len(decomp) == 2:
+            if not canon_comp.has_key(decomp[0]):
+                canon_comp[decomp[0]] = []
+            canon_comp[decomp[0]].append( (decomp[1], char) )
+    canon_comp_keys = canon_comp.keys()
+    canon_comp_keys.sort()
+
     f.write("pub mod normalization {\n")
 
     def mkdata_fun(table):
@@ -494,6 +507,22 @@ def emit_norm_module(f, canon, compat, combine):
     emit_table(f, "compatibility_table", compat_keys, "&'static [(char, &'static [char])]",
         pfun=mkdata_fun(compat))
 
+    def comp_pfun(char):
+        data = "(%s,&[" % escape_char(char)
+        canon_comp[char].sort(lambda x, y: x[0] - y[0])
+        first = True
+        for pair in canon_comp[char]:
+            if not first:
+                data += ","
+            first = False
+            data += "(%s,%s)" % (escape_char(pair[0]), escape_char(pair[1]))
+        data += "])"
+        return data
+
+    f.write("    // Canonical compositions\n")
+    emit_table(f, "composition_table", canon_comp_keys,
+        "&'static [(char, &'static [(char, char)])]", pfun=comp_pfun)
+
     f.write("""
     fn bsearch_range_value_table(c: char, r: &'static [(char, char, u8)]) -> u8 {
         use core::option::{Some, None};
@@ -579,6 +608,8 @@ if __name__ == "__main__":
         scripts = load_properties("Scripts.txt", [])
         props = load_properties("PropList.txt",
                 ["White_Space", "Join_Control", "Noncharacter_Code_Point"])
+        norm_props = load_properties("DerivedNormalizationProps.txt",
+                     ["Full_Composition_Exclusion"])
 
         # grapheme cluster category from DerivedCoreProperties
         # the rest are defined below
@@ -612,7 +643,7 @@ if __name__ == "__main__":
         emit_regex_module(rf, allcats, perl_words)
 
         # normalizations and conversions module
-        emit_norm_module(rf, canon_decomp, compat_decomp, combines)
+        emit_norm_module(rf, canon_decomp, compat_decomp, combines, norm_props)
         emit_conversions_module(rf, lowerupper, upperlower)
 
         ### character width module