Rollup merge of #23000 - Florob:unicode-FL, r=brson

This handles the ranges contained in UnicodeData.txt. Counterintuitively this actually makes the tables shorter.
author: Manish Goregaokar <manishsmail@gmail.com> 2015-03-04 15:46:17 +0530
committer: Manish Goregaokar <manishsmail@gmail.com> 2015-03-05 12:37:48 +0530
commit: bb459bf95b67c482a9ddcba4bca5f8b433444e60 (patch)
tree: a3d6e7959ff8d8997fa63f34cab8f01213ae1208 /src/etc
parent: 478c396b7a9e2d12ad1d15d13126ecf52f333086 (diff)
parent: c9e2de42b590c6d294afd1db44334c5168a694bb (diff)
download: rust-bb459bf95b67c482a9ddcba4bca5f8b433444e60.tar.gz
rust-bb459bf95b67c482a9ddcba4bca5f8b433444e60.zip
1 files changed, 21 insertions, 12 deletions
diff --git a/src/etc/unicode.py b/src/etc/unicode.py
index 5472ba3c7ed..312076b1b13 100755
--- a/src/etc/unicode.py
+++ b/src/etc/unicode.py
@@ -84,8 +84,8 @@ def fetch(f):
         sys.stderr.write("cannot load %s" % f)
         exit(1)
 
-def is_valid_unicode(n):
-    return 0 <= n <= 0xD7FF or 0xE000 <= n <= 0x10FFFF
+def is_surrogate(n):
+    return 0xD800 <= n <= 0xDFFF
 
 def load_unicode_data(f):
     fetch(f)
@@ -96,19 +96,28 @@ def load_unicode_data(f):
     canon_decomp = {}
     compat_decomp = {}
 
+    udict = {};
+    range_start = -1;
     for line in fileinput.input(f):
-        fields = line.split(";")
-        if len(fields) != 15:
+        data = line.split(';');
+        if len(data) != 15:
             continue
-        [code, name, gencat, combine, bidi,
-         decomp, deci, digit, num, mirror,
-         old, iso, upcase, lowcase, titlecase ] = fields
-
-        code_org = code
-        code     = int(code, 16)
-
-        if not is_valid_unicode(code):
+        cp = int(data[0], 16);
+        if is_surrogate(cp):
             continue
+        if range_start >= 0:
+            for i in xrange(range_start, cp):
+                udict[i] = data;
+            range_start = -1;
+        if data[1].endswith(", First>"):
+            range_start = cp;
+            continue;
+        udict[cp] = data;
+
+    for code in udict:
+        [code_org, name, gencat, combine, bidi,
+         decomp, deci, digit, num, mirror,
+         old, iso, upcase, lowcase, titlecase ] = udict[code];
 
         # generate char to char direct common and simple conversions
         # uppercase to lowercase
author	Manish Goregaokar <manishsmail@gmail.com>	2015-03-04 15:46:17 +0530
committer	Manish Goregaokar <manishsmail@gmail.com>	2015-03-05 12:37:48 +0530
commit	bb459bf95b67c482a9ddcba4bca5f8b433444e60 (patch)
tree	a3d6e7959ff8d8997fa63f34cab8f01213ae1208 /src/etc
parent	478c396b7a9e2d12ad1d15d13126ecf52f333086 (diff)
parent	c9e2de42b590c6d294afd1db44334c5168a694bb (diff)
download	rust-bb459bf95b67c482a9ddcba4bca5f8b433444e60.tar.gz rust-bb459bf95b67c482a9ddcba4bca5f8b433444e60.zip