diff options
| author | Manish Goregaokar <manishsmail@gmail.com> | 2015-03-04 15:46:17 +0530 |
|---|---|---|
| committer | Manish Goregaokar <manishsmail@gmail.com> | 2015-03-05 12:37:48 +0530 |
| commit | bb459bf95b67c482a9ddcba4bca5f8b433444e60 (patch) | |
| tree | a3d6e7959ff8d8997fa63f34cab8f01213ae1208 /src/etc | |
| parent | 478c396b7a9e2d12ad1d15d13126ecf52f333086 (diff) | |
| parent | c9e2de42b590c6d294afd1db44334c5168a694bb (diff) | |
| download | rust-bb459bf95b67c482a9ddcba4bca5f8b433444e60.tar.gz rust-bb459bf95b67c482a9ddcba4bca5f8b433444e60.zip | |
Rollup merge of #23000 - Florob:unicode-FL, r=brson
This handles the ranges contained in UnicodeData.txt. Counterintuitively this actually makes the tables shorter.
Diffstat (limited to 'src/etc')
| -rwxr-xr-x | src/etc/unicode.py | 33 |
1 files changed, 21 insertions, 12 deletions
diff --git a/src/etc/unicode.py b/src/etc/unicode.py index 5472ba3c7ed..312076b1b13 100755 --- a/src/etc/unicode.py +++ b/src/etc/unicode.py @@ -84,8 +84,8 @@ def fetch(f): sys.stderr.write("cannot load %s" % f) exit(1) -def is_valid_unicode(n): - return 0 <= n <= 0xD7FF or 0xE000 <= n <= 0x10FFFF +def is_surrogate(n): + return 0xD800 <= n <= 0xDFFF def load_unicode_data(f): fetch(f) @@ -96,19 +96,28 @@ def load_unicode_data(f): canon_decomp = {} compat_decomp = {} + udict = {}; + range_start = -1; for line in fileinput.input(f): - fields = line.split(";") - if len(fields) != 15: + data = line.split(';'); + if len(data) != 15: continue - [code, name, gencat, combine, bidi, - decomp, deci, digit, num, mirror, - old, iso, upcase, lowcase, titlecase ] = fields - - code_org = code - code = int(code, 16) - - if not is_valid_unicode(code): + cp = int(data[0], 16); + if is_surrogate(cp): continue + if range_start >= 0: + for i in xrange(range_start, cp): + udict[i] = data; + range_start = -1; + if data[1].endswith(", First>"): + range_start = cp; + continue; + udict[cp] = data; + + for code in udict: + [code_org, name, gencat, combine, bidi, + decomp, deci, digit, num, mirror, + old, iso, upcase, lowcase, titlecase ] = udict[code]; # generate char to char direct common and simple conversions # uppercase to lowercase |
