about summary refs log tree commit diff
path: root/src/libcore/unicode/unicode.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/libcore/unicode/unicode.py')
-rwxr-xr-xsrc/libcore/unicode/unicode.py878
1 files changed, 0 insertions, 878 deletions
diff --git a/src/libcore/unicode/unicode.py b/src/libcore/unicode/unicode.py
deleted file mode 100755
index 97df92a56da..00000000000
--- a/src/libcore/unicode/unicode.py
+++ /dev/null
@@ -1,878 +0,0 @@
-#!/usr/bin/env python
-
-"""
-Regenerate Unicode tables (tables.rs).
-"""
-
-# This script uses the Unicode tables as defined
-# in the UnicodeFiles class.
-
-# Since this should not require frequent updates, we just store this
-# out-of-line and check the tables.rs file into git.
-
-# Note that the "curl" program is required for operation.
-# This script is compatible with Python 2.7 and 3.x.
-
-import argparse
-import datetime
-import fileinput
-import itertools
-import os
-import re
-import textwrap
-import subprocess
-
-from collections import defaultdict, namedtuple
-
-try:
-    # Python 3
-    from itertools import zip_longest
-    from io import StringIO
-except ImportError:
-    # Python 2 compatibility
-    zip_longest = itertools.izip_longest
-    from StringIO import StringIO
-
-try:
-    # Completely optional type hinting
-    # (Python 2 compatible using comments,
-    # see: https://mypy.readthedocs.io/en/latest/python2.html)
-    # This is very helpful in typing-aware IDE like PyCharm.
-    from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Set, Tuple
-except ImportError:
-    pass
-
-
-# We don't use enum.Enum because of Python 2.7 compatibility.
-class UnicodeFiles(object):
-    # ReadMe does not contain any Unicode data, we
-    # only use it to extract versions.
-    README = "ReadMe.txt"
-
-    DERIVED_CORE_PROPERTIES = "DerivedCoreProperties.txt"
-    DERIVED_NORMALIZATION_PROPS = "DerivedNormalizationProps.txt"
-    PROPS = "PropList.txt"
-    SCRIPTS = "Scripts.txt"
-    SPECIAL_CASING = "SpecialCasing.txt"
-    UNICODE_DATA = "UnicodeData.txt"
-
-
-# The order doesn't really matter (Python < 3.6 won't preserve it),
-# we only want to aggregate all the file names.
-ALL_UNICODE_FILES = tuple(
-    value for name, value in UnicodeFiles.__dict__.items()
-    if not name.startswith("_")
-)
-
-assert len(ALL_UNICODE_FILES) == 7, "Unexpected number of unicode files"
-
-# The directory this file is located in.
-THIS_DIR = os.path.dirname(os.path.realpath(__file__))
-
-# Where to download the Unicode data.  The downloaded files
-# will be placed in sub-directories named after Unicode version.
-FETCH_DIR = os.path.join(THIS_DIR, "downloaded")
-
-FETCH_URL_LATEST = "ftp://ftp.unicode.org/Public/UNIDATA/{filename}"
-FETCH_URL_VERSION = "ftp://ftp.unicode.org/Public/{version}/ucd/{filename}"
-
-PREAMBLE = """\
-// NOTE: The following code was generated by "./unicode.py", do not edit directly
-
-#![allow(missing_docs, non_upper_case_globals, non_snake_case, clippy::unreadable_literal)]
-
-use crate::unicode::bool_trie::{{BoolTrie, SmallBoolTrie}};
-use crate::unicode::version::UnicodeVersion;
-""".format(year=datetime.datetime.now().year)
-
-# Mapping taken from Table 12 from:
-# http://www.unicode.org/reports/tr44/#General_Category_Values
-EXPANDED_CATEGORIES = {
-    "Lu": ["LC", "L"], "Ll": ["LC", "L"], "Lt": ["LC", "L"],
-    "Lm": ["L"], "Lo": ["L"],
-    "Mn": ["M"], "Mc": ["M"], "Me": ["M"],
-    "Nd": ["N"], "Nl": ["N"], "No": ["N"],
-    "Pc": ["P"], "Pd": ["P"], "Ps": ["P"], "Pe": ["P"],
-    "Pi": ["P"], "Pf": ["P"], "Po": ["P"],
-    "Sm": ["S"], "Sc": ["S"], "Sk": ["S"], "So": ["S"],
-    "Zs": ["Z"], "Zl": ["Z"], "Zp": ["Z"],
-    "Cc": ["C"], "Cf": ["C"], "Cs": ["C"], "Co": ["C"], "Cn": ["C"],
-}
-
-# This is the (inclusive) range of surrogate codepoints.
-# These are not valid Rust characters.
-SURROGATE_CODEPOINTS_RANGE = (0xd800, 0xdfff)
-
-UnicodeData = namedtuple(
-    "UnicodeData", (
-        # Conversions:
-        "to_upper", "to_lower", "to_title",
-
-        # Decompositions: canonical decompositions, compatibility decomp
-        "canon_decomp", "compat_decomp",
-
-        # Grouped: general categories and combining characters
-        "general_categories", "combines",
-    )
-)
-
-UnicodeVersion = namedtuple(
-    "UnicodeVersion", ("major", "minor", "micro", "as_str")
-)
-
-
-def fetch_files(version=None):
-    # type: (str) -> UnicodeVersion
-    """
-    Fetch all the Unicode files from unicode.org.
-
-    This will use cached files (stored in `FETCH_DIR`) if they exist,
-    creating them if they don't.  In any case, the Unicode version
-    is always returned.
-
-    :param version: The desired Unicode version, as string.
-        (If None, defaults to latest final release available,
-         querying the unicode.org service).
-    """
-    have_version = check_stored_version(version)
-    if have_version:
-        return have_version
-
-    if version:
-        # Check if the desired version exists on the server.
-        get_fetch_url = lambda name: FETCH_URL_VERSION.format(version=version, filename=name)
-    else:
-        # Extract the latest version.
-        get_fetch_url = lambda name: FETCH_URL_LATEST.format(filename=name)
-
-    readme_url = get_fetch_url(UnicodeFiles.README)
-
-    print("Fetching: {}".format(readme_url))
-    readme_content = subprocess.check_output(("curl", readme_url))
-
-    unicode_version = parse_readme_unicode_version(
-        readme_content.decode("utf8")
-    )
-
-    download_dir = get_unicode_dir(unicode_version)
-    if not os.path.exists(download_dir):
-        # For 2.7 compat, we don't use `exist_ok=True`.
-        os.makedirs(download_dir)
-
-    for filename in ALL_UNICODE_FILES:
-        file_path = get_unicode_file_path(unicode_version, filename)
-
-        if os.path.exists(file_path):
-            # Assume file on the server didn't change if it's been saved before.
-            continue
-
-        if filename == UnicodeFiles.README:
-            with open(file_path, "wb") as fd:
-                fd.write(readme_content)
-        else:
-            url = get_fetch_url(filename)
-            print("Fetching: {}".format(url))
-            subprocess.check_call(("curl", "-o", file_path, url))
-
-    return unicode_version
-
-
-def check_stored_version(version):
-    # type: (Optional[str]) -> Optional[UnicodeVersion]
-    """
-    Given desired Unicode version, return the version
-    if stored files are all present, and `None` otherwise.
-    """
-    if not version:
-        # If no desired version specified, we should check what's the latest
-        # version, skipping stored version checks.
-        return None
-
-    fetch_dir = os.path.join(FETCH_DIR, version)
-
-    for filename in ALL_UNICODE_FILES:
-        file_path = os.path.join(fetch_dir, filename)
-
-        if not os.path.exists(file_path):
-            return None
-
-    with open(os.path.join(fetch_dir, UnicodeFiles.README)) as fd:
-        return parse_readme_unicode_version(fd.read())
-
-
-def parse_readme_unicode_version(readme_content):
-    # type: (str) -> UnicodeVersion
-    """
-    Parse the Unicode version contained in their `ReadMe.txt` file.
-    """
-    # "Raw string" is necessary for \d not being treated as escape char
-    # (for the sake of compat with future Python versions).
-    # See: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
-    pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
-    groups = re.search(pattern, readme_content).groups()
-
-    return UnicodeVersion(*map(int, groups), as_str=".".join(groups))
-
-
-def get_unicode_dir(unicode_version):
-    # type: (UnicodeVersion) -> str
-    """
-    Indicate in which parent dir the Unicode data files should be stored.
-
-    This returns a full, absolute path.
-    """
-    return os.path.join(FETCH_DIR, unicode_version.as_str)
-
-
-def get_unicode_file_path(unicode_version, filename):
-    # type: (UnicodeVersion, str) -> str
-    """
-    Indicate where the Unicode data file should be stored.
-    """
-    return os.path.join(get_unicode_dir(unicode_version), filename)
-
-
-def is_surrogate(n):
-    # type: (int) -> bool
-    """
-    Tell if given codepoint is a surrogate (not a valid Rust character).
-    """
-    return SURROGATE_CODEPOINTS_RANGE[0] <= n <= SURROGATE_CODEPOINTS_RANGE[1]
-
-
-def load_unicode_data(file_path):
-    # type: (str) -> UnicodeData
-    """
-    Load main Unicode data.
-    """
-    # Conversions
-    to_lower = {}   # type: Dict[int, Tuple[int, int, int]]
-    to_upper = {}   # type: Dict[int, Tuple[int, int, int]]
-    to_title = {}   # type: Dict[int, Tuple[int, int, int]]
-
-    # Decompositions
-    compat_decomp = {}   # type: Dict[int, List[int]]
-    canon_decomp = {}    # type: Dict[int, List[int]]
-
-    # Combining characters
-    # FIXME: combines are not used
-    combines = defaultdict(set)   # type: Dict[str, Set[int]]
-
-    # Categories
-    general_categories = defaultdict(set)   # type: Dict[str, Set[int]]
-    category_assigned_codepoints = set()    # type: Set[int]
-
-    all_codepoints = {}
-
-    range_start = -1
-
-    for line in fileinput.input(file_path):
-        data = line.split(";")
-        if len(data) != 15:
-            continue
-        codepoint = int(data[0], 16)
-        if is_surrogate(codepoint):
-            continue
-        if range_start >= 0:
-            for i in range(range_start, codepoint):
-                all_codepoints[i] = data
-            range_start = -1
-        if data[1].endswith(", First>"):
-            range_start = codepoint
-            continue
-        all_codepoints[codepoint] = data
-
-    for code, data in all_codepoints.items():
-        (code_org, name, gencat, combine, bidi,
-         decomp, deci, digit, num, mirror,
-         old, iso, upcase, lowcase, titlecase) = data
-
-        # Generate char to char direct common and simple conversions:
-
-        # Uppercase to lowercase
-        if lowcase != "" and code_org != lowcase:
-            to_lower[code] = (int(lowcase, 16), 0, 0)
-
-        # Lowercase to uppercase
-        if upcase != "" and code_org != upcase:
-            to_upper[code] = (int(upcase, 16), 0, 0)
-
-        # Title case
-        if titlecase.strip() != "" and code_org != titlecase:
-            to_title[code] = (int(titlecase, 16), 0, 0)
-
-        # Store decomposition, if given
-        if decomp:
-            decompositions = decomp.split()[1:]
-            decomp_code_points = [int(i, 16) for i in decompositions]
-
-            if decomp.startswith("<"):
-                # Compatibility decomposition
-                compat_decomp[code] = decomp_code_points
-            else:
-                # Canonical decomposition
-                canon_decomp[code] = decomp_code_points
-
-        # Place letter in categories as appropriate.
-        for cat in itertools.chain((gencat, ), EXPANDED_CATEGORIES.get(gencat, [])):
-            general_categories[cat].add(code)
-            category_assigned_codepoints.add(code)
-
-        # Record combining class, if any.
-        if combine != "0":
-            combines[combine].add(code)
-
-    # Generate Not_Assigned from Assigned.
-    general_categories["Cn"] = get_unassigned_codepoints(category_assigned_codepoints)
-
-    # Other contains Not_Assigned
-    general_categories["C"].update(general_categories["Cn"])
-
-    grouped_categories = group_categories(general_categories)
-
-    # FIXME: combines are not used
-    return UnicodeData(
-        to_lower=to_lower, to_upper=to_upper, to_title=to_title,
-        compat_decomp=compat_decomp, canon_decomp=canon_decomp,
-        general_categories=grouped_categories, combines=combines,
-    )
-
-
-def load_special_casing(file_path, unicode_data):
-    # type: (str, UnicodeData) -> None
-    """
-    Load special casing data and enrich given Unicode data.
-    """
-    for line in fileinput.input(file_path):
-        data = line.split("#")[0].split(";")
-        if len(data) == 5:
-            code, lower, title, upper, _comment = data
-        elif len(data) == 6:
-            code, lower, title, upper, condition, _comment = data
-            if condition.strip():  # Only keep unconditional mappins
-                continue
-        else:
-            continue
-        code = code.strip()
-        lower = lower.strip()
-        title = title.strip()
-        upper = upper.strip()
-        key = int(code, 16)
-        for (map_, values) in ((unicode_data.to_lower, lower),
-                               (unicode_data.to_upper, upper),
-                               (unicode_data.to_title, title)):
-            if values != code:
-                split = values.split()
-
-                codepoints = list(itertools.chain(
-                    (int(i, 16) for i in split),
-                    (0 for _ in range(len(split), 3))
-                ))
-
-                assert len(codepoints) == 3
-                map_[key] = codepoints
-
-
-def group_categories(mapping):
-    # type: (Dict[Any, Iterable[int]]) -> Dict[str, List[Tuple[int, int]]]
-    """
-    Group codepoints mapped in "categories".
-    """
-    return {category: group_codepoints(codepoints)
-            for category, codepoints in mapping.items()}
-
-
-def group_codepoints(codepoints):
-    # type: (Iterable[int]) -> List[Tuple[int, int]]
-    """
-    Group integral values into continuous, disjoint value ranges.
-
-    Performs value deduplication.
-
-    :return: sorted list of pairs denoting start and end of codepoint
-        group values, both ends inclusive.
-
-    >>> group_codepoints([1, 2, 10, 11, 12, 3, 4])
-    [(1, 4), (10, 12)]
-    >>> group_codepoints([1])
-    [(1, 1)]
-    >>> group_codepoints([1, 5, 6])
-    [(1, 1), (5, 6)]
-    >>> group_codepoints([])
-    []
-    """
-    sorted_codes = sorted(set(codepoints))
-    result = []     # type: List[Tuple[int, int]]
-
-    if not sorted_codes:
-        return result
-
-    next_codes = sorted_codes[1:]
-    start_code = sorted_codes[0]
-
-    for code, next_code in zip_longest(sorted_codes, next_codes, fillvalue=None):
-        if next_code is None or next_code - code != 1:
-            result.append((start_code, code))
-            start_code = next_code
-
-    return result
-
-
-def ungroup_codepoints(codepoint_pairs):
-    # type: (Iterable[Tuple[int, int]]) -> List[int]
-    """
-    The inverse of group_codepoints -- produce a flat list of values
-    from value range pairs.
-
-    >>> ungroup_codepoints([(1, 4), (10, 12)])
-    [1, 2, 3, 4, 10, 11, 12]
-    >>> ungroup_codepoints([(1, 1), (5, 6)])
-    [1, 5, 6]
-    >>> ungroup_codepoints(group_codepoints([1, 2, 7, 8]))
-    [1, 2, 7, 8]
-    >>> ungroup_codepoints([])
-    []
-    """
-    return list(itertools.chain.from_iterable(
-        range(lo, hi + 1) for lo, hi in codepoint_pairs
-    ))
-
-
-def get_unassigned_codepoints(assigned_codepoints):
-    # type: (Set[int]) -> Set[int]
-    """
-    Given a set of "assigned" codepoints, return a set
-    of these that are not in assigned and not surrogate.
-    """
-    return {i for i in range(0, 0x110000)
-            if i not in assigned_codepoints and not is_surrogate(i)}
-
-
-def generate_table_lines(items, indent, wrap=98):
-    # type: (Iterable[str], int, int) -> Iterator[str]
-    """
-    Given table items, generate wrapped lines of text with comma-separated items.
-
-    This is a generator function.
-
-    :param wrap: soft wrap limit (characters per line), integer.
-    """
-    line = " " * indent
-    first = True
-    for item in items:
-        if len(line) + len(item) < wrap:
-            if first:
-                line += item
-            else:
-                line += ", " + item
-            first = False
-        else:
-            yield line + ",\n"
-            line = " " * indent + item
-
-    yield line
-
-
-def load_properties(file_path, interesting_props):
-    # type: (str, Iterable[str]) -> Dict[str, List[Tuple[int, int]]]
-    """
-    Load properties data and return in grouped form.
-    """
-    props = defaultdict(list)   # type: Dict[str, List[Tuple[int, int]]]
-    # "Raw string" is necessary for `\.` and `\w` not to be treated as escape chars
-    # (for the sake of compat with future Python versions).
-    # See: https://docs.python.org/3.6/whatsnew/3.6.html#deprecated-python-behavior
-    re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
-    re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
-
-    for line in fileinput.input(file_path):
-        match = re1.match(line) or re2.match(line)
-        if match:
-            groups = match.groups()
-
-            if len(groups) == 2:
-                # `re1` matched (2 groups).
-                d_lo, prop = groups
-                d_hi = d_lo
-            else:
-                d_lo, d_hi, prop = groups
-        else:
-            continue
-
-        if interesting_props and prop not in interesting_props:
-            continue
-
-        lo_value = int(d_lo, 16)
-        hi_value = int(d_hi, 16)
-
-        props[prop].append((lo_value, hi_value))
-
-    # Optimize if possible.
-    for prop in props:
-        props[prop] = group_codepoints(ungroup_codepoints(props[prop]))
-
-    return props
-
-
-def escape_char(c):
-    # type: (int) -> str
-    r"""
-    Escape a codepoint for use as Rust char literal.
-
-    Outputs are OK to use as Rust source code as char literals
-    and they also include necessary quotes.
-
-    >>> escape_char(97)
-    "'\\u{61}'"
-    >>> escape_char(0)
-    "'\\0'"
-    """
-    return r"'\u{%x}'" % c if c != 0 else r"'\0'"
-
-
-def format_char_pair(pair):
-    # type: (Tuple[int, int]) -> str
-    """
-    Format a pair of two Rust chars.
-    """
-    return "(%s,%s)" % (escape_char(pair[0]), escape_char(pair[1]))
-
-
-def generate_table(
-    name,   # type: str
-    items,  # type: List[Tuple[int, int]]
-    decl_type="&[(char, char)]",    # type: str
-    is_pub=True,                    # type: bool
-    format_item=format_char_pair,   # type: Callable[[Tuple[int, int]], str]
-):
-    # type: (...) -> Iterator[str]
-    """
-    Generate a nicely formatted Rust constant "table" array.
-
-    This generates actual Rust code.
-    """
-    pub_string = ""
-    if is_pub:
-        pub_string = "pub "
-
-    yield "\n"
-    yield "    #[rustfmt::skip]\n"
-    yield "    %sconst %s: %s = &[\n" % (pub_string, name, decl_type)
-
-    data = []
-    first = True
-    for item in items:
-        if not first:
-            data.append(",")
-        first = False
-        data.extend(format_item(item))
-
-    for table_line in generate_table_lines("".join(data).split(","), 8):
-        yield table_line
-
-    yield "\n    ];\n"
-
-
-def compute_trie(raw_data, chunk_size):
-    # type: (List[int], int) -> Tuple[List[int], List[int]]
-    """
-    Compute postfix-compressed trie.
-
-    See: bool_trie.rs for more details.
-
-    >>> compute_trie([1, 2, 3, 1, 2, 3, 4, 5, 6], 3)
-    ([0, 0, 1], [1, 2, 3, 4, 5, 6])
-    >>> compute_trie([1, 2, 3, 1, 2, 4, 4, 5, 6], 3)
-    ([0, 1, 2], [1, 2, 3, 1, 2, 4, 4, 5, 6])
-    """
-    root = []
-    childmap = {}       # type: Dict[Tuple[int, ...], int]
-    child_data = []
-
-    assert len(raw_data) % chunk_size == 0, "Chunks must be equally sized"
-
-    for i in range(len(raw_data) // chunk_size):
-        data = raw_data[i * chunk_size : (i + 1) * chunk_size]
-
-        # Postfix compression of child nodes (data chunks)
-        # (identical child nodes are shared).
-
-        # Make a tuple out of the list so it's hashable.
-        child = tuple(data)
-        if child not in childmap:
-            childmap[child] = len(childmap)
-            child_data.extend(data)
-
-        root.append(childmap[child])
-
-    return root, child_data
-
-
-def generate_bool_trie(name, codepoint_ranges, is_pub=False):
-    # type: (str, List[Tuple[int, int]], bool) -> Iterator[str]
-    """
-    Generate Rust code for BoolTrie struct.
-
-    This yields string fragments that should be joined to produce
-    the final string.
-
-    See: `bool_trie.rs`.
-    """
-    chunk_size = 64
-    rawdata = [False] * 0x110000
-    for (lo, hi) in codepoint_ranges:
-        for cp in range(lo, hi + 1):
-            rawdata[cp] = True
-
-    # Convert to bitmap chunks of `chunk_size` bits each.
-    chunks = []
-    for i in range(0x110000 // chunk_size):
-        chunk = 0
-        for j in range(chunk_size):
-            if rawdata[i * chunk_size + j]:
-                chunk |= 1 << j
-        chunks.append(chunk)
-
-    pub_string = ""
-    if is_pub:
-        pub_string = "pub "
-
-    yield "\n"
-    yield "    #[rustfmt::skip]\n"
-    yield "    %sconst %s: &super::BoolTrie = &super::BoolTrie {\n" % (pub_string, name)
-    yield "        r1: [\n"
-    data = ("0x%016x" % chunk for chunk in chunks[:0x800 // chunk_size])
-    for fragment in generate_table_lines(data, 12):
-        yield fragment
-    yield "\n        ],\n"
-
-    # 0x800..0x10000 trie
-    (r2, r3) = compute_trie(chunks[0x800 // chunk_size : 0x10000 // chunk_size], 64 // chunk_size)
-    yield "        r2: [\n"
-    data = map(str, r2)
-    for fragment in generate_table_lines(data, 12):
-        yield fragment
-    yield "\n        ],\n"
-
-    yield "        r3: &[\n"
-    data = ("0x%016x" % node for node in r3)
-    for fragment in generate_table_lines(data, 12):
-        yield fragment
-    yield "\n        ],\n"
-
-    # 0x10000..0x110000 trie
-    (mid, r6) = compute_trie(chunks[0x10000 // chunk_size : 0x110000 // chunk_size],
-                             64 // chunk_size)
-    (r4, r5) = compute_trie(mid, 64)
-
-    yield "        r4: [\n"
-    data = map(str, r4)
-    for fragment in generate_table_lines(data, 12):
-        yield fragment
-    yield "\n        ],\n"
-
-    yield "        r5: &[\n"
-    data = map(str, r5)
-    for fragment in generate_table_lines(data, 12):
-        yield fragment
-    yield "\n        ],\n"
-
-    yield "        r6: &[\n"
-    data = ("0x%016x" % node for node in r6)
-    for fragment in generate_table_lines(data, 12):
-        yield fragment
-    yield "\n        ],\n"
-
-    yield "    };\n"
-
-
-def generate_small_bool_trie(name, codepoint_ranges, is_pub=False):
-    # type: (str, List[Tuple[int, int]], bool) -> Iterator[str]
-    """
-    Generate Rust code for `SmallBoolTrie` struct.
-
-    See: `bool_trie.rs`.
-    """
-    last_chunk = max(hi // 64 for (lo, hi) in codepoint_ranges)
-    n_chunks = last_chunk + 1
-    chunks = [0] * n_chunks
-    for (lo, hi) in codepoint_ranges:
-        for cp in range(lo, hi + 1):
-            assert cp // 64 < len(chunks)
-            chunks[cp // 64] |= 1 << (cp & 63)
-
-    pub_string = ""
-    if is_pub:
-        pub_string = "pub "
-
-    yield "\n"
-    yield "    #[rustfmt::skip]\n"
-    yield ("    %sconst %s: &super::SmallBoolTrie = &super::SmallBoolTrie {\n"
-           % (pub_string, name))
-
-    (r1, r2) = compute_trie(chunks, 1)
-
-    yield "        r1: &[\n"
-    data = (str(node) for node in r1)
-    for fragment in generate_table_lines(data, 12):
-        yield fragment
-    yield "\n        ],\n"
-
-    yield "        r2: &[\n"
-    data = ("0x%016x" % node for node in r2)
-    for fragment in generate_table_lines(data, 12):
-        yield fragment
-    yield "\n        ],\n"
-
-    yield "    };\n"
-
-
-def generate_property_module(mod, grouped_categories, category_subset):
-    # type: (str, Dict[str, List[Tuple[int, int]]], Iterable[str]) -> Iterator[str]
-    """
-    Generate Rust code for module defining properties.
-    """
-
-    yield "pub(crate) mod %s {" % mod
-    for cat in sorted(category_subset):
-        if cat in ("Cc", "White_Space"):
-            generator = generate_small_bool_trie("%s_table" % cat, grouped_categories[cat])
-        else:
-            generator = generate_bool_trie("%s_table" % cat, grouped_categories[cat])
-
-        for fragment in generator:
-            yield fragment
-
-        yield "\n"
-        yield "    pub fn %s(c: char) -> bool {\n" % cat
-        yield "        %s_table.lookup(c)\n" % cat
-        yield "    }\n"
-
-    yield "}\n\n"
-
-
-def generate_conversions_module(unicode_data):
-    # type: (UnicodeData) -> Iterator[str]
-    """
-    Generate Rust code for module defining conversions.
-    """
-
-    yield "pub(crate) mod conversions {"
-    yield """
-    pub fn to_lower(c: char) -> [char; 3] {
-        match bsearch_case_table(c, to_lowercase_table) {
-            None => [c, '\\0', '\\0'],
-            Some(index) => to_lowercase_table[index].1,
-        }
-    }
-
-    pub fn to_upper(c: char) -> [char; 3] {
-        match bsearch_case_table(c, to_uppercase_table) {
-            None => [c, '\\0', '\\0'],
-            Some(index) => to_uppercase_table[index].1,
-        }
-    }
-
-    fn bsearch_case_table(c: char, table: &[(char, [char; 3])]) -> Option<usize> {
-        table.binary_search_by(|&(key, _)| key.cmp(&c)).ok()
-    }\n"""
-
-    decl_type = "&[(char, [char; 3])]"
-    format_conversion = lambda x: "({},[{},{},{}])".format(*(
-        escape_char(c) for c in (x[0], x[1][0], x[1][1], x[1][2])
-    ))
-
-    for fragment in generate_table(
-        name="to_lowercase_table",
-        items=sorted(unicode_data.to_lower.items(), key=lambda x: x[0]),
-        decl_type=decl_type,
-        is_pub=False,
-        format_item=format_conversion
-    ):
-        yield fragment
-
-    for fragment in generate_table(
-        name="to_uppercase_table",
-        items=sorted(unicode_data.to_upper.items(), key=lambda x: x[0]),
-        decl_type=decl_type,
-        is_pub=False,
-        format_item=format_conversion
-    ):
-        yield fragment
-
-    yield "}\n"
-
-
-def parse_args():
-    # type: () -> argparse.Namespace
-    """
-    Parse command line arguments.
-    """
-    parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument("-v", "--version", default=None, type=str,
-                        help="Unicode version to use (if not specified,"
-                             " defaults to latest release).")
-
-    return parser.parse_args()
-
-
-def main():
-    # type: () -> None
-    """
-    Script entry point.
-    """
-    args = parse_args()
-
-    unicode_version = fetch_files(args.version)
-    print("Using Unicode version: {}".format(unicode_version.as_str))
-
-    # All the writing happens entirely in memory, we only write to file
-    # once we have generated the file content (it's not very large, <1 MB).
-    buf = StringIO()
-    buf.write(PREAMBLE)
-
-    unicode_version_notice = textwrap.dedent("""
-    /// The version of [Unicode](http://www.unicode.org/) that the Unicode parts of
-    /// `char` and `str` methods are based on.
-    #[unstable(feature = "unicode_version", issue = "49726")]
-    pub const UNICODE_VERSION: UnicodeVersion =
-        UnicodeVersion {{ major: {v.major}, minor: {v.minor}, micro: {v.micro}, _priv: () }};
-    """).format(v=unicode_version)
-    buf.write(unicode_version_notice)
-
-    get_path = lambda f: get_unicode_file_path(unicode_version, f)
-
-    unicode_data = load_unicode_data(get_path(UnicodeFiles.UNICODE_DATA))
-    load_special_casing(get_path(UnicodeFiles.SPECIAL_CASING), unicode_data)
-
-    want_derived = {"Alphabetic", "Lowercase", "Uppercase",
-                    "Cased", "Case_Ignorable", "Grapheme_Extend"}
-    derived = load_properties(get_path(UnicodeFiles.DERIVED_CORE_PROPERTIES), want_derived)
-
-    props = load_properties(get_path(UnicodeFiles.PROPS),
-                            {"White_Space", "Join_Control", "Noncharacter_Code_Point"})
-
-    # Category tables
-    for (name, categories, category_subset) in (
-            ("general_category", unicode_data.general_categories, ["N", "Cc"]),
-            ("derived_property", derived, want_derived),
-            ("property", props, ["White_Space"])
-    ):
-        for fragment in generate_property_module(name, categories, category_subset):
-            buf.write(fragment)
-
-    for fragment in generate_conversions_module(unicode_data):
-        buf.write(fragment)
-
-    tables_rs_path = os.path.join(THIS_DIR, "tables.rs")
-
-    # Actually write out the file content.
-    # Will overwrite the file if it exists.
-    with open(tables_rs_path, "w") as fd:
-        fd.write(buf.getvalue())
-
-    print("Regenerated tables.rs.")
-
-
-if __name__ == "__main__":
-    main()