about summary refs log tree commit diff
path: root/src
diff options
context:
space:
mode:
authorKarl Meakin <karl.meakin@arm.com>2025-08-02 23:32:10 +0100
committerKarl Meakin <karl.meakin@arm.com>2025-09-07 15:21:24 +0200
commita8c669461f0c71985c72dd5b05f70b8d4d149e3b (patch)
tree9319ed6acf4501829107836c660d74bf32dfa6a4 /src
parentfbd8f95118fff54a2402983d3f446cad9b2f30c5 (diff)
downloadrust-a8c669461f0c71985c72dd5b05f70b8d4d149e3b.tar.gz
rust-a8c669461f0c71985c72dd5b05f70b8d4d149e3b.zip
optimization: Don't include ASCII characters in Unicode tables
The ASCII subset of Unicode is fixed and will never change, so we don't
need to generate tables for it with every new Unicode version. This
saves a few bytes of static data and speeds up `char::is_control` and
`char::is_grapheme_extended` on ASCII inputs.

Since the table lookup functions exported from the `unicode` module will
give nonsensical errors on ASCII input (and in fact will panic in debug
mode), I had to add some private wrapper methods to `char` which check
for ASCII-ness first.
Diffstat (limited to 'src')
-rw-r--r--src/tools/unicode-table-generator/src/cascading_map.rs1
-rw-r--r--src/tools/unicode-table-generator/src/main.rs1
-rw-r--r--src/tools/unicode-table-generator/src/raw_emitter.rs1
-rw-r--r--src/tools/unicode-table-generator/src/skiplist.rs2
4 files changed, 5 insertions, 0 deletions
diff --git a/src/tools/unicode-table-generator/src/cascading_map.rs b/src/tools/unicode-table-generator/src/cascading_map.rs
index 78a7bba3208..56e6401908d 100644
--- a/src/tools/unicode-table-generator/src/cascading_map.rs
+++ b/src/tools/unicode-table-generator/src/cascading_map.rs
@@ -64,6 +64,7 @@ impl RawEmitter {
 
         writeln!(&mut self.file, "#[inline]").unwrap();
         writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap();
+        writeln!(&mut self.file, "    debug_assert!(!c.is_ascii());").unwrap();
         writeln!(&mut self.file, "    match c as u32 >> 8 {{").unwrap();
         for arm in arms {
             writeln!(&mut self.file, "        {arm},").unwrap();
diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs
index aa7d97f7f3d..ded9205ffc4 100644
--- a/src/tools/unicode-table-generator/src/main.rs
+++ b/src/tools/unicode-table-generator/src/main.rs
@@ -195,6 +195,7 @@ fn load_data() -> UnicodeData {
                 .into_iter()
                 .flatten()
                 .flat_map(|cp| cp.scalar())
+                .filter(|c| !c.is_ascii())
                 .map(u32::from)
                 .collect::<Vec<_>>();
             (prop, ranges_from_set(&codepoints))
diff --git a/src/tools/unicode-table-generator/src/raw_emitter.rs b/src/tools/unicode-table-generator/src/raw_emitter.rs
index 03ed9499e26..297965615c1 100644
--- a/src/tools/unicode-table-generator/src/raw_emitter.rs
+++ b/src/tools/unicode-table-generator/src/raw_emitter.rs
@@ -98,6 +98,7 @@ impl RawEmitter {
         self.blank_line();
 
         writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap();
+        writeln!(&mut self.file, "    debug_assert!(!c.is_ascii());").unwrap();
         if first_code_point > 0x7f {
             writeln!(&mut self.file, "    (c as u32) >= {first_code_point:#04x} &&").unwrap();
         }
diff --git a/src/tools/unicode-table-generator/src/skiplist.rs b/src/tools/unicode-table-generator/src/skiplist.rs
index 34c9802e122..660a8f342f7 100644
--- a/src/tools/unicode-table-generator/src/skiplist.rs
+++ b/src/tools/unicode-table-generator/src/skiplist.rs
@@ -99,6 +99,7 @@ impl RawEmitter {
         if first_code_point > 0x7f {
             writeln!(&mut self.file, "#[inline]").unwrap();
             writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
+            writeln!(&mut self.file, "    debug_assert!(!c.is_ascii());").unwrap();
             writeln!(&mut self.file, "    (c as u32) >= {first_code_point:#04x} && lookup_slow(c)")
                 .unwrap();
             writeln!(&mut self.file, "}}").unwrap();
@@ -107,6 +108,7 @@ impl RawEmitter {
             writeln!(&mut self.file, "fn lookup_slow(c: char) -> bool {{").unwrap();
         } else {
             writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
+            writeln!(&mut self.file, "    debug_assert!(!c.is_ascii());").unwrap();
         }
         writeln!(&mut self.file, "    const {{").unwrap();
         writeln!(