diff options
| author | Karl Meakin <karl.meakin@arm.com> | 2025-08-02 23:32:10 +0100 |
|---|---|---|
| committer | Karl Meakin <karl.meakin@arm.com> | 2025-09-07 15:21:24 +0200 |
| commit | a8c669461f0c71985c72dd5b05f70b8d4d149e3b (patch) | |
| tree | 9319ed6acf4501829107836c660d74bf32dfa6a4 /src | |
| parent | fbd8f95118fff54a2402983d3f446cad9b2f30c5 (diff) | |
| download | rust-a8c669461f0c71985c72dd5b05f70b8d4d149e3b.tar.gz rust-a8c669461f0c71985c72dd5b05f70b8d4d149e3b.zip | |
optimization: Don't include ASCII characters in Unicode tables
The ASCII subset of Unicode is fixed and will never change, so we don't need to generate tables for it with every new Unicode version. This saves a few bytes of static data and speeds up `char::is_control` and `char::is_grapheme_extended` on ASCII inputs. Since the table lookup functions exported from the `unicode` module will give nonsensical errors on ASCII input (and in fact will panic in debug mode), I had to add some private wrapper methods to `char` which check for ASCII-ness first.
Diffstat (limited to 'src')
4 files changed, 5 insertions, 0 deletions
diff --git a/src/tools/unicode-table-generator/src/cascading_map.rs b/src/tools/unicode-table-generator/src/cascading_map.rs index 78a7bba3208..56e6401908d 100644 --- a/src/tools/unicode-table-generator/src/cascading_map.rs +++ b/src/tools/unicode-table-generator/src/cascading_map.rs @@ -64,6 +64,7 @@ impl RawEmitter { writeln!(&mut self.file, "#[inline]").unwrap(); writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap(); + writeln!(&mut self.file, " debug_assert!(!c.is_ascii());").unwrap(); writeln!(&mut self.file, " match c as u32 >> 8 {{").unwrap(); for arm in arms { writeln!(&mut self.file, " {arm},").unwrap(); diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index aa7d97f7f3d..ded9205ffc4 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -195,6 +195,7 @@ fn load_data() -> UnicodeData { .into_iter() .flatten() .flat_map(|cp| cp.scalar()) + .filter(|c| !c.is_ascii()) .map(u32::from) .collect::<Vec<_>>(); (prop, ranges_from_set(&codepoints)) diff --git a/src/tools/unicode-table-generator/src/raw_emitter.rs b/src/tools/unicode-table-generator/src/raw_emitter.rs index 03ed9499e26..297965615c1 100644 --- a/src/tools/unicode-table-generator/src/raw_emitter.rs +++ b/src/tools/unicode-table-generator/src/raw_emitter.rs @@ -98,6 +98,7 @@ impl RawEmitter { self.blank_line(); writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap(); + writeln!(&mut self.file, " debug_assert!(!c.is_ascii());").unwrap(); if first_code_point > 0x7f { writeln!(&mut self.file, " (c as u32) >= {first_code_point:#04x} &&").unwrap(); } diff --git a/src/tools/unicode-table-generator/src/skiplist.rs b/src/tools/unicode-table-generator/src/skiplist.rs index 34c9802e122..660a8f342f7 100644 --- a/src/tools/unicode-table-generator/src/skiplist.rs +++ b/src/tools/unicode-table-generator/src/skiplist.rs @@ -99,6 +99,7 @@ impl RawEmitter { if first_code_point > 0x7f { writeln!(&mut self.file, "#[inline]").unwrap(); writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap(); + writeln!(&mut self.file, " debug_assert!(!c.is_ascii());").unwrap(); writeln!(&mut self.file, " (c as u32) >= {first_code_point:#04x} && lookup_slow(c)") .unwrap(); writeln!(&mut self.file, "}}").unwrap(); @@ -107,6 +108,7 @@ impl RawEmitter { writeln!(&mut self.file, "fn lookup_slow(c: char) -> bool {{").unwrap(); } else { writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap(); + writeln!(&mut self.file, " debug_assert!(!c.is_ascii());").unwrap(); } writeln!(&mut self.file, " const {{").unwrap(); writeln!( |
