about summary refs log tree commit diff
diff options
context:
space:
mode:
authorbors <bors@rust-lang.org>2024-04-20 20:33:25 +0000
committerbors <bors@rust-lang.org>2024-04-20 20:33:25 +0000
commitdbce3b43b6cb34dd3ba12c3ec6f708fe68e9c3df (patch)
tree180ec8f1f702659975d5ccf7fc6d55084186c4d9
parent54692c3d0b873a5dc055f2f35081d0d6a2410a49 (diff)
parent488598c183ac55f6970bef34a1ed5404ae1d5088 (diff)
downloadrust-dbce3b43b6cb34dd3ba12c3ec6f708fe68e9c3df.tar.gz
rust-dbce3b43b6cb34dd3ba12c3ec6f708fe68e9c3df.zip
Auto merge of #122013 - Swatinem:unicode-gen-fastpath, r=scottmcm
Add a lower bound check to `unicode-table-generator` output

This adds a dedicated check for the lower bound
(if it is outside of ASCII range) to the output of the `unicode-table-generator` tool.

This generalized the ASCII-only fast-path, but only for the `Grapheme_Extend` property for now, as that is the only one with a lower bound outside of ASCII.
-rw-r--r--library/core/src/char/methods.rs2
-rw-r--r--library/core/src/unicode/unicode_data.rs4
-rw-r--r--src/tools/unicode-table-generator/src/raw_emitter.rs6
-rw-r--r--src/tools/unicode-table-generator/src/skiplist.rs24
4 files changed, 32 insertions, 4 deletions
diff --git a/library/core/src/char/methods.rs b/library/core/src/char/methods.rs
index 65ae4831839..a93b94867ce 100644
--- a/library/core/src/char/methods.rs
+++ b/library/core/src/char/methods.rs
@@ -927,7 +927,7 @@ impl char {
     #[must_use]
     #[inline]
     pub(crate) fn is_grapheme_extended(self) -> bool {
-        self > '\x7f' && unicode::Grapheme_Extend(self)
+        unicode::Grapheme_Extend(self)
     }
 
     /// Returns `true` if this `char` has one of the general categories for numbers.
diff --git a/library/core/src/unicode/unicode_data.rs b/library/core/src/unicode/unicode_data.rs
index dd2ad9a58f6..1b3d6729663 100644
--- a/library/core/src/unicode/unicode_data.rs
+++ b/library/core/src/unicode/unicode_data.rs
@@ -315,7 +315,11 @@ pub mod grapheme_extend {
         15, 0, 7, 1, 17, 2, 7, 1, 2, 1, 5, 100, 1, 160, 7, 0, 1, 61, 4, 0, 4, 0, 7, 109, 7, 0, 96,
         128, 240, 0,
     ];
+    #[inline]
     pub fn lookup(c: char) -> bool {
+        (c as u32) >= 0x300 && lookup_slow(c)
+    }
+    fn lookup_slow(c: char) -> bool {
         super::skip_search(
             c as u32,
             &SHORT_OFFSET_RUNS,
diff --git a/src/tools/unicode-table-generator/src/raw_emitter.rs b/src/tools/unicode-table-generator/src/raw_emitter.rs
index 7547b49ab2a..ef5cea18ea2 100644
--- a/src/tools/unicode-table-generator/src/raw_emitter.rs
+++ b/src/tools/unicode-table-generator/src/raw_emitter.rs
@@ -23,6 +23,7 @@ impl RawEmitter {
     }
 
     fn emit_bitset(&mut self, ranges: &[Range<u32>]) -> Result<(), String> {
+        let first_code_point = ranges.first().unwrap().start;
         let last_code_point = ranges.last().unwrap().end;
         // bitset for every bit in the codepoint range
         //
@@ -101,7 +102,10 @@ impl RawEmitter {
         )
         .unwrap();
         writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap();
-        writeln!(&mut self.file, "    super::bitset_search(",).unwrap();
+        if first_code_point > 0x7f {
+            writeln!(&mut self.file, "    (c as u32) >= {first_code_point:#04x} &&").unwrap();
+        }
+        writeln!(&mut self.file, "    super::bitset_search(").unwrap();
         writeln!(&mut self.file, "        c as u32,").unwrap();
         writeln!(&mut self.file, "        &BITSET_CHUNKS_MAP,").unwrap();
         writeln!(&mut self.file, "        &BITSET_INDEX_CHUNKS,").unwrap();
diff --git a/src/tools/unicode-table-generator/src/skiplist.rs b/src/tools/unicode-table-generator/src/skiplist.rs
index 9b613a94c57..8fae8289e25 100644
--- a/src/tools/unicode-table-generator/src/skiplist.rs
+++ b/src/tools/unicode-table-generator/src/skiplist.rs
@@ -25,8 +25,9 @@ impl ShortOffsetRunHeader {
 
 impl RawEmitter {
     pub fn emit_skiplist(&mut self, ranges: &[Range<u32>]) {
+        let first_code_point = ranges.first().unwrap().start;
         let mut offsets = Vec::<u32>::new();
-        let points = ranges.iter().flat_map(|r| vec![r.start, r.end]).collect::<Vec<u32>>();
+        let points = ranges.iter().flat_map(|r| [r.start, r.end]).collect::<Vec<u32>>();
         let mut offset = 0;
         for pt in points {
             let delta = pt - offset;
@@ -86,7 +87,26 @@ impl RawEmitter {
         .unwrap();
         self.bytes_used += coded_offsets.len();
 
-        writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
+        // The inlining in this code works like the following:
+        //
+        // The `skip_search` function is always inlined into the parent `lookup` fn,
+        // thus the compiler can generate optimal code based on the referenced `static`s.
+        //
+        // In the case of ASCII optimization, the lower-bounds check is inlined into
+        // the caller, and slower-path `skip_search` is outlined into a separate `lookup_slow` fn.
+        //
+        // Thus, in both cases, the `skip_search` function is specialized for the `static`s,
+        // and outlined into the prebuilt `std`.
+        if first_code_point > 0x7f {
+            writeln!(&mut self.file, "#[inline]").unwrap();
+            writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
+            writeln!(&mut self.file, "    (c as u32) >= {first_code_point:#04x} && lookup_slow(c)")
+                .unwrap();
+            writeln!(&mut self.file, "}}").unwrap();
+            writeln!(&mut self.file, "fn lookup_slow(c: char) -> bool {{").unwrap();
+        } else {
+            writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
+        }
         writeln!(&mut self.file, "    super::skip_search(",).unwrap();
         writeln!(&mut self.file, "        c as u32,").unwrap();
         writeln!(&mut self.file, "        &SHORT_OFFSET_RUNS,").unwrap();