about summary refs log tree commit diff
diff options
context:
space:
mode:
authorMarkus Reiter <me@reitermark.us>2025-03-05 01:04:10 +0100
committerMarkus Reiter <me@reitermark.us>2025-03-06 21:38:39 +0100
commit222adac953d1e745763850f75f4afac69b1d4321 (patch)
tree1cbf4599974bb9ea3be2b4fd331bff6ef27d4dca
parente6af292f91f21f12ac1aab6825efb7e1e3381cbb (diff)
downloadrust-222adac953d1e745763850f75f4afac69b1d4321.tar.gz
rust-222adac953d1e745763850f75f4afac69b1d4321.zip
Allow optimizing out `panic_bounds_check` in Unicode checks.
-rw-r--r--library/core/src/unicode/unicode_data.rs73
-rw-r--r--src/tools/unicode-table-generator/src/range_search.rs25
-rw-r--r--src/tools/unicode-table-generator/src/skiplist.rs20
3 files changed, 65 insertions, 53 deletions
diff --git a/library/core/src/unicode/unicode_data.rs b/library/core/src/unicode/unicode_data.rs
index 4655d35e9c4..e103d09d9ed 100644
--- a/library/core/src/unicode/unicode_data.rs
+++ b/library/core/src/unicode/unicode_data.rs
@@ -55,24 +55,31 @@ fn decode_length(short_offset_run_header: u32) -> usize {
     (short_offset_run_header >> 21) as usize
 }
 
+/// # Safety
+///
+/// The last element of `short_offset_runs` must be greater than `std::char::MAX`.
 #[inline(always)]
-fn skip_search<const SOR: usize, const OFFSETS: usize>(
-    needle: u32,
+unsafe fn skip_search<const SOR: usize, const OFFSETS: usize>(
+    needle: char,
     short_offset_runs: &[u32; SOR],
     offsets: &[u8; OFFSETS],
 ) -> bool {
-    // Note that this *cannot* be past the end of the array, as the last
-    // element is greater than std::char::MAX (the largest possible needle).
-    //
-    // So, we cannot have found it (i.e. Ok(idx) + 1 != length) and the correct
-    // location cannot be past it, so Err(idx) != length either.
-    //
-    // This means that we can avoid bounds checking for the accesses below, too.
+    let needle = needle as u32;
+
     let last_idx =
         match short_offset_runs.binary_search_by_key(&(needle << 11), |header| header << 11) {
             Ok(idx) => idx + 1,
             Err(idx) => idx,
         };
+    // SAFETY: `last_idx` *cannot* be past the end of the array, as the last
+    // element is greater than `std::char::MAX` (the largest possible needle)
+    // as guaranteed by the caller.
+    //
+    // So, we cannot have found it (i.e. `Ok(idx) => idx + 1 != length`) and the
+    // correct location cannot be past it, so `Err(idx) => idx != length` either.
+    //
+    // This means that we can avoid bounds checking for the accesses below, too.
+    unsafe { crate::hint::assert_unchecked(last_idx < SOR) };
 
     let mut offset_idx = decode_length(short_offset_runs[last_idx]);
     let length = if let Some(next) = short_offset_runs.get(last_idx + 1) {
@@ -169,11 +176,9 @@ pub mod alphabetic {
         0, 0, 0, 0, 5, 0, 0,
     ];
     pub fn lookup(c: char) -> bool {
-        super::skip_search(
-            c as u32,
-            &SHORT_OFFSET_RUNS,
-            &OFFSETS,
-        )
+        const { assert!(*SHORT_OFFSET_RUNS.last().unwrap() > (char::MAX as u32)); }
+        // SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX`.
+        unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) }
     }
 }
 
@@ -222,11 +227,9 @@ pub mod case_ignorable {
         1, 61, 4, 0, 5, 254, 2, 0, 7, 109, 8, 0, 5, 0, 1, 30, 96, 128, 240, 0,
     ];
     pub fn lookup(c: char) -> bool {
-        super::skip_search(
-            c as u32,
-            &SHORT_OFFSET_RUNS,
-            &OFFSETS,
-        )
+        const { assert!(*SHORT_OFFSET_RUNS.last().unwrap() > (char::MAX as u32)); }
+        // SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX`.
+        unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) }
     }
 }
 
@@ -252,11 +255,9 @@ pub mod cased {
         8, 0, 10, 1, 20, 6, 6, 0, 62, 0, 68, 0, 26, 6, 26, 6, 26, 0,
     ];
     pub fn lookup(c: char) -> bool {
-        super::skip_search(
-            c as u32,
-            &SHORT_OFFSET_RUNS,
-            &OFFSETS,
-        )
+        const { assert!(*SHORT_OFFSET_RUNS.last().unwrap() > (char::MAX as u32)); }
+        // SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX`.
+        unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) }
     }
 }
 
@@ -269,11 +270,9 @@ pub mod cc {
         0, 32, 95, 33, 0,
     ];
     pub fn lookup(c: char) -> bool {
-        super::skip_search(
-            c as u32,
-            &SHORT_OFFSET_RUNS,
-            &OFFSETS,
-        )
+        const { assert!(*SHORT_OFFSET_RUNS.last().unwrap() > (char::MAX as u32)); }
+        // SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX`.
+        unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) }
     }
 }
 
@@ -320,11 +319,9 @@ pub mod grapheme_extend {
         (c as u32) >= 0x300 && lookup_slow(c)
     }
     fn lookup_slow(c: char) -> bool {
-        super::skip_search(
-            c as u32,
-            &SHORT_OFFSET_RUNS,
-            &OFFSETS,
-        )
+        const { assert!(*SHORT_OFFSET_RUNS.last().unwrap() > (char::MAX as u32)); }
+        // SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX`.
+        unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) }
     }
 }
 
@@ -459,11 +456,9 @@ pub mod n {
         10, 247, 10, 0, 9, 128, 10, 0, 59, 1, 3, 1, 4, 76, 45, 1, 15, 0, 13, 0, 10, 0,
     ];
     pub fn lookup(c: char) -> bool {
-        super::skip_search(
-            c as u32,
-            &SHORT_OFFSET_RUNS,
-            &OFFSETS,
-        )
+        const { assert!(*SHORT_OFFSET_RUNS.last().unwrap() > (char::MAX as u32)); }
+        // SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX`.
+        unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) }
     }
 }
 
diff --git a/src/tools/unicode-table-generator/src/range_search.rs b/src/tools/unicode-table-generator/src/range_search.rs
index 9a51979a2f0..7376bbdc9ae 100644
--- a/src/tools/unicode-table-generator/src/range_search.rs
+++ b/src/tools/unicode-table-generator/src/range_search.rs
@@ -53,24 +53,31 @@ fn decode_length(short_offset_run_header: u32) -> usize {
     (short_offset_run_header >> 21) as usize
 }
 
+/// # Safety
+///
+/// The last element of `short_offset_runs` must be greater than `std::char::MAX`.
 #[inline(always)]
-fn skip_search<const SOR: usize, const OFFSETS: usize>(
-    needle: u32,
+unsafe fn skip_search<const SOR: usize, const OFFSETS: usize>(
+    needle: char,
     short_offset_runs: &[u32; SOR],
     offsets: &[u8; OFFSETS],
 ) -> bool {
-    // Note that this *cannot* be past the end of the array, as the last
-    // element is greater than std::char::MAX (the largest possible needle).
-    //
-    // So, we cannot have found it (i.e. Ok(idx) + 1 != length) and the correct
-    // location cannot be past it, so Err(idx) != length either.
-    //
-    // This means that we can avoid bounds checking for the accesses below, too.
+    let needle = needle as u32;
+
     let last_idx =
         match short_offset_runs.binary_search_by_key(&(needle << 11), |header| header << 11) {
             Ok(idx) => idx + 1,
             Err(idx) => idx,
         };
+    // SAFETY: `last_idx` *cannot* be past the end of the array, as the last
+    // element is greater than `std::char::MAX` (the largest possible needle)
+    // as guaranteed by the caller.
+    //
+    // So, we cannot have found it (i.e. `Ok(idx) => idx + 1 != length`) and the
+    // correct location cannot be past it, so `Err(idx) => idx != length` either.
+    //
+    // This means that we can avoid bounds checking for the accesses below, too.
+    unsafe { crate::hint::assert_unchecked(last_idx < SOR) };
 
     let mut offset_idx = decode_length(short_offset_runs[last_idx]);
     let length = if let Some(next) = short_offset_runs.get(last_idx + 1) {
diff --git a/src/tools/unicode-table-generator/src/skiplist.rs b/src/tools/unicode-table-generator/src/skiplist.rs
index 8ca18ddc91a..8dd3eeb8735 100644
--- a/src/tools/unicode-table-generator/src/skiplist.rs
+++ b/src/tools/unicode-table-generator/src/skiplist.rs
@@ -108,11 +108,21 @@ impl RawEmitter {
         } else {
             writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
         }
-        writeln!(&mut self.file, "    super::skip_search(",).unwrap();
-        writeln!(&mut self.file, "        c as u32,").unwrap();
-        writeln!(&mut self.file, "        &SHORT_OFFSET_RUNS,").unwrap();
-        writeln!(&mut self.file, "        &OFFSETS,").unwrap();
-        writeln!(&mut self.file, "    )").unwrap();
+        writeln!(
+            &mut self.file,
+            "    const {{ assert!(*SHORT_OFFSET_RUNS.last().unwrap() > (char::MAX as u32)); }}",
+        )
+        .unwrap();
+        writeln!(
+            &mut self.file,
+            "    // SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX`.",
+        )
+        .unwrap();
+        writeln!(
+            &mut self.file,
+            "    unsafe {{ super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) }}"
+        )
+        .unwrap();
         writeln!(&mut self.file, "}}").unwrap();
     }
 }