about summary refs log tree commit diff
diff options
context:
space:
mode:
authorLeón Orell Valerian Liehr <me@fmease.dev>2024-10-23 22:11:02 +0200
committerGitHub <noreply@github.com>2024-10-23 22:11:02 +0200
commitaf2c7dffda76492ff677e3c0a43774edac5eaa6e (patch)
tree61f55d5f0ebdda2595f32adac060bdf8367088de
parentbe01dabfefd2daa4574b974f571c7852085d60cb (diff)
parentc11bfd828ee4bace0b547c361c1088e03844af83 (diff)
downloadrust-af2c7dffda76492ff677e3c0a43774edac5eaa6e.tar.gz
rust-af2c7dffda76492ff677e3c0a43774edac5eaa6e.zip
Rollup merge of #130991 - LaihoE:vectorized_slice_contains, r=Noratrieb
Vectorized SliceContains

Godbolt for the u32 case: https://rust.godbolt.org/z/exT9xYWGs

Unsure about:
- Should align_to be used? It didn't seem to matter in my benchmark but maybe I was lucky with alignment?
- Should u8/i8 also be implemented? Currently uses memchr (SWAR)

Some benchmarks on x86 (contains called on an array with no matches, worst case may be slightly worse):

## Large N
![large_n_contains](https://github.com/user-attachments/assets/5be79072-970b-44be-a56c-16dc677dee46)

## Small N
![small_n_contains](https://github.com/user-attachments/assets/b8a33790-c176-459f-84f4-05feee893cd0)
-rw-r--r--library/core/src/slice/cmp.rs26
1 files changed, 26 insertions, 0 deletions
diff --git a/library/core/src/slice/cmp.rs b/library/core/src/slice/cmp.rs
index 1769612def0..9cb00644e64 100644
--- a/library/core/src/slice/cmp.rs
+++ b/library/core/src/slice/cmp.rs
@@ -257,3 +257,29 @@ impl SliceContains for i8 {
         memchr::memchr(byte, bytes).is_some()
     }
 }
+
+macro_rules! impl_slice_contains {
+    ($($t:ty),*) => {
+        $(
+            impl SliceContains for $t {
+                #[inline]
+                fn slice_contains(&self, arr: &[$t]) -> bool {
+                    // Make our LANE_COUNT 4x the normal lane count (aiming for 128 bit vectors).
+                    // The compiler will nicely unroll it.
+                    const LANE_COUNT: usize = 4 * (128 / (mem::size_of::<$t>() * 8));
+                    // SIMD
+                    let mut chunks = arr.chunks_exact(LANE_COUNT);
+                    for chunk in &mut chunks {
+                        if chunk.iter().fold(false, |acc, x| acc | (*x == *self)) {
+                            return true;
+                        }
+                    }
+                    // Scalar remainder
+                    return chunks.remainder().iter().any(|x| *x == *self);
+                }
+            }
+        )*
+    };
+}
+
+impl_slice_contains!(u16, u32, u64, i16, i32, i64, f32, f64, usize, isize);