diff options
| author | León Orell Valerian Liehr <me@fmease.dev> | 2024-10-23 22:11:02 +0200 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-10-23 22:11:02 +0200 |
| commit | af2c7dffda76492ff677e3c0a43774edac5eaa6e (patch) | |
| tree | 61f55d5f0ebdda2595f32adac060bdf8367088de | |
| parent | be01dabfefd2daa4574b974f571c7852085d60cb (diff) | |
| parent | c11bfd828ee4bace0b547c361c1088e03844af83 (diff) | |
| download | rust-af2c7dffda76492ff677e3c0a43774edac5eaa6e.tar.gz rust-af2c7dffda76492ff677e3c0a43774edac5eaa6e.zip | |
Rollup merge of #130991 - LaihoE:vectorized_slice_contains, r=Noratrieb
Vectorized SliceContains Godbolt for the u32 case: https://rust.godbolt.org/z/exT9xYWGs Unsure about: - Should align_to be used? It didn't seem to matter in my benchmark but maybe I was lucky with alignment? - Should u8/i8 also be implemented? Currently uses memchr (SWAR) Some benchmarks on x86 (contains called on an array with no matches, worst case may be slightly worse): ## Large N  ## Small N 
| -rw-r--r-- | library/core/src/slice/cmp.rs | 26 |
1 files changed, 26 insertions, 0 deletions
diff --git a/library/core/src/slice/cmp.rs b/library/core/src/slice/cmp.rs index 1769612def0..9cb00644e64 100644 --- a/library/core/src/slice/cmp.rs +++ b/library/core/src/slice/cmp.rs @@ -257,3 +257,29 @@ impl SliceContains for i8 { memchr::memchr(byte, bytes).is_some() } } + +macro_rules! impl_slice_contains { + ($($t:ty),*) => { + $( + impl SliceContains for $t { + #[inline] + fn slice_contains(&self, arr: &[$t]) -> bool { + // Make our LANE_COUNT 4x the normal lane count (aiming for 128 bit vectors). + // The compiler will nicely unroll it. + const LANE_COUNT: usize = 4 * (128 / (mem::size_of::<$t>() * 8)); + // SIMD + let mut chunks = arr.chunks_exact(LANE_COUNT); + for chunk in &mut chunks { + if chunk.iter().fold(false, |acc, x| acc | (*x == *self)) { + return true; + } + } + // Scalar remainder + return chunks.remainder().iter().any(|x| *x == *self); + } + } + )* + }; +} + +impl_slice_contains!(u16, u32, u64, i16, i32, i64, f32, f64, usize, isize); |
