about summary refs log tree commit diff
diff options
context:
space:
mode:
authorCaleb Zulawski <caleb.zulawski@gmail.com>2024-08-26 21:30:39 -0400
committerGitHub <noreply@github.com>2024-08-26 21:30:39 -0400
commitf6519c5d70666bb654515aec637c5e5048745ad0 (patch)
tree8e1ce12c4df379708303495894b42baec0e40d2a
parent4697d394138fddbf062dec154270f2a43c4cf64e (diff)
parentd5abbfa9786552ca516574bd4aa44a39665919ee (diff)
downloadrust-f6519c5d70666bb654515aec637c5e5048745ad0.tar.gz
rust-f6519c5d70666bb654515aec637c5e5048745ad0.zip
Merge pull request #431 from cvijdea-bd/fix-swizzle-dyn-vbmi
Fix avx512vbmi swizzle_dyn implementation
-rw-r--r--crates/core_simd/src/swizzle_dyn.rs30
1 files changed, 24 insertions, 6 deletions
diff --git a/crates/core_simd/src/swizzle_dyn.rs b/crates/core_simd/src/swizzle_dyn.rs
index 8a1079042f0..eaf297ba3e3 100644
--- a/crates/core_simd/src/swizzle_dyn.rs
+++ b/crates/core_simd/src/swizzle_dyn.rs
@@ -60,12 +60,30 @@ where
                 #[cfg(all(target_feature = "avx2", not(target_feature = "avx512vbmi")))]
                 32 => transize(avx2_pshufb, self, idxs),
                 #[cfg(all(target_feature = "avx512vl", target_feature = "avx512vbmi"))]
-                32 => transize(x86::_mm256_permutexvar_epi8, zeroing_idxs(idxs), self),
-                // Notable absence: avx512bw shuffle
-                // If avx512bw is available, odds of avx512vbmi are good
-                // FIXME: initial AVX512VBMI variant didn't actually pass muster
-                // #[cfg(target_feature = "avx512vbmi")]
-                // 64 => transize(x86::_mm512_permutexvar_epi8, self, idxs),
+                32 => {
+                    // Unlike vpshufb, vpermb doesn't zero out values in the result based on the index high bit
+                    let swizzler = |bytes, idxs| {
+                        let mask = x86::_mm256_cmp_epu8_mask::<{ x86::_MM_CMPINT_LT }>(
+                            idxs,
+                            Simd::<u8, 32>::splat(N as u8).into(),
+                        );
+                        x86::_mm256_maskz_permutexvar_epi8(mask, idxs, bytes)
+                    };
+                    transize(swizzler, self, idxs)
+                }
+                // Notable absence: avx512bw pshufb shuffle
+                #[cfg(all(target_feature = "avx512vl", target_feature = "avx512vbmi"))]
+                64 => {
+                    // Unlike vpshufb, vpermb doesn't zero out values in the result based on the index high bit
+                    let swizzler = |bytes, idxs| {
+                        let mask = x86::_mm512_cmp_epu8_mask::<{ x86::_MM_CMPINT_LT }>(
+                            idxs,
+                            Simd::<u8, 64>::splat(N as u8).into(),
+                        );
+                        x86::_mm512_maskz_permutexvar_epi8(mask, idxs, bytes)
+                    };
+                    transize(swizzler, self, idxs)
+                }
                 _ => {
                     let mut array = [0; N];
                     for (i, k) in idxs.to_array().into_iter().enumerate() {