about summary refs log tree commit diff
diff options
context:
space:
mode:
authorHans Kratz <hans@appfour.com>2024-10-23 00:42:29 +0200
committerGitHub <noreply@github.com>2024-10-22 15:42:29 -0700
commit7e162d19dd2e245dbba0e37fd12fe2cdaafdfed8 (patch)
tree6deba2eccab0d851f6f5f7d4a61516e1315e3ee0
parent158e2409fe479cd509d1549ce629a40a1588f1bf (diff)
downloadrust-7e162d19dd2e245dbba0e37fd12fe2cdaafdfed8.tar.gz
rust-7e162d19dd2e245dbba0e37fd12fe2cdaafdfed8.zip
rust-lang/portable-simd#443: Add armv7 neon mplementation for `Simd<u8, 16>::swizzle_dyn`
Use arm neon intrinsics to swizzle two u8x8 blocks with a u8x8x2 lookup table.
-rw-r--r--crates/core_simd/src/swizzle_dyn.rs29
1 files changed, 29 insertions, 0 deletions
diff --git a/crates/core_simd/src/swizzle_dyn.rs b/crates/core_simd/src/swizzle_dyn.rs
index eaf297ba3e3..0619404e5f7 100644
--- a/crates/core_simd/src/swizzle_dyn.rs
+++ b/crates/core_simd/src/swizzle_dyn.rs
@@ -57,6 +57,13 @@ where
                     target_endian = "little"
                 ))]
                 16 => transize(vqtbl1q_u8, self, idxs),
+                #[cfg(all(
+                    target_arch = "arm",
+                    target_feature = "v7",
+                    target_feature = "neon",
+                    target_endian = "little"
+                ))]
+                16 => transize(armv7_neon_swizzle_u8x16, self, idxs),
                 #[cfg(all(target_feature = "avx2", not(target_feature = "avx512vbmi")))]
                 32 => transize(avx2_pshufb, self, idxs),
                 #[cfg(all(target_feature = "avx512vl", target_feature = "avx512vbmi"))]
@@ -98,6 +105,28 @@ where
     }
 }
 
+/// armv7 neon supports swizzling `u8x16` by swizzling two u8x8 blocks
+/// with a u8x8x2 lookup table.
+///
+/// # Safety
+/// This requires armv7 neon to work
+#[cfg(all(
+    target_arch = "arm",
+    target_feature = "v7",
+    target_feature = "neon",
+    target_endian = "little"
+))]
+unsafe fn armv7_neon_swizzle_u8x16(bytes: Simd<u8, 16>, idxs: Simd<u8, 16>) -> Simd<u8, 16> {
+    use core::arch::arm::{uint8x8x2_t, vcombine_u8, vget_high_u8, vget_low_u8, vtbl2_u8};
+    // SAFETY: Caller promised arm neon support
+    unsafe {
+        let bytes = uint8x8x2_t(vget_low_u8(bytes.into()), vget_high_u8(bytes.into()));
+        let lo = vtbl2_u8(bytes, vget_low_u8(idxs.into()));
+        let hi = vtbl2_u8(bytes, vget_high_u8(idxs.into()));
+        vcombine_u8(lo, hi).into()
+    }
+}
+
 /// "vpshufb like it was meant to be" on AVX2
 ///
 /// # Safety