diff options
| author | Jacob Bramley <jacob.bramley@arm.com> | 2023-05-31 15:08:51 +0100 |
|---|---|---|
| committer | Amanieu d'Antras <amanieu@gmail.com> | 2023-06-21 18:52:21 +0200 |
| commit | a9fecd8456bdfe1f1234dc26534fe0059cd24862 (patch) | |
| tree | b69e0ff6f8051631710f0b86a6c84f8085b33da1 | |
| parent | 1e15fa3f0a492be3666cc6b9bfe4d82b2efa2c5c (diff) | |
| download | rust-a9fecd8456bdfe1f1234dc26534fe0059cd24862.tar.gz rust-a9fecd8456bdfe1f1234dc26534fe0059cd24862.zip | |
Support AArch32 Neon dotprod intrinsics.
Note that the feature detection requires a recent Linux kernel (v6.2).
7 files changed, 305 insertions, 236 deletions
diff --git a/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs b/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs index 25c119cbe34..72fdceb77b1 100644 --- a/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs +++ b/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs @@ -10557,80 +10557,7 @@ pub unsafe fn vcmlaq_rot270_laneq_f32<const LANE: i32>(a: float32x4_t, b: float3 vcmlaq_rot270_f32(a, b, c) } -/// Dot product arithmetic -/// -/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_s32) -#[inline] -#[target_feature(enable = "neon,dotprod")] -#[cfg_attr(test, assert_instr(sdot))] -pub unsafe fn vdot_s32(a: int32x2_t, b: int8x8_t, c: int8x8_t) -> int32x2_t { - #[allow(improper_ctypes)] - extern "unadjusted" { - #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sdot.v2i32.v8i8")] - fn vdot_s32_(a: int32x2_t, b: int8x8_t, c: int8x8_t) -> int32x2_t; - } - vdot_s32_(a, b, c) -} - -/// Dot product arithmetic -/// -/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_s32) -#[inline] -#[target_feature(enable = "neon,dotprod")] -#[cfg_attr(test, assert_instr(sdot))] -pub unsafe fn vdotq_s32(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t { - #[allow(improper_ctypes)] - extern "unadjusted" { - #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sdot.v4i32.v16i8")] - fn vdotq_s32_(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t; - } - vdotq_s32_(a, b, c) -} - -/// Dot product arithmetic -/// -/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_u32) -#[inline] -#[target_feature(enable = "neon,dotprod")] -#[cfg_attr(test, assert_instr(udot))] -pub unsafe fn vdot_u32(a: uint32x2_t, b: uint8x8_t, c: uint8x8_t) -> uint32x2_t { - #[allow(improper_ctypes)] - extern "unadjusted" { - #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.udot.v2i32.v8i8")] - fn vdot_u32_(a: uint32x2_t, b: uint8x8_t, c: uint8x8_t) -> uint32x2_t; - } - vdot_u32_(a, b, c) -} - -/// Dot product arithmetic -/// -/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_u32) -#[inline] -#[target_feature(enable = "neon,dotprod")] -#[cfg_attr(test, assert_instr(udot))] -pub unsafe fn vdotq_u32(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t { - #[allow(improper_ctypes)] - extern "unadjusted" { - #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.udot.v4i32.v16i8")] - fn vdotq_u32_(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t; - } - vdotq_u32_(a, b, c) -} - -/// Dot product arithmetic -/// -/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_lane_s32) -#[inline] -#[target_feature(enable = "neon,dotprod")] -#[cfg_attr(test, assert_instr(sdot, LANE = 0))] -#[rustc_legacy_const_generics(3)] -pub unsafe fn vdot_lane_s32<const LANE: i32>(a: int32x2_t, b: int8x8_t, c: int8x8_t) -> int32x2_t { - static_assert_uimm_bits!(LANE, 1); - let c: int8x8_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]); - vdot_s32(a, b, c) -} - -/// Dot product arithmetic +/// Dot product arithmetic (indexed) /// /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_laneq_s32) #[inline] @@ -10639,24 +10566,12 @@ pub unsafe fn vdot_lane_s32<const LANE: i32>(a: int32x2_t, b: int8x8_t, c: int8x #[rustc_legacy_const_generics(3)] pub unsafe fn vdot_laneq_s32<const LANE: i32>(a: int32x2_t, b: int8x8_t, c: int8x16_t) -> int32x2_t { static_assert_uimm_bits!(LANE, 2); - let c: int8x8_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]); - vdot_s32(a, b, c) -} - -/// Dot product arithmetic -/// -/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_lane_s32) -#[inline] -#[target_feature(enable = "neon,dotprod")] -#[cfg_attr(test, assert_instr(sdot, LANE = 0))] -#[rustc_legacy_const_generics(3)] -pub unsafe fn vdotq_lane_s32<const LANE: i32>(a: int32x4_t, b: int8x16_t, c: int8x8_t) -> int32x4_t { - static_assert_uimm_bits!(LANE, 1); - let c: int8x16_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]); - vdotq_s32(a, b, c) + let c: int32x4_t = transmute(c); + let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]); + vdot_s32(a, b, transmute(c)) } -/// Dot product arithmetic +/// Dot product arithmetic (indexed) /// /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_laneq_s32) #[inline] @@ -10665,24 +10580,12 @@ pub unsafe fn vdotq_lane_s32<const LANE: i32>(a: int32x4_t, b: int8x16_t, c: int #[rustc_legacy_const_generics(3)] pub unsafe fn vdotq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t { static_assert_uimm_bits!(LANE, 2); - let c: int8x16_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]); - vdotq_s32(a, b, c) -} - -/// Dot product arithmetic -/// -/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_lane_u32) -#[inline] -#[target_feature(enable = "neon,dotprod")] -#[cfg_attr(test, assert_instr(udot, LANE = 0))] -#[rustc_legacy_const_generics(3)] -pub unsafe fn vdot_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint8x8_t, c: uint8x8_t) -> uint32x2_t { - static_assert_uimm_bits!(LANE, 1); - let c: uint8x8_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]); - vdot_u32(a, b, c) + let c: int32x4_t = transmute(c); + let c: int32x4_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]); + vdotq_s32(a, b, transmute(c)) } -/// Dot product arithmetic +/// Dot product arithmetic (indexed) /// /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_laneq_u32) #[inline] @@ -10691,24 +10594,12 @@ pub unsafe fn vdot_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint8x8_t, c: uin #[rustc_legacy_const_generics(3)] pub unsafe fn vdot_laneq_u32<const LANE: i32>(a: uint32x2_t, b: uint8x8_t, c: uint8x16_t) -> uint32x2_t { static_assert_uimm_bits!(LANE, 2); - let c: uint8x8_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]); - vdot_u32(a, b, c) -} - -/// Dot product arithmetic -/// -/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_lane_u32) -#[inline] -#[target_feature(enable = "neon,dotprod")] -#[cfg_attr(test, assert_instr(udot, LANE = 0))] -#[rustc_legacy_const_generics(3)] -pub unsafe fn vdotq_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint8x16_t, c: uint8x8_t) -> uint32x4_t { - static_assert_uimm_bits!(LANE, 1); - let c: uint8x16_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]); - vdotq_u32(a, b, c) + let c: uint32x4_t = transmute(c); + let c: uint32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]); + vdot_u32(a, b, transmute(c)) } -/// Dot product arithmetic +/// Dot product arithmetic (indexed) /// /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_laneq_u32) #[inline] @@ -10717,8 +10608,9 @@ pub unsafe fn vdotq_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint8x16_t, c: u #[rustc_legacy_const_generics(3)] pub unsafe fn vdotq_laneq_u32<const LANE: i32>(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t { static_assert_uimm_bits!(LANE, 2); - let c: uint8x16_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]); - vdotq_u32(a, b, c) + let c: uint32x4_t = transmute(c); + let c: uint32x4_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]); + vdotq_u32(a, b, transmute(c)) } /// Maximum (vector) @@ -23760,121 +23652,41 @@ mod test { } #[simd_test(enable = "neon,dotprod")] - unsafe fn test_vdot_s32() { - let a: i32x2 = i32x2::new(1, 2); - let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8); - let c: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8); - let e: i32x2 = i32x2::new(31, 176); - let r: i32x2 = transmute(vdot_s32(transmute(a), transmute(b), transmute(c))); - assert_eq!(r, e); - } - - #[simd_test(enable = "neon,dotprod")] - unsafe fn test_vdotq_s32() { - let a: i32x4 = i32x4::new(1, 2, 1, 2); - let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8); - let c: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8); - let e: i32x4 = i32x4::new(31, 176, 31, 176); - let r: i32x4 = transmute(vdotq_s32(transmute(a), transmute(b), transmute(c))); - assert_eq!(r, e); - } - - #[simd_test(enable = "neon,dotprod")] - unsafe fn test_vdot_u32() { - let a: u32x2 = u32x2::new(1, 2); - let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8); - let c: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8); - let e: u32x2 = u32x2::new(31, 176); - let r: u32x2 = transmute(vdot_u32(transmute(a), transmute(b), transmute(c))); - assert_eq!(r, e); - } - - #[simd_test(enable = "neon,dotprod")] - unsafe fn test_vdotq_u32() { - let a: u32x4 = u32x4::new(1, 2, 1, 2); - let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8); - let c: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8); - let e: u32x4 = u32x4::new(31, 176, 31, 176); - let r: u32x4 = transmute(vdotq_u32(transmute(a), transmute(b), transmute(c))); - assert_eq!(r, e); - } - - #[simd_test(enable = "neon,dotprod")] - unsafe fn test_vdot_lane_s32() { - let a: i32x2 = i32x2::new(1, 2); - let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8); - let c: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8); - let e: i32x2 = i32x2::new(31, 72); - let r: i32x2 = transmute(vdot_lane_s32::<0>(transmute(a), transmute(b), transmute(c))); - assert_eq!(r, e); - } - - #[simd_test(enable = "neon,dotprod")] unsafe fn test_vdot_laneq_s32() { let a: i32x2 = i32x2::new(1, 2); - let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8); + let b: i8x8 = i8x8::new(-1, 2, 3, 4, 5, 6, 7, 8); let c: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8); - let e: i32x2 = i32x2::new(31, 72); + let e: i32x2 = i32x2::new(29, 72); let r: i32x2 = transmute(vdot_laneq_s32::<0>(transmute(a), transmute(b), transmute(c))); assert_eq!(r, e); } #[simd_test(enable = "neon,dotprod")] - unsafe fn test_vdotq_lane_s32() { - let a: i32x4 = i32x4::new(1, 2, 1, 2); - let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8); - let c: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8); - let e: i32x4 = i32x4::new(31, 72, 31, 72); - let r: i32x4 = transmute(vdotq_lane_s32::<0>(transmute(a), transmute(b), transmute(c))); - assert_eq!(r, e); - } - - #[simd_test(enable = "neon,dotprod")] unsafe fn test_vdotq_laneq_s32() { let a: i32x4 = i32x4::new(1, 2, 1, 2); - let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8); + let b: i8x16 = i8x16::new(-1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8); let c: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8); - let e: i32x4 = i32x4::new(31, 72, 31, 72); + let e: i32x4 = i32x4::new(29, 72, 31, 72); let r: i32x4 = transmute(vdotq_laneq_s32::<0>(transmute(a), transmute(b), transmute(c))); assert_eq!(r, e); } #[simd_test(enable = "neon,dotprod")] - unsafe fn test_vdot_lane_u32() { - let a: u32x2 = u32x2::new(1, 2); - let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8); - let c: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8); - let e: u32x2 = u32x2::new(31, 72); - let r: u32x2 = transmute(vdot_lane_u32::<0>(transmute(a), transmute(b), transmute(c))); - assert_eq!(r, e); - } - - #[simd_test(enable = "neon,dotprod")] unsafe fn test_vdot_laneq_u32() { let a: u32x2 = u32x2::new(1, 2); - let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8); + let b: u8x8 = u8x8::new(255, 2, 3, 4, 5, 6, 7, 8); let c: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8); - let e: u32x2 = u32x2::new(31, 72); + let e: u32x2 = u32x2::new(285, 72); let r: u32x2 = transmute(vdot_laneq_u32::<0>(transmute(a), transmute(b), transmute(c))); assert_eq!(r, e); } #[simd_test(enable = "neon,dotprod")] - unsafe fn test_vdotq_lane_u32() { - let a: u32x4 = u32x4::new(1, 2, 1, 2); - let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8); - let c: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8); - let e: u32x4 = u32x4::new(31, 72, 31, 72); - let r: u32x4 = transmute(vdotq_lane_u32::<0>(transmute(a), transmute(b), transmute(c))); - assert_eq!(r, e); - } - - #[simd_test(enable = "neon,dotprod")] unsafe fn test_vdotq_laneq_u32() { let a: u32x4 = u32x4::new(1, 2, 1, 2); - let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8); + let b: u8x16 = u8x16::new(255, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8); let c: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8); - let e: u32x4 = u32x4::new(31, 72, 31, 72); + let e: u32x4 = u32x4::new(285, 72, 31, 72); let r: u32x4 = transmute(vdotq_laneq_u32::<0>(transmute(a), transmute(b), transmute(c))); assert_eq!(r, e); } diff --git a/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs b/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs index 6382607f990..2f3be778aee 100644 --- a/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs @@ -18837,6 +18837,142 @@ pub unsafe fn vsubl_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t { simd_sub(c, d) } +/// Dot product arithmetic (vector) +/// +/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_s32) +#[inline] +#[target_feature(enable = "neon,dotprod")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsdot))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sdot))] +pub unsafe fn vdot_s32(a: int32x2_t, b: int8x8_t, c: int8x8_t) -> int32x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.sdot.v2i32.v8i8")] + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sdot.v2i32.v8i8")] + fn vdot_s32_(a: int32x2_t, b: int8x8_t, c: int8x8_t) -> int32x2_t; + } +vdot_s32_(a, b, c) +} + +/// Dot product arithmetic (vector) +/// +/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_s32) +#[inline] +#[target_feature(enable = "neon,dotprod")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsdot))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sdot))] +pub unsafe fn vdotq_s32(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.sdot.v4i32.v16i8")] + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sdot.v4i32.v16i8")] + fn vdotq_s32_(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t; + } +vdotq_s32_(a, b, c) +} + +/// Dot product arithmetic (vector) +/// +/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_u32) +#[inline] +#[target_feature(enable = "neon,dotprod")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vudot))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(udot))] +pub unsafe fn vdot_u32(a: uint32x2_t, b: uint8x8_t, c: uint8x8_t) -> uint32x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.udot.v2i32.v8i8")] + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.udot.v2i32.v8i8")] + fn vdot_u32_(a: uint32x2_t, b: uint8x8_t, c: uint8x8_t) -> uint32x2_t; + } +vdot_u32_(a, b, c) +} + +/// Dot product arithmetic (vector) +/// +/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_u32) +#[inline] +#[target_feature(enable = "neon,dotprod")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vudot))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(udot))] +pub unsafe fn vdotq_u32(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.udot.v4i32.v16i8")] + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.udot.v4i32.v16i8")] + fn vdotq_u32_(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t; + } +vdotq_u32_(a, b, c) +} + +/// Dot product arithmetic (indexed) +/// +/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_lane_s32) +#[inline] +#[target_feature(enable = "neon,dotprod")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsdot, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sdot, LANE = 0))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vdot_lane_s32<const LANE: i32>(a: int32x2_t, b: int8x8_t, c: int8x8_t) -> int32x2_t { + static_assert_uimm_bits!(LANE, 1); + let c: int32x2_t = transmute(c); + let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]); + vdot_s32(a, b, transmute(c)) +} + +/// Dot product arithmetic (indexed) +/// +/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_lane_s32) +#[inline] +#[target_feature(enable = "neon,dotprod")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsdot, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sdot, LANE = 0))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vdotq_lane_s32<const LANE: i32>(a: int32x4_t, b: int8x16_t, c: int8x8_t) -> int32x4_t { + static_assert_uimm_bits!(LANE, 1); + let c: int32x2_t = transmute(c); + let c: int32x4_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]); + vdotq_s32(a, b, transmute(c)) +} + +/// Dot product arithmetic (indexed) +/// +/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_lane_u32) +#[inline] +#[target_feature(enable = "neon,dotprod")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vudot, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(udot, LANE = 0))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vdot_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint8x8_t, c: uint8x8_t) -> uint32x2_t { + static_assert_uimm_bits!(LANE, 1); + let c: uint32x2_t = transmute(c); + let c: uint32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]); + vdot_u32(a, b, transmute(c)) +} + +/// Dot product arithmetic (indexed) +/// +/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_lane_u32) +#[inline] +#[target_feature(enable = "neon,dotprod")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vudot, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(udot, LANE = 0))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vdotq_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint8x16_t, c: uint8x8_t) -> uint32x4_t { + static_assert_uimm_bits!(LANE, 1); + let c: uint32x2_t = transmute(c); + let c: uint32x4_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]); + vdotq_u32(a, b, transmute(c)) +} + /// Maximum (vector) /// /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmax_s8) @@ -39239,6 +39375,86 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon,dotprod")] + unsafe fn test_vdot_s32() { + let a: i32x2 = i32x2::new(1, 2); + let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8); + let c: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8); + let e: i32x2 = i32x2::new(31, 176); + let r: i32x2 = transmute(vdot_s32(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon,dotprod")] + unsafe fn test_vdotq_s32() { + let a: i32x4 = i32x4::new(1, 2, 1, 2); + let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8); + let c: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8); + let e: i32x4 = i32x4::new(31, 176, 31, 176); + let r: i32x4 = transmute(vdotq_s32(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon,dotprod")] + unsafe fn test_vdot_u32() { + let a: u32x2 = u32x2::new(1, 2); + let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8); + let c: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8); + let e: u32x2 = u32x2::new(31, 176); + let r: u32x2 = transmute(vdot_u32(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon,dotprod")] + unsafe fn test_vdotq_u32() { + let a: u32x4 = u32x4::new(1, 2, 1, 2); + let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8); + let c: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8); + let e: u32x4 = u32x4::new(31, 176, 31, 176); + let r: u32x4 = transmute(vdotq_u32(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon,dotprod")] + unsafe fn test_vdot_lane_s32() { + let a: i32x2 = i32x2::new(1, 2); + let b: i8x8 = i8x8::new(-1, 2, 3, 4, 5, 6, 7, 8); + let c: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8); + let e: i32x2 = i32x2::new(29, 72); + let r: i32x2 = transmute(vdot_lane_s32::<0>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon,dotprod")] + unsafe fn test_vdotq_lane_s32() { + let a: i32x4 = i32x4::new(1, 2, 1, 2); + let b: i8x16 = i8x16::new(-1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8); + let c: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8); + let e: i32x4 = i32x4::new(29, 72, 31, 72); + let r: i32x4 = transmute(vdotq_lane_s32::<0>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon,dotprod")] + unsafe fn test_vdot_lane_u32() { + let a: u32x2 = u32x2::new(1, 2); + let b: u8x8 = u8x8::new(255, 2, 3, 4, 5, 6, 7, 8); + let c: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8); + let e: u32x2 = u32x2::new(285, 72); + let r: u32x2 = transmute(vdot_lane_u32::<0>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon,dotprod")] + unsafe fn test_vdotq_lane_u32() { + let a: u32x4 = u32x4::new(1, 2, 1, 2); + let b: u8x16 = u8x16::new(255, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8); + let c: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8); + let e: u32x4 = u32x4::new(285, 72, 31, 72); + let r: u32x4 = transmute(vdotq_lane_u32::<0>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vmax_s8() { let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8); diff --git a/library/stdarch/crates/intrinsic-test/missing_arm.txt b/library/stdarch/crates/intrinsic-test/missing_arm.txt index 07524b67790..7439cd6e664 100644 --- a/library/stdarch/crates/intrinsic-test/missing_arm.txt +++ b/library/stdarch/crates/intrinsic-test/missing_arm.txt @@ -160,14 +160,6 @@ vcvtpq_s32_f32 vcvtpq_u32_f32 vcvtp_s32_f32 vcvtp_u32_f32 -vdot_lane_s32 -vdot_lane_u32 -vdotq_lane_s32 -vdotq_lane_u32 -vdotq_s32 -vdotq_u32 -vdot_s32 -vdot_u32 vqdmulh_lane_s16 vqdmulh_lane_s32 vqdmulhq_lane_s16 diff --git a/library/stdarch/crates/std_detect/src/detect/arch/arm.rs b/library/stdarch/crates/std_detect/src/detect/arch/arm.rs index a7dea27fb3f..fd332e0b2ca 100644 --- a/library/stdarch/crates/std_detect/src/detect/arch/arm.rs +++ b/library/stdarch/crates/std_detect/src/detect/arch/arm.rs @@ -22,5 +22,7 @@ features! { @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] sha2: "sha2"; /// FEAT_SHA1 & FEAT_SHA256 (SHA1 & SHA2-256 instructions) @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] i8mm: "i8mm"; - /// FEAT_I8MM + /// FEAT_I8MM (integer matrix multiplication, plus ASIMD support) + @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] dotprod: "dotprod"; + /// FEAT_DotProd (Vector Dot-Product - ASIMDDP) } diff --git a/library/stdarch/crates/std_detect/src/detect/os/linux/arm.rs b/library/stdarch/crates/std_detect/src/detect/os/linux/arm.rs index 7601cf0a841..4dc9590e18b 100644 --- a/library/stdarch/crates/std_detect/src/detect/os/linux/arm.rs +++ b/library/stdarch/crates/std_detect/src/detect/os/linux/arm.rs @@ -17,6 +17,8 @@ pub(crate) fn detect_features() -> cache::Initializer { // // [hwcap]: https://github.com/torvalds/linux/blob/master/arch/arm/include/uapi/asm/hwcap.h if let Ok(auxv) = auxvec::auxv() { + enable_feature(&mut value, Feature::i8mm, bit::test(auxv.hwcap, 27)); + enable_feature(&mut value, Feature::dotprod, bit::test(auxv.hwcap, 24)); enable_feature(&mut value, Feature::neon, bit::test(auxv.hwcap, 12)); enable_feature(&mut value, Feature::pmull, bit::test(auxv.hwcap2, 1)); enable_feature(&mut value, Feature::crc, bit::test(auxv.hwcap2, 4)); @@ -37,6 +39,12 @@ pub(crate) fn detect_features() -> cache::Initializer { Feature::neon, c.field("Features").has("neon") && !has_broken_neon(&c), ); + enable_feature(&mut value, Feature::i8mm, c.field("Features").has("i8mm")); + enable_feature( + &mut value, + Feature::dotprod, + c.field("Features").has("asimddp"), + ); enable_feature(&mut value, Feature::pmull, c.field("Features").has("pmull")); enable_feature(&mut value, Feature::crc, c.field("Features").has("crc32")); enable_feature(&mut value, Feature::aes, c.field("Features").has("aes")); diff --git a/library/stdarch/crates/std_detect/tests/cpu-detection.rs b/library/stdarch/crates/std_detect/tests/cpu-detection.rs index 38bdb5bbd1b..f93212d24f6 100644 --- a/library/stdarch/crates/std_detect/tests/cpu-detection.rs +++ b/library/stdarch/crates/std_detect/tests/cpu-detection.rs @@ -20,16 +20,25 @@ fn all() { } #[test] -#[cfg(all( - target_arch = "arm", - any(target_os = "linux", target_os = "android", target_os = "freebsd"), -))] -fn arm_linux_or_freebsd() { +#[cfg(all(target_arch = "arm", any(target_os = "freebsd"),))] +fn arm_freebsd() { + println!("neon: {}", is_arm_feature_detected!("neon")); + println!("pmull: {}", is_arm_feature_detected!("pmull")); + println!("crc: {}", is_arm_feature_detected!("crc")); + println!("aes: {}", is_arm_feature_detected!("aes")); + println!("sha2: {}", is_arm_feature_detected!("sha2")); +} + +#[test] +#[cfg(all(target_arch = "arm", any(target_os = "linux", target_os = "android"),))] +fn arm_linux() { println!("neon: {}", is_arm_feature_detected!("neon")); println!("pmull: {}", is_arm_feature_detected!("pmull")); println!("crc: {}", is_arm_feature_detected!("crc")); println!("aes: {}", is_arm_feature_detected!("aes")); println!("sha2: {}", is_arm_feature_detected!("sha2")); + println!("dotprod: {}", is_arm_feature_detected!("dotprod")); + println!("i8mm: {}", is_arm_feature_detected!("i8mm")); } #[test] diff --git a/library/stdarch/crates/stdarch-gen/neon.spec b/library/stdarch/crates/stdarch-gen/neon.spec index 5aaa7305b58..b3ca359e6f9 100644 --- a/library/stdarch/crates/stdarch-gen/neon.spec +++ b/library/stdarch/crates/stdarch-gen/neon.spec @@ -4723,7 +4723,7 @@ aarch64 = fcmla generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t generate float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t -/// Dot product arithmetic +/// Dot product arithmetic (vector) name = vdot out-suffix a = 1, 2, 1, 2 @@ -4732,35 +4732,65 @@ c = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 validate 31, 176, 31, 176 target = dotprod +arm = vsdot aarch64 = sdot +link-arm = sdot._EXT_._EXT3_ link-aarch64 = sdot._EXT_._EXT3_ generate int32x2_t:int8x8_t:int8x8_t:int32x2_t, int32x4_t:int8x16_t:int8x16_t:int32x4_t +arm = vudot aarch64 = udot +link-arm = udot._EXT_._EXT3_ link-aarch64 = udot._EXT_._EXT3_ generate uint32x2_t:uint8x8_t:uint8x8_t:uint32x2_t, uint32x4_t:uint8x16_t:uint8x16_t:uint32x4_t -/// Dot product arithmetic +/// Dot product arithmetic (indexed) name = vdot out-lane-suffixes constn = LANE multi_fn = static_assert_imm-in2_dot-LANE -multi_fn = simd_shuffle!, c:in_t, c, c, {base-4-LANE} -multi_fn = vdot-out-noext, a, b, c +multi_fn = transmute, c:merge4_t2, c +multi_fn = simd_shuffle!, c:out_t, c, c, {dup-out_len-LANE as u32} +multi_fn = vdot-out-noext, a, b, {transmute, c} a = 1, 2, 1, 2 -b = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 +b = -1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 c = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 n = 0 -validate 31, 72, 31, 72 +validate 29, 72, 31, 72 target = dotprod +// Only AArch64 has the laneq forms. aarch64 = sdot -generate int32x2_t:int8x8_t:int8x8_t:int32x2_t, int32x2_t:int8x8_t:int8x16_t:int32x2_t -generate int32x4_t:int8x16_t:int8x8_t:int32x4_t, int32x4_t:int8x16_t:int8x16_t:int32x4_t +generate int32x2_t:int8x8_t:int8x16_t:int32x2_t +generate int32x4_t:int8x16_t:int8x16_t:int32x4_t + +arm = vsdot +generate int32x2_t:int8x8_t:int8x8_t:int32x2_t +generate int32x4_t:int8x16_t:int8x8_t:int32x4_t + +/// Dot product arithmetic (indexed) +name = vdot +out-lane-suffixes +constn = LANE +multi_fn = static_assert_imm-in2_dot-LANE +multi_fn = transmute, c:merge4_t2, c +multi_fn = simd_shuffle!, c:out_t, c, c, {dup-out_len-LANE as u32} +multi_fn = vdot-out-noext, a, b, {transmute, c} +a = 1, 2, 1, 2 +b = 255, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 +c = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 +n = 0 +validate 285, 72, 31, 72 +target = dotprod +// Only AArch64 has the laneq forms. aarch64 = udot -generate uint32x2_t:uint8x8_t:uint8x8_t:uint32x2_t, uint32x2_t:uint8x8_t:uint8x16_t:uint32x2_t -generate uint32x4_t:uint8x16_t:uint8x8_t:uint32x4_t, uint32x4_t:uint8x16_t:uint8x16_t:uint32x4_t +generate uint32x2_t:uint8x8_t:uint8x16_t:uint32x2_t +generate uint32x4_t:uint8x16_t:uint8x16_t:uint32x4_t + +arm = vudot +generate uint32x2_t:uint8x8_t:uint8x8_t:uint32x2_t +generate uint32x4_t:uint8x16_t:uint8x8_t:uint32x4_t /// Maximum (vector) name = vmax |
