6 files changed, 448 insertions, 343 deletions
diff --git a/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs b/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs
index 791e7707bc7..88fe4cb085c 100644
--- a/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs
+++ b/library/stdarch/crates/core_arch/src/aarch64/neon/generated.rs
@@ -3937,26 +3937,6 @@ pub unsafe fn vqsubh_s16(a: i16, b: i16) -> i16 {
 /// Saturating subtract
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(sqsub))]
-pub unsafe fn vqsubs_s32(a: i32, b: i32) -> i32 {
-    let a: int32x2_t = vdup_n_s32(a);
-    let b: int32x2_t = vdup_n_s32(b);
-    simd_extract(vqsub_s32(a, b), 0)
-}
-
-/// Saturating subtract
-#[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(sqsub))]
-pub unsafe fn vqsubd_s64(a: i64, b: i64) -> i64 {
-    let a: int64x1_t = vdup_n_s64(a);
-    let b: int64x1_t = vdup_n_s64(b);
-    simd_extract(vqsub_s64(a, b), 0)
-}
-
-/// Saturating subtract
-#[inline]
-#[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uqsub))]
 pub unsafe fn vqsubb_u8(a: u8, b: u8) -> u8 {
     let a: uint8x8_t = vdup_n_u8(a);
@@ -3979,9 +3959,12 @@ pub unsafe fn vqsubh_u16(a: u16, b: u16) -> u16 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uqsub))]
 pub unsafe fn vqsubs_u32(a: u32, b: u32) -> u32 {
-    let a: uint32x2_t = vdup_n_u32(a);
-    let b: uint32x2_t = vdup_n_u32(b);
-    simd_extract(vqsub_u32(a, b), 0)
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqsub.i32")]
+        fn vqsubs_u32_(a: u32, b: u32) -> u32;
+    }
+    vqsubs_u32_(a, b)
 }
 
 /// Saturating subtract
@@ -3989,9 +3972,38 @@ pub unsafe fn vqsubs_u32(a: u32, b: u32) -> u32 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uqsub))]
 pub unsafe fn vqsubd_u64(a: u64, b: u64) -> u64 {
-    let a: uint64x1_t = vdup_n_u64(a);
-    let b: uint64x1_t = vdup_n_u64(b);
-    simd_extract(vqsub_u64(a, b), 0)
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqsub.i64")]
+        fn vqsubd_u64_(a: u64, b: u64) -> u64;
+    }
+    vqsubd_u64_(a, b)
+}
+
+/// Saturating subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqsub))]
+pub unsafe fn vqsubs_s32(a: i32, b: i32) -> i32 {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqsub.i32")]
+        fn vqsubs_s32_(a: i32, b: i32) -> i32;
+    }
+    vqsubs_s32_(a, b)
+}
+
+/// Saturating subtract
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqsub))]
+pub unsafe fn vqsubd_s64(a: i64, b: i64) -> i64 {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqsub.i64")]
+        fn vqsubd_s64_(a: i64, b: i64) -> i64;
+    }
+    vqsubd_s64_(a, b)
 }
 
 /// Reverse bit order
@@ -4413,26 +4425,6 @@ pub unsafe fn vqaddh_s16(a: i16, b: i16) -> i16 {
 /// Saturating add
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(sqadd))]
-pub unsafe fn vqadds_s32(a: i32, b: i32) -> i32 {
-    let a: int32x2_t = vdup_n_s32(a);
-    let b: int32x2_t = vdup_n_s32(b);
-    simd_extract(vqadd_s32(a, b), 0)
-}
-
-/// Saturating add
-#[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(sqadd))]
-pub unsafe fn vqaddd_s64(a: i64, b: i64) -> i64 {
-    let a: int64x1_t = vdup_n_s64(a);
-    let b: int64x1_t = vdup_n_s64(b);
-    simd_extract(vqadd_s64(a, b), 0)
-}
-
-/// Saturating add
-#[inline]
-#[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uqadd))]
 pub unsafe fn vqaddb_u8(a: u8, b: u8) -> u8 {
     let a: uint8x8_t = vdup_n_u8(a);
@@ -4455,9 +4447,12 @@ pub unsafe fn vqaddh_u16(a: u16, b: u16) -> u16 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uqadd))]
 pub unsafe fn vqadds_u32(a: u32, b: u32) -> u32 {
-    let a: uint32x2_t = vdup_n_u32(a);
-    let b: uint32x2_t = vdup_n_u32(b);
-    simd_extract(vqadd_u32(a, b), 0)
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqadd.i32")]
+        fn vqadds_u32_(a: u32, b: u32) -> u32;
+    }
+    vqadds_u32_(a, b)
 }
 
 /// Saturating add
@@ -4465,9 +4460,38 @@ pub unsafe fn vqadds_u32(a: u32, b: u32) -> u32 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uqadd))]
 pub unsafe fn vqaddd_u64(a: u64, b: u64) -> u64 {
-    let a: uint64x1_t = vdup_n_u64(a);
-    let b: uint64x1_t = vdup_n_u64(b);
-    simd_extract(vqadd_u64(a, b), 0)
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqadd.i64")]
+        fn vqaddd_u64_(a: u64, b: u64) -> u64;
+    }
+    vqaddd_u64_(a, b)
+}
+
+/// Saturating add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqadd))]
+pub unsafe fn vqadds_s32(a: i32, b: i32) -> i32 {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqadd.i32")]
+        fn vqadds_s32_(a: i32, b: i32) -> i32;
+    }
+    vqadds_s32_(a, b)
+}
+
+/// Saturating add
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqadd))]
+pub unsafe fn vqaddd_s64(a: i64, b: i64) -> i64 {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqadd.i64")]
+        fn vqaddd_s64_(a: i64, b: i64) -> i64;
+    }
+    vqaddd_s64_(a, b)
 }
 
 /// Multiply
@@ -5935,14 +5959,6 @@ pub unsafe fn vqmovns_s32(a: i32) -> i16 {
 /// Saturating extract narrow
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(sqxtn))]
-pub unsafe fn vqmovnd_s64(a: i64) -> i32 {
-    simd_extract(vqmovn_s64(vdupq_n_s64(a)), 0)
-}
-
-/// Saturating extract narrow
-#[inline]
-#[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uqxtn))]
 pub unsafe fn vqmovnh_u16(a: u16) -> u8 {
     simd_extract(vqmovn_u16(vdupq_n_u16(a)), 0)
@@ -5959,9 +5975,27 @@ pub unsafe fn vqmovns_u32(a: u32) -> u16 {
 /// Saturating extract narrow
 #[inline]
 #[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqxtn))]
+pub unsafe fn vqmovnd_s64(a: i64) -> i32 {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.scalar.sqxtn.i32.i64")]
+        fn vqmovnd_s64_(a: i64) -> i32;
+    }
+    vqmovnd_s64_(a)
+}
+
+/// Saturating extract narrow
+#[inline]
+#[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uqxtn))]
 pub unsafe fn vqmovnd_u64(a: u64) -> u32 {
-    simd_extract(vqmovn_u64(vdupq_n_u64(a)), 0)
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.scalar.uqxtn.i32.i64")]
+        fn vqmovnd_u64_(a: u64) -> u32;
+    }
+    vqmovnd_u64_(a)
 }
 
 /// Signed saturating extract narrow
@@ -6232,80 +6266,92 @@ pub unsafe fn vqrdmlshs_laneq_s32<const LANE: i32>(a: i32, b: i32, c: int32x4_t)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sqrshl))]
-pub unsafe fn vqrshlb_s8(a: i8, b: i8) -> i8 {
-    let a: int8x8_t = vdup_n_s8(a);
-    let b: int8x8_t = vdup_n_s8(b);
-    simd_extract(vqrshl_s8(a, b), 0)
+pub unsafe fn vqrshls_s32(a: i32, b: i32) -> i32 {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.i32")]
+        fn vqrshls_s32_(a: i32, b: i32) -> i32;
+    }
+    vqrshls_s32_(a, b)
 }
 
 /// Signed saturating rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sqrshl))]
-pub unsafe fn vqrshlh_s16(a: i16, b: i16) -> i16 {
-    let a: int16x4_t = vdup_n_s16(a);
-    let b: int16x4_t = vdup_n_s16(b);
-    simd_extract(vqrshl_s16(a, b), 0)
+pub unsafe fn vqrshld_s64(a: i64, b: i64) -> i64 {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.i64")]
+        fn vqrshld_s64_(a: i64, b: i64) -> i64;
+    }
+    vqrshld_s64_(a, b)
 }
 
 /// Signed saturating rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sqrshl))]
-pub unsafe fn vqrshls_s32(a: i32, b: i32) -> i32 {
-    let a: int32x2_t = vdup_n_s32(a);
-    let b: int32x2_t = vdup_n_s32(b);
-    simd_extract(vqrshl_s32(a, b), 0)
+pub unsafe fn vqrshlb_s8(a: i8, b: i8) -> i8 {
+    let a: int8x8_t = vdup_n_s8(a);
+    let b: int8x8_t = vdup_n_s8(b);
+    simd_extract(vqrshl_s8(a, b), 0)
 }
 
 /// Signed saturating rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sqrshl))]
-pub unsafe fn vqrshld_s64(a: i64, b: i64) -> i64 {
-    let a: int64x1_t = vdup_n_s64(a);
-    let b: int64x1_t = vdup_n_s64(b);
-    simd_extract(vqrshl_s64(a, b), 0)
+pub unsafe fn vqrshlh_s16(a: i16, b: i16) -> i16 {
+    let a: int16x4_t = vdup_n_s16(a);
+    let b: int16x4_t = vdup_n_s16(b);
+    simd_extract(vqrshl_s16(a, b), 0)
 }
 
 /// Unsigned signed saturating rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uqrshl))]
-pub unsafe fn vqrshlb_u8(a: u8, b: i8) -> u8 {
-    let a: uint8x8_t = vdup_n_u8(a);
-    let b: int8x8_t = vdup_n_s8(b);
-    simd_extract(vqrshl_u8(a, b), 0)
+pub unsafe fn vqrshls_u32(a: u32, b: i32) -> u32 {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.i32")]
+        fn vqrshls_u32_(a: u32, b: i32) -> u32;
+    }
+    vqrshls_u32_(a, b)
 }
 
 /// Unsigned signed saturating rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uqrshl))]
-pub unsafe fn vqrshlh_u16(a: u16, b: i16) -> u16 {
-    let a: uint16x4_t = vdup_n_u16(a);
-    let b: int16x4_t = vdup_n_s16(b);
-    simd_extract(vqrshl_u16(a, b), 0)
+pub unsafe fn vqrshld_u64(a: u64, b: i64) -> u64 {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.i64")]
+        fn vqrshld_u64_(a: u64, b: i64) -> u64;
+    }
+    vqrshld_u64_(a, b)
 }
 
 /// Unsigned signed saturating rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uqrshl))]
-pub unsafe fn vqrshls_u32(a: u32, b: i32) -> u32 {
-    let a: uint32x2_t = vdup_n_u32(a);
-    let b: int32x2_t = vdup_n_s32(b);
-    simd_extract(vqrshl_u32(a, b), 0)
+pub unsafe fn vqrshlb_u8(a: u8, b: i8) -> u8 {
+    let a: uint8x8_t = vdup_n_u8(a);
+    let b: int8x8_t = vdup_n_s8(b);
+    simd_extract(vqrshl_u8(a, b), 0)
 }
 
 /// Unsigned signed saturating rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uqrshl))]
-pub unsafe fn vqrshld_u64(a: u64, b: i64) -> u64 {
-    let a: uint64x1_t = vdup_n_u64(a);
-    let b: int64x1_t = vdup_n_s64(b);
-    simd_extract(vqrshl_u64(a, b), 0)
+pub unsafe fn vqrshlh_u16(a: u16, b: i16) -> u16 {
+    let a: uint16x4_t = vdup_n_u16(a);
+    let b: int16x4_t = vdup_n_s16(b);
+    simd_extract(vqrshl_u16(a, b), 0)
 }
 
 /// Signed saturating rounded shift right narrow
@@ -6501,6 +6547,19 @@ pub unsafe fn vqrshrun_high_n_s64<const N: i32>(a: uint32x2_t, b: int64x2_t) ->
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sqshl))]
+pub unsafe fn vqshld_s64(a: i64, b: i64) -> i64 {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.i64")]
+        fn vqshld_s64_(a: i64, b: i64) -> i64;
+    }
+    vqshld_s64_(a, b)
+}
+
+/// Signed saturating shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(sqshl))]
 pub unsafe fn vqshlb_s8(a: i8, b: i8) -> i8 {
     let c: int8x8_t = vqshl_s8(vdup_n_s8(a), vdup_n_s8(b));
     simd_extract(c, 0)
@@ -6524,13 +6583,17 @@ pub unsafe fn vqshls_s32(a: i32, b: i32) -> i32 {
     simd_extract(c, 0)
 }
 
-/// Signed saturating shift left
+/// Unsigned saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(sqshl))]
-pub unsafe fn vqshld_s64(a: i64, b: i64) -> i64 {
-    let c: int64x1_t = vqshl_s64(vdup_n_s64(a), vdup_n_s64(b));
-    simd_extract(c, 0)
+#[cfg_attr(test, assert_instr(uqshl))]
+pub unsafe fn vqshld_u64(a: u64, b: i64) -> u64 {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.i64")]
+        fn vqshld_u64_(a: u64, b: i64) -> u64;
+    }
+    vqshld_u64_(a, b)
 }
 
 /// Unsigned saturating shift left
@@ -6560,15 +6623,6 @@ pub unsafe fn vqshls_u32(a: u32, b: i32) -> u32 {
     simd_extract(c, 0)
 }
 
-/// Unsigned saturating shift left
-#[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(uqshl))]
-pub unsafe fn vqshld_u64(a: u64, b: i64) -> u64 {
-    let c: uint64x1_t = vqshl_u64(vdup_n_u64(a), vdup_n_s64(b));
-    simd_extract(c, 0)
-}
-
 /// Signed saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
@@ -6654,9 +6708,14 @@ pub unsafe fn vqshld_n_u64<const N: i32>(a: u64) -> u64 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sqshrn, N = 2))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrnh_n_s16<const N: i32>(a: i16) -> i8 {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_extract(vqshrn_n_s16::<N>(vdupq_n_s16(a)), 0)
+pub unsafe fn vqshrnd_n_s64<const N: i32>(a: i64) -> i32 {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshrn.i32")]
+        fn vqshrnd_n_s64_(a: i64, n: i32) -> i32;
+    }
+    vqshrnd_n_s64_(a, N)
 }
 
 /// Signed saturating shift right narrow
@@ -6664,9 +6723,9 @@ pub unsafe fn vqshrnh_n_s16<const N: i32>(a: i16) -> i8 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sqshrn, N = 2))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrns_n_s32<const N: i32>(a: i32) -> i16 {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_extract(vqshrn_n_s32::<N>(vdupq_n_s32(a)), 0)
+pub unsafe fn vqshrnh_n_s16<const N: i32>(a: i16) -> i8 {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_extract(vqshrn_n_s16::<N>(vdupq_n_s16(a)), 0)
 }
 
 /// Signed saturating shift right narrow
@@ -6674,9 +6733,9 @@ pub unsafe fn vqshrns_n_s32<const N: i32>(a: i32) -> i16 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(sqshrn, N = 2))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrnd_n_s64<const N: i32>(a: i64) -> i32 {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_extract(vqshrn_n_s64::<N>(vdupq_n_s64(a)), 0)
+pub unsafe fn vqshrns_n_s32<const N: i32>(a: i32) -> i16 {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_extract(vqshrn_n_s32::<N>(vdupq_n_s32(a)), 0)
 }
 
 /// Signed saturating shift right narrow
@@ -6714,9 +6773,14 @@ pub unsafe fn vqshrn_high_n_s64<const N: i32>(a: int32x2_t, b: int64x2_t) -> int
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uqshrn, N = 2))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrnh_n_u16<const N: i32>(a: u16) -> u8 {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_extract(vqshrn_n_u16::<N>(vdupq_n_u16(a)), 0)
+pub unsafe fn vqshrnd_n_u64<const N: i32>(a: u64) -> u32 {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshrn.i32")]
+        fn vqshrnd_n_u64_(a: u64, n: i32) -> u32;
+    }
+    vqshrnd_n_u64_(a, N)
 }
 
 /// Unsigned saturating shift right narrow
@@ -6724,9 +6788,9 @@ pub unsafe fn vqshrnh_n_u16<const N: i32>(a: u16) -> u8 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uqshrn, N = 2))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrns_n_u32<const N: i32>(a: u32) -> u16 {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_extract(vqshrn_n_u32::<N>(vdupq_n_u32(a)), 0)
+pub unsafe fn vqshrnh_n_u16<const N: i32>(a: u16) -> u8 {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_extract(vqshrn_n_u16::<N>(vdupq_n_u16(a)), 0)
 }
 
 /// Unsigned saturating shift right narrow
@@ -6734,9 +6798,9 @@ pub unsafe fn vqshrns_n_u32<const N: i32>(a: u32) -> u16 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uqshrn, N = 2))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrnd_n_u64<const N: i32>(a: u64) -> u32 {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_extract(vqshrn_n_u64::<N>(vdupq_n_u64(a)), 0)
+pub unsafe fn vqshrns_n_u32<const N: i32>(a: u32) -> u16 {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_extract(vqshrn_n_u32::<N>(vdupq_n_u32(a)), 0)
 }
 
 /// Unsigned saturating shift right narrow
@@ -7654,7 +7718,12 @@ pub unsafe fn vreinterpretq_f32_f64(a: float64x2_t) -> float32x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(srshl))]
 pub unsafe fn vrshld_s64(a: i64, b: i64) -> i64 {
-    transmute(vrshl_s64(transmute(a), transmute(b)))
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.i64")]
+        fn vrshld_s64_(a: i64, b: i64) -> i64;
+    }
+    vrshld_s64_(a, b)
 }
 
 /// Unsigned rounding shift left
@@ -7662,7 +7731,12 @@ pub unsafe fn vrshld_s64(a: i64, b: i64) -> i64 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(urshl))]
 pub unsafe fn vrshld_u64(a: u64, b: i64) -> u64 {
-    transmute(vrshl_u64(transmute(a), transmute(b)))
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.i64")]
+        fn vrshld_u64_(a: u64, b: i64) -> u64;
+    }
+    vrshld_u64_(a, b)
 }
 
 /// Signed rounding shift right
@@ -7748,23 +7822,23 @@ pub unsafe fn vrshrn_high_n_u64<const N: i32>(a: uint32x2_t, b: uint64x2_t) -> u
 /// Signed rounding shift right and accumulate.
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(nop, N = 2))]
+#[cfg_attr(test, assert_instr(srsra, N = 2))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vrsrad_n_s64<const N: i32>(a: i64, b: i64) -> i64 {
     static_assert!(N : i32 where N >= 1 && N <= 64);
-    let b: int64x1_t = vrshr_n_s64::<N>(transmute(b));
-    transmute(simd_add(transmute(a), b))
+    let b: i64 = vrshrd_n_s64::<N>(b);
+    a + b
 }
 
 /// Ungisned rounding shift right and accumulate.
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(nop, N = 2))]
+#[cfg_attr(test, assert_instr(ursra, N = 2))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vrsrad_n_u64<const N: i32>(a: u64, b: u64) -> u64 {
     static_assert!(N : i32 where N >= 1 && N <= 64);
-    let b: uint64x1_t = vrshr_n_u64::<N>(transmute(b));
-    transmute(simd_add(transmute(a), b))
+    let b: u64 = vrshrd_n_u64::<N>(b);
+    a + b
 }
 
 /// Insert vector element from another vector element
@@ -12090,24 +12164,6 @@ mod test {
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqsubs_s32() {
-        let a: i32 = 42;
-        let b: i32 = 1;
-        let e: i32 = 41;
-        let r: i32 = transmute(vqsubs_s32(transmute(a), transmute(b)));
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vqsubd_s64() {
-        let a: i64 = 42;
-        let b: i64 = 1;
-        let e: i64 = 41;
-        let r: i64 = transmute(vqsubd_s64(transmute(a), transmute(b)));
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "neon")]
     unsafe fn test_vqsubb_u8() {
         let a: u8 = 42;
         let b: u8 = 1;
@@ -12144,6 +12200,24 @@ mod test {
     }
 
     #[simd_test(enable = "neon")]
+    unsafe fn test_vqsubs_s32() {
+        let a: i32 = 42;
+        let b: i32 = 1;
+        let e: i32 = 41;
+        let r: i32 = transmute(vqsubs_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqsubd_s64() {
+        let a: i64 = 42;
+        let b: i64 = 1;
+        let e: i64 = 41;
+        let r: i64 = transmute(vqsubd_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
     unsafe fn test_vrbit_s8() {
         let a: i8x8 = i8x8::new(0, 2, 4, 6, 8, 10, 12, 14);
         let e: i8x8 = i8x8::new(0, 64, 32, 96, 16, 80, 48, 112);
@@ -12418,24 +12492,6 @@ mod test {
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqadds_s32() {
-        let a: i32 = 42;
-        let b: i32 = 1;
-        let e: i32 = 43;
-        let r: i32 = transmute(vqadds_s32(transmute(a), transmute(b)));
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vqaddd_s64() {
-        let a: i64 = 42;
-        let b: i64 = 1;
-        let e: i64 = 43;
-        let r: i64 = transmute(vqaddd_s64(transmute(a), transmute(b)));
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "neon")]
     unsafe fn test_vqaddb_u8() {
         let a: u8 = 42;
         let b: u8 = 1;
@@ -12472,6 +12528,24 @@ mod test {
     }
 
     #[simd_test(enable = "neon")]
+    unsafe fn test_vqadds_s32() {
+        let a: i32 = 42;
+        let b: i32 = 1;
+        let e: i32 = 43;
+        let r: i32 = transmute(vqadds_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqaddd_s64() {
+        let a: i64 = 42;
+        let b: i64 = 1;
+        let e: i64 = 43;
+        let r: i64 = transmute(vqaddd_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
     unsafe fn test_vmul_f64() {
         let a: f64 = 1.0;
         let b: f64 = 2.0;
@@ -13737,14 +13811,6 @@ mod test {
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqmovnd_s64() {
-        let a: i64 = 1;
-        let e: i32 = 1;
-        let r: i32 = transmute(vqmovnd_s64(transmute(a)));
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "neon")]
     unsafe fn test_vqmovnh_u16() {
         let a: u16 = 1;
         let e: u8 = 1;
@@ -13761,6 +13827,14 @@ mod test {
     }
 
     #[simd_test(enable = "neon")]
+    unsafe fn test_vqmovnd_s64() {
+        let a: i64 = 1;
+        let e: i32 = 1;
+        let r: i32 = transmute(vqmovnd_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
     unsafe fn test_vqmovnd_u64() {
         let a: u64 = 1;
         let e: u32 = 1;
@@ -14048,6 +14122,24 @@ mod test {
     }
 
     #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshls_s32() {
+        let a: i32 = 2;
+        let b: i32 = 2;
+        let e: i32 = 8;
+        let r: i32 = transmute(vqrshls_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vqrshld_s64() {
+        let a: i64 = 2;
+        let b: i64 = 2;
+        let e: i64 = 8;
+        let r: i64 = transmute(vqrshld_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
     unsafe fn test_vqrshlb_s8() {
         let a: i8 = 1;
         let b: i8 = 2;
@@ -14066,20 +14158,20 @@ mod test {
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqrshls_s32() {
-        let a: i32 = 1;
+    unsafe fn test_vqrshls_u32() {
+        let a: u32 = 2;
         let b: i32 = 2;
-        let e: i32 = 4;
-        let r: i32 = transmute(vqrshls_s32(transmute(a), transmute(b)));
+        let e: u32 = 8;
+        let r: u32 = transmute(vqrshls_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqrshld_s64() {
-        let a: i64 = 1;
+    unsafe fn test_vqrshld_u64() {
+        let a: u64 = 2;
         let b: i64 = 2;
-        let e: i64 = 4;
-        let r: i64 = transmute(vqrshld_s64(transmute(a), transmute(b)));
+        let e: u64 = 8;
+        let r: u64 = transmute(vqrshld_u64(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
@@ -14102,24 +14194,6 @@ mod test {
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqrshls_u32() {
-        let a: u32 = 1;
-        let b: i32 = 2;
-        let e: u32 = 4;
-        let r: u32 = transmute(vqrshls_u32(transmute(a), transmute(b)));
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vqrshld_u64() {
-        let a: u64 = 1;
-        let b: i64 = 2;
-        let e: u64 = 4;
-        let r: u64 = transmute(vqrshld_u64(transmute(a), transmute(b)));
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "neon")]
     unsafe fn test_vqrshrnh_n_s16() {
         let a: i16 = 4;
         let e: i8 = 1;
@@ -14273,6 +14347,15 @@ mod test {
     }
 
     #[simd_test(enable = "neon")]
+    unsafe fn test_vqshld_s64() {
+        let a: i64 = 0;
+        let b: i64 = 2;
+        let e: i64 = 0;
+        let r: i64 = transmute(vqshld_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
     unsafe fn test_vqshlb_s8() {
         let a: i8 = 1;
         let b: i8 = 2;
@@ -14300,11 +14383,11 @@ mod test {
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqshld_s64() {
-        let a: i64 = 1;
+    unsafe fn test_vqshld_u64() {
+        let a: u64 = 0;
         let b: i64 = 2;
-        let e: i64 = 4;
-        let r: i64 = transmute(vqshld_s64(transmute(a), transmute(b)));
+        let e: u64 = 0;
+        let r: u64 = transmute(vqshld_u64(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
@@ -14336,15 +14419,6 @@ mod test {
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqshld_u64() {
-        let a: u64 = 1;
-        let b: i64 = 2;
-        let e: u64 = 4;
-        let r: u64 = transmute(vqshld_u64(transmute(a), transmute(b)));
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "neon")]
     unsafe fn test_vqshlb_n_s8() {
         let a: i8 = 1;
         let e: i8 = 4;
@@ -14409,6 +14483,14 @@ mod test {
     }
 
     #[simd_test(enable = "neon")]
+    unsafe fn test_vqshrnd_n_s64() {
+        let a: i64 = 0;
+        let e: i32 = 0;
+        let r: i32 = transmute(vqshrnd_n_s64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
     unsafe fn test_vqshrnh_n_s16() {
         let a: i16 = 4;
         let e: i8 = 1;
@@ -14425,14 +14507,6 @@ mod test {
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqshrnd_n_s64() {
-        let a: i64 = 4;
-        let e: i32 = 1;
-        let r: i32 = transmute(vqshrnd_n_s64::<2>(transmute(a)));
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "neon")]
     unsafe fn test_vqshrn_high_n_s16() {
         let a: i8x8 = i8x8::new(0, 1, 8, 9, 8, 9, 10, 11);
         let b: i16x8 = i16x8::new(32, 36, 40, 44, 48, 52, 56, 60);
@@ -14460,6 +14534,14 @@ mod test {
     }
 
     #[simd_test(enable = "neon")]
+    unsafe fn test_vqshrnd_n_u64() {
+        let a: u64 = 0;
+        let e: u32 = 0;
+        let r: u32 = transmute(vqshrnd_n_u64::<2>(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
     unsafe fn test_vqshrnh_n_u16() {
         let a: u16 = 4;
         let e: u8 = 1;
@@ -14476,14 +14558,6 @@ mod test {
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vqshrnd_n_u64() {
-        let a: u64 = 4;
-        let e: u32 = 1;
-        let r: u32 = transmute(vqshrnd_n_u64::<2>(transmute(a)));
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "neon")]
     unsafe fn test_vqshrn_high_n_u16() {
         let a: u8x8 = u8x8::new(0, 1, 8, 9, 8, 9, 10, 11);
         let b: u16x8 = u16x8::new(32, 36, 40, 44, 48, 52, 56, 60);
diff --git a/library/stdarch/crates/core_arch/src/aarch64/neon/mod.rs b/library/stdarch/crates/core_arch/src/aarch64/neon/mod.rs
index e29c1b36d25..9097d269893 100644
--- a/library/stdarch/crates/core_arch/src/aarch64/neon/mod.rs
+++ b/library/stdarch/crates/core_arch/src/aarch64/neon/mod.rs
@@ -1184,9 +1184,7 @@ pub unsafe fn vadd_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(add))]
 pub unsafe fn vaddd_s64(a: i64, b: i64) -> i64 {
-    let a: int64x1_t = transmute(a);
-    let b: int64x1_t = transmute(b);
-    simd_extract(simd_add(a, b), 0)
+    a.wrapping_add(b)
 }
 
 /// Vector add.
@@ -1194,9 +1192,7 @@ pub unsafe fn vaddd_s64(a: i64, b: i64) -> i64 {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(add))]
 pub unsafe fn vaddd_u64(a: u64, b: u64) -> u64 {
-    let a: uint64x1_t = transmute(a);
-    let b: uint64x1_t = transmute(b);
-    simd_extract(simd_add(a, b), 0)
+    a.wrapping_add(b)
 }
 
 /// Horizontal vector max.
diff --git a/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs b/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs
index 835a3aba749..0387799f6f4 100644
--- a/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs
+++ b/library/stdarch/crates/core_arch/src/arm_shared/neon/generated.rs
@@ -13070,7 +13070,7 @@ pub unsafe fn vset_lane_s8<const LANE: i32>(a: i8, b: int8x8_t) -> int8x8_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vset_lane_s16<const LANE: i32>(a: i16, b: int16x4_t) -> int16x4_t {
-    static_assert_imm4!(LANE);
+    static_assert_imm2!(LANE);
     simd_insert(b, LANE as u32, a)
 }
 
@@ -13082,7 +13082,7 @@ pub unsafe fn vset_lane_s16<const LANE: i32>(a: i16, b: int16x4_t) -> int16x4_t
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vset_lane_s32<const LANE: i32>(a: i32, b: int32x2_t) -> int32x2_t {
-    static_assert_imm5!(LANE);
+    static_assert_imm1!(LANE);
     simd_insert(b, LANE as u32, a)
 }
 
@@ -13094,7 +13094,7 @@ pub unsafe fn vset_lane_s32<const LANE: i32>(a: i32, b: int32x2_t) -> int32x2_t
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vset_lane_s64<const LANE: i32>(a: i64, b: int64x1_t) -> int64x1_t {
-    static_assert_imm6!(LANE);
+    static_assert!(LANE : i32 where LANE == 0);
     simd_insert(b, LANE as u32, a)
 }
 
@@ -13118,7 +13118,7 @@ pub unsafe fn vset_lane_u8<const LANE: i32>(a: u8, b: uint8x8_t) -> uint8x8_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vset_lane_u16<const LANE: i32>(a: u16, b: uint16x4_t) -> uint16x4_t {
-    static_assert_imm4!(LANE);
+    static_assert_imm2!(LANE);
     simd_insert(b, LANE as u32, a)
 }
 
@@ -13130,7 +13130,7 @@ pub unsafe fn vset_lane_u16<const LANE: i32>(a: u16, b: uint16x4_t) -> uint16x4_
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vset_lane_u32<const LANE: i32>(a: u32, b: uint32x2_t) -> uint32x2_t {
-    static_assert_imm5!(LANE);
+    static_assert_imm1!(LANE);
     simd_insert(b, LANE as u32, a)
 }
 
@@ -13142,7 +13142,7 @@ pub unsafe fn vset_lane_u32<const LANE: i32>(a: u32, b: uint32x2_t) -> uint32x2_
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vset_lane_u64<const LANE: i32>(a: u64, b: uint64x1_t) -> uint64x1_t {
-    static_assert_imm6!(LANE);
+    static_assert!(LANE : i32 where LANE == 0);
     simd_insert(b, LANE as u32, a)
 }
 
@@ -13166,7 +13166,7 @@ pub unsafe fn vset_lane_p8<const LANE: i32>(a: p8, b: poly8x8_t) -> poly8x8_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vset_lane_p16<const LANE: i32>(a: p16, b: poly16x4_t) -> poly16x4_t {
-    static_assert_imm4!(LANE);
+    static_assert_imm2!(LANE);
     simd_insert(b, LANE as u32, a)
 }
 
@@ -13178,7 +13178,7 @@ pub unsafe fn vset_lane_p16<const LANE: i32>(a: p16, b: poly16x4_t) -> poly16x4_
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vset_lane_p64<const LANE: i32>(a: p64, b: poly64x1_t) -> poly64x1_t {
-    static_assert_imm6!(LANE);
+    static_assert!(LANE : i32 where LANE == 0);
     simd_insert(b, LANE as u32, a)
 }
 
@@ -13190,7 +13190,7 @@ pub unsafe fn vset_lane_p64<const LANE: i32>(a: p64, b: poly64x1_t) -> poly64x1_
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vsetq_lane_s8<const LANE: i32>(a: i8, b: int8x16_t) -> int8x16_t {
-    static_assert_imm3!(LANE);
+    static_assert_imm4!(LANE);
     simd_insert(b, LANE as u32, a)
 }
 
@@ -13202,7 +13202,7 @@ pub unsafe fn vsetq_lane_s8<const LANE: i32>(a: i8, b: int8x16_t) -> int8x16_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vsetq_lane_s16<const LANE: i32>(a: i16, b: int16x8_t) -> int16x8_t {
-    static_assert_imm4!(LANE);
+    static_assert_imm3!(LANE);
     simd_insert(b, LANE as u32, a)
 }
 
@@ -13214,7 +13214,7 @@ pub unsafe fn vsetq_lane_s16<const LANE: i32>(a: i16, b: int16x8_t) -> int16x8_t
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vsetq_lane_s32<const LANE: i32>(a: i32, b: int32x4_t) -> int32x4_t {
-    static_assert_imm5!(LANE);
+    static_assert_imm2!(LANE);
     simd_insert(b, LANE as u32, a)
 }
 
@@ -13226,7 +13226,7 @@ pub unsafe fn vsetq_lane_s32<const LANE: i32>(a: i32, b: int32x4_t) -> int32x4_t
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vsetq_lane_s64<const LANE: i32>(a: i64, b: int64x2_t) -> int64x2_t {
-    static_assert_imm6!(LANE);
+    static_assert_imm1!(LANE);
     simd_insert(b, LANE as u32, a)
 }
 
@@ -13238,7 +13238,7 @@ pub unsafe fn vsetq_lane_s64<const LANE: i32>(a: i64, b: int64x2_t) -> int64x2_t
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vsetq_lane_u8<const LANE: i32>(a: u8, b: uint8x16_t) -> uint8x16_t {
-    static_assert_imm3!(LANE);
+    static_assert_imm4!(LANE);
     simd_insert(b, LANE as u32, a)
 }
 
@@ -13250,7 +13250,7 @@ pub unsafe fn vsetq_lane_u8<const LANE: i32>(a: u8, b: uint8x16_t) -> uint8x16_t
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vsetq_lane_u16<const LANE: i32>(a: u16, b: uint16x8_t) -> uint16x8_t {
-    static_assert_imm4!(LANE);
+    static_assert_imm3!(LANE);
     simd_insert(b, LANE as u32, a)
 }
 
@@ -13262,7 +13262,7 @@ pub unsafe fn vsetq_lane_u16<const LANE: i32>(a: u16, b: uint16x8_t) -> uint16x8
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vsetq_lane_u32<const LANE: i32>(a: u32, b: uint32x4_t) -> uint32x4_t {
-    static_assert_imm5!(LANE);
+    static_assert_imm2!(LANE);
     simd_insert(b, LANE as u32, a)
 }
 
@@ -13274,7 +13274,7 @@ pub unsafe fn vsetq_lane_u32<const LANE: i32>(a: u32, b: uint32x4_t) -> uint32x4
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vsetq_lane_u64<const LANE: i32>(a: u64, b: uint64x2_t) -> uint64x2_t {
-    static_assert_imm6!(LANE);
+    static_assert_imm1!(LANE);
     simd_insert(b, LANE as u32, a)
 }
 
@@ -13286,7 +13286,7 @@ pub unsafe fn vsetq_lane_u64<const LANE: i32>(a: u64, b: uint64x2_t) -> uint64x2
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vsetq_lane_p8<const LANE: i32>(a: p8, b: poly8x16_t) -> poly8x16_t {
-    static_assert_imm3!(LANE);
+    static_assert_imm4!(LANE);
     simd_insert(b, LANE as u32, a)
 }
 
@@ -13298,7 +13298,7 @@ pub unsafe fn vsetq_lane_p8<const LANE: i32>(a: p8, b: poly8x16_t) -> poly8x16_t
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vsetq_lane_p16<const LANE: i32>(a: p16, b: poly16x8_t) -> poly16x8_t {
-    static_assert_imm4!(LANE);
+    static_assert_imm3!(LANE);
     simd_insert(b, LANE as u32, a)
 }
 
@@ -13310,7 +13310,7 @@ pub unsafe fn vsetq_lane_p16<const LANE: i32>(a: p16, b: poly16x8_t) -> poly16x8
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn vsetq_lane_p64<const LANE: i32>(a: p64, b: poly64x2_t) -> poly64x2_t {
-    static_assert_imm6!(LANE);
+    static_assert_imm1!(LANE);
     simd_insert(b, LANE as u32, a)
 }
 
@@ -21006,144 +21006,144 @@ mod test {
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vqrshl_s8() {
-        let a: i8x8 = i8x8::new(-128, 0x7F, 2, 3, 4, 5, 6, 7);
+        let a: i8x8 = i8x8::new(2, -128, 0x7F, 3, 4, 5, 6, 7);
         let b: i8x8 = i8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
-        let e: i8x8 = i8x8::new(-128, 0x7F, 8, 12, 16, 20, 24, 28);
+        let e: i8x8 = i8x8::new(8, -128, 0x7F, 12, 16, 20, 24, 28);
         let r: i8x8 = transmute(vqrshl_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vqrshlq_s8() {
-        let a: i8x16 = i8x16::new(-128, 0x7F, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let a: i8x16 = i8x16::new(2, -128, 0x7F, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
         let b: i8x16 = i8x16::new(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
-        let e: i8x16 = i8x16::new(-128, 0x7F, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60);
+        let e: i8x16 = i8x16::new(8, -128, 0x7F, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60);
         let r: i8x16 = transmute(vqrshlq_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vqrshl_s16() {
-        let a: i16x4 = i16x4::new(-32768, 0x7F_FF, 2, 3);
+        let a: i16x4 = i16x4::new(2, -32768, 0x7F_FF, 3);
         let b: i16x4 = i16x4::new(2, 2, 2, 2);
-        let e: i16x4 = i16x4::new(-32768, 0x7F_FF, 8, 12);
+        let e: i16x4 = i16x4::new(8, -32768, 0x7F_FF, 12);
         let r: i16x4 = transmute(vqrshl_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vqrshlq_s16() {
-        let a: i16x8 = i16x8::new(-32768, 0x7F_FF, 2, 3, 4, 5, 6, 7);
+        let a: i16x8 = i16x8::new(2, -32768, 0x7F_FF, 3, 4, 5, 6, 7);
         let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
-        let e: i16x8 = i16x8::new(-32768, 0x7F_FF, 8, 12, 16, 20, 24, 28);
+        let e: i16x8 = i16x8::new(8, -32768, 0x7F_FF, 12, 16, 20, 24, 28);
         let r: i16x8 = transmute(vqrshlq_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vqrshl_s32() {
-        let a: i32x2 = i32x2::new(-2147483648, 0x7F_FF_FF_FF);
+        let a: i32x2 = i32x2::new(2, -2147483648);
         let b: i32x2 = i32x2::new(2, 2);
-        let e: i32x2 = i32x2::new(-2147483648, 0x7F_FF_FF_FF);
+        let e: i32x2 = i32x2::new(8, -2147483648);
         let r: i32x2 = transmute(vqrshl_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vqrshlq_s32() {
-        let a: i32x4 = i32x4::new(-2147483648, 0x7F_FF_FF_FF, 2, 3);
+        let a: i32x4 = i32x4::new(2, -2147483648, 0x7F_FF_FF_FF, 3);
         let b: i32x4 = i32x4::new(2, 2, 2, 2);
-        let e: i32x4 = i32x4::new(-2147483648, 0x7F_FF_FF_FF, 8, 12);
+        let e: i32x4 = i32x4::new(8, -2147483648, 0x7F_FF_FF_FF, 12);
         let r: i32x4 = transmute(vqrshlq_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vqrshl_s64() {
-        let a: i64x1 = i64x1::new(-9223372036854775808);
+        let a: i64x1 = i64x1::new(2);
         let b: i64x1 = i64x1::new(2);
-        let e: i64x1 = i64x1::new(-9223372036854775808);
+        let e: i64x1 = i64x1::new(8);
         let r: i64x1 = transmute(vqrshl_s64(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vqrshlq_s64() {
-        let a: i64x2 = i64x2::new(-9223372036854775808, 0x7F_FF_FF_FF_FF_FF_FF_FF);
+        let a: i64x2 = i64x2::new(2, -9223372036854775808);
         let b: i64x2 = i64x2::new(2, 2);
-        let e: i64x2 = i64x2::new(-9223372036854775808, 0x7F_FF_FF_FF_FF_FF_FF_FF);
+        let e: i64x2 = i64x2::new(8, -9223372036854775808);
         let r: i64x2 = transmute(vqrshlq_s64(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vqrshl_u8() {
-        let a: u8x8 = u8x8::new(0, 0xFF, 2, 3, 4, 5, 6, 7);
+        let a: u8x8 = u8x8::new(2, 0, 0xFF, 3, 4, 5, 6, 7);
         let b: i8x8 = i8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
-        let e: u8x8 = u8x8::new(0, 0xFF, 8, 12, 16, 20, 24, 28);
+        let e: u8x8 = u8x8::new(8, 0, 0xFF, 12, 16, 20, 24, 28);
         let r: u8x8 = transmute(vqrshl_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vqrshlq_u8() {
-        let a: u8x16 = u8x16::new(0, 0xFF, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let a: u8x16 = u8x16::new(2, 0, 0xFF, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
         let b: i8x16 = i8x16::new(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
-        let e: u8x16 = u8x16::new(0, 0xFF, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60);
+        let e: u8x16 = u8x16::new(8, 0, 0xFF, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60);
         let r: u8x16 = transmute(vqrshlq_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vqrshl_u16() {
-        let a: u16x4 = u16x4::new(0, 0xFF_FF, 2, 3);
+        let a: u16x4 = u16x4::new(2, 0, 0xFF_FF, 3);
         let b: i16x4 = i16x4::new(2, 2, 2, 2);
-        let e: u16x4 = u16x4::new(0, 0xFF_FF, 8, 12);
+        let e: u16x4 = u16x4::new(8, 0, 0xFF_FF, 12);
         let r: u16x4 = transmute(vqrshl_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vqrshlq_u16() {
-        let a: u16x8 = u16x8::new(0, 0xFF_FF, 2, 3, 4, 5, 6, 7);
+        let a: u16x8 = u16x8::new(2, 0, 0xFF_FF, 3, 4, 5, 6, 7);
         let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
-        let e: u16x8 = u16x8::new(0, 0xFF_FF, 8, 12, 16, 20, 24, 28);
+        let e: u16x8 = u16x8::new(8, 0, 0xFF_FF, 12, 16, 20, 24, 28);
         let r: u16x8 = transmute(vqrshlq_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vqrshl_u32() {
-        let a: u32x2 = u32x2::new(0, 0xFF_FF_FF_FF);
+        let a: u32x2 = u32x2::new(2, 0);
         let b: i32x2 = i32x2::new(2, 2);
-        let e: u32x2 = u32x2::new(0, 0xFF_FF_FF_FF);
+        let e: u32x2 = u32x2::new(8, 0);
         let r: u32x2 = transmute(vqrshl_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vqrshlq_u32() {
-        let a: u32x4 = u32x4::new(0, 0xFF_FF_FF_FF, 2, 3);
+        let a: u32x4 = u32x4::new(2, 0, 0xFF_FF_FF_FF, 3);
         let b: i32x4 = i32x4::new(2, 2, 2, 2);
-        let e: u32x4 = u32x4::new(0, 0xFF_FF_FF_FF, 8, 12);
+        let e: u32x4 = u32x4::new(8, 0, 0xFF_FF_FF_FF, 12);
         let r: u32x4 = transmute(vqrshlq_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vqrshl_u64() {
-        let a: u64x1 = u64x1::new(0);
+        let a: u64x1 = u64x1::new(2);
         let b: i64x1 = i64x1::new(2);
-        let e: u64x1 = u64x1::new(0);
+        let e: u64x1 = u64x1::new(8);
         let r: u64x1 = transmute(vqrshl_u64(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vqrshlq_u64() {
-        let a: u64x2 = u64x2::new(0, 0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let a: u64x2 = u64x2::new(2, 0);
         let b: i64x2 = i64x2::new(2, 2);
-        let e: u64x2 = u64x2::new(0, 0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let e: u64x2 = u64x2::new(8, 0);
         let r: u64x2 = transmute(vqrshlq_u64(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
diff --git a/library/stdarch/crates/core_arch/src/x86/macros.rs b/library/stdarch/crates/core_arch/src/x86/macros.rs
index b9550ce79c4..e686e65b303 100644
--- a/library/stdarch/crates/core_arch/src/x86/macros.rs
+++ b/library/stdarch/crates/core_arch/src/x86/macros.rs
@@ -5,7 +5,10 @@
 pub(crate) struct ValidateConstRound<const IMM: i32>;
 impl<const IMM: i32> ValidateConstRound<IMM> {
     pub(crate) const VALID: () = {
-        assert!(IMM == 4 || IMM == 8 || IMM == 9 || IMM == 10 || IMM == 11, "Invalid IMM value");
+        assert!(
+            IMM == 4 || IMM == 8 || IMM == 9 || IMM == 10 || IMM == 11,
+            "Invalid IMM value"
+        );
     };
 }
 
@@ -70,7 +73,10 @@ macro_rules! static_assert_imm_u8 {
 pub(crate) struct ValidateConstGatherScale<const SCALE: i32>;
 impl<const SCALE: i32> ValidateConstGatherScale<SCALE> {
     pub(crate) const VALID: () = {
-        assert!(SCALE == 1 || SCALE == 2 || SCALE == 4 || SCALE == 8, "Invalid SCALE value");
+        assert!(
+            SCALE == 1 || SCALE == 2 || SCALE == 4 || SCALE == 8,
+            "Invalid SCALE value"
+        );
     };
 }
 
diff --git a/library/stdarch/crates/core_arch/src/x86_64/macros.rs b/library/stdarch/crates/core_arch/src/x86_64/macros.rs
index 9e3faf444d3..a3ea0e82163 100644
--- a/library/stdarch/crates/core_arch/src/x86_64/macros.rs
+++ b/library/stdarch/crates/core_arch/src/x86_64/macros.rs
@@ -5,7 +5,10 @@
 pub(crate) struct ValidateConstRound<const IMM: i32>;
 impl<const IMM: i32> ValidateConstRound<IMM> {
     pub(crate) const VALID: () = {
-        assert!(IMM == 4 || IMM == 8 || IMM == 9 || IMM == 10 || IMM == 11, "Invalid IMM value");
+        assert!(
+            IMM == 4 || IMM == 8 || IMM == 9 || IMM == 10 || IMM == 11,
+            "Invalid IMM value"
+        );
     };
 }
 
diff --git a/library/stdarch/crates/stdarch-gen/neon.spec b/library/stdarch/crates/stdarch-gen/neon.spec
index 4b192069b6f..825ecf51155 100644
--- a/library/stdarch/crates/stdarch-gen/neon.spec
+++ b/library/stdarch/crates/stdarch-gen/neon.spec
@@ -1843,9 +1843,23 @@ b = 1
 validate 41
 
 aarch64 = sqsub
-generate i8, i16, i32, i64
+generate i8, i16
 aarch64 = uqsub
-generate u8, u16, u32, u64
+generate u8, u16
+
+/// Saturating subtract
+name = vqsub
+a = 42
+b = 1
+validate 41
+
+aarch64 = uqsub
+link-aarch64 = uqsub._EXT_
+generate u32, u64
+
+aarch64 = sqsub
+link-aarch64 = sqsub._EXT_
+generate i32, i64
 
 /// Halving add
 name = vhadd
@@ -1999,9 +2013,23 @@ b = 1
 validate 43
 
 aarch64 = sqadd
-generate i8, i16, i32, i64
+generate i8, i16
 aarch64 = uqadd
-generate u8, u16, u32, u64
+generate u8, u16
+
+/// Saturating add
+name = vqadd
+a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42
+b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+validate 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58
+
+aarch64 = uqadd
+link-aarch64 = uqadd._EXT_
+generate u32, u64
+
+aarch64 = sqadd
+link-aarch64 = sqadd._EXT_
+generate i32, i64
 
 /// Multiply
 name = vmul
@@ -3383,9 +3411,22 @@ a = 1
 validate 1
 
 aarch64 = sqxtn
-generate i16:i8, i32:i16, i64:i32
+generate i16:i8, i32:i16
 aarch64 = uqxtn
-generate u16:u8, u32:u16, u64:u32
+generate u16:u8, u32:u16
+
+/// Saturating extract narrow
+name = vqmovn
+a = 1
+validate 1
+
+aarch64 = sqxtn
+link-aarch64 = scalar.sqxtn._EXT2_._EXT_
+generate i64:i32
+
+aarch64 = uqxtn
+link-aarch64 = scalar.uqxtn._EXT2_._EXT_
+generate u64:u32
 
 /// Signed saturating extract narrow
 name = vqmovn_high
@@ -3609,12 +3650,13 @@ generate i16:i16:int16x4_t:i16, i16:i16:int16x8_t:i16, i32:i32:int32x2_t:i32, i3
 
 /// Signed saturating rounding shift left
 name = vqrshl
-a = MIN, MAX, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+a = 2, MIN, MAX, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
-validate MIN, MAX, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
+validate 8, MIN, MAX, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
 
 aarch64 = sqrshl
 link-aarch64 = sqrshl._EXT_
+generate i32, i64
 
 arm = vqrshl
 link-arm = vqrshifts._EXT_
@@ -3630,17 +3672,18 @@ b = 2
 validate 4
 
 aarch64 = sqrshl
-generate i8, i16, i32, i64
+generate i8, i16
 
 /// Unsigned signed saturating rounding shift left
 name = vqrshl
 out-suffix
-a = MIN, MAX, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+a = 2, MIN, MAX, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
-validate 0, MAX, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
+validate 8, 0, MAX, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
 
 aarch64 = uqrshl
 link-aarch64 = uqrshl._EXT_
+generate u32:i32:u32, u64:i64:u64
 
 arm = vqrshl
 link-arm = vqrshiftu._EXT_
@@ -3658,7 +3701,7 @@ b = 2
 validate 4
 
 aarch64 = uqrshl
-generate u8:i8:u8, u16:i16:u16, u32:i32:u32, u64:i64:u64
+generate u8:i8:u8, u16:i16:u16
 
 /// Signed saturating rounded shift right narrow
 name = vqrshrn
@@ -3806,6 +3849,7 @@ validate 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
 
 aarch64 = sqshl
 link-aarch64 = sqshl._EXT_
+generate i64
 
 arm = vqshl
 link-arm = vqshifts._EXT_
@@ -3820,7 +3864,7 @@ b = 2
 validate 4
 
 aarch64 = sqshl
-generate i8, i16, i32, i64
+generate i8, i16, i32
 
 /// Unsigned saturating shift left
 name = vqshl
@@ -3831,6 +3875,7 @@ validate 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
 
 aarch64 = uqshl
 link-aarch64 = uqshl._EXT_
+generate u64:i64:u64
 
 arm = vqshl
 link-arm = vqshiftu._EXT_
@@ -3847,7 +3892,7 @@ b = 2
 validate 4
 
 aarch64 = uqshl
-generate u8:i8:u8, u16:i16:u16, u32:i32:u32, u64:i64:u64
+generate u8:i8:u8, u16:i16:u16, u32:i32:u32
 
 /// Signed saturating shift left
 name = vqshl
@@ -3915,6 +3960,7 @@ validate 0, 1, 2, 3, 4, 5, 6, 7
 aarch64 = sqshrn
 link-aarch64 = sqshrn._EXT2_
 const-aarch64 = N
+generate i64:i32
 
 arm = vqshrn
 link-arm = vqshiftns._EXT2_
@@ -3932,7 +3978,7 @@ n = 2
 validate 1
 
 aarch64 = sqshrn
-generate i16:i8, i32:i16, i64:i32
+generate i16:i8, i32:i16
 
 /// Signed saturating shift right narrow
 name = vqshrn_high
@@ -3960,6 +4006,7 @@ validate 0, 1, 2, 3, 4, 5, 6, 7
 aarch64 = uqshrn
 link-aarch64 = uqshrn._EXT2_
 const-aarch64 = N
+generate u64:u32
 
 arm = vqshrn
 link-arm = vqshiftnu._EXT2_
@@ -3977,7 +4024,7 @@ n = 2
 validate 1
 
 aarch64 = uqshrn
-generate u16:u8, u32:u16, u64:u32
+generate u16:u8, u32:u16
 
 /// Unsigned saturating shift right narrow
 name = vqshrn_high
@@ -4261,21 +4308,12 @@ validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
 
 aarch64 = srshl
 link-aarch64 = srshl._EXT_
+generate i64
 
 arm = vrshl
 link-arm = vrshifts._EXT_
 generate int*_t, int64x*_t
 
-/// Signed rounding shift left
-name = vrshl
-multi_fn = transmute, {vrshl-in_ntt-noext, transmute(a), transmute(b)}
-a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
-b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
-validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
-
-aarch64 = srshl
-generate i64
-
 /// Unsigned rounding shift left
 name = vrshl
 out-suffix
@@ -4285,23 +4323,13 @@ validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
 
 aarch64 = urshl
 link-aarch64 = urshl._EXT_
+generate u64:i64:u64
 
 arm = vrshl
 link-arm = vrshiftu._EXT_
 generate uint8x8_t:int8x8_t:uint8x8_t, uint8x16_t:int8x16_t:uint8x16_t, uint16x4_t:int16x4_t:uint16x4_t, uint16x8_t:int16x8_t:uint16x8_t
 generate uint32x2_t:int32x2_t:uint32x2_t, uint32x4_t:int32x4_t:uint32x4_t, uint64x1_t:int64x1_t:uint64x1_t, uint64x2_t:int64x2_t:uint64x2_t
 
-/// Unsigned rounding shift left
-name = vrshl
-out-suffix
-multi_fn = transmute, {vrshl-out_ntt-noext, transmute(a), transmute(b)}
-a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
-b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
-validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
-
-aarch64 = urshl
-generate u64:i64:u64
-
 /// Signed rounding shift right
 name = vrshr
 n-suffix
@@ -4438,15 +4466,14 @@ name = vrsra
 n-suffix
 constn = N
 multi_fn = static_assert-N-1-bits
-multi_fn = vrshr_n-in_ntt-::<N>, b:in_ntt, transmute(b)
-multi_fn = transmute, {simd_add, transmute(a), b}
+multi_fn = vrshr-nself-::<N>, b:in_t, b
+multi_fn = a + b
 a = 1
 b = 4
 n = 2
 validate 2
 
-// We use "nop" here to skip the instruction test, since it cannot be optimized correctly.
-aarch64 = nop
+aarch64 = srsra
 generate i64
 
 /// Ungisned rounding shift right and accumulate.
@@ -4454,21 +4481,20 @@ name = vrsra
 n-suffix
 constn = N
 multi_fn = static_assert-N-1-bits
-multi_fn = vrshr_n-in_ntt-::<N>, b:in_ntt, transmute(b)
-multi_fn = transmute, {simd_add, transmute(a), b}
+multi_fn = vrshr-nself-::<N>, b:in_t, b
+multi_fn = a + b
 a = 1
 b = 4
 n = 2
 validate 2
 
-// We use "nop" here to skip the instruction test, since it cannot be optimized correctly.
-aarch64 = nop
+aarch64 = ursra
 generate u64
 
 /// Insert vector element from another vector element
 name = vset_lane
 constn = LANE
-multi_fn = static_assert_imm-in_bits_exp_len-LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
 multi_fn = simd_insert, b, LANE as u32, a
 a = 1
 b = 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
@@ -4490,7 +4516,7 @@ generate p64:poly64x1_t:poly64x1_t
 name = vsetq_lane
 no-q
 constn = LANE
-multi_fn = static_assert_imm-in_bits_exp_len-LANE
+multi_fn = static_assert_imm-in_exp_len-LANE
 multi_fn = simd_insert, b, LANE as u32, a
 a = 1
 b = 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
@@ -4547,10 +4573,10 @@ a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
 validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
 
-arm = vshl
-link-arm = vshifts._EXT_
 aarch64 = sshl
 link-aarch64 = sshl._EXT_
+arm = vshl
+link-arm = vshifts._EXT_
 generate int*_t, int64x*_t
 
 /// Signed Shift left
@@ -4570,10 +4596,10 @@ a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
 b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
 validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64
 
-arm = vshl
-link-arm = vshiftu._EXT_
 aarch64 = ushl
 link-aarch64 = ushl._EXT_
+arm = vshl
+link-arm = vshiftu._EXT_
 generate uint8x8_t:int8x8_t:uint8x8_t, uint8x16_t:int8x16_t:uint8x16_t, uint16x4_t:int16x4_t:uint16x4_t, uint16x8_t:int16x8_t:uint16x8_t
 generate uint32x2_t:int32x2_t:uint32x2_t, uint32x4_t:int32x4_t:uint32x4_t, uint64x1_t:int64x1_t:uint64x1_t, uint64x2_t:int64x2_t:uint64x2_t