7 files changed, 172 insertions, 277 deletions
diff --git a/library/stdarch/crates/stdarch-gen-arm/Cargo.toml b/library/stdarch/crates/stdarch-gen-arm/Cargo.toml
index 899296d25ea..312019f454c 100644
--- a/library/stdarch/crates/stdarch-gen-arm/Cargo.toml
+++ b/library/stdarch/crates/stdarch-gen-arm/Cargo.toml
@@ -13,7 +13,6 @@ edition = "2024"
 
 [dependencies]
 itertools = "0.14.0"
-lazy_static = "1.4.0"
 proc-macro2 = "1.0"
 quote = "1.0"
 regex = "1.5"
diff --git a/library/stdarch/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml b/library/stdarch/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml
index f0dce681d9c..a31613e6b1a 100644
--- a/library/stdarch/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml
+++ b/library/stdarch/crates/stdarch-gen-arm/spec/neon/aarch64.spec.yml
@@ -187,7 +187,7 @@ intrinsics:
     arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
     return_type: "{neon_type[1]}"
     attr: [*neon-stable]
-    assert_instr: [sabdl]
+    assert_instr: [sabdl2]
     safety: safe
     types:
       - [int8x16_t, int16x8_t, int8x8_t, uint8x8_t]
@@ -230,7 +230,7 @@ intrinsics:
           - stable
           - - 'feature = "neon_intrinsics"'
             - 'since = "1.59.0"'
-    assert_instr: [sabdl]
+    assert_instr: [sabdl2]
     safety: safe
     types:
       - [int16x8_t, int32x4_t, int16x4_t, uint16x4_t]
@@ -273,7 +273,7 @@ intrinsics:
           - stable
           - - 'feature = "neon_intrinsics"'
             - 'since = "1.59.0"'
-    assert_instr: [sabdl]
+    assert_instr: [sabdl2]
     safety: safe
     types:
       - [int32x4_t, int64x2_t, int32x2_t, uint32x2_t]
@@ -1462,7 +1462,7 @@ intrinsics:
     arguments: ["a: {neon_type[0]}"]
     return_type: "{neon_type[1]}"
     attr:
-      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtl]]}]]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtl2]]}]]
       - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
     safety: safe
     types:
@@ -1530,7 +1530,7 @@ intrinsics:
     arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
     return_type: "{neon_type[2]}"
     attr:
-      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtn]]}]]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtn2]]}]]
       - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
     safety: safe
     types:
@@ -1582,7 +1582,7 @@ intrinsics:
     arguments: ["a: {type[0]}", "b: {neon_type[1]}"]
     return_type: "{type[2]}"
     attr:
-      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtxn]]}]]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [fcvtxn2]]}]]
       - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
     safety: safe
     types:
@@ -5147,7 +5147,7 @@ intrinsics:
     attr:
       - *neon-stable
     safety: safe
-    assert_instr: [pmull]
+    assert_instr: [pmull2]
     types:
       - [poly8x16_t, poly8x8_t, '[8, 9, 10, 11, 12, 13, 14, 15]', poly16x8_t]
     compose:
@@ -5169,7 +5169,7 @@ intrinsics:
       - *neon-aes
       - *neon-stable
     safety: safe
-    assert_instr: [pmull]
+    assert_instr: [pmull2]
     types:
       - [poly64x2_t, "p128"]
     compose:
@@ -5741,7 +5741,7 @@ intrinsics:
     arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
     return_type: "{neon_type[0]}"
     attr: [*neon-stable]
-    assert_instr: [ssubw]
+    assert_instr: [ssubw2]
     safety: safe
     types:
       - [int16x8_t, int8x16_t, int8x8_t, '[8, 9, 10, 11, 12, 13, 14, 15]']
@@ -5762,7 +5762,7 @@ intrinsics:
     arguments: ["a: {neon_type[0]}", "b: {neon_type[1]}"]
     return_type: "{neon_type[0]}"
     attr: [*neon-stable]
-    assert_instr: [usubw]
+    assert_instr: [usubw2]
     safety: safe
     types:
       - [uint16x8_t, uint8x16_t, uint8x8_t, '[8, 9, 10, 11, 12, 13, 14, 15]']
@@ -5783,7 +5783,7 @@ intrinsics:
     arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
     return_type: "{neon_type[1]}"
     attr: [*neon-stable]
-    assert_instr: [ssubl]
+    assert_instr: [ssubl2]
     safety: safe
     types:
       - [int8x16_t, int16x8_t, '[8, 9, 10, 11, 12, 13, 14, 15]', int8x8_t]
@@ -5813,7 +5813,7 @@ intrinsics:
     arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
     return_type: "{neon_type[1]}"
     attr: [*neon-stable]
-    assert_instr: [usubl]
+    assert_instr: [usubl2]
     safety: safe
     types:
       - [uint8x16_t, uint16x8_t, '[8, 9, 10, 11, 12, 13, 14, 15]', uint8x8_t]
@@ -6580,7 +6580,6 @@ intrinsics:
               arch: aarch64,arm64ec
 
 
-
   - name: "vmaxnm{neon_type.no}"
     doc: Floating-point Maximum Number (vector)
     arguments: ["a: {neon_type}", "b: {neon_type}"]
@@ -6592,11 +6591,7 @@ intrinsics:
       - float64x1_t
       - float64x2_t
     compose:
-      - LLVMLink:
-          name: "fmaxnm.{neon_type}"
-          links:
-            - link: "llvm.aarch64.neon.fmaxnm.{neon_type}"
-              arch: aarch64,arm64ec
+      - FnCall: [simd_fmax, [a, b]]
 
 
   - name: "vmaxnmh_{type}"
@@ -6611,11 +6606,7 @@ intrinsics:
     types:
       - f16
     compose:
-      - LLVMLink:
-          name: "vmaxh.{neon_type}"
-          links:
-            - link: "llvm.aarch64.neon.fmaxnm.{type}"
-              arch: aarch64,arm64ec
+      - FnCall: ["f16::max", [a, b]]
 
 
   - name: "vminnmh_{type}"
@@ -6630,11 +6621,7 @@ intrinsics:
     types:
       - f16
     compose:
-      - LLVMLink:
-          name: "vminh.{neon_type}"
-          links:
-            - link: "llvm.aarch64.neon.fminnm.{type}"
-              arch: aarch64,arm64ec
+      - FnCall: ["f16::min", [a, b]]
 
 
   - name: "vmaxnmv{neon_type[0].no}"
@@ -6648,11 +6635,7 @@ intrinsics:
       - [float32x2_t, f32]
       - [float64x2_t, f64]
     compose:
-      - LLVMLink:
-          name: "fmaxnmv.{neon_type[0]}"
-          links:
-            - link: "llvm.aarch64.neon.fmaxnmv.{type[1]}.{neon_type[0]}"
-              arch: aarch64,arm64ec
+      - FnCall: [simd_reduce_max, [a]]
 
   - name: "vmaxnmv{neon_type[0].no}"
     doc: Floating-point maximum number across vector
@@ -6664,11 +6647,7 @@ intrinsics:
     types:
       - [float32x4_t, f32]
     compose:
-      - LLVMLink:
-          name: "fmaxnmv.{neon_type[0]}"
-          links:
-            - link: "llvm.aarch64.neon.fmaxnmv.{type[1]}.{neon_type[0]}"
-              arch: aarch64,arm64ec
+      - FnCall: [simd_reduce_max, [a]]
 
 
   - name: "vmaxnmv{neon_type[0].no}"
@@ -6684,11 +6663,7 @@ intrinsics:
       - [float16x4_t, f16]
       - [float16x8_t, f16]
     compose:
-      - LLVMLink:
-          name: "fmaxnmv.{neon_type[0]}"
-          links:
-            - link: "llvm.aarch64.neon.fmaxnmv.{type[1]}.{neon_type[0]}"
-              arch: aarch64,arm64ec
+      - FnCall: [simd_reduce_max, [a]]
 
 
   - name: "vminnmv{neon_type[0].no}"
@@ -6704,11 +6679,7 @@ intrinsics:
       - [float16x4_t, f16]
       - [float16x8_t, f16]
     compose:
-      - LLVMLink:
-          name: "fminnmv.{neon_type[0]}"
-          links:
-            - link: "llvm.aarch64.neon.fminnmv.{type[1]}.{neon_type[0]}"
-              arch: aarch64,arm64ec
+      - FnCall: [simd_reduce_min, [a]]
 
 
   - name: "vmaxv{neon_type[0].no}"
@@ -6814,11 +6785,7 @@ intrinsics:
       - float64x1_t
       - float64x2_t
     compose:
-      - LLVMLink:
-          name: "fminnm.{neon_type}"
-          links:
-            - link: "llvm.aarch64.neon.fminnm.{neon_type}"
-              arch: aarch64,arm64ec
+      - FnCall: [simd_fmin, [a, b]]
 
   - name: "vminnmv{neon_type[0].no}"
     doc: "Floating-point minimum number across vector"
@@ -6832,11 +6799,7 @@ intrinsics:
       - [float32x2_t, "f32"]
       - [float64x2_t, "f64"]
     compose:
-      - LLVMLink:
-          name: "vminnmv.{neon_type[0]}"
-          links:
-            - link: "llvm.aarch64.neon.fminnmv.{type[1]}.{neon_type[0]}"
-              arch: aarch64,arm64ec
+      - FnCall: [simd_reduce_min, [a]]
 
   - name: "vminnmv{neon_type[0].no}"
     doc: "Floating-point minimum number across vector"
@@ -6849,11 +6812,7 @@ intrinsics:
     types:
       - [float32x4_t, "f32"]
     compose:
-      - LLVMLink:
-          name: "vminnmv.{neon_type[0]}"
-          links:
-            - link: "llvm.aarch64.neon.fminnmv.{type[1]}.{neon_type[0]}"
-              arch: aarch64,arm64ec
+      - FnCall: [simd_reduce_min, [a]]
 
   - name: "vmovl_high{neon_type[0].noq}"
     doc: Vector move
@@ -9950,7 +9909,7 @@ intrinsics:
     return_type: "{neon_type[0]}"
     attr:
       - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
-      - FnCall: [cfg_attr, [{FnCall: [all, [test, {FnCall: [not, ['target_env = "msvc"']]}]]}, {FnCall: [assert_instr, [uabal]]}]]
+      - FnCall: [cfg_attr, [{FnCall: [all, [test, {FnCall: [not, ['target_env = "msvc"']]}]]}, {FnCall: [assert_instr, [uabal2]]}]]
     safety: safe
     types:
       - [uint16x8_t, uint8x16_t, uint8x8_t, '[8, 9, 10, 11, 12, 13, 14, 15]', '[8, 9, 10, 11, 12, 13, 14, 15]']
@@ -9977,7 +9936,7 @@ intrinsics:
     return_type: "{neon_type[0]}"
     attr:
       - *neon-stable
-      - FnCall: [cfg_attr, [{FnCall: [all, [test, {FnCall: [not, ['target_env = "msvc"']]}]]}, {FnCall: [assert_instr, [sabal]]}]]
+      - FnCall: [cfg_attr, [{FnCall: [all, [test, {FnCall: [not, ['target_env = "msvc"']]}]]}, {FnCall: [assert_instr, [sabal2]]}]]
     safety: safe
     types:
       - [int16x8_t, int8x16_t, int8x16_t, '[8, 9, 10, 11, 12, 13, 14, 15]', int8x8_t, uint8x8_t]
@@ -11386,7 +11345,7 @@ intrinsics:
     arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
     return_type: "{neon_type[1]}"
     attr:
-      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [uabdl]]}]]
+      - FnCall: [cfg_attr, [test, {FnCall: [assert_instr, [uabdl2]]}]]
       - FnCall: [stable, ['feature = "neon_intrinsics"', 'since = "1.59.0"']]
     safety: safe
     types:
@@ -13023,6 +12982,26 @@ intrinsics:
             - link: "llvm.aarch64.crc32cx"
               arch: aarch64,arm64ec
 
+  - name: "vabsd_s64"
+    doc: "Absolute Value (wrapping)."
+    arguments: ["a: {type[1]}"]
+    return_type: "{type[1]}"
+    attr:
+      - *neon-stable
+    assert_instr: [abs]
+    safety: safe
+    types:
+      - [i64, i64]
+    compose:
+      # This is behaviorally equivalent to `i64::wrapping_abs`, but keeps the value in a SIMD
+      # register. That can be beneficial when combined with other instructions. This LLVM
+      # issue provides some extra context https://github.com/llvm/llvm-project/issues/148388.
+      - LLVMLink:
+          name: "vabsd_s64"
+          links:
+             - link: "llvm.aarch64.neon.abs.i64"
+               arch: aarch64,arm64ec
+
   - name: "{type[0]}"
     doc: "Absolute Value (wrapping)."
     arguments: ["a: {type[1]}"]
@@ -13032,15 +13011,18 @@ intrinsics:
     assert_instr: [abs]
     safety: safe
     types:
-      - ['vabsd_s64', i64, i64]
       - ['vabs_s64', int64x1_t, v1i64]
       - ['vabsq_s64', int64x2_t, v2i64]
     compose:
-      - LLVMLink:
-          name: "{type[0]}"
-          links:
-            - link: "llvm.aarch64.neon.abs.{type[2]}"
-              arch: aarch64,arm64ec
+      - Let:
+          - neg
+          - "{type[1]}"
+          - FnCall: [simd_neg, [a]]
+      - Let:
+          - mask
+          - "{type[1]}"
+          - FnCall: [simd_ge, [a, neg]]
+      - FnCall: [simd_select, [mask, a, neg]]
 
   - name: "vuqadd{neon_type[0].no}"
     doc: "Signed saturating Accumulate of Unsigned value."
@@ -13142,11 +13124,7 @@ intrinsics:
     types:
       - [int64x2_t, i64]
     compose:
-      - FnCall:
-          - transmute
-          - - FnCall:
-                - "vaddvq_u64"
-                - - FnCall: [transmute, [a]]
+      - FnCall: [simd_reduce_add_unordered, [a]]
 
   - name: "vpaddd_u64"
     doc: "Add pairwise"
@@ -13159,7 +13137,7 @@ intrinsics:
     types:
       - [uint64x2_t, u64]
     compose:
-      - FnCall: [vaddvq_u64, [a]]
+      - FnCall: [simd_reduce_add_unordered, [a]]
 
   - name: "vaddv{neon_type[0].no}"
     doc: "Add across vector"
@@ -13176,11 +13154,7 @@ intrinsics:
       - [int16x8_t, i16]
       - [int32x4_t, i32]
     compose:
-      - LLVMLink:
-          name: "vaddv{neon_type[0].no}"
-          links:
-            - link: "llvm.aarch64.neon.saddv.{type[1]}.{neon_type[0]}"
-              arch: aarch64,arm64ec
+      - FnCall: [simd_reduce_add_unordered, [a]]
 
   - name: "vaddv{neon_type[0].no}"
     doc: "Add across vector"
@@ -13193,11 +13167,7 @@ intrinsics:
     types:
       - [int32x2_t, i32]
     compose:
-      - LLVMLink:
-          name: "vaddv{neon_type[0].no}"
-          links:
-            - link: "llvm.aarch64.neon.saddv.i32.{neon_type[0]}"
-              arch: aarch64,arm64ec
+      - FnCall: [simd_reduce_add_unordered, [a]]
 
   - name: "vaddv{neon_type[0].no}"
     doc: "Add across vector"
@@ -13210,11 +13180,7 @@ intrinsics:
     types:
       - [int64x2_t, i64]
     compose:
-      - LLVMLink:
-          name: "vaddv{neon_type[0].no}"
-          links:
-            - link: "llvm.aarch64.neon.saddv.i64.{neon_type[0]}"
-              arch: aarch64,arm64ec
+      - FnCall: [simd_reduce_add_unordered, [a]]
 
   - name: "vaddv{neon_type[0].no}"
     doc: "Add across vector"
@@ -13231,11 +13197,7 @@ intrinsics:
       - [uint16x8_t, u16]
       - [uint32x4_t, u32]
     compose:
-      - LLVMLink:
-          name: "vaddv{neon_type[0].no}"
-          links:
-            - link: "llvm.aarch64.neon.uaddv.{type[1]}.{neon_type[0]}"
-              arch: aarch64,arm64ec
+      - FnCall: [simd_reduce_add_unordered, [a]]
 
   - name: "vaddv{neon_type[0].no}"
     doc: "Add across vector"
@@ -13248,11 +13210,7 @@ intrinsics:
     types:
       - [uint32x2_t, u32, i32]
     compose:
-      - LLVMLink:
-          name: "vaddv{neon_type[0].no}"
-          links:
-            - link: "llvm.aarch64.neon.uaddv.{type[2]}.{neon_type[0]}"
-              arch: aarch64,arm64ec
+      - FnCall: [simd_reduce_add_unordered, [a]]
 
   - name: "vaddv{neon_type[0].no}"
     doc: "Add across vector"
@@ -13265,11 +13223,7 @@ intrinsics:
     types:
       - [uint64x2_t, u64, i64]
     compose:
-      - LLVMLink:
-          name: "vaddv{neon_type[0].no}"
-          links:
-            - link: "llvm.aarch64.neon.uaddv.{type[2]}.{neon_type[0]}"
-              arch: aarch64,arm64ec
+      - FnCall: [simd_reduce_add_unordered, [a]]
 
   - name: "vaddlv{neon_type[0].no}"
     doc: "Signed Add Long across Vector"
@@ -13327,11 +13281,7 @@ intrinsics:
       - [int16x8_t, i16, 'smaxv']
       - [int32x4_t, i32, 'smaxv']
     compose:
-      - LLVMLink:
-          name: "vmaxv{neon_type[0].no}"
-          links:
-            - link: "llvm.aarch64.neon.smaxv.{type[1]}.{neon_type[0]}"
-              arch: aarch64,arm64ec
+      - FnCall: [simd_reduce_max, [a]]
 
   - name: "vmaxv{neon_type[0].no}"
     doc: "Horizontal vector max."
@@ -13349,11 +13299,7 @@ intrinsics:
       - [uint16x8_t, u16, 'umaxv']
       - [uint32x4_t, u32, 'umaxv']
     compose:
-      - LLVMLink:
-          name: "vmaxv{neon_type[0].no}"
-          links:
-            - link: "llvm.aarch64.neon.umaxv.{type[1]}.{neon_type[0]}"
-              arch: aarch64,arm64ec
+      - FnCall: [simd_reduce_max, [a]]
 
   - name: "vmaxv{neon_type[0].no}"
     doc: "Horizontal vector max."
@@ -13390,11 +13336,7 @@ intrinsics:
       - [int16x8_t, i16, 'sminv']
       - [int32x4_t, i32, 'sminv']
     compose:
-      - LLVMLink:
-          name: "vminv{neon_type[0].no}"
-          links:
-            - link: "llvm.aarch64.neon.sminv.{type[1]}.{neon_type[0]}"
-              arch: aarch64,arm64ec
+      - FnCall: [simd_reduce_min, [a]]
 
   - name: "vminv{neon_type[0].no}"
     doc: "Horizontal vector min."
@@ -13412,11 +13354,7 @@ intrinsics:
       - [uint16x8_t, u16, 'uminv']
       - [uint32x4_t, u32, 'uminv']
     compose:
-      - LLVMLink:
-          name: "vminv{neon_type[0].no}"
-          links:
-            - link: "llvm.aarch64.neon.uminv.{type[1]}.{neon_type[0]}"
-              arch: aarch64,arm64ec
+      - FnCall: [simd_reduce_min, [a]]
 
   - name: "vminv{neon_type[0].no}"
     doc: "Horizontal vector min."
diff --git a/library/stdarch/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml b/library/stdarch/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml
index 07959cf380e..c96c6e2a0c0 100644
--- a/library/stdarch/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml
+++ b/library/stdarch/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml
@@ -7135,13 +7135,8 @@ intrinsics:
       - int32x2_t
       - int32x4_t
     compose:
-      - LLVMLink:
-          name: "smax.{neon_type}"
-          links:
-            - link: "llvm.arm.neon.vmaxs.{neon_type}"
-              arch: arm
-            - link: "llvm.aarch64.neon.smax.{neon_type}"
-              arch: aarch64,arm64ec
+      - Let: [mask, "{neon_type}", {FnCall: [simd_ge, [a, b]]}]
+      - FnCall: [simd_select, [mask, a, b]]
 
   - name: "vmax{neon_type.no}"
     doc: Maximum (vector)
@@ -7162,13 +7157,8 @@ intrinsics:
       - uint32x2_t
       - uint32x4_t
     compose:
-      - LLVMLink:
-          name: "smax.{neon_type}"
-          links:
-            - link: "llvm.arm.neon.vmaxu.{neon_type}"
-              arch: arm
-            - link: "llvm.aarch64.neon.umax.{neon_type}"
-              arch: aarch64,arm64ec
+      - Let: [mask, "{neon_type}", {FnCall: [simd_ge, [a, b]]}]
+      - FnCall: [simd_select, [mask, a, b]]
 
   - name: "vmax{neon_type.no}"
     doc: Maximum (vector)
@@ -7233,13 +7223,7 @@ intrinsics:
       - float32x2_t
       - float32x4_t
     compose:
-      - LLVMLink:
-          name: "fmaxnm.{neon_type}"
-          links:
-            - link: "llvm.arm.neon.vmaxnm.{neon_type}"
-              arch: arm
-            - link: "llvm.aarch64.neon.fmaxnm.{neon_type}"
-              arch: aarch64,arm64ec
+      - FnCall: [simd_fmax, [a, b]]
 
 
   - name: "vmaxnm{neon_type.no}"
@@ -7257,13 +7241,7 @@ intrinsics:
       - float16x4_t
       - float16x8_t
     compose:
-      - LLVMLink:
-          name: "fmaxnm.{neon_type}"
-          links:
-            - link: "llvm.arm.neon.vmaxnm.{neon_type}"
-              arch: arm
-            - link: "llvm.aarch64.neon.fmaxnm.{neon_type}"
-              arch: aarch64,arm64ec
+      - FnCall: [simd_fmax, [a, b]]
 
 
   - name: "vminnm{neon_type.no}"
@@ -7281,13 +7259,7 @@ intrinsics:
       - float16x4_t
       - float16x8_t
     compose:
-      - LLVMLink:
-          name: "fminnm.{neon_type}"
-          links:
-            - link: "llvm.arm.neon.vminnm.{neon_type}"
-              arch: arm
-            - link: "llvm.aarch64.neon.fminnm.{neon_type}"
-              arch: aarch64,arm64ec
+      - FnCall: [simd_fmin, [a, b]]
 
 
   - name: "vmin{neon_type.no}"
@@ -7309,13 +7281,8 @@ intrinsics:
       - int32x2_t
       - int32x4_t
     compose:
-      - LLVMLink:
-          name: "smin.{neon_type}"
-          links:
-            - link: "llvm.arm.neon.vmins.{neon_type}"
-              arch: arm
-            - link: "llvm.aarch64.neon.smin.{neon_type}"
-              arch: aarch64,arm64ec
+      - Let: [mask, "{neon_type}", {FnCall: [simd_le, [a, b]]}]
+      - FnCall: [simd_select, [mask, a, b]]
 
   - name: "vmin{neon_type.no}"
     doc: "Minimum (vector)"
@@ -7336,13 +7303,8 @@ intrinsics:
       - uint32x2_t
       - uint32x4_t
     compose:
-      - LLVMLink:
-          name: "umin.{neon_type}"
-          links:
-            - link: "llvm.arm.neon.vminu.{neon_type}"
-              arch: arm
-            - link: "llvm.aarch64.neon.umin.{neon_type}"
-              arch: aarch64,arm64ec
+      - Let: [mask, "{neon_type}", {FnCall: [simd_le, [a, b]]}]
+      - FnCall: [simd_select, [mask, a, b]]
 
   - name: "vmin{neon_type.no}"
     doc: "Minimum (vector)"
@@ -7408,13 +7370,7 @@ intrinsics:
       - float32x2_t
       - float32x4_t
     compose:
-      - LLVMLink:
-          name: "fminnm.{neon_type}"
-          links:
-            - link: "llvm.arm.neon.vminnm.{neon_type}"
-              arch: arm
-            - link: "llvm.aarch64.neon.fminnm.{neon_type}"
-              arch: aarch64,arm64ec
+      - FnCall: [simd_fmin, [a, b]]
 
   - name: "vpadd{neon_type.no}"
     doc: Floating-point add pairwise
@@ -7874,9 +7830,9 @@ intrinsics:
     static_defs: ['const N: i32']
     safety: safe
     types:
-      - [int16x8_t, int8x8_t, 'N >= 1 && N <= 8', 'const { int16x8_t([-N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16]) }']
-      - [int32x4_t, int16x4_t, 'N >= 1 && N <= 16', 'const { int32x4_t([-N as i32, -N as i32, -N as i32, -N as i32]) }']
-      - [int64x2_t, int32x2_t, 'N >= 1 && N <= 32', 'const { int64x2_t([-N as i64, -N as i64]) }']
+      - [int16x8_t, int8x8_t, 'N >= 1 && N <= 8', 'const { int16x8_t([-N as i16; 8]) }']
+      - [int32x4_t, int16x4_t, 'N >= 1 && N <= 16', 'const { int32x4_t([-N; 4]) }']
+      - [int64x2_t, int32x2_t, 'N >= 1 && N <= 32', 'const { int64x2_t([-N as i64; 2]) }']
     compose:
       - FnCall: [static_assert!, ["{type[2]}"]]
       - LLVMLink:
@@ -7929,9 +7885,9 @@ intrinsics:
     static_defs: ['const N: i32']
     safety: safe
     types:
-      - [int16x8_t, uint8x8_t, 'N >= 1 && N <= 8', 'const { int16x8_t([-N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16]) }']
-      - [int32x4_t, uint16x4_t, 'N >= 1 && N <= 16', 'const { int32x4_t([-N as i32, -N as i32, -N as i32, -N as i32]) }']
-      - [int64x2_t, uint32x2_t, 'N >= 1 && N <= 32', 'const { int64x2_t([-N as i64, -N as i64]) }']
+      - [int16x8_t, uint8x8_t, 'N >= 1 && N <= 8', 'const { int16x8_t([-N as i16; 8]) }']
+      - [int32x4_t, uint16x4_t, 'N >= 1 && N <= 16', 'const { int32x4_t([-N; 4]) }']
+      - [int64x2_t, uint32x2_t, 'N >= 1 && N <= 32', 'const { int64x2_t([-N as i64; 2]) }']
     compose:
       - FnCall: [static_assert!, ["{type[2]}"]]
       - LLVMLink:
@@ -8105,9 +8061,9 @@ intrinsics:
     static_defs: ['const N: i32']
     safety: safe
     types:
-      - [int16x8_t, int8x8_t, 'N >= 1 && N <= 8', 'const { int16x8_t([-N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16]) }']
-      - [int32x4_t, int16x4_t, 'N >= 1 && N <= 16', 'const { int32x4_t([-N as i32, -N as i32, -N as i32, -N as i32]) }']
-      - [int64x2_t, int32x2_t, 'N >= 1 && N <= 32', 'const { int64x2_t([-N as i64, -N as i64]) }']
+      - [int16x8_t, int8x8_t, 'N >= 1 && N <= 8', 'const { int16x8_t([-N as i16; 8]) }']
+      - [int32x4_t, int16x4_t, 'N >= 1 && N <= 16', 'const { int32x4_t([-N; 4]) }']
+      - [int64x2_t, int32x2_t, 'N >= 1 && N <= 32', 'const { int64x2_t([-N as i64; 2]) }']
     compose:
       - FnCall: [static_assert!, ["{type[2]}"]]
       - LLVMLink:
@@ -8215,9 +8171,9 @@ intrinsics:
     static_defs: ['const N: i32']
     safety: safe
     types:
-      - [int16x8_t, uint8x8_t, 'N >= 1 && N <= 8', 'const { int16x8_t([-N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16]) }']
-      - [int32x4_t, uint16x4_t, 'N >= 1 && N <= 16', 'const { int32x4_t([-N as i32, -N as i32, -N as i32, -N as i32]) }']
-      - [int64x2_t, uint32x2_t, 'N >= 1 && N <= 32', 'const { int64x2_t([-N as i64, -N as i64]) }']
+      - [int16x8_t, uint8x8_t, 'N >= 1 && N <= 8', 'const { int16x8_t([-N as i16; 8]) }']
+      - [int32x4_t, uint16x4_t, 'N >= 1 && N <= 16', 'const { int32x4_t([-N; 4]) }']
+      - [int64x2_t, uint32x2_t, 'N >= 1 && N <= 32', 'const { int64x2_t([-N as i64; 2]) }']
     compose:
       - FnCall: [static_assert!, ["{type[2]}"]]
       - LLVMLink:
@@ -8939,9 +8895,9 @@ intrinsics:
     static_defs: ['const N: i32']
     safety: safe
     types:
-      - [int16x8_t, int8x8_t, 'N >= 1 && N <= 8', 'const { int16x8_t([-N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16]) }']
-      - [int32x4_t, int16x4_t, 'N >= 1 && N <= 16', 'const { int32x4_t([-N as i32, -N as i32, -N as i32, -N as i32]) }']
-      - [int64x2_t, int32x2_t, 'N >= 1 && N <= 32', 'const { int64x2_t([-N as i64, -N as i64]) }']
+      - [int16x8_t, int8x8_t, 'N >= 1 && N <= 8', 'const { int16x8_t([-N as i16; 8]) }']
+      - [int32x4_t, int16x4_t, 'N >= 1 && N <= 16', 'const { int32x4_t([-N; 4]) }']
+      - [int64x2_t, int32x2_t, 'N >= 1 && N <= 32', 'const { int64x2_t([-N as i64; 2]) }']
     compose:
       - FnCall: [static_assert!, ["{type[2]}"]]
       - LLVMLink:
@@ -9576,7 +9532,8 @@ intrinsics:
     attr:
       - *neon-v7
       - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vtrn]]}]]
-      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [trn]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [trn1]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [trn2]]}]]
       - *neon-not-arm-stable
       - *neon-cfg-arm-unstable
     safety: safe
@@ -9617,7 +9574,8 @@ intrinsics:
     attr:
       - *neon-v7
       - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vtrn]]}]]
-      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [trn]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [trn1]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [trn2]]}]]
       - *neon-fp16
       - *neon-unstable-f16
     safety: safe
@@ -9645,7 +9603,8 @@ intrinsics:
     attr:
       - *neon-v7
       - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vtrn]]}]]
-      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [zip]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [zip1]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [zip2]]}]]
       - *neon-not-arm-stable
       - *neon-cfg-arm-unstable
     safety: safe
@@ -9673,7 +9632,8 @@ intrinsics:
     attr:
       - *neon-v7
       - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vorr]]}]]
-      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [zip]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [zip1]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [zip2]]}]]
       - *neon-not-arm-stable
       - *neon-cfg-arm-unstable
     safety: safe
@@ -9707,7 +9667,8 @@ intrinsics:
     attr:
       - *neon-v7
       - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vtrn]]}]]
-      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [zip]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [zip1]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [zip2]]}]]
       - *neon-not-arm-stable
       - *neon-cfg-arm-unstable
     safety: safe
@@ -9735,7 +9696,8 @@ intrinsics:
     attr:
       - *neon-v7
       - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vzip]]}]]
-      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [zip]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [zip1]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [zip2]]}]]
       - *neon-not-arm-stable
       - *neon-cfg-arm-unstable
     safety: safe
@@ -9767,7 +9729,8 @@ intrinsics:
     attr:
       - *neon-v7
       - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, ['"vzip.16"']]}]]
-      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [zip]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [zip1]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [zip2]]}]]
       - *neon-fp16
       - *neon-unstable-f16
     safety: safe
@@ -9794,7 +9757,8 @@ intrinsics:
     attr:
       - *neon-v7
       - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vuzp]]}]]
-      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [uzp]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [uzp1]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [uzp2]]}]]
       - *neon-not-arm-stable
       - *neon-cfg-arm-unstable
     safety: safe
@@ -9835,7 +9799,8 @@ intrinsics:
     attr:
       - *neon-v7
       - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vuzp]]}]]
-      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [uzp]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [uzp1]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [uzp2]]}]]
       - *neon-fp16
       - *neon-unstable-f16
     safety: safe
@@ -9863,7 +9828,8 @@ intrinsics:
     attr:
       - *neon-v7
       - FnCall: [cfg_attr, [*test-is-arm, {FnCall: [assert_instr, [vtrn]]}]]
-      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [zip]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [zip1]]}]]
+      - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [zip2]]}]]
       - *neon-not-arm-stable
       - *neon-cfg-arm-unstable
     safety: safe
@@ -12881,13 +12847,16 @@ intrinsics:
       - int16x8_t
       - int32x4_t
     compose:
-      - LLVMLink:
-          name: "vabs{neon_type.no}"
-          links:
-            - link: "llvm.aarch64.neon.abs.{neon_type}"
-              arch: aarch64,arm64ec
-            - link: "llvm.arm.neon.vabs.{neon_type}"
-              arch: arm
+      - Let:
+          - neg
+          - "{neon_type}"
+          - FnCall: [simd_neg, [a]]
+      - Let:
+          - mask
+          - "{neon_type}"
+          - FnCall: [simd_ge, [a, neg]]
+      - FnCall: [simd_select, [mask, a, neg]]
+
 
   - name: "vpmin{neon_type.no}"
     doc: "Folding minimum of adjacent pairs"
@@ -13862,8 +13831,8 @@ intrinsics:
       - [int8x16_t, '8',  '1 <= N && N <= 8',  'v16i8', 'int8x16_t::splat', '-N as i8']
       - [int16x4_t, '16', '1 <= N && N <= 16', 'v4i16', 'int16x4_t::splat', '-N as i16']
       - [int16x8_t, '16', '1 <= N && N <= 16', 'v8i16', 'int16x8_t::splat', '-N as i16']
-      - [int32x2_t, '32', '1 <= N && N <= 32', 'v2i32', 'int32x2_t::splat', '-N as i32']
-      - [int32x4_t, '32', '1 <= N && N <= 32', 'v4i32', 'int32x4_t::splat', '-N as i32']
+      - [int32x2_t, '32', '1 <= N && N <= 32', 'v2i32', 'int32x2_t::splat', '-N']
+      - [int32x4_t, '32', '1 <= N && N <= 32', 'v4i32', 'int32x4_t::splat', '-N']
       - [int64x1_t, '64', '1 <= N && N <= 64', 'v1i64', 'int64x1_t::splat', '-N as i64']
       - [int64x2_t, '64', '1 <= N && N <= 64', 'v2i64', 'int64x2_t::splat', '-N as i64']
     compose:
@@ -13891,8 +13860,8 @@ intrinsics:
       - [uint8x16_t, "neon,v7", '8',  'static_assert_uimm_bits!', 'N, 3',    'v16i8', 'int8x16_t::splat', 'N as i8']
       - [uint16x4_t, "neon,v7", '16', 'static_assert_uimm_bits!', 'N, 4',    'v4i16', 'int16x4_t::splat', 'N as i16']
       - [uint16x8_t, "neon,v7", '16', 'static_assert_uimm_bits!', 'N, 4',    'v8i16', 'int16x8_t::splat', 'N as i16']
-      - [uint32x2_t, "neon,v7", '32', 'static_assert!', 'N >= 0 && N <= 31', 'v2i32', 'int32x2_t::splat', 'N as i32']
-      - [uint32x4_t, "neon,v7", '32', 'static_assert!', 'N >= 0 && N <= 31', 'v4i32', 'int32x4_t::splat', 'N as i32']
+      - [uint32x2_t, "neon,v7", '32', 'static_assert!', 'N >= 0 && N <= 31', 'v2i32', 'int32x2_t::splat', 'N']
+      - [uint32x4_t, "neon,v7", '32', 'static_assert!', 'N >= 0 && N <= 31', 'v4i32', 'int32x4_t::splat', 'N']
       - [uint64x1_t, "neon,v7", '64', 'static_assert!', 'N >= 0 && N <= 63', 'v1i64', 'int64x1_t::splat', 'N as i64']
       - [uint64x2_t, "neon,v7", '64', 'static_assert!', 'N >= 0 && N <= 63', 'v2i64', 'int64x2_t::splat', 'N as i64']
       - [poly8x8_t,  "neon,v7", '8',  'static_assert_uimm_bits!', 'N, 3',     'v8i8', 'int8x8_t::splat',  'N as i8']
@@ -14138,6 +14107,7 @@ intrinsics:
     doc: "Load one single-element structure and Replicate to all lanes (of one register)."
     arguments: ["ptr: {type[1]}"]
     return_type: "{neon_type[2]}"
+    big_endian_inverse: false
     attr:
       - *neon-v7
       - FnCall: [cfg_attr, [*test-is-arm, { FnCall: [assert_instr, ['"{type[3]}"']] }  ]]
@@ -14147,40 +14117,36 @@ intrinsics:
     safety:
       unsafe: [neon]
     types:
-      - ['vld1_dup_s8', '*const i8', 'int8x8_t', 'vld1.8', 'ld1r', 'vld1_lane_s8::<0>', 'i8x8::splat(0)', '[0, 0, 0, 0, 0, 0, 0, 0]']
-      - ['vld1_dup_u8', '*const u8', 'uint8x8_t', 'vld1.8', 'ld1r', 'vld1_lane_u8::<0>', 'u8x8::splat(0)', '[0, 0, 0, 0, 0, 0, 0, 0]']
-      - ['vld1_dup_p8', '*const p8', 'poly8x8_t', 'vld1.8', 'ld1r', 'vld1_lane_p8::<0>', 'u8x8::splat(0)', '[0, 0, 0, 0, 0, 0, 0, 0]']
+      - ['vld1_dup_s8', '*const i8', 'int8x8_t', 'vld1.8', 'ld1r', 'i8x8::splat']
+      - ['vld1_dup_u8', '*const u8', 'uint8x8_t', 'vld1.8', 'ld1r', 'u8x8::splat']
+      - ['vld1_dup_p8', '*const p8', 'poly8x8_t', 'vld1.8', 'ld1r', 'u8x8::splat']
 
-      - ['vld1q_dup_s8', '*const i8', 'int8x16_t', 'vld1.8', 'ld1r', 'vld1q_lane_s8::<0>', 'i8x16::splat(0)', '[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]']
-      - ['vld1q_dup_u8', '*const u8', 'uint8x16_t', 'vld1.8', 'ld1r', 'vld1q_lane_u8::<0>', 'u8x16::splat(0)', '[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]']
-      - ['vld1q_dup_p8', '*const p8', 'poly8x16_t', 'vld1.8', 'ld1r', 'vld1q_lane_p8::<0>', 'u8x16::splat(0)', '[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]']
+      - ['vld1q_dup_s8', '*const i8', 'int8x16_t', 'vld1.8', 'ld1r', 'i8x16::splat']
+      - ['vld1q_dup_u8', '*const u8', 'uint8x16_t', 'vld1.8', 'ld1r', 'u8x16::splat']
+      - ['vld1q_dup_p8', '*const p8', 'poly8x16_t', 'vld1.8', 'ld1r', 'u8x16::splat']
 
-      - ['vld1_dup_s16', '*const i16', 'int16x4_t', 'vld1.16', 'ld1r', 'vld1_lane_s16::<0>', 'i16x4::splat(0)', '[0, 0, 0, 0]']
-      - ['vld1_dup_u16', '*const u16', 'uint16x4_t', 'vld1.16', 'ld1r', 'vld1_lane_u16::<0>', 'u16x4::splat(0)', '[0, 0, 0, 0]']
-      - ['vld1_dup_p16', '*const p16', 'poly16x4_t', 'vld1.16', 'ld1r', 'vld1_lane_p16::<0>', 'u16x4::splat(0)', '[0, 0, 0, 0]']
+      - ['vld1_dup_s16', '*const i16', 'int16x4_t', 'vld1.16', 'ld1r', 'i16x4::splat']
+      - ['vld1_dup_u16', '*const u16', 'uint16x4_t', 'vld1.16', 'ld1r', 'u16x4::splat']
+      - ['vld1_dup_p16', '*const p16', 'poly16x4_t', 'vld1.16', 'ld1r', 'u16x4::splat']
 
-      - ['vld1q_dup_s16', '*const i16', 'int16x8_t', 'vld1.16', 'ld1r', 'vld1q_lane_s16::<0>', 'i16x8::splat(0)', '[0, 0, 0, 0, 0, 0, 0, 0]']
-      - ['vld1q_dup_u16', '*const u16', 'uint16x8_t', 'vld1.16', 'ld1r', 'vld1q_lane_u16::<0>', 'u16x8::splat(0)', '[0, 0, 0, 0, 0, 0, 0, 0]']
-      - ['vld1q_dup_p16', '*const p16', 'poly16x8_t', 'vld1.16', 'ld1r', 'vld1q_lane_p16::<0>', 'u16x8::splat(0)', '[0, 0, 0, 0, 0, 0, 0, 0]']
+      - ['vld1q_dup_s16', '*const i16', 'int16x8_t', 'vld1.16', 'ld1r', 'i16x8::splat']
+      - ['vld1q_dup_u16', '*const u16', 'uint16x8_t', 'vld1.16', 'ld1r', 'u16x8::splat']
+      - ['vld1q_dup_p16', '*const p16', 'poly16x8_t', 'vld1.16', 'ld1r', 'u16x8::splat']
 
-      - ['vld1_dup_s32', '*const i32', 'int32x2_t', 'vld1.32', 'ld1r', 'vld1_lane_s32::<0>', 'i32x2::splat(0)', '[0, 0]']
-      - ['vld1_dup_u32', '*const u32', 'uint32x2_t', 'vld1.32', 'ld1r', 'vld1_lane_u32::<0>', 'u32x2::splat(0)', '[0, 0]']
-      - ['vld1_dup_f32', '*const f32', 'float32x2_t', 'vld1.32', 'ld1r', 'vld1_lane_f32::<0>', 'f32x2::splat(0.0)', '[0, 0]']
+      - ['vld1_dup_s32', '*const i32', 'int32x2_t', 'vld1.32', 'ld1r', 'i32x2::splat']
+      - ['vld1_dup_u32', '*const u32', 'uint32x2_t', 'vld1.32', 'ld1r', 'u32x2::splat']
+      - ['vld1_dup_f32', '*const f32', 'float32x2_t', 'vld1.32', 'ld1r', 'f32x2::splat']
 
-      - ['vld1q_dup_s32', '*const i32', 'int32x4_t', 'vld1.32', 'ld1r', 'vld1q_lane_s32::<0>', 'i32x4::splat(0)', '[0, 0, 0, 0]']
-      - ['vld1q_dup_u32', '*const u32', 'uint32x4_t', 'vld1.32', 'ld1r', 'vld1q_lane_u32::<0>', 'u32x4::splat(0)', '[0, 0, 0, 0]']
-      - ['vld1q_dup_f32', '*const f32', 'float32x4_t', 'vld1.32', 'ld1r', 'vld1q_lane_f32::<0>', 'f32x4::splat(0.0)', '[0, 0, 0, 0]']
+      - ['vld1q_dup_s32', '*const i32', 'int32x4_t', 'vld1.32', 'ld1r', 'i32x4::splat']
+      - ['vld1q_dup_u32', '*const u32', 'uint32x4_t', 'vld1.32', 'ld1r', 'u32x4::splat']
+      - ['vld1q_dup_f32', '*const f32', 'float32x4_t', 'vld1.32', 'ld1r', 'f32x4::splat']
 
-      - ['vld1q_dup_s64', '*const i64', 'int64x2_t', 'vldr', 'ld1', 'vld1q_lane_s64::<0>', 'i64x2::splat(0)', '[0, 0]']
-      - ['vld1q_dup_u64', '*const u64', 'uint64x2_t', 'vldr', 'ld1', 'vld1q_lane_u64::<0>', 'u64x2::splat(0)', '[0, 0]']
+      - ['vld1q_dup_s64', '*const i64', 'int64x2_t', 'vldr', 'ld1r', 'i64x2::splat']
+      - ['vld1q_dup_u64', '*const u64', 'uint64x2_t', 'vldr', 'ld1r', 'u64x2::splat']
     compose:
-      - Let:
-          - x
-          - FnCall:
-              - '{type[5]}'
-              - - ptr
-                - FnCall: [transmute, ['{type[6]}']]
-      - FnCall: ['simd_shuffle!', [x, x, '{type[7]}']]
+      - FnCall:
+          - transmute
+          - - FnCall: ['{type[5]}', ["*ptr"]]
 
   - name: "{type[0]}"
     doc: "Absolute difference and accumulate (64-bit)"
diff --git a/library/stdarch/crates/stdarch-gen-arm/src/expression.rs b/library/stdarch/crates/stdarch-gen-arm/src/expression.rs
index 56c94602fff..d5644ef27d4 100644
--- a/library/stdarch/crates/stdarch-gen-arm/src/expression.rs
+++ b/library/stdarch/crates/stdarch-gen-arm/src/expression.rs
@@ -1,5 +1,4 @@
 use itertools::Itertools;
-use lazy_static::lazy_static;
 use proc_macro2::{Literal, Punct, Spacing, TokenStream};
 use quote::{ToTokens, TokenStreamExt, format_ident, quote};
 use regex::Regex;
@@ -7,6 +6,7 @@ use serde::de::{self, MapAccess, Visitor};
 use serde::{Deserialize, Deserializer, Serialize};
 use std::fmt;
 use std::str::FromStr;
+use std::sync::LazyLock;
 
 use crate::intrinsic::Intrinsic;
 use crate::wildstring::WildStringPart;
@@ -374,10 +374,8 @@ impl FromStr for Expression {
     type Err = String;
 
     fn from_str(s: &str) -> Result<Self, Self::Err> {
-        lazy_static! {
-            static ref MACRO_RE: Regex =
-                Regex::new(r"^(?P<name>[\w\d_]+)!\((?P<ex>.*?)\);?$").unwrap();
-        }
+        static MACRO_RE: LazyLock<Regex> =
+            LazyLock::new(|| Regex::new(r"^(?P<name>[\w\d_]+)!\((?P<ex>.*?)\);?$").unwrap());
 
         if s == "SvUndef" {
             Ok(Expression::SvUndef)
diff --git a/library/stdarch/crates/stdarch-gen-arm/src/load_store_tests.rs b/library/stdarch/crates/stdarch-gen-arm/src/load_store_tests.rs
index 5cf39b2e11a..3f3bfed132c 100644
--- a/library/stdarch/crates/stdarch-gen-arm/src/load_store_tests.rs
+++ b/library/stdarch/crates/stdarch-gen-arm/src/load_store_tests.rs
@@ -2,6 +2,7 @@ use std::fs::File;
 use std::io::Write;
 use std::path::PathBuf;
 use std::str::FromStr;
+use std::sync::LazyLock;
 
 use crate::format_code;
 use crate::input::InputType;
@@ -10,7 +11,6 @@ use crate::typekinds::BaseType;
 use crate::typekinds::{ToRepr, TypeKind};
 
 use itertools::Itertools;
-use lazy_static::lazy_static;
 use proc_macro2::TokenStream;
 use quote::{format_ident, quote};
 
@@ -639,8 +639,8 @@ impl LdIntrCharacteristics {
     }
 }
 
-lazy_static! {
-    static ref PREAMBLE: String = format!(
+static PREAMBLE: LazyLock<String> = LazyLock::new(|| {
+    format!(
         r#"#![allow(unused)]
 
 use super::*;
@@ -801,13 +801,11 @@ fn assert_vector_matches_u64(vector: svuint64_t, expected: svuint64_t) {{
     assert!(!svptest_any(defined, cmp))
 }}
 "#
-    );
-}
+    )
+});
 
-lazy_static! {
-    static ref MANUAL_TESTS: String = format!(
-        "#[simd_test(enable = \"sve\")]
-unsafe fn test_ffr() {{
+const MANUAL_TESTS: &str = "#[simd_test(enable = \"sve\")]
+unsafe fn test_ffr() {
     svsetffr();
     let ffr = svrdffr();
     assert_vector_matches_u8(svdup_n_u8_z(ffr, 1), svindex_u8(1, 0));
@@ -816,7 +814,5 @@ unsafe fn test_ffr() {{
     svwrffr(pred);
     let ffr = svrdffr_z(svptrue_b8());
     assert_vector_matches_u8(svdup_n_u8_z(ffr, 1), svdup_n_u8_z(pred, 1));
-}}
-"
-    );
 }
+";
diff --git a/library/stdarch/crates/stdarch-gen-arm/src/typekinds.rs b/library/stdarch/crates/stdarch-gen-arm/src/typekinds.rs
index 7c697cb7c0c..bd47ff2bd15 100644
--- a/library/stdarch/crates/stdarch-gen-arm/src/typekinds.rs
+++ b/library/stdarch/crates/stdarch-gen-arm/src/typekinds.rs
@@ -1,10 +1,10 @@
-use lazy_static::lazy_static;
 use proc_macro2::TokenStream;
 use quote::{ToTokens, TokenStreamExt, quote};
 use regex::Regex;
 use serde_with::{DeserializeFromStr, SerializeDisplay};
 use std::fmt;
 use std::str::FromStr;
+use std::sync::LazyLock;
 
 use crate::context;
 use crate::expression::{Expression, FnCall};
@@ -496,9 +496,9 @@ impl FromStr for VectorType {
     type Err = String;
 
     fn from_str(s: &str) -> Result<Self, Self::Err> {
-        lazy_static! {
-            static ref RE: Regex = Regex::new(r"^(?:(?:sv(?P<sv_ty>(?:uint|int|bool|float)(?:\d+)?))|(?:(?P<ty>(?:uint|int|bool|poly|float)(?:\d+)?)x(?P<lanes>(?:\d+)?)))(?:x(?P<tuple_size>2|3|4))?_t$").unwrap();
-        }
+        static RE: LazyLock<Regex> = LazyLock::new(|| {
+            Regex::new(r"^(?:(?:sv(?P<sv_ty>(?:uint|int|bool|float)(?:\d+)?))|(?:(?P<ty>(?:uint|int|bool|poly|float)(?:\d+)?)x(?P<lanes>(?:\d+)?)))(?:x(?P<tuple_size>2|3|4))?_t$").unwrap()
+        });
 
         if let Some(c) = RE.captures(s) {
             let (base_type, lanes) = Self::sanitise_lanes(
@@ -698,9 +698,8 @@ impl FromStr for BaseType {
     type Err = String;
 
     fn from_str(s: &str) -> Result<Self, Self::Err> {
-        lazy_static! {
-            static ref RE: Regex = Regex::new(r"^(?P<kind>[a-zA-Z]+)(?P<size>\d+)?(_t)?$").unwrap();
-        }
+        static RE: LazyLock<Regex> =
+            LazyLock::new(|| Regex::new(r"^(?P<kind>[a-zA-Z]+)(?P<size>\d+)?(_t)?$").unwrap());
 
         if let Some(c) = RE.captures(s) {
             let kind = c["kind"].parse()?;
diff --git a/library/stdarch/crates/stdarch-gen-arm/src/wildcards.rs b/library/stdarch/crates/stdarch-gen-arm/src/wildcards.rs
index 25aa8034892..6c40d88df45 100644
--- a/library/stdarch/crates/stdarch-gen-arm/src/wildcards.rs
+++ b/library/stdarch/crates/stdarch-gen-arm/src/wildcards.rs
@@ -1,8 +1,7 @@
-use lazy_static::lazy_static;
 use regex::Regex;
 use serde_with::{DeserializeFromStr, SerializeDisplay};
-use std::fmt;
 use std::str::FromStr;
+use std::{fmt, sync::LazyLock};
 
 use crate::{
     fn_suffix::SuffixKind,
@@ -66,9 +65,9 @@ impl FromStr for Wildcard {
     type Err = String;
 
     fn from_str(s: &str) -> Result<Self, Self::Err> {
-        lazy_static! {
-            static ref RE: Regex = Regex::new(r"^(?P<wildcard>\w+?)(?:_x(?P<tuple_size>[2-4]))?(?:\[(?P<index>\d+)\])?(?:\.(?P<modifiers>\w+))?(?:\s+as\s+(?P<scale_to>.*?))?$").unwrap();
-        }
+        static RE: LazyLock<Regex> = LazyLock::new(|| {
+            Regex::new(r"^(?P<wildcard>\w+?)(?:_x(?P<tuple_size>[2-4]))?(?:\[(?P<index>\d+)\])?(?:\.(?P<modifiers>\w+))?(?:\s+as\s+(?P<scale_to>.*?))?$").unwrap()
+        });
 
         if let Some(c) = RE.captures(s) {
             let wildcard_name = &c["wildcard"];