19 files changed, 331 insertions, 81 deletions
diff --git a/tests/codegen/array-cmp.rs b/tests/codegen/array-cmp.rs
index 2565a385b61..0d337655401 100644
--- a/tests/codegen/array-cmp.rs
+++ b/tests/codegen/array-cmp.rs
@@ -1,6 +1,7 @@
 // Ensure the asm for array comparisons is properly optimized.
 
 //@ compile-flags: -C opt-level=2
+//@ needs-deterministic-layouts (checks depend on tuple layout)
 
 #![crate_type = "lib"]
 
@@ -17,3 +18,57 @@ pub fn compare() -> bool {
             [0x00, 0x00, 0x48, 0x41]
         }
 }
+
+// CHECK-LABEL: @array_of_tuple_le
+#[no_mangle]
+pub fn array_of_tuple_le(a: &[(i16, u16); 2], b: &[(i16, u16); 2]) -> bool {
+    // Ensure that, after all the optimizations have run, the happy path just checks
+    // `eq` on each corresponding pair and moves onto the next one if it is.
+    // Then there's a dedup'd comparison for the place that's different.
+    // (As opposed to, say, running a full `[su]cmp` as part of checking equality.)
+
+    // This is written quite specifically because different library code was triggering
+    // <https://github.com/llvm/llvm-project/issues/132678> along the way, so this
+    // has enough checks to make sure that's not happening. It doesn't need to be
+    // *exactly* this IR, but be careful if you ever need to update these checks.
+
+    // CHECK: start:
+    // CHECK: %[[A00:.+]] = load i16, ptr %a
+    // CHECK: %[[B00:.+]] = load i16, ptr %b
+    // CHECK-NOT: cmp
+    // CHECK: %[[EQ00:.+]] = icmp eq i16 %[[A00]], %[[B00]]
+    // CHECK-NEXT: br i1 %[[EQ00]], label %[[L01:.+]], label %[[EXIT_S:.+]]
+
+    // CHECK: [[L01]]:
+    // CHECK: %[[PA01:.+]] = getelementptr{{.+}}i8, ptr %a, {{i32|i64}} 2
+    // CHECK: %[[PB01:.+]] = getelementptr{{.+}}i8, ptr %b, {{i32|i64}} 2
+    // CHECK: %[[A01:.+]] = load i16, ptr %[[PA01]]
+    // CHECK: %[[B01:.+]] = load i16, ptr %[[PB01]]
+    // CHECK-NOT: cmp
+    // CHECK: %[[EQ01:.+]] = icmp eq i16 %[[A01]], %[[B01]]
+    // CHECK-NEXT: br i1 %[[EQ01]], label %[[L10:.+]], label %[[EXIT_U:.+]]
+
+    // CHECK: [[L10]]:
+    // CHECK: %[[PA10:.+]] = getelementptr{{.+}}i8, ptr %a, {{i32|i64}} 4
+    // CHECK: %[[PB10:.+]] = getelementptr{{.+}}i8, ptr %b, {{i32|i64}} 4
+    // CHECK: %[[A10:.+]] = load i16, ptr %[[PA10]]
+    // CHECK: %[[B10:.+]] = load i16, ptr %[[PB10]]
+    // CHECK-NOT: cmp
+    // CHECK: %[[EQ10:.+]] = icmp eq i16 %[[A10]], %[[B10]]
+    // CHECK-NEXT: br i1 %[[EQ10]], label %[[L11:.+]], label %[[EXIT_S]]
+
+    // CHECK: [[L11]]:
+    // CHECK: %[[PA11:.+]] = getelementptr{{.+}}i8, ptr %a, {{i32|i64}} 6
+    // CHECK: %[[PB11:.+]] = getelementptr{{.+}}i8, ptr %b, {{i32|i64}} 6
+    // CHECK: %[[A11:.+]] = load i16, ptr %[[PA11]]
+    // CHECK: %[[B11:.+]] = load i16, ptr %[[PB11]]
+    // CHECK-NOT: cmp
+    // CHECK: %[[EQ11:.+]] = icmp eq i16 %[[A11]], %[[B11]]
+    // CHECK-NEXT: br i1 %[[EQ11]], label %[[DONE:.+]], label %[[EXIT_U]]
+
+    // CHECK: [[DONE]]:
+    // CHECK: %[[RET:.+]] = phi i1 [ %{{.+}}, %[[EXIT_S]] ], [ %{{.+}}, %[[EXIT_U]] ], [ true, %[[L11]] ]
+    // CHECK: ret i1 %[[RET]]
+
+    a <= b
+}
diff --git a/tests/codegen/async-closure-debug.rs b/tests/codegen/async-closure-debug.rs
index 2d67e02eb9c..b5b369e6e54 100644
--- a/tests/codegen/async-closure-debug.rs
+++ b/tests/codegen/async-closure-debug.rs
@@ -1,6 +1,7 @@
 // Just make sure that async closures don't ICE.
 //
-//@ compile-flags: -C debuginfo=2 --edition=2018
+//@ compile-flags: -C debuginfo=2
+//@ edition: 2018
 //@ ignore-msvc
 
 // CHECK-DAG:  [[GEN_FN:!.*]] = !DINamespace(name: "async_closure_test"
diff --git a/tests/codegen/async-fn-debug-awaitee-field.rs b/tests/codegen/async-fn-debug-awaitee-field.rs
index ab13d4509e2..50860c90662 100644
--- a/tests/codegen/async-fn-debug-awaitee-field.rs
+++ b/tests/codegen/async-fn-debug-awaitee-field.rs
@@ -7,7 +7,8 @@
 //@[MSVC] only-msvc
 //@[NONMSVC] ignore-msvc
 
-//@ compile-flags: -C debuginfo=2 --edition=2018 -Copt-level=0
+//@ compile-flags: -C debuginfo=2 -Copt-level=0
+//@ edition: 2018
 
 #![crate_type = "lib"]
 
diff --git a/tests/codegen/async-fn-debug-msvc.rs b/tests/codegen/async-fn-debug-msvc.rs
index 7c695042b42..e0c601146f8 100644
--- a/tests/codegen/async-fn-debug-msvc.rs
+++ b/tests/codegen/async-fn-debug-msvc.rs
@@ -4,7 +4,8 @@
 //  - Other fields are not marked artificial
 //
 //
-//@ compile-flags: -C debuginfo=2 --edition=2018
+//@ compile-flags: -C debuginfo=2
+//@ edition: 2018
 //@ only-msvc
 
 async fn foo() {}
@@ -19,23 +20,23 @@ async fn async_fn_test() {
 // CHECK-DAG:  [[GEN:!.*]] = !DICompositeType(tag: DW_TAG_union_type, name: "enum2$<async_fn_debug_msvc::async_fn_test::async_fn_env$0>",
 // CHECK:      {{!.*}} = !DIDerivedType(tag: DW_TAG_member, name: "variant0", scope: [[GEN]],
 // For brevity, we only check the struct name and members of the last variant.
-// CHECK-SAME: file: [[FILE:![0-9]*]], line: 11,
+// CHECK-SAME: file: [[FILE:![0-9]*]], line: 12,
 // CHECK-NOT:  flags: DIFlagArtificial
 // CHECK-SAME: )
 // CHECK:      {{!.*}} = !DIDerivedType(tag: DW_TAG_member, name: "variant1", scope: [[GEN]],
-// CHECK-SAME: file: [[FILE]], line: 15,
+// CHECK-SAME: file: [[FILE]], line: 16,
 // CHECK-NOT:  flags: DIFlagArtificial
 // CHECK-SAME: )
 // CHECK:      {{!.*}} = !DIDerivedType(tag: DW_TAG_member, name: "variant2", scope: [[GEN]],
-// CHECK-SAME: file: [[FILE]], line: 15,
+// CHECK-SAME: file: [[FILE]], line: 16,
 // CHECK-NOT:  flags: DIFlagArtificial
 // CHECK-SAME: )
 // CHECK:      {{!.*}} = !DIDerivedType(tag: DW_TAG_member, name: "variant3", scope: [[GEN]],
-// CHECK-SAME: file: [[FILE]], line: 12,
+// CHECK-SAME: file: [[FILE]], line: 13,
 // CHECK-NOT:  flags: DIFlagArtificial
 // CHECK-SAME: )
 // CHECK:      {{!.*}} = !DIDerivedType(tag: DW_TAG_member, name: "variant4", scope: [[GEN]],
-// CHECK-SAME: file: [[FILE]], line: 14,
+// CHECK-SAME: file: [[FILE]], line: 15,
 // CHECK-SAME: baseType: [[VARIANT_WRAPPER:![0-9]*]]
 // CHECK-NOT:  flags: DIFlagArtificial
 // CHECK-SAME: )
diff --git a/tests/codegen/async-fn-debug.rs b/tests/codegen/async-fn-debug.rs
index 7be4ad45665..ed704c7cc8b 100644
--- a/tests/codegen/async-fn-debug.rs
+++ b/tests/codegen/async-fn-debug.rs
@@ -4,7 +4,8 @@
 //  - Other fields are not marked artificial
 //
 //
-//@ compile-flags: -C debuginfo=2 --edition=2018
+//@ compile-flags: -C debuginfo=2
+//@ edition: 2018
 //@ ignore-msvc
 
 async fn foo() {}
@@ -22,26 +23,26 @@ async fn async_fn_test() {
 // CHECK-NOT:  flags: DIFlagArtificial
 // CHECK-SAME: discriminator: [[DISC:![0-9]*]]
 // CHECK:      {{!.*}} = !DIDerivedType(tag: DW_TAG_member, name: "0", scope: [[VARIANT]],
-// CHECK-SAME: file: [[FILE:![0-9]*]], line: 11,
+// CHECK-SAME: file: [[FILE:![0-9]*]], line: 12,
 // CHECK-NOT:  flags: DIFlagArtificial
 // CHECK-SAME: )
 // CHECK:      {{!.*}} = !DICompositeType(tag: DW_TAG_structure_type, name: "Unresumed", scope: [[GEN]],
 // CHECK-NOT:  flags: DIFlagArtificial
 // CHECK-SAME: )
 // CHECK:      {{!.*}} = !DIDerivedType(tag: DW_TAG_member, name: "1", scope: [[VARIANT]],
-// CHECK-SAME: file: [[FILE]], line: 15,
+// CHECK-SAME: file: [[FILE]], line: 16,
 // CHECK-NOT:  flags: DIFlagArtificial
 // CHECK-SAME: )
 // CHECK:      {{!.*}} = !DIDerivedType(tag: DW_TAG_member, name: "2", scope: [[VARIANT]],
-// CHECK-SAME: file: [[FILE]], line: 15,
+// CHECK-SAME: file: [[FILE]], line: 16,
 // CHECK-NOT:  flags: DIFlagArtificial
 // CHECK-SAME: )
 // CHECK:      {{!.*}} = !DIDerivedType(tag: DW_TAG_member, name: "3", scope: [[VARIANT]],
-// CHECK-SAME: file: [[FILE]], line: 12,
+// CHECK-SAME: file: [[FILE]], line: 13,
 // CHECK-NOT:  flags: DIFlagArtificial
 // CHECK-SAME: )
 // CHECK:      {{!.*}} = !DIDerivedType(tag: DW_TAG_member, name: "4", scope: [[VARIANT]],
-// CHECK-SAME: file: [[FILE]], line: 14,
+// CHECK-SAME: file: [[FILE]], line: 15,
 // CHECK-NOT:  flags: DIFlagArtificial
 // CHECK-SAME: )
 // CHECK:      [[S1:!.*]] = !DICompositeType(tag: DW_TAG_structure_type, name: "Suspend1", scope: [[GEN]],
diff --git a/tests/codegen/coroutine-debug.rs b/tests/codegen/coroutine-debug.rs
index d00667a37d5..ff62e9709b4 100644
--- a/tests/codegen/coroutine-debug.rs
+++ b/tests/codegen/coroutine-debug.rs
@@ -4,7 +4,8 @@
 //  - Other fields are not marked artificial
 //
 //
-//@ compile-flags: -C debuginfo=2 --edition=2018
+//@ compile-flags: -C debuginfo=2
+//@ edition: 2018
 //@ ignore-msvc
 
 #![feature(coroutines, coroutine_trait)]
@@ -27,26 +28,26 @@ fn coroutine_test() -> impl Coroutine<Yield = i32, Return = ()> {
 // CHECK-NOT:  flags: DIFlagArtificial
 // CHECK-SAME: discriminator: [[DISC:![0-9]*]]
 // CHECK:      {{!.*}} = !DIDerivedType(tag: DW_TAG_member, name: "0", scope: [[VARIANT]],
-// CHECK-SAME: file: [[FILE:![0-9]*]], line: 15,
+// CHECK-SAME: file: [[FILE:![0-9]*]], line: 16,
 // CHECK-NOT:  flags: DIFlagArtificial
 // CHECK-SAME: )
 // CHECK:      {{!.*}} = !DICompositeType(tag: DW_TAG_structure_type, name: "Unresumed", scope: [[GEN]],
 // CHECK-NOT:  flags: DIFlagArtificial
 // CHECK-SAME: )
 // CHECK:      {{!.*}} = !DIDerivedType(tag: DW_TAG_member, name: "1", scope: [[VARIANT]],
-// CHECK-SAME: file: [[FILE]], line: 19,
+// CHECK-SAME: file: [[FILE]], line: 20,
 // CHECK-NOT:  flags: DIFlagArtificial
 // CHECK-SAME: )
 // CHECK:      {{!.*}} = !DIDerivedType(tag: DW_TAG_member, name: "2", scope: [[VARIANT]],
-// CHECK-SAME: file: [[FILE]], line: 19,
+// CHECK-SAME: file: [[FILE]], line: 20,
 // CHECK-NOT:  flags: DIFlagArtificial
 // CHECK-SAME: )
 // CHECK:      {{!.*}} = !DIDerivedType(tag: DW_TAG_member, name: "3", scope: [[VARIANT]],
-// CHECK-SAME: file: [[FILE]], line: 16,
+// CHECK-SAME: file: [[FILE]], line: 17,
 // CHECK-NOT:  flags: DIFlagArtificial
 // CHECK-SAME: )
 // CHECK:      {{!.*}} = !DIDerivedType(tag: DW_TAG_member, name: "4", scope: [[VARIANT]],
-// CHECK-SAME: file: [[FILE]], line: 18,
+// CHECK-SAME: file: [[FILE]], line: 19,
 // CHECK-NOT:  flags: DIFlagArtificial
 // CHECK-SAME: )
 // CHECK:      [[S1:!.*]] = !DICompositeType(tag: DW_TAG_structure_type, name: "Suspend1", scope: [[GEN]],
diff --git a/tests/codegen/debuginfo-generic-closure-env-names.rs b/tests/codegen/debuginfo-generic-closure-env-names.rs
index 6b314c9abae..64bc58e1df7 100644
--- a/tests/codegen/debuginfo-generic-closure-env-names.rs
+++ b/tests/codegen/debuginfo-generic-closure-env-names.rs
@@ -18,7 +18,8 @@
 // legacy mangling scheme rustc version and generic parameters are both hashed into a single part
 // of the name, thus randomizing item order with respect to rustc version.
 
-//@ compile-flags: -Cdebuginfo=2 --edition 2021 -Copt-level=0 -Csymbol-mangling-version=v0
+//@ compile-flags: -Cdebuginfo=2 -Copt-level=0 -Csymbol-mangling-version=v0
+//@ edition: 2021
 
 // non_generic_closure()
 // NONMSVC: !DICompositeType(tag: DW_TAG_structure_type, name: "{closure_env#0}", scope: ![[non_generic_closure_NAMESPACE:[0-9]+]],
diff --git a/tests/codegen/dont-shuffle-bswaps.rs b/tests/codegen/dont-shuffle-bswaps.rs
index e100474f606..c1dab2bc295 100644
--- a/tests/codegen/dont-shuffle-bswaps.rs
+++ b/tests/codegen/dont-shuffle-bswaps.rs
@@ -1,8 +1,11 @@
-//@ revisions: OPT2 OPT3
+//@ revisions: OPT2 OPT3 OPT3_S390X
 //@[OPT2] compile-flags: -Copt-level=2
 //@[OPT3] compile-flags: -C opt-level=3
 // some targets don't do the opt we are looking for
 //@[OPT3] only-64bit
+//@[OPT3] ignore-s390x
+//@[OPT3_S390X] compile-flags: -C opt-level=3 -C target-cpu=z13
+//@[OPT3_S390X] only-s390x
 
 #![crate_type = "lib"]
 #![no_std]
@@ -17,6 +20,10 @@
 // OPT3-NEXT: call <8 x i16> @llvm.bswap
 // OPT3-NEXT: store <8 x i16>
 // OPT3-NEXT: ret void
+// OPT3_S390X: load <8 x i16>
+// OPT3_S390X-NEXT: call <8 x i16> @llvm.bswap
+// OPT3_S390X-NEXT: store <8 x i16>
+// OPT3_S390X-NEXT: ret void
 #[no_mangle]
 pub fn convert(value: [u16; 8]) -> [u8; 16] {
     #[cfg(target_endian = "little")]
diff --git a/tests/codegen/infallible-unwrap-in-opt-z.rs b/tests/codegen/infallible-unwrap-in-opt-z.rs
index 3756fafe384..c2297c58e77 100644
--- a/tests/codegen/infallible-unwrap-in-opt-z.rs
+++ b/tests/codegen/infallible-unwrap-in-opt-z.rs
@@ -1,4 +1,5 @@
-//@ compile-flags: -C opt-level=z --edition=2021
+//@ compile-flags: -C opt-level=z
+//@ edition: 2021
 
 #![crate_type = "lib"]
 
diff --git a/tests/codegen/inline-function-args-debug-info.rs b/tests/codegen/inline-function-args-debug-info.rs
index 53a179160dc..c31419cb914 100644
--- a/tests/codegen/inline-function-args-debug-info.rs
+++ b/tests/codegen/inline-function-args-debug-info.rs
@@ -2,7 +2,8 @@
 // gets inlined by MIR inlining. Without function argument indexes, `info args` in gdb won't show
 // arguments and their values for the current function.
 
-//@ compile-flags: -Zinline-mir=yes -Cdebuginfo=2 --edition=2021
+//@ compile-flags: -Zinline-mir=yes -Cdebuginfo=2
+//@ edition: 2021
 
 #![crate_type = "lib"]
 
@@ -14,9 +15,9 @@ pub fn outer_function(x: usize, y: usize) -> usize {
 #[inline]
 fn inner_function(aaaa: usize, bbbb: usize) -> usize {
     // CHECK: !DILocalVariable(name: "aaaa", arg: 1
-    // CHECK-SAME: line: 15
+    // CHECK-SAME: line: 16
     // CHECK-NOT: !DILexicalBlock(
     // CHECK: !DILocalVariable(name: "bbbb", arg: 2
-    // CHECK-SAME: line: 15
+    // CHECK-SAME: line: 16
     aaaa + bbbb
 }
diff --git a/tests/codegen/intrinsics/select_unpredictable.rs b/tests/codegen/intrinsics/select_unpredictable.rs
index 68a02c8342d..2db4ae174b3 100644
--- a/tests/codegen/intrinsics/select_unpredictable.rs
+++ b/tests/codegen/intrinsics/select_unpredictable.rs
@@ -46,21 +46,21 @@ pub fn test_zst(p: bool, a: (), b: ()) -> () {
 pub fn test_int2(p: bool, a: u64, b: u64) -> u64 {
     // CHECK-LABEL: define{{.*}} @test_int2
     // CHECK: select i1 %p, i64 %a, i64 %b, !unpredictable
-    p.select_unpredictable(a, b)
+    core::hint::select_unpredictable(p, a, b)
 }
 
 #[no_mangle]
 pub fn test_pair2(p: bool, a: (u64, u64), b: (u64, u64)) -> (u64, u64) {
     // CHECK-LABEL: define{{.*}} @test_pair2
     // CHECK: select i1 %p, {{.*}}, !unpredictable
-    p.select_unpredictable(a, b)
+    core::hint::select_unpredictable(p, a, b)
 }
 
 #[no_mangle]
 pub fn test_struct2(p: bool, a: Large, b: Large) -> Large {
     // CHECK-LABEL: define{{.*}} @test_struct2
     // CHECK: select i1 %p, {{.*}}, !unpredictable
-    p.select_unpredictable(a, b)
+    core::hint::select_unpredictable(p, a, b)
 }
 
 #[no_mangle]
@@ -68,5 +68,5 @@ pub fn test_zst2(p: bool, a: (), b: ()) -> () {
     // CHECK-LABEL: define{{.*}} @test_zst2
     // CHECK-NEXT: start:
     // CHECK-NEXT: ret void
-    p.select_unpredictable(a, b)
+    core::hint::select_unpredictable(p, a, b)
 }
diff --git a/tests/codegen/issues/issue-101082.rs b/tests/codegen/issues/issue-101082.rs
index 7fb850ca253..96cdff64dda 100644
--- a/tests/codegen/issues/issue-101082.rs
+++ b/tests/codegen/issues/issue-101082.rs
@@ -1,8 +1,16 @@
 //@ compile-flags: -Copt-level=3
-//@ revisions: host x86-64-v3
+//@ revisions: host x86-64 x86-64-v3
 //@ min-llvm-version: 20
 
-// This particular CPU regressed in #131563
+//@[host] ignore-x86_64
+
+// Set the base cpu explicitly, in case the default has been changed.
+//@[x86-64] only-x86_64
+//@[x86-64] compile-flags: -Ctarget-cpu=x86-64
+
+// FIXME(cuviper) x86-64-v3 in particular regressed in #131563, and the workaround
+// at the time still sometimes fails, so only verify it for the power-of-two size
+// - https://github.com/llvm/llvm-project/issues/134735
 //@[x86-64-v3] only-x86_64
 //@[x86-64-v3] compile-flags: -Ctarget-cpu=x86-64-v3
 
@@ -11,7 +19,16 @@
 #[no_mangle]
 pub fn test() -> usize {
     // CHECK-LABEL: @test(
-    // CHECK: ret {{i64|i32}} 165
+    // host: ret {{i64|i32}} 165
+    // x86-64: ret {{i64|i32}} 165
+
+    // FIXME: Now that this autovectorizes via a masked load, it doesn't actually
+    // const-fold for certain widths.  The `test_eight` case below shows that, yes,
+    // what we're emitting *can* be const-folded, except that the way LLVM does it
+    // for certain widths doesn't today.  We should be able to put this back to
+    // the same check after <https://github.com/llvm/llvm-project/issues/134513>
+    // x86-64-v3: masked.load
+
     let values = [23, 16, 54, 3, 60, 9];
     let mut acc = 0;
     for item in values {
@@ -19,3 +36,15 @@ pub fn test() -> usize {
     }
     acc
 }
+
+#[no_mangle]
+pub fn test_eight() -> usize {
+    // CHECK-LABEL: @test_eight(
+    // CHECK: ret {{i64|i32}} 220
+    let values = [23, 16, 54, 3, 60, 9, 13, 42];
+    let mut acc = 0;
+    for item in values {
+        acc += item;
+    }
+    acc
+}
diff --git a/tests/codegen/issues/issue-119422.rs b/tests/codegen/issues/issue-119422.rs
index e1a082c377f..17ae71605b5 100644
--- a/tests/codegen/issues/issue-119422.rs
+++ b/tests/codegen/issues/issue-119422.rs
@@ -1,7 +1,8 @@
 //! This test checks that compiler don't generate useless compares to zeros
 //! for `NonZero` integer types.
 //!
-//@ compile-flags: -Copt-level=3 --edition=2021 -Zmerge-functions=disabled
+//@ compile-flags: -Copt-level=3 -Zmerge-functions=disabled
+//@ edition: 2021
 //@ only-64bit (because the LLVM type of i64 for usize shows up)
 #![crate_type = "lib"]
 
diff --git a/tests/codegen/simd/extract-insert-dyn.rs b/tests/codegen/simd/extract-insert-dyn.rs
new file mode 100644
index 00000000000..584e2c7887a
--- /dev/null
+++ b/tests/codegen/simd/extract-insert-dyn.rs
@@ -0,0 +1,75 @@
+//@compile-flags: -C opt-level=3 -C no-prepopulate-passes
+
+#![feature(core_intrinsics, repr_simd)]
+#![no_std]
+#![crate_type = "lib"]
+#![allow(non_camel_case_types)]
+
+// Test that `core::intrinsics::simd::{simd_extract_dyn, simd_insert_dyn}`
+// lower to an LLVM extractelement or insertelement operation.
+
+use core::intrinsics::simd::{simd_extract, simd_extract_dyn, simd_insert, simd_insert_dyn};
+
+#[repr(simd)]
+#[derive(Clone, Copy)]
+pub struct u32x16([u32; 16]);
+
+#[repr(simd)]
+#[derive(Clone, Copy)]
+pub struct i8x16([i8; 16]);
+
+// CHECK-LABEL: dyn_simd_extract
+// CHECK: extractelement <16 x i8> %x, i32 %idx
+#[no_mangle]
+unsafe extern "C" fn dyn_simd_extract(x: i8x16, idx: u32) -> i8 {
+    simd_extract_dyn(x, idx)
+}
+
+// CHECK-LABEL: literal_dyn_simd_extract
+// CHECK: extractelement <16 x i8> %x, i32 7
+#[no_mangle]
+unsafe extern "C" fn literal_dyn_simd_extract(x: i8x16) -> i8 {
+    simd_extract_dyn(x, 7)
+}
+
+// CHECK-LABEL: const_dyn_simd_extract
+// CHECK: extractelement <16 x i8> %x, i32 7
+#[no_mangle]
+unsafe extern "C" fn const_dyn_simd_extract(x: i8x16) -> i8 {
+    simd_extract_dyn(x, const { 3 + 4 })
+}
+
+// CHECK-LABEL: const_simd_extract
+// CHECK: extractelement <16 x i8> %x, i32 7
+#[no_mangle]
+unsafe extern "C" fn const_simd_extract(x: i8x16) -> i8 {
+    simd_extract(x, const { 3 + 4 })
+}
+
+// CHECK-LABEL: dyn_simd_insert
+// CHECK: insertelement <16 x i8> %x, i8 %e, i32 %idx
+#[no_mangle]
+unsafe extern "C" fn dyn_simd_insert(x: i8x16, e: i8, idx: u32) -> i8x16 {
+    simd_insert_dyn(x, idx, e)
+}
+
+// CHECK-LABEL: literal_dyn_simd_insert
+// CHECK: insertelement <16 x i8> %x, i8 %e, i32 7
+#[no_mangle]
+unsafe extern "C" fn literal_dyn_simd_insert(x: i8x16, e: i8) -> i8x16 {
+    simd_insert_dyn(x, 7, e)
+}
+
+// CHECK-LABEL: const_dyn_simd_insert
+// CHECK: insertelement <16 x i8> %x, i8 %e, i32 7
+#[no_mangle]
+unsafe extern "C" fn const_dyn_simd_insert(x: i8x16, e: i8) -> i8x16 {
+    simd_insert_dyn(x, const { 3 + 4 }, e)
+}
+
+// CHECK-LABEL: const_simd_insert
+// CHECK: insertelement <16 x i8> %x, i8 %e, i32 7
+#[no_mangle]
+unsafe extern "C" fn const_simd_insert(x: i8x16, e: i8) -> i8x16 {
+    simd_insert(x, const { 3 + 4 }, e)
+}
diff --git a/tests/codegen/simd/simd-wide-sum.rs b/tests/codegen/simd/simd-wide-sum.rs
index fb9b61884e7..95117b2c748 100644
--- a/tests/codegen/simd/simd-wide-sum.rs
+++ b/tests/codegen/simd/simd-wide-sum.rs
@@ -1,5 +1,6 @@
 //@ revisions: llvm mir-opt3
-//@ compile-flags: -C opt-level=3 -Z merge-functions=disabled --edition=2021
+//@ compile-flags: -C opt-level=3 -Z merge-functions=disabled
+//@ edition: 2021
 //@ only-x86_64
 //@ [mir-opt3]compile-flags: -Zmir-opt-level=3
 //@ [mir-opt3]build-pass
diff --git a/tests/codegen/simd/swap-simd-types.rs b/tests/codegen/simd/swap-simd-types.rs
index 69767d0a755..c063cc683a6 100644
--- a/tests/codegen/simd/swap-simd-types.rs
+++ b/tests/codegen/simd/swap-simd-types.rs
@@ -23,8 +23,8 @@ pub fn swap_single_m256(x: &mut __m256, y: &mut __m256) {
 #[no_mangle]
 pub fn swap_m256_slice(x: &mut [__m256], y: &mut [__m256]) {
     // CHECK-NOT: alloca
-    // CHECK: load <8 x float>{{.+}}align 32
-    // CHECK: store <8 x float>{{.+}}align 32
+    // CHECK-COUNT-2: load <4 x i64>{{.+}}align 32
+    // CHECK-COUNT-2: store <4 x i64>{{.+}}align 32
     if x.len() == y.len() {
         x.swap_with_slice(y);
     }
@@ -34,7 +34,7 @@ pub fn swap_m256_slice(x: &mut [__m256], y: &mut [__m256]) {
 #[no_mangle]
 pub fn swap_bytes32(x: &mut [u8; 32], y: &mut [u8; 32]) {
     // CHECK-NOT: alloca
-    // CHECK: load <32 x i8>{{.+}}align 1
-    // CHECK: store <32 x i8>{{.+}}align 1
+    // CHECK-COUNT-2: load <4 x i64>{{.+}}align 1
+    // CHECK-COUNT-2: store <4 x i64>{{.+}}align 1
     swap(x, y)
 }
diff --git a/tests/codegen/swap-large-types.rs b/tests/codegen/swap-large-types.rs
index 49a41bb1469..08c486affd9 100644
--- a/tests/codegen/swap-large-types.rs
+++ b/tests/codegen/swap-large-types.rs
@@ -12,6 +12,16 @@ type KeccakBuffer = [[u64; 5]; 5];
 // to stack for large types, which is completely unnecessary as the lack of
 // overlap means we can just do whatever fits in registers at a time.
 
+// The tests here (after the first one showing that the problem still exists)
+// are less about testing *exactly* what the codegen is, and more about testing
+// 1) That things are swapped directly from one argument to the other,
+//    never going through stack along the way, and
+// 2) That we're doing the swapping for big things using large vector types,
+//    rather then `i64` or `<8 x i8>` (or, even worse, `i8`) at a time.
+//
+// (There are separate tests for intrinsics::typed_swap_nonoverlapping that
+//  check that it, as an intrinsic, are emitting exactly what it should.)
+
 // CHECK-LABEL: @swap_basic
 #[no_mangle]
 pub fn swap_basic(x: &mut KeccakBuffer, y: &mut KeccakBuffer) {
@@ -26,55 +36,55 @@ pub fn swap_basic(x: &mut KeccakBuffer, y: &mut KeccakBuffer) {
     }
 }
 
-// This test verifies that the library does something smarter, and thus
-// doesn't need any scratch space on the stack.
-
 // CHECK-LABEL: @swap_std
 #[no_mangle]
 pub fn swap_std(x: &mut KeccakBuffer, y: &mut KeccakBuffer) {
     // CHECK-NOT: alloca
-    // CHECK: load <{{[0-9]+}} x i64>
-    // CHECK: store <{{[0-9]+}} x i64>
+    // CHECK: load <{{2|4}} x i64>
+    // CHECK: store <{{2|4}} x i64>
     swap(x, y)
 }
 
-// Verify that types with usize alignment are swapped via vectored usizes,
-// not falling back to byte-level code.
-
 // CHECK-LABEL: @swap_slice
 #[no_mangle]
 pub fn swap_slice(x: &mut [KeccakBuffer], y: &mut [KeccakBuffer]) {
     // CHECK-NOT: alloca
-    // CHECK: load <{{[0-9]+}} x i64>
-    // CHECK: store <{{[0-9]+}} x i64>
+    // CHECK: load <{{2|4}} x i64>
+    // CHECK: store <{{2|4}} x i64>
     if x.len() == y.len() {
         x.swap_with_slice(y);
     }
 }
 
-// But for a large align-1 type, vectorized byte copying is what we want.
-
 type OneKilobyteBuffer = [u8; 1024];
 
 // CHECK-LABEL: @swap_1kb_slices
 #[no_mangle]
 pub fn swap_1kb_slices(x: &mut [OneKilobyteBuffer], y: &mut [OneKilobyteBuffer]) {
     // CHECK-NOT: alloca
-    // CHECK: load <{{[0-9]+}} x i8>
-    // CHECK: store <{{[0-9]+}} x i8>
+
+    // CHECK-NOT: load i32
+    // CHECK-NOT: store i32
+    // CHECK-NOT: load i16
+    // CHECK-NOT: store i16
+    // CHECK-NOT: load i8
+    // CHECK-NOT: store i8
+
+    // CHECK: load <{{2|4}} x i64>{{.+}}align 1,
+    // CHECK: store <{{2|4}} x i64>{{.+}}align 1,
+
+    // CHECK-NOT: load i32
+    // CHECK-NOT: store i32
+    // CHECK-NOT: load i16
+    // CHECK-NOT: store i16
+    // CHECK-NOT: load i8
+    // CHECK-NOT: store i8
+
     if x.len() == y.len() {
         x.swap_with_slice(y);
     }
 }
 
-// This verifies that the 2×read + 2×write optimizes to just 3 memcpys
-// for an unusual type like this.  It's not clear whether we should do anything
-// smarter in Rust for these, so for now it's fine to leave these up to the backend.
-// That's not as bad as it might seem, as for example, LLVM will lower the
-// memcpys below to VMOVAPS on YMMs if one enables the AVX target feature.
-// Eventually we'll be able to pass `align_of::<T>` to a const generic and
-// thus pick a smarter chunk size ourselves without huge code duplication.
-
 #[repr(align(64))]
 pub struct BigButHighlyAligned([u8; 64 * 3]);
 
@@ -82,9 +92,25 @@ pub struct BigButHighlyAligned([u8; 64 * 3]);
 #[no_mangle]
 pub fn swap_big_aligned(x: &mut BigButHighlyAligned, y: &mut BigButHighlyAligned) {
     // CHECK-NOT: call void @llvm.memcpy
-    // CHECK: call void @llvm.memcpy.{{.+}}(ptr noundef nonnull align 64 dereferenceable(192)
-    // CHECK: call void @llvm.memcpy.{{.+}}(ptr noundef nonnull align 64 dereferenceable(192)
-    // CHECK: call void @llvm.memcpy.{{.+}}(ptr noundef nonnull align 64 dereferenceable(192)
+    // CHECK-NOT: load i32
+    // CHECK-NOT: store i32
+    // CHECK-NOT: load i16
+    // CHECK-NOT: store i16
+    // CHECK-NOT: load i8
+    // CHECK-NOT: store i8
+
+    // CHECK-COUNT-2: load <{{2|4}} x i64>{{.+}}align 64,
+    // CHECK-COUNT-2: store <{{2|4}} x i64>{{.+}}align 64,
+
+    // CHECK-COUNT-2: load <{{2|4}} x i64>{{.+}}align 32,
+    // CHECK-COUNT-2: store <{{2|4}} x i64>{{.+}}align 32,
+
+    // CHECK-NOT: load i32
+    // CHECK-NOT: store i32
+    // CHECK-NOT: load i16
+    // CHECK-NOT: store i16
+    // CHECK-NOT: load i8
+    // CHECK-NOT: store i8
     // CHECK-NOT: call void @llvm.memcpy
     swap(x, y)
 }
diff --git a/tests/codegen/swap-small-types.rs b/tests/codegen/swap-small-types.rs
index 76bb853e642..7aa613ae9c2 100644
--- a/tests/codegen/swap-small-types.rs
+++ b/tests/codegen/swap-small-types.rs
@@ -1,5 +1,7 @@
 //@ compile-flags: -Copt-level=3 -Z merge-functions=disabled
 //@ only-x86_64
+//@ min-llvm-version: 20
+//@ ignore-std-debug-assertions (`ptr::swap_nonoverlapping` has one which blocks some optimizations)
 
 #![crate_type = "lib"]
 
@@ -27,13 +29,19 @@ pub fn swap_rgb48_manually(x: &mut RGB48, y: &mut RGB48) {
 pub fn swap_rgb48(x: &mut RGB48, y: &mut RGB48) {
     // CHECK-NOT: alloca
 
-    // Whether `i8` is the best for this is unclear, but
-    // might as well record what's actually happening right now.
-
-    // CHECK: load i8
-    // CHECK: load i8
-    // CHECK: store i8
-    // CHECK: store i8
+    // Swapping `i48` might be cleaner in LLVM-IR here, but `i32`+`i16` isn't bad,
+    // and is closer to the assembly it generates anyway.
+
+    // CHECK-NOT: load{{ }}
+    // CHECK: load i32{{.+}}align 2
+    // CHECK-NEXT: load i32{{.+}}align 2
+    // CHECK-NEXT: store i32{{.+}}align 2
+    // CHECK-NEXT: store i32{{.+}}align 2
+    // CHECK: load i16{{.+}}align 2
+    // CHECK-NEXT: load i16{{.+}}align 2
+    // CHECK-NEXT: store i16{{.+}}align 2
+    // CHECK-NEXT: store i16{{.+}}align 2
+    // CHECK-NOT: store{{ }}
     swap(x, y)
 }
 
@@ -76,30 +84,49 @@ pub fn swap_slices<'a>(x: &mut &'a [u32], y: &mut &'a [u32]) {
     swap(x, y)
 }
 
-// LLVM doesn't vectorize a loop over 3-byte elements,
-// so we chunk it down to bytes and loop over those instead.
 type RGB24 = [u8; 3];
 
 // CHECK-LABEL: @swap_rgb24_slices
 #[no_mangle]
 pub fn swap_rgb24_slices(x: &mut [RGB24], y: &mut [RGB24]) {
     // CHECK-NOT: alloca
-    // CHECK: load <{{[0-9]+}} x i8>
-    // CHECK: store <{{[0-9]+}} x i8>
+
+    // CHECK: mul nuw nsw i64 %{{x|y}}.1, 3
+
+    // CHECK: load <{{[0-9]+}} x i64>
+    // CHECK: store <{{[0-9]+}} x i64>
+
+    // CHECK-COUNT-2: load i32
+    // CHECK-COUNT-2: store i32
+    // CHECK-COUNT-2: load i16
+    // CHECK-COUNT-2: store i16
+    // CHECK-COUNT-2: load i8
+    // CHECK-COUNT-2: store i8
     if x.len() == y.len() {
         x.swap_with_slice(y);
     }
 }
 
-// This one has a power-of-two size, so we iterate over it directly
 type RGBA32 = [u8; 4];
 
 // CHECK-LABEL: @swap_rgba32_slices
 #[no_mangle]
 pub fn swap_rgba32_slices(x: &mut [RGBA32], y: &mut [RGBA32]) {
     // CHECK-NOT: alloca
-    // CHECK: load <{{[0-9]+}} x i32>
-    // CHECK: store <{{[0-9]+}} x i32>
+
+    // Because the size in bytes in a multiple of 4, we can skip the smallest sizes.
+
+    // CHECK: load <{{[0-9]+}} x i64>
+    // CHECK: store <{{[0-9]+}} x i64>
+
+    // CHECK-COUNT-2: load i32
+    // CHECK-COUNT-2: store i32
+
+    // CHECK-NOT: load i16
+    // CHECK-NOT: store i16
+    // CHECK-NOT: load i8
+    // CHECK-NOT: store i8
+
     if x.len() == y.len() {
         x.swap_with_slice(y);
     }
@@ -113,8 +140,8 @@ const _: () = assert!(!std::mem::size_of::<String>().is_power_of_two());
 #[no_mangle]
 pub fn swap_string_slices(x: &mut [String], y: &mut [String]) {
     // CHECK-NOT: alloca
-    // CHECK: load <{{[0-9]+}} x i64>
-    // CHECK: store <{{[0-9]+}} x i64>
+    // CHECK: load <{{[0-9]+}} x i64>{{.+}}, align 8,
+    // CHECK: store <{{[0-9]+}} x i64>{{.+}}, align 8,
     if x.len() == y.len() {
         x.swap_with_slice(y);
     }
@@ -130,6 +157,26 @@ pub struct Packed {
 #[no_mangle]
 pub fn swap_packed_structs(x: &mut Packed, y: &mut Packed) {
     // CHECK-NOT: alloca
+
+    // CHECK-NOT: load
+    // CHECK-NOT: store
+
+    // CHECK: %[[A:.+]] = load i64, ptr %x, align 1,
+    // CHECK-NEXT: %[[B:.+]] = load i64, ptr %y, align 1,
+    // CHECK-NEXT: store i64 %[[B]], ptr %x, align 1,
+    // CHECK-NEXT: store i64 %[[A]], ptr %y, align 1,
+
+    // CHECK-NOT: load
+    // CHECK-NOT: store
+
+    // CHECK: %[[C:.+]] = load i8, ptr %[[X8:.+]], align 1,
+    // CHECK-NEXT: %[[D:.+]] = load i8, ptr %[[Y8:.+]], align 1,
+    // CHECK-NEXT: store i8 %[[D]], ptr %[[X8]], align 1,
+    // CHECK-NEXT: store i8 %[[C]], ptr %[[Y8]], align 1,
+
+    // CHECK-NOT: load
+    // CHECK-NOT: store
+
     // CHECK: ret void
     swap(x, y)
 }
diff --git a/tests/codegen/try_question_mark_nop.rs b/tests/codegen/try_question_mark_nop.rs
index 9f68d742a75..398c9a580bc 100644
--- a/tests/codegen/try_question_mark_nop.rs
+++ b/tests/codegen/try_question_mark_nop.rs
@@ -1,4 +1,5 @@
-//@ compile-flags: -Copt-level=3 -Z merge-functions=disabled --edition=2021
+//@ compile-flags: -Copt-level=3 -Z merge-functions=disabled
+//@ edition: 2021
 //@ only-x86_64
 //@ revisions: NINETEEN TWENTY
 //@[NINETEEN] exact-llvm-major-version: 19