31 files changed, 476 insertions, 186 deletions
diff --git a/tests/codegen/abi-win64-zst.rs b/tests/codegen/abi-win64-zst.rs
index dd361898144..825a5c1b09c 100644
--- a/tests/codegen/abi-win64-zst.rs
+++ b/tests/codegen/abi-win64-zst.rs
@@ -1,4 +1,5 @@
 //@ compile-flags: -Z merge-functions=disabled
+//@ add-core-stubs
 
 //@ revisions: windows-gnu
 //@[windows-gnu] compile-flags: --target x86_64-pc-windows-gnu
@@ -13,12 +14,12 @@
 //@[linux] compile-flags: --target x86_64-unknown-linux-gnu
 //@[linux] needs-llvm-components: x86
 
-#![feature(no_core, lang_items, rustc_attrs, abi_vectorcall)]
+#![feature(no_core, rustc_attrs, abi_vectorcall)]
 #![no_core]
 #![crate_type = "lib"]
 
-#[lang = "sized"]
-trait Sized {}
+extern crate minicore;
+use minicore::*;
 
 // Make sure the argument is always passed when explicitly requesting a Windows ABI.
 // Our goal here is to match clang: <https://clang.godbolt.org/z/Wr4jMWq3P>.
diff --git a/tests/codegen/abi-x86-sse.rs b/tests/codegen/abi-x86-sse.rs
new file mode 100644
index 00000000000..837bf6134b0
--- /dev/null
+++ b/tests/codegen/abi-x86-sse.rs
@@ -0,0 +1,36 @@
+//@ compile-flags: -Z merge-functions=disabled
+
+//@ revisions: x86-64
+//@[x86-64] compile-flags: --target x86_64-unknown-linux-gnu
+//@[x86-64] needs-llvm-components: x86
+
+//@ revisions: x86-32
+//@[x86-32] compile-flags: --target i686-unknown-linux-gnu
+//@[x86-32] needs-llvm-components: x86
+
+//@ revisions: x86-32-nosse
+//@[x86-32-nosse] compile-flags: --target i586-unknown-linux-gnu
+//@[x86-32-nosse] needs-llvm-components: x86
+
+#![feature(no_core, lang_items, rustc_attrs, repr_simd)]
+#![no_core]
+#![crate_type = "lib"]
+
+#[lang = "sized"]
+trait Sized {}
+
+#[lang = "copy"]
+trait Copy {}
+
+// Ensure this type is passed without ptr indirection on targets that
+// require SSE2.
+#[repr(simd)]
+pub struct Sse([f32; 4]);
+
+// x86-64: <4 x float> @sse_id(<4 x float> {{[^,]*}})
+// x86-32: <4 x float> @sse_id(<4 x float> {{[^,]*}})
+// x86-32-nosse: void @sse_id(ptr{{( [^,]*)?}} sret([16 x i8]){{( .*)?}}, ptr{{( [^,]*)?}})
+#[no_mangle]
+pub fn sse_id(x: Sse) -> Sse {
+    x
+}
diff --git a/tests/codegen/align-byval-alignment-mismatch.rs b/tests/codegen/align-byval-alignment-mismatch.rs
index 835cc7393e5..46cfb2972df 100644
--- a/tests/codegen/align-byval-alignment-mismatch.rs
+++ b/tests/codegen/align-byval-alignment-mismatch.rs
@@ -1,4 +1,5 @@
 // ignore-tidy-linelength
+//@ add-core-stubs
 //@ revisions:i686-linux x86_64-linux
 
 //@[i686-linux] compile-flags: --target i686-unknown-linux-gnu -C panic=abort
@@ -16,18 +17,14 @@
 // on i686-unknown-linux-gnu, since the alignment needs to be increased, and should codegen
 // to a direct call on x86_64-unknown-linux-gnu, where byval alignment matches Rust alignment.
 
-#![feature(no_core, lang_items)]
+#![feature(no_core)]
 #![crate_type = "lib"]
 #![no_std]
 #![no_core]
 #![allow(non_camel_case_types)]
 
-#[lang = "sized"]
-trait Sized {}
-#[lang = "freeze"]
-trait Freeze {}
-#[lang = "copy"]
-trait Copy {}
+extern crate minicore;
+use minicore::*;
 
 // This type has align 1 in Rust, but as a byval argument on i686-linux, it will have align 4.
 #[repr(C)]
diff --git a/tests/codegen/align-byval-vector.rs b/tests/codegen/align-byval-vector.rs
index 60d49f93081..c33b41a7bbe 100644
--- a/tests/codegen/align-byval-vector.rs
+++ b/tests/codegen/align-byval-vector.rs
@@ -1,3 +1,4 @@
+//@ add-core-stubs
 //@ revisions:x86-linux x86-darwin
 
 //@[x86-linux] compile-flags: --target i686-unknown-linux-gnu
@@ -7,18 +8,14 @@
 
 // Tests that aggregates containing vector types get their alignment increased to 16 on Darwin.
 
-#![feature(no_core, lang_items, repr_simd, simd_ffi)]
+#![feature(no_core, repr_simd, simd_ffi)]
 #![crate_type = "lib"]
 #![no_std]
 #![no_core]
 #![allow(non_camel_case_types)]
 
-#[lang = "sized"]
-trait Sized {}
-#[lang = "freeze"]
-trait Freeze {}
-#[lang = "copy"]
-trait Copy {}
+extern crate minicore;
+use minicore::*;
 
 #[repr(simd)]
 pub struct i32x4([i32; 4]);
diff --git a/tests/codegen/align-byval.rs b/tests/codegen/align-byval.rs
index b057147ab13..75dabd74a79 100644
--- a/tests/codegen/align-byval.rs
+++ b/tests/codegen/align-byval.rs
@@ -1,4 +1,5 @@
 // ignore-tidy-linelength
+//@ add-core-stubs
 //@ revisions:m68k x86_64-linux x86_64-windows i686-linux i686-windows
 
 //@[m68k] compile-flags: --target m68k-unknown-linux-gnu
@@ -16,20 +17,13 @@
 // The only targets that use `byval` are m68k, x86-64, and x86.
 // Note also that Windows mandates a by-ref ABI here, so it does not use byval.
 
-#![feature(no_core, lang_items)]
+#![feature(no_core)]
 #![crate_type = "lib"]
 #![no_std]
 #![no_core]
 
-#[lang = "sized"]
-trait Sized {}
-#[lang = "freeze"]
-trait Freeze {}
-#[lang = "copy"]
-trait Copy {}
-
-impl Copy for i32 {}
-impl Copy for i64 {}
+extern crate minicore;
+use minicore::*;
 
 // This struct can be represented as a pair, so it exercises the OperandValue::Pair
 // codepath in `codegen_argument`.
diff --git a/tests/codegen/asm/avr-clobbers.rs b/tests/codegen/asm/avr-clobbers.rs
index 56218cd7bcf..9451127bf04 100644
--- a/tests/codegen/asm/avr-clobbers.rs
+++ b/tests/codegen/asm/avr-clobbers.rs
@@ -1,6 +1,6 @@
 //@ add-core-stubs
 //@ assembly-output: emit-asm
-//@ compile-flags: --target avr-unknown-gnu-atmega328
+//@ compile-flags: --target avr-none -C target-cpu=atmega328p
 //@ needs-llvm-components: avr
 
 #![crate_type = "rlib"]
diff --git a/tests/codegen/autodiff.rs b/tests/codegen/autodiff.rs
index abf7fcf3e4b..cace0edb2b5 100644
--- a/tests/codegen/autodiff.rs
+++ b/tests/codegen/autodiff.rs
@@ -1,4 +1,4 @@
-//@ compile-flags: -C opt-level=3  -Clto=fat
+//@ compile-flags: -Zautodiff=Enable -C opt-level=3  -Clto=fat
 //@ no-prefer-dynamic
 //@ needs-enzyme
 #![feature(autodiff)]
diff --git a/tests/codegen/avr/avr-func-addrspace.rs b/tests/codegen/avr/avr-func-addrspace.rs
index ed8acccb1ad..2ec7c86b5f4 100644
--- a/tests/codegen/avr/avr-func-addrspace.rs
+++ b/tests/codegen/avr/avr-func-addrspace.rs
@@ -1,4 +1,4 @@
-//@ compile-flags: -Copt-level=3 --target=avr-unknown-gnu-atmega328 --crate-type=rlib -C panic=abort
+//@ compile-flags: -Copt-level=3 --target=avr-none -C target-cpu=atmega328p --crate-type=rlib -C panic=abort
 //@ needs-llvm-components: avr
 
 // This test validates that function pointers can be stored in global variables
diff --git a/tests/codegen/box-default-debug-copies.rs b/tests/codegen/box-default-debug-copies.rs
new file mode 100644
index 00000000000..06cc41b21c0
--- /dev/null
+++ b/tests/codegen/box-default-debug-copies.rs
@@ -0,0 +1,28 @@
+//@ compile-flags: -Copt-level=0
+
+// Test to make sure that `<Box<T>>::default` does not create too many copies of `T` on the stack.
+// in debug mode. This regressed in dd0620b86721ae8cae86736443acd3f72ba6fc32 to
+// four `T` allocas.
+//
+// See https://github.com/rust-lang/rust/issues/136043 for more context.
+//
+// FIXME: This test only wants to ensure that there are at most two allocas of `T` created, instead
+// of checking for exactly two.
+
+#![crate_type = "lib"]
+
+#[allow(dead_code)]
+pub struct Thing([u8; 1000000]);
+
+impl Default for Thing {
+    fn default() -> Self {
+        Thing([0; 1000000])
+    }
+}
+
+// CHECK-COUNT-2: %{{.*}} = alloca {{.*}}1000000
+// CHECK-NOT: %{{.*}} = alloca {{.*}}1000000
+#[no_mangle]
+pub fn box_default_single_copy() -> Box<Thing> {
+    Box::default()
+}
diff --git a/tests/codegen/float/f128.rs b/tests/codegen/float/f128.rs
index 562a8e6c9e9..d87bab1172a 100644
--- a/tests/codegen/float/f128.rs
+++ b/tests/codegen/float/f128.rs
@@ -1,8 +1,11 @@
 // 32-bit x86 returns float types differently to avoid the x87 stack.
 // 32-bit systems will return 128bit values using a return area pointer.
 // Emscripten aligns f128 to 8 bytes, not 16.
-//@ revisions: x86 bit32 bit64 emscripten
-//@[x86] only-x86
+//@ revisions: x86-sse x86-nosse bit32 bit64 emscripten
+//@[x86-sse] only-x86
+//@[x86-sse] only-rustc_abi-x86-sse2
+//@[x86-nosse] only-x86
+//@[x86-nosse] ignore-rustc_abi-x86-sse2
 //@[bit32] ignore-x86
 //@[bit32] ignore-emscripten
 //@[bit32] only-32bit
@@ -60,7 +63,8 @@ pub fn f128_le(a: f128, b: f128) -> bool {
     a <= b
 }
 
-// x86-LABEL: void @f128_neg({{.*}}sret([16 x i8])
+// x86-nosse-LABEL: void @f128_neg({{.*}}sret([16 x i8])
+// x86-sse-LABEL: <16 x i8> @f128_neg(fp128
 // bit32-LABEL: void @f128_neg({{.*}}sret([16 x i8])
 // bit64-LABEL: fp128 @f128_neg(
 // emscripten-LABEL: void @f128_neg({{.*}}sret([16 x i8])
@@ -70,7 +74,8 @@ pub fn f128_neg(a: f128) -> f128 {
     -a
 }
 
-// x86-LABEL: void @f128_add({{.*}}sret([16 x i8])
+// x86-nosse-LABEL: void @f128_add({{.*}}sret([16 x i8])
+// x86-sse-LABEL: <16 x i8> @f128_add(fp128
 // bit32-LABEL: void @f128_add({{.*}}sret([16 x i8])
 // bit64-LABEL: fp128 @f128_add(
 // emscripten-LABEL: void @f128_add({{.*}}sret([16 x i8])
@@ -80,7 +85,8 @@ pub fn f128_add(a: f128, b: f128) -> f128 {
     a + b
 }
 
-// x86-LABEL: void @f128_sub({{.*}}sret([16 x i8])
+// x86-nosse-LABEL: void @f128_sub({{.*}}sret([16 x i8])
+// x86-sse-LABEL: <16 x i8> @f128_sub(fp128
 // bit32-LABEL: void @f128_sub({{.*}}sret([16 x i8])
 // bit64-LABEL: fp128 @f128_sub(
 // emscripten-LABEL: void @f128_sub({{.*}}sret([16 x i8])
@@ -90,7 +96,8 @@ pub fn f128_sub(a: f128, b: f128) -> f128 {
     a - b
 }
 
-// x86-LABEL: void @f128_mul({{.*}}sret([16 x i8])
+// x86-nosse-LABEL: void @f128_mul({{.*}}sret([16 x i8])
+// x86-sse-LABEL: <16 x i8> @f128_mul(fp128
 // bit32-LABEL: void @f128_mul({{.*}}sret([16 x i8])
 // bit64-LABEL: fp128 @f128_mul(
 // emscripten-LABEL: void @f128_mul({{.*}}sret([16 x i8])
@@ -100,7 +107,8 @@ pub fn f128_mul(a: f128, b: f128) -> f128 {
     a * b
 }
 
-// x86-LABEL: void @f128_div({{.*}}sret([16 x i8])
+// x86-nosse-LABEL: void @f128_div({{.*}}sret([16 x i8])
+// x86-sse-LABEL: <16 x i8> @f128_div(fp128
 // bit32-LABEL: void @f128_div({{.*}}sret([16 x i8])
 // bit64-LABEL: fp128 @f128_div(
 // emscripten-LABEL: void @f128_div({{.*}}sret([16 x i8])
@@ -110,7 +118,8 @@ pub fn f128_div(a: f128, b: f128) -> f128 {
     a / b
 }
 
-// x86-LABEL: void @f128_rem({{.*}}sret([16 x i8])
+// x86-nosse-LABEL: void @f128_rem({{.*}}sret([16 x i8])
+// x86-sse-LABEL: <16 x i8> @f128_rem(fp128
 // bit32-LABEL: void @f128_rem({{.*}}sret([16 x i8])
 // bit64-LABEL: fp128 @f128_rem(
 // emscripten-LABEL: void @f128_rem({{.*}}sret([16 x i8])
@@ -162,7 +171,8 @@ pub fn f128_rem_assign(a: &mut f128, b: f128) {
 
 /* float to float conversions */
 
-// x86-LABEL: i16 @f128_as_f16(
+// x86-sse-LABEL: <2 x i8> @f128_as_f16(
+// x86-nosse-LABEL: i16 @f128_as_f16(
 // bits32-LABEL: half @f128_as_f16(
 // bits64-LABEL: half @f128_as_f16(
 #[no_mangle]
@@ -171,7 +181,8 @@ pub fn f128_as_f16(a: f128) -> f16 {
     a as f16
 }
 
-// x86-LABEL: i32 @f128_as_f32(
+// x86-sse-LABEL: <4 x i8> @f128_as_f32(
+// x86-nosse-LABEL: i32 @f128_as_f32(
 // bit32-LABEL: float @f128_as_f32(
 // bit64-LABEL: float @f128_as_f32(
 // emscripten-LABEL: float @f128_as_f32(
@@ -181,7 +192,8 @@ pub fn f128_as_f32(a: f128) -> f32 {
     a as f32
 }
 
-// x86-LABEL: void @f128_as_f64(
+// x86-sse-LABEL: <8 x i8> @f128_as_f64(
+// x86-nosse-LABEL: void @f128_as_f64({{.*}}sret([8 x i8])
 // bit32-LABEL: double @f128_as_f64(
 // bit64-LABEL: double @f128_as_f64(
 // emscripten-LABEL: double @f128_as_f64(
@@ -191,7 +203,8 @@ pub fn f128_as_f64(a: f128) -> f64 {
     a as f64
 }
 
-// x86-LABEL: void @f128_as_self({{.*}}sret([16 x i8])
+// x86-sse-LABEL: <16 x i8> @f128_as_self(
+// x86-nosse-LABEL: void @f128_as_self({{.*}}sret([16 x i8])
 // bit32-LABEL: void @f128_as_self({{.*}}sret([16 x i8])
 // bit64-LABEL: fp128 @f128_as_self(
 // emscripten-LABEL: void @f128_as_self({{.*}}sret([16 x i8])
@@ -204,7 +217,8 @@ pub fn f128_as_self(a: f128) -> f128 {
     a as f128
 }
 
-// x86-LABEL: void @f16_as_f128({{.*}}sret([16 x i8])
+// x86-sse-LABEL: <16 x i8> @f16_as_f128(
+// x86-nosse-LABEL: void @f16_as_f128({{.*}}sret([16 x i8])
 // bit32-LABEL: void @f16_as_f128({{.*}}sret([16 x i8])
 // bit64-LABEL: fp128 @f16_as_f128(
 // emscripten-LABEL: void @f16_as_f128({{.*}}sret([16 x i8])
@@ -214,7 +228,8 @@ pub fn f16_as_f128(a: f16) -> f128 {
     a as f128
 }
 
-// x86-LABEL: void @f32_as_f128({{.*}}sret([16 x i8])
+// x86-sse-LABEL: <16 x i8> @f32_as_f128(
+// x86-nosse-LABEL: void @f32_as_f128({{.*}}sret([16 x i8])
 // bit32-LABEL: void @f32_as_f128({{.*}}sret([16 x i8])
 // bit64-LABEL: fp128 @f32_as_f128(
 // emscripten-LABEL: void @f32_as_f128({{.*}}sret([16 x i8])
@@ -224,7 +239,8 @@ pub fn f32_as_f128(a: f32) -> f128 {
     a as f128
 }
 
-// x86-LABEL: void @f64_as_f128({{.*}}sret([16 x i8])
+// x86-sse-LABEL: <16 x i8> @f64_as_f128(
+// x86-nosse-LABEL: void @f64_as_f128({{.*}}sret([16 x i8])
 // bit32-LABEL: void @f64_as_f128({{.*}}sret([16 x i8])
 // bit64-LABEL: fp128 @f64_as_f128(
 // emscripten-LABEL: void @f64_as_f128({{.*}}sret([16 x i8])
@@ -263,7 +279,8 @@ pub fn f128_as_u64(a: f128) -> u64 {
     a as u64
 }
 
-// x86-LABEL: void @f128_as_u128({{.*}}sret([16 x i8])
+// x86-sse-LABEL: void @f128_as_u128({{.*}}sret([16 x i8])
+// x86-nosse-LABEL: void @f128_as_u128({{.*}}sret([16 x i8])
 // bit32-LABEL: void @f128_as_u128({{.*}}sret([16 x i8])
 // bit64-LABEL: i128 @f128_as_u128(
 // emscripten-LABEL: void @f128_as_u128({{.*}}sret([16 x i8])
@@ -300,7 +317,8 @@ pub fn f128_as_i64(a: f128) -> i64 {
     a as i64
 }
 
-// x86-LABEL: void @f128_as_i128({{.*}}sret([16 x i8])
+// x86-sse-LABEL: void @f128_as_i128({{.*}}sret([16 x i8])
+// x86-nosse-LABEL: void @f128_as_i128({{.*}}sret([16 x i8])
 // bit32-LABEL: void @f128_as_i128({{.*}}sret([16 x i8])
 // bit64-LABEL: i128 @f128_as_i128(
 // emscripten-LABEL: void @f128_as_i128({{.*}}sret([16 x i8])
@@ -312,7 +330,8 @@ pub fn f128_as_i128(a: f128) -> i128 {
 
 /* int to float conversions */
 
-// x86-LABEL: void @u8_as_f128({{.*}}sret([16 x i8])
+// x86-sse-LABEL: <16 x i8> @u8_as_f128(
+// x86-nosse-LABEL: void @u8_as_f128({{.*}}sret([16 x i8])
 // bit32-LABEL: void @u8_as_f128({{.*}}sret([16 x i8])
 // bit64-LABEL: fp128 @u8_as_f128(
 // emscripten-LABEL: void @u8_as_f128({{.*}}sret([16 x i8])
@@ -322,7 +341,8 @@ pub fn u8_as_f128(a: u8) -> f128 {
     a as f128
 }
 
-// x86-LABEL: void @u16_as_f128({{.*}}sret([16 x i8])
+// x86-sse-LABEL: <16 x i8> @u16_as_f128(
+// x86-nosse-LABEL: void @u16_as_f128({{.*}}sret([16 x i8])
 // bit32-LABEL: void @u16_as_f128({{.*}}sret([16 x i8])
 // bit64-LABEL: fp128 @u16_as_f128(
 // emscripten-LABEL: void @u16_as_f128({{.*}}sret([16 x i8])
@@ -332,7 +352,8 @@ pub fn u16_as_f128(a: u16) -> f128 {
     a as f128
 }
 
-// x86-LABEL: void @u32_as_f128({{.*}}sret([16 x i8])
+// x86-sse-LABEL: <16 x i8> @u32_as_f128(
+// x86-nosse-LABEL: void @u32_as_f128({{.*}}sret([16 x i8])
 // bit32-LABEL: void @u32_as_f128({{.*}}sret([16 x i8])
 // bit64-LABEL: fp128 @u32_as_f128(
 // emscripten-LABEL: void @u32_as_f128({{.*}}sret([16 x i8])
@@ -342,7 +363,8 @@ pub fn u32_as_f128(a: u32) -> f128 {
     a as f128
 }
 
-// x86-LABEL: void @u64_as_f128({{.*}}sret([16 x i8])
+// x86-sse-LABEL: <16 x i8> @u64_as_f128(
+// x86-nosse-LABEL: void @u64_as_f128({{.*}}sret([16 x i8])
 // bit32-LABEL: void @u64_as_f128({{.*}}sret([16 x i8])
 // bit64-LABEL: fp128 @u64_as_f128(
 // emscripten-LABEL: void @u64_as_f128({{.*}}sret([16 x i8])
@@ -352,7 +374,8 @@ pub fn u64_as_f128(a: u64) -> f128 {
     a as f128
 }
 
-// x86-LABEL: void @u128_as_f128({{.*}}sret([16 x i8])
+// x86-sse-LABEL: <16 x i8> @u128_as_f128(
+// x86-nosse-LABEL: void @u128_as_f128({{.*}}sret([16 x i8])
 // bit32-LABEL: void @u128_as_f128({{.*}}sret([16 x i8])
 // bit64-LABEL: fp128 @u128_as_f128(
 // emscripten-LABEL: void @u128_as_f128({{.*}}sret([16 x i8])
@@ -362,7 +385,8 @@ pub fn u128_as_f128(a: u128) -> f128 {
     a as f128
 }
 
-// x86-LABEL: void @i8_as_f128({{.*}}sret([16 x i8])
+// x86-sse-LABEL: <16 x i8> @i8_as_f128(
+// x86-nosse-LABEL: void @i8_as_f128({{.*}}sret([16 x i8])
 // bit32-LABEL: void @i8_as_f128({{.*}}sret([16 x i8])
 // bit64-LABEL: fp128 @i8_as_f128(
 // emscripten-LABEL: void @i8_as_f128({{.*}}sret([16 x i8])
@@ -372,7 +396,8 @@ pub fn i8_as_f128(a: i8) -> f128 {
     a as f128
 }
 
-// x86-LABEL: void @i16_as_f128({{.*}}sret([16 x i8])
+// x86-sse-LABEL: <16 x i8> @i16_as_f128(
+// x86-nosse-LABEL: void @i16_as_f128({{.*}}sret([16 x i8])
 // bit32-LABEL: void @i16_as_f128({{.*}}sret([16 x i8])
 // bit64-LABEL: fp128 @i16_as_f128(
 // emscripten-LABEL: void @i16_as_f128({{.*}}sret([16 x i8])
@@ -382,7 +407,8 @@ pub fn i16_as_f128(a: i16) -> f128 {
     a as f128
 }
 
-// x86-LABEL: void @i32_as_f128({{.*}}sret([16 x i8])
+// x86-sse-LABEL: <16 x i8> @i32_as_f128(
+// x86-nosse-LABEL: void @i32_as_f128({{.*}}sret([16 x i8])
 // bit32-LABEL: void @i32_as_f128({{.*}}sret([16 x i8])
 // bit64-LABEL: fp128 @i32_as_f128(
 // emscripten-LABEL: void @i32_as_f128({{.*}}sret([16 x i8])
@@ -392,7 +418,8 @@ pub fn i32_as_f128(a: i32) -> f128 {
     a as f128
 }
 
-// x86-LABEL: void @i64_as_f128({{.*}}sret([16 x i8])
+// x86-sse-LABEL: <16 x i8> @i64_as_f128(
+// x86-nosse-LABEL: void @i64_as_f128({{.*}}sret([16 x i8])
 // bit32-LABEL: void @i64_as_f128({{.*}}sret([16 x i8])
 // bit64-LABEL: fp128 @i64_as_f128(
 // emscripten-LABEL: void @i64_as_f128({{.*}}sret([16 x i8])
@@ -402,7 +429,8 @@ pub fn i64_as_f128(a: i64) -> f128 {
     a as f128
 }
 
-// x86-LABEL: void @i128_as_f128({{.*}}sret([16 x i8])
+// x86-sse-LABEL: <16 x i8> @i128_as_f128(
+// x86-nosse-LABEL: void @i128_as_f128({{.*}}sret([16 x i8])
 // bit32-LABEL: void @i128_as_f128({{.*}}sret([16 x i8])
 // bit64-LABEL: fp128 @i128_as_f128(
 // emscripten-LABEL: void @i128_as_f128({{.*}}sret([16 x i8])
diff --git a/tests/codegen/float/f16.rs b/tests/codegen/float/f16.rs
index 5c3a5893b9d..0c40606ad8a 100644
--- a/tests/codegen/float/f16.rs
+++ b/tests/codegen/float/f16.rs
@@ -1,7 +1,10 @@
 // 32-bit x86 returns float types differently to avoid the x87 stack.
 // 32-bit systems will return 128bit values using a return area pointer.
-//@ revisions: x86 bit32 bit64
-//@[x86] only-x86
+//@ revisions: x86-sse x86-nosse bit32 bit64
+//@[x86-sse] only-x86
+//@[x86-sse] only-rustc_abi-x86-sse2
+//@[x86-nosse] only-x86
+//@[x86-nosse] ignore-rustc_abi-x86-sse2
 //@[bit32] ignore-x86
 //@[bit32] only-32bit
 //@[bit64] ignore-x86
@@ -59,8 +62,10 @@ pub fn f16_le(a: f16, b: f16) -> bool {
 }
 
 // This is where we check the argument and return ABI for f16.
-// other-LABEL: half @f16_neg(half
-// x86-LABEL: i16 @f16_neg(half
+// bit32-LABEL: half @f16_neg(half
+// bit64-LABEL: half @f16_neg(half
+// x86-sse-LABEL: <2 x i8> @f16_neg(half
+// x86-nosse-LABEL: i16 @f16_neg(half
 #[no_mangle]
 pub fn f16_neg(a: f16) -> f16 {
     // CHECK: fneg half %{{.+}}
@@ -144,17 +149,23 @@ pub fn f16_rem_assign(a: &mut f16, b: f16) {
 
 /* float to float conversions */
 
-// other-LABEL: half @f16_as_self(
-// x86-LABEL: i16 @f16_as_self(
+// bit32-LABEL: half @f16_as_self(
+// bit64-LABEL: half @f16_as_self(
+// x86-sse-LABEL: <2 x i8> @f16_as_self(
+// x86-nosse-LABEL: i16 @f16_as_self(
 #[no_mangle]
 pub fn f16_as_self(a: f16) -> f16 {
-    // other-CHECK: ret half %{{.+}}
-    // x86-CHECK: bitcast half
-    // x86-CHECK: ret i16
+    // bit32-CHECK: ret half %{{.+}}
+    // bit64-CHECK: ret half %{{.+}}
+    // x86-sse-CHECK: bitcast half
+    // x86-nosse-CHECK: bitcast half
+    // x86-sse-CHECK: ret i16
+    // x86-nosse-CHECK: ret i16
     a as f16
 }
 
-// x86-LABEL: i32 @f16_as_f32(
+// x86-sse-LABEL: <4 x i8> @f16_as_f32(
+// x86-nosse-LABEL: i32 @f16_as_f32(
 // bit32-LABEL: float @f16_as_f32(
 // bit64-LABEL: float @f16_as_f32(
 #[no_mangle]
@@ -163,7 +174,8 @@ pub fn f16_as_f32(a: f16) -> f32 {
     a as f32
 }
 
-// x86-LABEL: void @f16_as_f64(
+// x86-sse-LABEL: <8 x i8> @f16_as_f64(
+// x86-nosse-LABEL: void @f16_as_f64({{.*}}sret([8 x i8])
 // bit32-LABEL: double @f16_as_f64(
 // bit64-LABEL: double @f16_as_f64(
 #[no_mangle]
@@ -172,7 +184,8 @@ pub fn f16_as_f64(a: f16) -> f64 {
     a as f64
 }
 
-// x86-LABEL: void @f16_as_f128({{.*}}sret([16 x i8])
+// x86-sse-LABEL: <16 x i8> @f16_as_f128(
+// x86-nosse-LABEL: void @f16_as_f128({{.*}}sret([16 x i8])
 // bit32-LABEL: void @f16_as_f128({{.*}}sret([16 x i8])
 // bit64-LABEL: fp128 @f16_as_f128(
 #[no_mangle]
@@ -231,7 +244,8 @@ pub fn f16_as_u64(a: f16) -> u64 {
     a as u64
 }
 
-// x86-LABEL: void @f16_as_u128({{.*}}sret([16 x i8])
+// x86-sse-LABEL: void @f16_as_u128({{.*}}sret([16 x i8])
+// x86-nosse-LABEL: void @f16_as_u128({{.*}}sret([16 x i8])
 // bit32-LABEL: void @f16_as_u128({{.*}}sret([16 x i8])
 // bit64-LABEL: i128 @f16_as_u128(
 #[no_mangle]
@@ -267,7 +281,8 @@ pub fn f16_as_i64(a: f16) -> i64 {
     a as i64
 }
 
-// x86-LABEL: void @f16_as_i128({{.*}}sret([16 x i8])
+// x86-sse-LABEL: void @f16_as_i128({{.*}}sret([16 x i8])
+// x86-nosse-LABEL: void @f16_as_i128({{.*}}sret([16 x i8])
 // bit32-LABEL: void @f16_as_i128({{.*}}sret([16 x i8])
 // bit64-LABEL: i128 @f16_as_i128(
 #[no_mangle]
diff --git a/tests/codegen/gep-index.rs b/tests/codegen/gep-index.rs
index 1f5e8855910..bfb2511af87 100644
--- a/tests/codegen/gep-index.rs
+++ b/tests/codegen/gep-index.rs
@@ -11,27 +11,27 @@ struct Foo(i32, i32);
 // CHECK-LABEL: @index_on_struct(
 #[no_mangle]
 fn index_on_struct(a: &[Foo], index: usize) -> &Foo {
-    // CHECK: getelementptr inbounds %Foo, ptr %a.0, {{i64|i32}} %index
+    // CHECK: getelementptr inbounds{{( nuw)?}} %Foo, ptr %a.0, {{i64|i32}} %index
     &a[index]
 }
 
 // CHECK-LABEL: @offset_on_struct(
 #[no_mangle]
 fn offset_on_struct(a: *const Foo, index: usize) -> *const Foo {
-    // CHECK: getelementptr inbounds %Foo, ptr %a, {{i64|i32}} %index
+    // CHECK: getelementptr inbounds{{( nuw)?}} %Foo, ptr %a, {{i64|i32}} %index
     unsafe { a.add(index) }
 }
 
 // CHECK-LABEL: @index_on_i32(
 #[no_mangle]
 fn index_on_i32(a: &[i32], index: usize) -> &i32 {
-    // CHECK: getelementptr inbounds i32, ptr %a.0, {{i64|i32}} %index
+    // CHECK: getelementptr inbounds{{( nuw)?}} i32, ptr %a.0, {{i64|i32}} %index
     &a[index]
 }
 
 // CHECK-LABEL: @offset_on_i32(
 #[no_mangle]
 fn offset_on_i32(a: *const i32, index: usize) -> *const i32 {
-    // CHECK: getelementptr inbounds i32, ptr %a, {{i64|i32}} %index
+    // CHECK: getelementptr inbounds{{( nuw)?}} i32, ptr %a, {{i64|i32}} %index
     unsafe { a.add(index) }
 }
diff --git a/tests/codegen/i128-x86-callconv.rs b/tests/codegen/i128-x86-callconv.rs
index 9a9c9002fc0..41c30c09c1a 100644
--- a/tests/codegen/i128-x86-callconv.rs
+++ b/tests/codegen/i128-x86-callconv.rs
@@ -4,13 +4,18 @@
 //@ compile-flags: -Copt-level=1
 
 //@ add-core-stubs
-//@ revisions: MSVC MINGW
+//@ revisions: MSVC MINGW softfloat
 //@ [MSVC] needs-llvm-components: x86
-//@ [MINGW] needs-llvm-components: x86
 //@ [MSVC] compile-flags: --target x86_64-pc-windows-msvc
-//@ [MINGW] compile-flags: --target x86_64-pc-windows-gnu
+// Use `WIN` as a common prefix for MSVC and MINGW but *not* the softfloat test.
 //@ [MSVC] filecheck-flags: --check-prefix=WIN
+//@ [MINGW] needs-llvm-components: x86
+//@ [MINGW] compile-flags: --target x86_64-pc-windows-gnu
 //@ [MINGW] filecheck-flags: --check-prefix=WIN
+// The `x86_64-unknown-uefi` target also uses the Windows calling convention,
+// but does not have SSE registers available.
+//@ [softfloat] needs-llvm-components: x86
+//@ [softfloat] compile-flags: --target x86_64-unknown-uefi
 
 #![crate_type = "lib"]
 #![no_std]
@@ -28,24 +33,26 @@ extern "C" {
 pub extern "C" fn pass(_arg0: u32, arg1: i128) {
     // CHECK-LABEL: @pass(
     // i128 is passed indirectly on Windows. It should load the pointer to the stack and pass
-    // a pointer to that allocation.
-    // WIN-SAME: %_arg0, ptr{{.*}} %arg1)
-    // WIN: [[PASS:%[_0-9]+]] = alloca [16 x i8], align 16
-    // WIN: [[LOADED:%[_0-9]+]] = load i128, ptr %arg1
-    // WIN: store i128 [[LOADED]], ptr [[PASS]]
-    // WIN: call void @extern_call
+    // a pointer to that allocation. The softfloat ABI works the same.
+    // CHECK-SAME: %_arg0, ptr{{.*}} %arg1)
+    // CHECK: [[PASS:%[_0-9]+]] = alloca [16 x i8], align 16
+    // CHECK: [[LOADED:%[_0-9]+]] = load i128, ptr %arg1
+    // CHECK: store i128 [[LOADED]], ptr [[PASS]]
+    // CHECK: call void @extern_call
     unsafe { extern_call(arg1) };
 }
 
 // Check that we produce the correct return ABI
 #[no_mangle]
 pub extern "C" fn ret(_arg0: u32, arg1: i128) -> i128 {
-    // CHECK-LABEL: @ret(
+    // WIN-LABEL: @ret(
     // i128 is returned in xmm0 on Windows
     // FIXME(#134288): This may change for the `-msvc` targets in the future.
     // WIN-SAME: i32{{.*}} %_arg0, ptr{{.*}} %arg1)
     // WIN: [[LOADED:%[_0-9]+]] = load <16 x i8>, ptr %arg1
     // WIN-NEXT: ret <16 x i8> [[LOADED]]
+    // The softfloat ABI returns this indirectly.
+    // softfloat-LABEL: i128 @ret(i32{{.*}} %_arg0, ptr{{.*}} %arg1)
     arg1
 }
 
@@ -57,6 +64,7 @@ pub extern "C" fn forward(dst: *mut i128) {
     // WIN: [[RETURNED:%[_0-9]+]] = tail call <16 x i8> @extern_ret()
     // WIN: store <16 x i8> [[RETURNED]], ptr %dst
     // WIN: ret void
+    // softfloat: [[RETURNED:%[_0-9]+]] = tail call {{.*}}i128 @extern_ret()
     unsafe { *dst = extern_ret() };
 }
 
@@ -70,10 +78,10 @@ struct RetAggregate {
 pub extern "C" fn ret_aggregate(_arg0: u32, arg1: i128) -> RetAggregate {
     // CHECK-LABEL: @ret_aggregate(
     // Aggregates should also be returned indirectly
-    // WIN-SAME: ptr{{.*}}sret([32 x i8]){{.*}}[[RET:%[_0-9]+]], i32{{.*}}%_arg0, ptr{{.*}}%arg1)
-    // WIN: [[LOADED:%[_0-9]+]] = load i128, ptr %arg1
-    // WIN: [[GEP:%[_0-9]+]] = getelementptr{{.*}}, ptr [[RET]]
-    // WIN: store i128 [[LOADED]], ptr [[GEP]]
-    // WIN: ret void
+    // CHECK-SAME: ptr{{.*}}sret([32 x i8]){{.*}}[[RET:%[_0-9]+]], i32{{.*}}%_arg0, ptr{{.*}}%arg1)
+    // CHECK: [[LOADED:%[_0-9]+]] = load i128, ptr %arg1
+    // CHECK: [[GEP:%[_0-9]+]] = getelementptr{{.*}}, ptr [[RET]]
+    // CHECK: store i128 [[LOADED]], ptr [[GEP]]
+    // CHECK: ret void
     RetAggregate { a: 1, b: arg1 }
 }
diff --git a/tests/codegen/intrinsics/cold_path2.rs b/tests/codegen/intrinsics/cold_path2.rs
new file mode 100644
index 00000000000..1e7e0478f4f
--- /dev/null
+++ b/tests/codegen/intrinsics/cold_path2.rs
@@ -0,0 +1,36 @@
+//@ compile-flags: -O
+#![crate_type = "lib"]
+#![feature(core_intrinsics)]
+
+use std::intrinsics::cold_path;
+
+#[inline(never)]
+#[no_mangle]
+pub fn path_a() {
+    println!("path a");
+}
+
+#[inline(never)]
+#[no_mangle]
+pub fn path_b() {
+    println!("path b");
+}
+
+#[no_mangle]
+pub fn test(x: Option<bool>) {
+    if let Some(_) = x {
+        path_a();
+    } else {
+        cold_path();
+        path_b();
+    }
+
+    // CHECK-LABEL: @test(
+    // CHECK: br i1 %1, label %bb2, label %bb1, !prof ![[NUM:[0-9]+]]
+    // CHECK: bb1:
+    // CHECK: path_a
+    // CHECK: bb2:
+    // CHECK: path_b
+}
+
+// CHECK: ![[NUM]] = !{!"branch_weights", {{(!"expected", )?}}i32 1, i32 2000}
diff --git a/tests/codegen/intrinsics/cold_path3.rs b/tests/codegen/intrinsics/cold_path3.rs
new file mode 100644
index 00000000000..bf3347de665
--- /dev/null
+++ b/tests/codegen/intrinsics/cold_path3.rs
@@ -0,0 +1,87 @@
+//@ compile-flags: -O
+#![crate_type = "lib"]
+#![feature(core_intrinsics)]
+
+use std::intrinsics::cold_path;
+
+#[inline(never)]
+#[no_mangle]
+pub fn path_a() {
+    println!("path a");
+}
+
+#[inline(never)]
+#[no_mangle]
+pub fn path_b() {
+    println!("path b");
+}
+
+#[inline(never)]
+#[no_mangle]
+pub fn path_c() {
+    println!("path c");
+}
+
+#[inline(never)]
+#[no_mangle]
+pub fn path_d() {
+    println!("path d");
+}
+
+#[no_mangle]
+pub fn test(x: Option<u32>) {
+    match x {
+        Some(0) => path_a(),
+        Some(1) => {
+            cold_path();
+            path_b()
+        }
+        Some(2) => path_c(),
+        Some(3) => {
+            cold_path();
+            path_d()
+        }
+        _ => path_a(),
+    }
+
+    // CHECK-LABEL: @test(
+    // CHECK: switch i32 %1, label %bb1 [
+    // CHECK: i32 0, label %bb6
+    // CHECK: i32 1, label %bb5
+    // CHECK: i32 2, label %bb4
+    // CHECK: i32 3, label %bb3
+    // CHECK: ], !prof ![[NUM1:[0-9]+]]
+}
+
+#[no_mangle]
+pub fn test2(x: Option<u32>) {
+    match x {
+        Some(10) => path_a(),
+        Some(11) => {
+            cold_path();
+            path_b()
+        }
+        Some(12) => {
+            unsafe { core::intrinsics::unreachable() };
+            path_c()
+        }
+        Some(13) => {
+            cold_path();
+            path_d()
+        }
+        _ => {
+            cold_path();
+            path_a()
+        }
+    }
+
+    // CHECK-LABEL: @test2(
+    // CHECK: switch i32 %1, label %bb1 [
+    // CHECK: i32 10, label %bb5
+    // CHECK: i32 11, label %bb4
+    // CHECK: i32 13, label %bb3
+    // CHECK: ], !prof ![[NUM2:[0-9]+]]
+}
+
+// CHECK: ![[NUM1]] = !{!"branch_weights", i32 2000, i32 2000, i32 1, i32 2000, i32 1}
+// CHECK: ![[NUM2]] = !{!"branch_weights", i32 1, i32 2000, i32 1, i32 1}
diff --git a/tests/codegen/intrinsics/nearby.rs b/tests/codegen/intrinsics/nearby.rs
deleted file mode 100644
index 520fe2f1886..00000000000
--- a/tests/codegen/intrinsics/nearby.rs
+++ /dev/null
@@ -1,18 +0,0 @@
-#![crate_type = "lib"]
-#![feature(core_intrinsics)]
-
-use std::intrinsics;
-
-// CHECK-LABEL: @nearbyintf32
-#[no_mangle]
-pub unsafe fn nearbyintf32(a: f32) -> f32 {
-    // CHECK: llvm.nearbyint.f32
-    intrinsics::nearbyintf32(a)
-}
-
-// CHECK-LABEL: @nearbyintf64
-#[no_mangle]
-pub unsafe fn nearbyintf64(a: f64) -> f64 {
-    // CHECK: llvm.nearbyint.f64
-    intrinsics::nearbyintf64(a)
-}
diff --git a/tests/codegen/intrinsics/offset.rs b/tests/codegen/intrinsics/offset.rs
index d76d3e705ab..cf0c7c7ac7d 100644
--- a/tests/codegen/intrinsics/offset.rs
+++ b/tests/codegen/intrinsics/offset.rs
@@ -27,7 +27,7 @@ pub unsafe fn offset_isize(p: *const u32, d: isize) -> *const u32 {
 // CHECK-SAME: (ptr noundef %p, [[SIZE]] noundef %d)
 #[no_mangle]
 pub unsafe fn offset_usize(p: *const u64, d: usize) -> *const u64 {
-    // CHECK: %[[R:.*]] = getelementptr inbounds i64, ptr %p, [[SIZE]] %d
+    // CHECK: %[[R:.*]] = getelementptr inbounds{{( nuw)?}} i64, ptr %p, [[SIZE]] %d
     // CHECK-NEXT: ret ptr %[[R]]
     offset(p, d)
 }
diff --git a/tests/codegen/intrinsics/ptr_metadata.rs b/tests/codegen/intrinsics/ptr_metadata.rs
index 87a32fa3d24..044dbc20486 100644
--- a/tests/codegen/intrinsics/ptr_metadata.rs
+++ b/tests/codegen/intrinsics/ptr_metadata.rs
@@ -28,7 +28,7 @@ pub unsafe fn dyn_byte_offset(
     p: *const dyn std::fmt::Debug,
     n: usize,
 ) -> *const dyn std::fmt::Debug {
-    // CHECK: %[[Q:.+]] = getelementptr inbounds i8, ptr %p.0, i64 %n
+    // CHECK: %[[Q:.+]] = getelementptr inbounds{{( nuw)?}} i8, ptr %p.0, i64 %n
     // CHECK: %[[TEMP1:.+]] = insertvalue { ptr, ptr } poison, ptr %[[Q]], 0
     // CHECK: %[[TEMP2:.+]] = insertvalue { ptr, ptr } %[[TEMP1]], ptr %p.1, 1
     // CHECK: ret { ptr, ptr } %[[TEMP2]]
diff --git a/tests/codegen/intrinsics/transmute-niched.rs b/tests/codegen/intrinsics/transmute-niched.rs
index 5ded0e192fa..8ff5cc8ee4f 100644
--- a/tests/codegen/intrinsics/transmute-niched.rs
+++ b/tests/codegen/intrinsics/transmute-niched.rs
@@ -170,7 +170,7 @@ pub unsafe fn check_bool_from_ordering(x: std::cmp::Ordering) -> bool {
     // OPT: call void @llvm.assume(i1 %2)
     // CHECK-NOT: icmp
     // CHECK-NOT: assume
-    // CHECK: %[[R:.+]] = trunc i8 %x to i1
+    // CHECK: %[[R:.+]] = trunc{{( nuw)?}} i8 %x to i1
     // CHECK: ret i1 %[[R]]
 
     transmute(x)
diff --git a/tests/codegen/intrinsics/transmute-x64.rs b/tests/codegen/intrinsics/transmute-x64.rs
index fe68f183667..be45e4db90f 100644
--- a/tests/codegen/intrinsics/transmute-x64.rs
+++ b/tests/codegen/intrinsics/transmute-x64.rs
@@ -6,15 +6,6 @@
 use std::arch::x86_64::{__m128, __m128i, __m256i};
 use std::mem::transmute;
 
-// CHECK-LABEL: @check_sse_float_to_int(
-#[no_mangle]
-pub unsafe fn check_sse_float_to_int(x: __m128) -> __m128i {
-    // CHECK-NOT: alloca
-    // CHECK: %0 = load <4 x float>, ptr %x, align 16
-    // CHECK: store <4 x float> %0, ptr %_0, align 16
-    transmute(x)
-}
-
 // CHECK-LABEL: @check_sse_pair_to_avx(
 #[no_mangle]
 pub unsafe fn check_sse_pair_to_avx(x: (__m128i, __m128i)) -> __m256i {
diff --git a/tests/codegen/intrinsics/transmute.rs b/tests/codegen/intrinsics/transmute.rs
index 5b40a6a12c0..ff297b27065 100644
--- a/tests/codegen/intrinsics/transmute.rs
+++ b/tests/codegen/intrinsics/transmute.rs
@@ -11,6 +11,9 @@ use std::intrinsics::mir::*;
 use std::intrinsics::{transmute, transmute_unchecked};
 use std::mem::MaybeUninit;
 
+// FIXME(LLVM18REMOVED): `trunc nuw` doesn't exist in LLVM 18, so once we no
+// longer support it the optional flag checks can be changed to required.
+
 pub enum ZstNever {}
 
 #[repr(align(2))]
@@ -153,7 +156,7 @@ pub unsafe fn check_from_newtype(x: Scalar64) -> u64 {
 pub unsafe fn check_aggregate_to_bool(x: Aggregate8) -> bool {
     // CHECK: %x = alloca [1 x i8], align 1
     // CHECK: %[[BYTE:.+]] = load i8, ptr %x, align 1
-    // CHECK: %[[BOOL:.+]] = trunc i8 %[[BYTE]] to i1
+    // CHECK: %[[BOOL:.+]] = trunc{{( nuw)?}} i8 %[[BYTE]] to i1
     // CHECK: ret i1 %[[BOOL]]
     transmute(x)
 }
@@ -171,7 +174,7 @@ pub unsafe fn check_aggregate_from_bool(x: bool) -> Aggregate8 {
 #[no_mangle]
 pub unsafe fn check_byte_to_bool(x: u8) -> bool {
     // CHECK-NOT: alloca
-    // CHECK: %[[R:.+]] = trunc i8 %x to i1
+    // CHECK: %[[R:.+]] = trunc{{( nuw)?}} i8 %x to i1
     // CHECK: ret i1 %[[R]]
     transmute(x)
 }
@@ -284,7 +287,7 @@ pub unsafe fn check_long_array_more_aligned(x: [u8; 100]) -> [u32; 25] {
 #[no_mangle]
 pub unsafe fn check_pair_with_bool(x: (u8, bool)) -> (bool, i8) {
     // CHECK-NOT: alloca
-    // CHECK: trunc i8 %x.0 to i1
+    // CHECK: trunc{{( nuw)?}} i8 %x.0 to i1
     // CHECK: zext i1 %x.1 to i8
     transmute(x)
 }
@@ -338,7 +341,7 @@ pub unsafe fn check_heterogeneous_integer_pair(x: (i32, bool)) -> (bool, u32) {
     // CHECK: store i8 %[[WIDER]]
 
     // CHECK: %[[BYTE:.+]] = load i8
-    // CHECK: trunc i8 %[[BYTE:.+]] to i1
+    // CHECK: trunc{{( nuw)?}} i8 %[[BYTE:.+]] to i1
     // CHECK: load i32
     transmute(x)
 }
diff --git a/tests/codegen/issues/issue-32031.rs b/tests/codegen/issues/issue-32031.rs
index 4d6895166f1..559e8d947fb 100644
--- a/tests/codegen/issues/issue-32031.rs
+++ b/tests/codegen/issues/issue-32031.rs
@@ -1,7 +1,7 @@
 //@ compile-flags: -C no-prepopulate-passes -Copt-level=0
 // 32-bit x86 returns `f32` and `f64` differently to avoid the x87 stack.
 //@ revisions: x86 other
-//@[x86] only-x86
+//@[x86] only-rustc_abi-x86-sse2
 //@[other] ignore-x86
 
 #![crate_type = "lib"]
@@ -10,7 +10,7 @@
 pub struct F32(f32);
 
 // other: define{{.*}}float @add_newtype_f32(float %a, float %b)
-// x86: define{{.*}}i32 @add_newtype_f32(float %a, float %b)
+// x86: define{{.*}}<4 x i8> @add_newtype_f32(float %a, float %b)
 #[inline(never)]
 #[no_mangle]
 pub fn add_newtype_f32(a: F32, b: F32) -> F32 {
@@ -21,7 +21,7 @@ pub fn add_newtype_f32(a: F32, b: F32) -> F32 {
 pub struct F64(f64);
 
 // other: define{{.*}}double @add_newtype_f64(double %a, double %b)
-// x86: define{{.*}}void @add_newtype_f64(ptr{{.*}}sret([8 x i8]){{.*}}%_0, double %a, double %b)
+// x86: define{{.*}}<8 x i8> @add_newtype_f64(double %a, double %b)
 #[inline(never)]
 #[no_mangle]
 pub fn add_newtype_f64(a: F64, b: F64) -> F64 {
diff --git a/tests/codegen/ptr-arithmetic.rs b/tests/codegen/ptr-arithmetic.rs
index ecb44b30f5c..fc4441ef448 100644
--- a/tests/codegen/ptr-arithmetic.rs
+++ b/tests/codegen/ptr-arithmetic.rs
@@ -6,7 +6,7 @@
 // CHECK-SAME: [[WORD:i[0-9]+]] noundef %n)
 #[no_mangle]
 pub unsafe fn i32_add(p: *const i32, n: usize) -> *const i32 {
-    // CHECK: %[[TEMP:.+]] = getelementptr inbounds i32, ptr %p, [[WORD]] %n
+    // CHECK: %[[TEMP:.+]] = getelementptr inbounds{{( nuw)?}} i32, ptr %p, [[WORD]] %n
     // CHECK: ret ptr %[[TEMP]]
     p.add(n)
 }
diff --git a/tests/codegen/simd-intrinsic/simd-intrinsic-transmute-array.rs b/tests/codegen/simd-intrinsic/simd-intrinsic-transmute-array.rs
index 75f989d6e12..0d21d510557 100644
--- a/tests/codegen/simd-intrinsic/simd-intrinsic-transmute-array.rs
+++ b/tests/codegen/simd-intrinsic/simd-intrinsic-transmute-array.rs
@@ -1,5 +1,14 @@
 //
 //@ compile-flags: -C no-prepopulate-passes
+// LLVM IR isn't very portable and the one tested here depends on the ABI
+// which is different between x86 (where we use SSE registers) and others.
+// `x86-64` and `x86-32-sse2` are identical, but compiletest does not support
+// taking the union of multiple `only` annotations.
+//@ revisions: x86-64 x86-32-sse2 other
+//@[x86-64] only-x86_64
+//@[x86-32-sse2] only-rustc_abi-x86-sse2
+//@[other] ignore-rustc_abi-x86-sse2
+//@[other] ignore-x86_64
 
 #![crate_type = "lib"]
 #![allow(non_camel_case_types)]
@@ -38,7 +47,9 @@ pub fn build_array_s(x: [f32; 4]) -> S<4> {
 #[no_mangle]
 pub fn build_array_transmute_s(x: [f32; 4]) -> S<4> {
     // CHECK: %[[VAL:.+]] = load <4 x float>, ptr %x, align [[ARRAY_ALIGN]]
-    // CHECK: store <4 x float> %[[VAL:.+]], ptr %_0, align [[VECTOR_ALIGN]]
+    // x86-32: ret <4 x float> %[[VAL:.+]]
+    // x86-64: ret <4 x float> %[[VAL:.+]]
+    // other: store <4 x float> %[[VAL:.+]], ptr %_0, align [[VECTOR_ALIGN]]
     unsafe { std::mem::transmute(x) }
 }
 
@@ -53,6 +64,8 @@ pub fn build_array_t(x: [f32; 4]) -> T {
 #[no_mangle]
 pub fn build_array_transmute_t(x: [f32; 4]) -> T {
     // CHECK: %[[VAL:.+]] = load <4 x float>, ptr %x, align [[ARRAY_ALIGN]]
-    // CHECK: store <4 x float> %[[VAL:.+]], ptr %_0, align [[VECTOR_ALIGN]]
+    // x86-32: ret <4 x float> %[[VAL:.+]]
+    // x86-64: ret <4 x float> %[[VAL:.+]]
+    // other: store <4 x float> %[[VAL:.+]], ptr %_0, align [[VECTOR_ALIGN]]
     unsafe { std::mem::transmute(x) }
 }
diff --git a/tests/codegen/simd/packed-simd.rs b/tests/codegen/simd/packed-simd.rs
index 1df09c96e6c..a27d5e3af45 100644
--- a/tests/codegen/simd/packed-simd.rs
+++ b/tests/codegen/simd/packed-simd.rs
@@ -1,4 +1,5 @@
 //@ revisions:opt3 noopt
+//@ only-x86_64
 //@[opt3] compile-flags: -Copt-level=3
 //@[noopt] compile-flags: -Cno-prepopulate-passes
 
@@ -14,14 +15,14 @@ use core::{mem, ptr};
 
 #[repr(simd, packed)]
 #[derive(Copy, Clone)]
-pub struct Simd<T, const N: usize>([T; N]);
+pub struct PackedSimd<T, const N: usize>([T; N]);
 
 #[repr(simd)]
 #[derive(Copy, Clone)]
 pub struct FullSimd<T, const N: usize>([T; N]);
 
 // non-powers-of-two have padding and need to be expanded to full vectors
-fn load<T, const N: usize>(v: Simd<T, N>) -> FullSimd<T, N> {
+fn load<T, const N: usize>(v: PackedSimd<T, N>) -> FullSimd<T, N> {
     unsafe {
         let mut tmp = mem::MaybeUninit::<FullSimd<T, N>>::uninit();
         ptr::copy_nonoverlapping(&v as *const _, tmp.as_mut_ptr().cast(), 1);
@@ -29,18 +30,16 @@ fn load<T, const N: usize>(v: Simd<T, N>) -> FullSimd<T, N> {
     }
 }
 
-// CHECK-LABEL: square_packed_full
-// CHECK-SAME: ptr{{[a-z_ ]*}} sret([[RET_TYPE:[^)]+]]) [[RET_ALIGN:align (8|16)]]{{[^%]*}} [[RET_VREG:%[_0-9]*]]
-// CHECK-SAME: ptr{{[a-z_ ]*}} align 4
+// CHECK-LABEL: define <3 x float> @square_packed_full(ptr{{[a-z_ ]*}} align 4 {{[^,]*}})
 #[no_mangle]
-pub fn square_packed_full(x: Simd<f32, 3>) -> FullSimd<f32, 3> {
-    // CHECK-NEXT: start
-    // noopt: alloca [[RET_TYPE]], [[RET_ALIGN]]
-    // CHECK: load <3 x float>
+pub fn square_packed_full(x: PackedSimd<f32, 3>) -> FullSimd<f32, 3> {
+    // The unoptimized version of this is not very interesting to check
+    // since `load` does not get inlined.
+    // opt3-NEXT: start:
+    // opt3-NEXT: load <3 x float>
     let x = load(x);
-    // CHECK: [[VREG:%[a-z0-9_]+]] = fmul <3 x float>
-    // CHECK-NEXT: store <3 x float> [[VREG]], ptr [[RET_VREG]], [[RET_ALIGN]]
-    // CHECK-NEXT: ret void
+    // opt3-NEXT: [[VREG:%[a-z0-9_]+]] = fmul <3 x float>
+    // opt3-NEXT: ret <3 x float> [[VREG:%[a-z0-9_]+]]
     unsafe { intrinsics::simd_mul(x, x) }
 }
 
@@ -48,7 +47,7 @@ pub fn square_packed_full(x: Simd<f32, 3>) -> FullSimd<f32, 3> {
 // CHECK-SAME: ptr{{[a-z_ ]*}} sret([[RET_TYPE:[^)]+]]) [[RET_ALIGN:align 4]]{{[^%]*}} [[RET_VREG:%[_0-9]*]]
 // CHECK-SAME: ptr{{[a-z_ ]*}} align 4
 #[no_mangle]
-pub fn square_packed(x: Simd<f32, 3>) -> Simd<f32, 3> {
+pub fn square_packed(x: PackedSimd<f32, 3>) -> PackedSimd<f32, 3> {
     // CHECK-NEXT: start
     // CHECK-NEXT: load <3 x float>
     // noopt-NEXT: load <3 x float>
diff --git a/tests/codegen/simd/project-to-simd-array-field.rs b/tests/codegen/simd/project-to-simd-array-field.rs
new file mode 100644
index 00000000000..29fab640633
--- /dev/null
+++ b/tests/codegen/simd/project-to-simd-array-field.rs
@@ -0,0 +1,31 @@
+//@compile-flags: -Copt-level=3
+
+#![crate_type = "lib"]
+#![feature(repr_simd, core_intrinsics)]
+
+#[allow(non_camel_case_types)]
+#[derive(Clone, Copy)]
+#[repr(simd)]
+struct i32x4([i32; 4]);
+
+#[inline(always)]
+fn to_array4(a: i32x4) -> [i32; 4] {
+    a.0
+}
+
+// CHECK-LABEL: simd_add_self_then_return_array(
+// CHECK-SAME: ptr{{.+}}sret{{.+}}%[[RET:.+]],
+// CHECK-SAME: ptr{{.+}}%a)
+#[no_mangle]
+pub fn simd_add_self_then_return_array(a: &i32x4) -> [i32; 4] {
+    // It would be nice to just ban `.0` into simd types,
+    // but until we do this has to keep working.
+    // See also <https://github.com/rust-lang/rust/issues/105439>
+
+    // CHECK: %[[T1:.+]] = load <4 x i32>, ptr %a
+    // CHECK: %[[T2:.+]] = shl <4 x i32> %[[T1]], {{splat \(i32 1\)|<i32 1, i32 1, i32 1, i32 1>}}
+    // CHECK: store <4 x i32> %[[T2]], ptr %[[RET]]
+    let a = *a;
+    let b = unsafe { core::intrinsics::simd::simd_add(a, a) };
+    to_array4(b)
+}
diff --git a/tests/codegen/slice-iter-nonnull.rs b/tests/codegen/slice-iter-nonnull.rs
index 98a1b961a64..87907e7ad0a 100644
--- a/tests/codegen/slice-iter-nonnull.rs
+++ b/tests/codegen/slice-iter-nonnull.rs
@@ -14,11 +14,11 @@
 // CHECK-LABEL: @slice_iter_next(
 #[no_mangle]
 pub fn slice_iter_next<'a>(it: &mut std::slice::Iter<'a, u32>) -> Option<&'a u32> {
-    // CHECK: %[[ENDP:.+]] = getelementptr inbounds{{( nuw)?}} i8, ptr %it, {{i32 4|i64 8}}
-    // CHECK: %[[END:.+]] = load ptr, ptr %[[ENDP]]
+    // CHECK: %[[START:.+]] = load ptr, ptr %it,
     // CHECK-SAME: !nonnull
     // CHECK-SAME: !noundef
-    // CHECK: %[[START:.+]] = load ptr, ptr %it,
+    // CHECK: %[[ENDP:.+]] = getelementptr inbounds{{( nuw)?}} i8, ptr %it, {{i32 4|i64 8}}
+    // CHECK: %[[END:.+]] = load ptr, ptr %[[ENDP]]
     // CHECK-SAME: !nonnull
     // CHECK-SAME: !noundef
     // CHECK: icmp eq ptr %[[START]], %[[END]]
diff --git a/tests/codegen/transmute-scalar.rs b/tests/codegen/transmute-scalar.rs
index 43da7c1781e..c080259a917 100644
--- a/tests/codegen/transmute-scalar.rs
+++ b/tests/codegen/transmute-scalar.rs
@@ -26,7 +26,7 @@ pub fn bool_to_byte(b: bool) -> u8 {
 }
 
 // CHECK-LABEL: define{{.*}}zeroext i1 @byte_to_bool(i8{{.*}} %byte)
-// CHECK: %_0 = trunc i8 %byte to i1
+// CHECK: %_0 = trunc{{( nuw)?}} i8 %byte to i1
 // CHECK-NEXT: ret i1 %_0
 #[no_mangle]
 pub unsafe fn byte_to_bool(byte: u8) -> bool {
diff --git a/tests/codegen/unchecked_shifts.rs b/tests/codegen/unchecked_shifts.rs
index b27eb73c0cc..9fccaf2252e 100644
--- a/tests/codegen/unchecked_shifts.rs
+++ b/tests/codegen/unchecked_shifts.rs
@@ -1,4 +1,10 @@
-//@ compile-flags: -Copt-level=3
+//@ revisions: LLVM18 LLVM19PLUS
+//@ compile-flags: -Copt-level=3 -C no-prepopulate-passes
+//@[LLVM18] exact-llvm-major-version: 18
+//@[LLVM19PLUS] min-llvm-version: 19
+
+// This runs mir-opts to inline the standard library call, but doesn't run LLVM
+// optimizations so it doesn't need to worry about them adding more flags.
 
 #![crate_type = "lib"]
 #![feature(unchecked_shifts)]
@@ -17,13 +23,10 @@ pub unsafe fn unchecked_shl_unsigned_same(a: u32, b: u32) -> u32 {
 // CHECK-LABEL: @unchecked_shl_unsigned_smaller
 #[no_mangle]
 pub unsafe fn unchecked_shl_unsigned_smaller(a: u16, b: u32) -> u16 {
-    // This uses -DAG to avoid failing on irrelevant reorderings,
-    // like emitting the truncation earlier.
-
-    // CHECK-DAG: %[[INRANGE:.+]] = icmp ult i32 %b, 16
-    // CHECK-DAG: tail call void @llvm.assume(i1 %[[INRANGE]])
-    // CHECK-DAG: %[[TRUNC:.+]] = trunc{{( nuw)?( nsw)?}} i32 %b to i16
-    // CHECK-DAG: shl i16 %a, %[[TRUNC]]
+    // CHECK-NOT: assume
+    // LLVM18: %[[TRUNC:.+]] = trunc i32 %b to i16
+    // LLVM19PLUS: %[[TRUNC:.+]] = trunc nuw i32 %b to i16
+    // CHECK: shl i16 %a, %[[TRUNC]]
     a.unchecked_shl(b)
 }
 
@@ -31,7 +34,7 @@ pub unsafe fn unchecked_shl_unsigned_smaller(a: u16, b: u32) -> u16 {
 #[no_mangle]
 pub unsafe fn unchecked_shl_unsigned_bigger(a: u64, b: u32) -> u64 {
     // CHECK-NOT: assume
-    // CHECK: %[[EXT:.+]] = zext{{( nneg)?}} i32 %b to i64
+    // CHECK: %[[EXT:.+]] = zext i32 %b to i64
     // CHECK: shl i64 %a, %[[EXT]]
     a.unchecked_shl(b)
 }
@@ -49,13 +52,10 @@ pub unsafe fn unchecked_shr_signed_same(a: i32, b: u32) -> i32 {
 // CHECK-LABEL: @unchecked_shr_signed_smaller
 #[no_mangle]
 pub unsafe fn unchecked_shr_signed_smaller(a: i16, b: u32) -> i16 {
-    // This uses -DAG to avoid failing on irrelevant reorderings,
-    // like emitting the truncation earlier.
-
-    // CHECK-DAG: %[[INRANGE:.+]] = icmp ult i32 %b, 16
-    // CHECK-DAG: tail call void @llvm.assume(i1 %[[INRANGE]])
-    // CHECK-DAG: %[[TRUNC:.+]] = trunc{{( nuw)?( nsw)?}}  i32 %b to i16
-    // CHECK-DAG: ashr i16 %a, %[[TRUNC]]
+    // CHECK-NOT: assume
+    // LLVM18: %[[TRUNC:.+]] = trunc i32 %b to i16
+    // LLVM19PLUS: %[[TRUNC:.+]] = trunc nuw i32 %b to i16
+    // CHECK: ashr i16 %a, %[[TRUNC]]
     a.unchecked_shr(b)
 }
 
@@ -63,7 +63,7 @@ pub unsafe fn unchecked_shr_signed_smaller(a: i16, b: u32) -> i16 {
 #[no_mangle]
 pub unsafe fn unchecked_shr_signed_bigger(a: i64, b: u32) -> i64 {
     // CHECK-NOT: assume
-    // CHECK: %[[EXT:.+]] = zext{{( nneg)?}} i32 %b to i64
+    // CHECK: %[[EXT:.+]] = zext i32 %b to i64
     // CHECK: ashr i64 %a, %[[EXT]]
     a.unchecked_shr(b)
 }
@@ -72,7 +72,7 @@ pub unsafe fn unchecked_shr_signed_bigger(a: i64, b: u32) -> i64 {
 #[no_mangle]
 pub unsafe fn unchecked_shr_u128_i8(a: u128, b: i8) -> u128 {
     // CHECK-NOT: assume
-    // CHECK: %[[EXT:.+]] = zext{{( nneg)?}} i8 %b to i128
+    // CHECK: %[[EXT:.+]] = zext i8 %b to i128
     // CHECK: lshr i128 %a, %[[EXT]]
     std::intrinsics::unchecked_shr(a, b)
 }
@@ -81,7 +81,7 @@ pub unsafe fn unchecked_shr_u128_i8(a: u128, b: i8) -> u128 {
 #[no_mangle]
 pub unsafe fn unchecked_shl_i128_u8(a: i128, b: u8) -> i128 {
     // CHECK-NOT: assume
-    // CHECK: %[[EXT:.+]] = zext{{( nneg)?}} i8 %b to i128
+    // CHECK: %[[EXT:.+]] = zext i8 %b to i128
     // CHECK: shl i128 %a, %[[EXT]]
     std::intrinsics::unchecked_shl(a, b)
 }
@@ -89,25 +89,19 @@ pub unsafe fn unchecked_shl_i128_u8(a: i128, b: u8) -> i128 {
 // CHECK-LABEL: @unchecked_shl_u8_i128
 #[no_mangle]
 pub unsafe fn unchecked_shl_u8_i128(a: u8, b: i128) -> u8 {
-    // This uses -DAG to avoid failing on irrelevant reorderings,
-    // like emitting the truncation earlier.
-
-    // CHECK-DAG: %[[INRANGE:.+]] = icmp ult i128 %b, 8
-    // CHECK-DAG: tail call void @llvm.assume(i1 %[[INRANGE]])
-    // CHECK-DAG: %[[TRUNC:.+]] = trunc{{( nuw)?( nsw)?}} i128 %b to i8
-    // CHECK-DAG: shl i8 %a, %[[TRUNC]]
+    // CHECK-NOT: assume
+    // LLVM18: %[[TRUNC:.+]] = trunc i128 %b to i8
+    // LLVM19PLUS: %[[TRUNC:.+]] = trunc nuw i128 %b to i8
+    // CHECK: shl i8 %a, %[[TRUNC]]
     std::intrinsics::unchecked_shl(a, b)
 }
 
 // CHECK-LABEL: @unchecked_shr_i8_u128
 #[no_mangle]
 pub unsafe fn unchecked_shr_i8_u128(a: i8, b: u128) -> i8 {
-    // This uses -DAG to avoid failing on irrelevant reorderings,
-    // like emitting the truncation earlier.
-
-    // CHECK-DAG: %[[INRANGE:.+]] = icmp ult i128 %b, 8
-    // CHECK-DAG: tail call void @llvm.assume(i1 %[[INRANGE]])
-    // CHECK-DAG: %[[TRUNC:.+]] = trunc{{( nuw)?( nsw)?}} i128 %b to i8
-    // CHECK-DAG: ashr i8 %a, %[[TRUNC]]
+    // CHECK-NOT: assume
+    // LLVM18: %[[TRUNC:.+]] = trunc i128 %b to i8
+    // LLVM19PLUS: %[[TRUNC:.+]] = trunc nuw i128 %b to i8
+    // CHECK: ashr i8 %a, %[[TRUNC]]
     std::intrinsics::unchecked_shr(a, b)
 }
diff --git a/tests/codegen/uninhabited-transparent-return-abi.rs b/tests/codegen/uninhabited-transparent-return-abi.rs
new file mode 100644
index 00000000000..6e8b1683163
--- /dev/null
+++ b/tests/codegen/uninhabited-transparent-return-abi.rs
@@ -0,0 +1,44 @@
+//@ compile-flags: -Copt-level=3
+
+// See https://github.com/rust-lang/rust/issues/135802
+
+#![crate_type = "lib"]
+
+enum Void {}
+
+// Should be ABI-compatible with T, but wasn't prior to the PR adding this test.
+#[repr(transparent)]
+struct NoReturn<T>(T, Void);
+
+// Returned by invisible reference (in most ABIs)
+#[allow(dead_code)]
+struct Large(u64, u64, u64);
+
+extern "Rust" {
+    fn opaque() -> NoReturn<Large>;
+    fn opaque_with_arg(rsi: u32) -> NoReturn<Large>;
+}
+
+// CHECK-LABEL: @test_uninhabited_ret_by_ref
+#[no_mangle]
+pub fn test_uninhabited_ret_by_ref() {
+    // CHECK: %_1 = alloca [24 x i8], align {{8|4}}
+    // CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 24, ptr nonnull %_1)
+    // CHECK-NEXT: call void @opaque(ptr noalias nocapture noundef nonnull sret([24 x i8]) align {{8|4}} dereferenceable(24) %_1) #2
+    // CHECK-NEXT: unreachable
+    unsafe {
+        opaque();
+    }
+}
+
+// CHECK-LABEL: @test_uninhabited_ret_by_ref_with_arg
+#[no_mangle]
+pub fn test_uninhabited_ret_by_ref_with_arg(rsi: u32) {
+    // CHECK: %_2 = alloca [24 x i8], align {{8|4}}
+    // CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 24, ptr nonnull %_2)
+    // CHECK-NEXT: call void @opaque_with_arg(ptr noalias nocapture noundef nonnull sret([24 x i8]) align {{8|4}} dereferenceable(24) %_2, i32 noundef %rsi) #2
+    // CHECK-NEXT: unreachable
+    unsafe {
+        opaque_with_arg(rsi);
+    }
+}
diff --git a/tests/codegen/union-abi.rs b/tests/codegen/union-abi.rs
index 92d40d8ac14..28acc4de2f3 100644
--- a/tests/codegen/union-abi.rs
+++ b/tests/codegen/union-abi.rs
@@ -2,8 +2,11 @@
 //@ compile-flags: -Copt-level=3 -C no-prepopulate-passes
 // 32-bit x86 returns `f32` differently to avoid the x87 stack.
 // 32-bit systems will return 128bit values using a return area pointer.
-//@ revisions: x86 bit32 bit64
-//@[x86] only-x86
+//@ revisions: x86-sse x86-nosse bit32 bit64
+//@[x86-sse] only-x86
+//@[x86-sse] only-rustc_abi-x86-sse2
+//@[x86-nosse] only-x86
+//@[x86-nosse] ignore-rustc_abi-x86-sse2
 //@[bit32] ignore-x86
 //@[bit32] only-32bit
 //@[bit64] ignore-x86
@@ -75,7 +78,8 @@ pub union UnionF32 {
     a: f32,
 }
 
-// x86: define {{(dso_local )?}}i32 @test_UnionF32(float %_1)
+// x86-sse: define {{(dso_local )?}}<4 x i8> @test_UnionF32(float %_1)
+// x86-nosse: define {{(dso_local )?}}i32 @test_UnionF32(float %_1)
 // bit32: define {{(dso_local )?}}float @test_UnionF32(float %_1)
 // bit64: define {{(dso_local )?}}float @test_UnionF32(float %_1)
 #[no_mangle]
@@ -88,7 +92,8 @@ pub union UnionF32F32 {
     b: f32,
 }
 
-// x86: define {{(dso_local )?}}i32 @test_UnionF32F32(float %_1)
+// x86-sse: define {{(dso_local )?}}<4 x i8> @test_UnionF32F32(float %_1)
+// x86-nosse: define {{(dso_local )?}}i32 @test_UnionF32F32(float %_1)
 // bit32: define {{(dso_local )?}}float @test_UnionF32F32(float %_1)
 // bit64: define {{(dso_local )?}}float @test_UnionF32F32(float %_1)
 #[no_mangle]
@@ -110,7 +115,8 @@ pub fn test_UnionF32U32(_: UnionF32U32) -> UnionF32U32 {
 pub union UnionU128 {
     a: u128,
 }
-// x86: define {{(dso_local )?}}void @test_UnionU128({{.*}}sret([16 x i8]){{.*}}, i128 %_1)
+// x86-sse: define {{(dso_local )?}}void @test_UnionU128({{.*}}sret([16 x i8]){{.*}}, i128 %_1)
+// x86-nosse: define {{(dso_local )?}}void @test_UnionU128({{.*}}sret([16 x i8]){{.*}}, i128 %_1)
 // bit32: define {{(dso_local )?}}void @test_UnionU128({{.*}}sret([16 x i8]){{.*}}, i128 %_1)
 // bit64: define {{(dso_local )?}}i128 @test_UnionU128(i128 %_1)
 #[no_mangle]
@@ -136,4 +142,4 @@ pub union UnionBool {
 pub fn test_UnionBool(b: UnionBool) -> bool {
     unsafe { b.b }
 }
-// CHECK: %_0 = trunc i8 %b to i1
+// CHECK: %_0 = trunc{{( nuw)?}} i8 %b to i1