8 files changed, 463 insertions, 0 deletions
diff --git a/tests/codegen-llvm/simd/aggregate-simd.rs b/tests/codegen-llvm/simd/aggregate-simd.rs
new file mode 100644
index 00000000000..57a301d634c
--- /dev/null
+++ b/tests/codegen-llvm/simd/aggregate-simd.rs
@@ -0,0 +1,102 @@
+//@ compile-flags: -C opt-level=3 -C no-prepopulate-passes
+//@ only-64bit
+
+#![feature(core_intrinsics, repr_simd)]
+#![no_std]
+#![crate_type = "lib"]
+
+#[path = "../../auxiliary/minisimd.rs"]
+mod minisimd;
+use core::intrinsics::simd::{simd_add, simd_extract};
+
+use minisimd::*;
+
+#[repr(transparent)]
+pub struct Transparent<T>(T);
+
+// These tests don't actually care about the add/extract, but it ensures the
+// aggregated temporaries are only used in potentially-SSA ways.
+
+#[no_mangle]
+pub fn simd_aggregate_pot(x: [u32; 4], y: [u32; 4]) -> u32 {
+    // CHECK-LABEL: simd_aggregate_pot
+    // CHECK: %a = load <4 x i32>, ptr %x, align 4
+    // CHECK: %b = load <4 x i32>, ptr %y, align 4
+    // CHECK: add <4 x i32> %a, %b
+
+    unsafe {
+        let a = Simd(x);
+        let b = Simd(y);
+        let c = simd_add(a, b);
+        simd_extract(c, 1)
+    }
+}
+
+#[no_mangle]
+pub fn simd_aggregate_npot(x: [u32; 7], y: [u32; 7]) -> u32 {
+    // CHECK-LABEL: simd_aggregate_npot
+    // CHECK: %a = load <7 x i32>, ptr %x, align 4
+    // CHECK: %b = load <7 x i32>, ptr %y, align 4
+    // CHECK: add <7 x i32> %a, %b
+
+    unsafe {
+        let a = Simd(x);
+        let b = Simd(y);
+        let c = simd_add(a, b);
+        simd_extract(c, 1)
+    }
+}
+
+#[no_mangle]
+pub fn packed_simd_aggregate_pot(x: [u32; 4], y: [u32; 4]) -> u32 {
+    // CHECK-LABEL: packed_simd_aggregate_pot
+    // CHECK: %a = load <4 x i32>, ptr %x, align 4
+    // CHECK: %b = load <4 x i32>, ptr %y, align 4
+    // CHECK: add <4 x i32> %a, %b
+
+    unsafe {
+        let a = PackedSimd(x);
+        let b = PackedSimd(y);
+        let c = simd_add(a, b);
+        simd_extract(c, 1)
+    }
+}
+
+#[no_mangle]
+pub fn packed_simd_aggregate_npot(x: [u32; 7], y: [u32; 7]) -> u32 {
+    // CHECK-LABEL: packed_simd_aggregate_npot
+    // CHECK: %b = alloca [28 x i8], align 4
+    // CHECK: %a = alloca [28 x i8], align 4
+    // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 4 %a, ptr align 4 %x, i64 28, i1 false)
+    // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 4 %b, ptr align 4 %y, i64 28, i1 false)
+    // CHECK: %[[TEMPA:.+]] = load <7 x i32>, ptr %a, align 4
+    // CHECK: %[[TEMPB:.+]] = load <7 x i32>, ptr %b, align 4
+    // CHECK: add <7 x i32> %[[TEMPA]], %[[TEMPB]]
+
+    unsafe {
+        let a = PackedSimd(x);
+        let b = PackedSimd(y);
+        let c = simd_add(a, b);
+        simd_extract(c, 1)
+    }
+}
+
+#[no_mangle]
+pub fn transparent_simd_aggregate(x: [u32; 4]) -> u32 {
+    // The transparent wrapper can just use the same SSA value as its field.
+    // No extra processing or spilling needed.
+
+    // CHECK-LABEL: transparent_simd_aggregate
+    // CHECK-NOT: alloca
+    // CHECK: %[[RET:.+]] = alloca [4 x i8]
+    // CHECK-NOT: alloca
+    // CHECK: %a = load <4 x i32>, ptr %x, align 4
+    // CHECK: %[[TEMP:.+]] = extractelement <4 x i32> %a, i32 1
+    // CHECK: store i32 %[[TEMP]], ptr %[[RET]]
+
+    unsafe {
+        let a = Simd(x);
+        let b = Transparent(a);
+        simd_extract(b.0, 1)
+    }
+}
diff --git a/tests/codegen-llvm/simd/extract-insert-dyn.rs b/tests/codegen-llvm/simd/extract-insert-dyn.rs
new file mode 100644
index 00000000000..729f0145314
--- /dev/null
+++ b/tests/codegen-llvm/simd/extract-insert-dyn.rs
@@ -0,0 +1,121 @@
+//@compile-flags: -C opt-level=3 -C no-prepopulate-passes
+
+#![feature(
+    core_intrinsics,
+    repr_simd,
+    arm_target_feature,
+    mips_target_feature,
+    s390x_target_feature
+)]
+#![no_std]
+#![crate_type = "lib"]
+#![allow(non_camel_case_types)]
+
+// Test that `core::intrinsics::simd::{simd_extract_dyn, simd_insert_dyn}`
+// lower to an LLVM extractelement or insertelement operation.
+
+use core::intrinsics::simd::{simd_extract, simd_extract_dyn, simd_insert, simd_insert_dyn};
+
+#[repr(simd)]
+#[derive(Clone, Copy)]
+pub struct u32x16([u32; 16]);
+
+#[repr(simd)]
+#[derive(Clone, Copy)]
+pub struct i8x16([i8; 16]);
+
+// CHECK-LABEL: dyn_simd_extract
+// CHECK: extractelement <16 x i8> %x, i32 %idx
+#[no_mangle]
+#[cfg_attr(target_family = "wasm", target_feature(enable = "simd128"))]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "neon"))]
+#[cfg_attr(target_arch = "x86", target_feature(enable = "sse"))]
+#[cfg_attr(target_arch = "mips", target_feature(enable = "msa"))]
+#[cfg_attr(target_arch = "s390x", target_feature(enable = "vector"))]
+unsafe extern "C" fn dyn_simd_extract(x: i8x16, idx: u32) -> i8 {
+    simd_extract_dyn(x, idx)
+}
+
+// CHECK-LABEL: literal_dyn_simd_extract
+// CHECK: extractelement <16 x i8> %x, i32 7
+#[no_mangle]
+#[cfg_attr(target_family = "wasm", target_feature(enable = "simd128"))]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "neon"))]
+#[cfg_attr(target_arch = "x86", target_feature(enable = "sse"))]
+#[cfg_attr(target_arch = "mips", target_feature(enable = "msa"))]
+#[cfg_attr(target_arch = "s390x", target_feature(enable = "vector"))]
+unsafe extern "C" fn literal_dyn_simd_extract(x: i8x16) -> i8 {
+    simd_extract_dyn(x, 7)
+}
+
+// CHECK-LABEL: const_dyn_simd_extract
+// CHECK: extractelement <16 x i8> %x, i32 7
+#[no_mangle]
+#[cfg_attr(target_family = "wasm", target_feature(enable = "simd128"))]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "neon"))]
+#[cfg_attr(target_arch = "x86", target_feature(enable = "sse"))]
+#[cfg_attr(target_arch = "mips", target_feature(enable = "msa"))]
+#[cfg_attr(target_arch = "s390x", target_feature(enable = "vector"))]
+unsafe extern "C" fn const_dyn_simd_extract(x: i8x16) -> i8 {
+    simd_extract_dyn(x, const { 3 + 4 })
+}
+
+// CHECK-LABEL: const_simd_extract
+// CHECK: extractelement <16 x i8> %x, i32 7
+#[no_mangle]
+#[cfg_attr(target_family = "wasm", target_feature(enable = "simd128"))]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "neon"))]
+#[cfg_attr(target_arch = "x86", target_feature(enable = "sse"))]
+#[cfg_attr(target_arch = "mips", target_feature(enable = "msa"))]
+#[cfg_attr(target_arch = "s390x", target_feature(enable = "vector"))]
+unsafe extern "C" fn const_simd_extract(x: i8x16) -> i8 {
+    simd_extract(x, const { 3 + 4 })
+}
+
+// CHECK-LABEL: dyn_simd_insert
+// CHECK: insertelement <16 x i8> %x, i8 %e, i32 %idx
+#[no_mangle]
+#[cfg_attr(target_family = "wasm", target_feature(enable = "simd128"))]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "neon"))]
+#[cfg_attr(target_arch = "x86", target_feature(enable = "sse"))]
+#[cfg_attr(target_arch = "mips", target_feature(enable = "msa"))]
+#[cfg_attr(target_arch = "s390x", target_feature(enable = "vector"))]
+unsafe extern "C" fn dyn_simd_insert(x: i8x16, e: i8, idx: u32) -> i8x16 {
+    simd_insert_dyn(x, idx, e)
+}
+
+// CHECK-LABEL: literal_dyn_simd_insert
+// CHECK: insertelement <16 x i8> %x, i8 %e, i32 7
+#[no_mangle]
+#[cfg_attr(target_family = "wasm", target_feature(enable = "simd128"))]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "neon"))]
+#[cfg_attr(target_arch = "x86", target_feature(enable = "sse"))]
+#[cfg_attr(target_arch = "mips", target_feature(enable = "msa"))]
+#[cfg_attr(target_arch = "s390x", target_feature(enable = "vector"))]
+unsafe extern "C" fn literal_dyn_simd_insert(x: i8x16, e: i8) -> i8x16 {
+    simd_insert_dyn(x, 7, e)
+}
+
+// CHECK-LABEL: const_dyn_simd_insert
+// CHECK: insertelement <16 x i8> %x, i8 %e, i32 7
+#[no_mangle]
+#[cfg_attr(target_family = "wasm", target_feature(enable = "simd128"))]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "neon"))]
+#[cfg_attr(target_arch = "x86", target_feature(enable = "sse"))]
+#[cfg_attr(target_arch = "mips", target_feature(enable = "msa"))]
+#[cfg_attr(target_arch = "s390x", target_feature(enable = "vector"))]
+unsafe extern "C" fn const_dyn_simd_insert(x: i8x16, e: i8) -> i8x16 {
+    simd_insert_dyn(x, const { 3 + 4 }, e)
+}
+
+// CHECK-LABEL: const_simd_insert
+// CHECK: insertelement <16 x i8> %x, i8 %e, i32 7
+#[no_mangle]
+#[cfg_attr(target_family = "wasm", target_feature(enable = "simd128"))]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "neon"))]
+#[cfg_attr(target_arch = "x86", target_feature(enable = "sse"))]
+#[cfg_attr(target_arch = "mips", target_feature(enable = "msa"))]
+#[cfg_attr(target_arch = "s390x", target_feature(enable = "vector"))]
+unsafe extern "C" fn const_simd_insert(x: i8x16, e: i8) -> i8x16 {
+    simd_insert(x, const { 3 + 4 }, e)
+}
diff --git a/tests/codegen-llvm/simd/packed-simd-alignment.rs b/tests/codegen-llvm/simd/packed-simd-alignment.rs
new file mode 100644
index 00000000000..53e88d8e5cf
--- /dev/null
+++ b/tests/codegen-llvm/simd/packed-simd-alignment.rs
@@ -0,0 +1,44 @@
+//@ compile-flags: -Cno-prepopulate-passes
+
+#![crate_type = "lib"]
+#![feature(repr_simd, core_intrinsics)]
+// make sure that codegen emits correctly-aligned loads and stores for repr(packed, simd) types
+// the alignment of a load should be no less than T, and no more than the size of the vector type
+use std::intrinsics::simd as intrinsics;
+
+#[derive(Copy, Clone)]
+#[repr(packed, simd)]
+struct f32x3([f32; 3]);
+
+#[derive(Copy, Clone)]
+#[repr(packed, simd)]
+struct f32x4([f32; 4]);
+
+// CHECK-LABEL: load_f32x3
+#[no_mangle]
+pub fn load_f32x3(floats: &f32x3) -> f32x3 {
+    // FIXME: Is a memcpy really the best we can do?
+    // CHECK: @llvm.memcpy.{{.*}}ptr align 4 {{.*}}ptr align 4
+    *floats
+}
+
+// CHECK-LABEL: load_f32x4
+#[no_mangle]
+pub fn load_f32x4(floats: &f32x4) -> f32x4 {
+    // CHECK: load <4 x float>, ptr %{{[a-z0-9_]*}}, align {{4|8|16}}
+    *floats
+}
+
+// CHECK-LABEL: add_f32x3
+#[no_mangle]
+pub fn add_f32x3(x: f32x3, y: f32x3) -> f32x3 {
+    // CHECK: load <3 x float>, ptr %{{[a-z0-9_]*}}, align 4
+    unsafe { intrinsics::simd_add(x, y) }
+}
+
+// CHECK-LABEL: add_f32x4
+#[no_mangle]
+pub fn add_f32x4(x: f32x4, y: f32x4) -> f32x4 {
+    // CHECK: load <4 x float>, ptr %{{[a-z0-9_]*}}, align {{4|8|16}}
+    unsafe { intrinsics::simd_add(x, y) }
+}
diff --git a/tests/codegen-llvm/simd/packed-simd.rs b/tests/codegen-llvm/simd/packed-simd.rs
new file mode 100644
index 00000000000..70c03fcc955
--- /dev/null
+++ b/tests/codegen-llvm/simd/packed-simd.rs
@@ -0,0 +1,56 @@
+//@ revisions:opt3 noopt
+//@ only-x86_64
+//@[opt3] compile-flags: -Copt-level=3
+//@[noopt] compile-flags: -Cno-prepopulate-passes
+
+#![crate_type = "lib"]
+#![no_std]
+#![feature(repr_simd, core_intrinsics)]
+use core::intrinsics::simd as intrinsics;
+use core::{mem, ptr};
+
+#[path = "../../auxiliary/minisimd.rs"]
+mod minisimd;
+use minisimd::{PackedSimd, Simd as FullSimd};
+
+// Test codegen for not only "packed" but also "fully aligned" SIMD types, and conversion between
+// them. A repr(packed,simd) type with 3 elements can't exceed its element alignment, whereas the
+// same type as repr(simd) will instead have padding.
+
+// non-powers-of-two have padding and need to be expanded to full vectors
+fn load<T, const N: usize>(v: PackedSimd<T, N>) -> FullSimd<T, N> {
+    unsafe {
+        let mut tmp = mem::MaybeUninit::<FullSimd<T, N>>::uninit();
+        ptr::copy_nonoverlapping(&v as *const _, tmp.as_mut_ptr().cast(), 1);
+        tmp.assume_init()
+    }
+}
+
+// CHECK-LABEL: square_packed_full
+// CHECK-SAME: ptr{{[a-z_ ]*}} sret([[RET_TYPE:[^)]+]]) [[RET_ALIGN:align (8|16)]]{{[^%]*}} [[RET_VREG:%[_0-9]*]]
+// CHECK-SAME: ptr{{[a-z_ ]*}} align 4
+#[no_mangle]
+pub fn square_packed_full(x: PackedSimd<f32, 3>) -> FullSimd<f32, 3> {
+    // CHECK-NEXT: start
+    // noopt: alloca [[RET_TYPE]], [[RET_ALIGN]]
+    // CHECK: load <3 x float>
+    let x = load(x);
+    // CHECK: [[VREG:%[a-z0-9_]+]] = fmul <3 x float>
+    // CHECK-NEXT: store <3 x float> [[VREG]], ptr [[RET_VREG]], [[RET_ALIGN]]
+    // CHECK-NEXT: ret void
+    unsafe { intrinsics::simd_mul(x, x) }
+}
+
+// CHECK-LABEL: square_packed
+// CHECK-SAME: ptr{{[a-z_ ]*}} sret([[RET_TYPE:[^)]+]]) [[RET_ALIGN:align 4]]{{[^%]*}} [[RET_VREG:%[_0-9]*]]
+// CHECK-SAME: ptr{{[a-z_ ]*}} align 4
+#[no_mangle]
+pub fn square_packed(x: PackedSimd<f32, 3>) -> PackedSimd<f32, 3> {
+    // CHECK-NEXT: start
+    // CHECK-NEXT: load <3 x float>
+    // noopt-NEXT: load <3 x float>
+    // CHECK-NEXT: [[VREG:%[a-z0-9_]+]] = fmul <3 x float>
+    // CHECK-NEXT: store <3 x float> [[VREG]], ptr [[RET_VREG]], [[RET_ALIGN]]
+    // CHECK-NEXT: ret void
+    unsafe { intrinsics::simd_mul(x, x) }
+}
diff --git a/tests/codegen-llvm/simd/simd-wide-sum.rs b/tests/codegen-llvm/simd/simd-wide-sum.rs
new file mode 100644
index 00000000000..95117b2c748
--- /dev/null
+++ b/tests/codegen-llvm/simd/simd-wide-sum.rs
@@ -0,0 +1,59 @@
+//@ revisions: llvm mir-opt3
+//@ compile-flags: -C opt-level=3 -Z merge-functions=disabled
+//@ edition: 2021
+//@ only-x86_64
+//@ [mir-opt3]compile-flags: -Zmir-opt-level=3
+//@ [mir-opt3]build-pass
+
+// mir-opt3 is a regression test for https://github.com/rust-lang/rust/issues/98016
+
+#![crate_type = "lib"]
+#![feature(portable_simd)]
+
+use std::simd::prelude::*;
+const N: usize = 16;
+
+#[no_mangle]
+// CHECK-LABEL: @wider_reduce_simd
+pub fn wider_reduce_simd(x: Simd<u8, N>) -> u16 {
+    // CHECK: zext <16 x i8>
+    // CHECK-SAME: to <16 x i16>
+    // CHECK: call i16 @llvm.vector.reduce.add.v16i16(<16 x i16>
+    let x: Simd<u16, N> = x.cast();
+    x.reduce_sum()
+}
+
+#[no_mangle]
+// CHECK-LABEL: @wider_reduce_loop
+pub fn wider_reduce_loop(x: Simd<u8, N>) -> u16 {
+    // CHECK: zext <16 x i8>
+    // CHECK-SAME: to <16 x i16>
+    // CHECK: call i16 @llvm.vector.reduce.add.v16i16(<16 x i16>
+    let mut sum = 0_u16;
+    for i in 0..N {
+        sum += u16::from(x[i]);
+    }
+    sum
+}
+
+#[no_mangle]
+// CHECK-LABEL: @wider_reduce_iter
+pub fn wider_reduce_iter(x: Simd<u8, N>) -> u16 {
+    // CHECK: zext <16 x i8>
+    // CHECK-SAME: to <16 x i16>
+    // CHECK: call i16 @llvm.vector.reduce.add.v16i16(<16 x i16>
+    x.as_array().iter().copied().map(u16::from).sum()
+}
+
+// This iterator one is the most interesting, as it's the one
+// which used to not auto-vectorize due to a suboptimality in the
+// `<array::IntoIter as Iterator>::fold` implementation.
+
+#[no_mangle]
+// CHECK-LABEL: @wider_reduce_into_iter
+pub fn wider_reduce_into_iter(x: Simd<u8, N>) -> u16 {
+    // CHECK: zext <16 x i8>
+    // CHECK-SAME: to <16 x i16>
+    // CHECK: call i16 @llvm.vector.reduce.add.v16i16(<16 x i16>
+    x.to_array().into_iter().map(u16::from).sum()
+}
diff --git a/tests/codegen-llvm/simd/simd_arith_offset.rs b/tests/codegen-llvm/simd/simd_arith_offset.rs
new file mode 100644
index 00000000000..210b4e9bb50
--- /dev/null
+++ b/tests/codegen-llvm/simd/simd_arith_offset.rs
@@ -0,0 +1,22 @@
+//@ compile-flags: -C no-prepopulate-passes
+//@ only-64bit (because the LLVM type of i64 for usize shows up)
+//
+
+#![crate_type = "lib"]
+#![feature(repr_simd, core_intrinsics)]
+
+#[path = "../../auxiliary/minisimd.rs"]
+mod minisimd;
+use std::intrinsics::simd::simd_arith_offset;
+
+use minisimd::*;
+
+/// A vector of *const T.
+pub type SimdConstPtr<T, const LANES: usize> = Simd<*const T, LANES>;
+
+// CHECK-LABEL: smoke
+#[no_mangle]
+pub fn smoke(ptrs: SimdConstPtr<u8, 8>, offsets: Simd<usize, 8>) -> SimdConstPtr<u8, 8> {
+    // CHECK: getelementptr i8, <8 x ptr> %0, <8 x i64> %1
+    unsafe { simd_arith_offset(ptrs, offsets) }
+}
diff --git a/tests/codegen-llvm/simd/swap-simd-types.rs b/tests/codegen-llvm/simd/swap-simd-types.rs
new file mode 100644
index 00000000000..c063cc683a6
--- /dev/null
+++ b/tests/codegen-llvm/simd/swap-simd-types.rs
@@ -0,0 +1,40 @@
+//@ compile-flags: -Copt-level=3 -C target-feature=+avx
+//@ only-x86_64
+
+#![crate_type = "lib"]
+
+use std::mem::swap;
+
+// SIMD types are highly-aligned already, so make sure the swap code leaves their
+// types alone and doesn't pessimize them (such as by swapping them as `usize`s).
+extern crate core;
+use core::arch::x86_64::__m256;
+
+// CHECK-LABEL: @swap_single_m256
+#[no_mangle]
+pub fn swap_single_m256(x: &mut __m256, y: &mut __m256) {
+    // CHECK-NOT: alloca
+    // CHECK: load <8 x float>{{.+}}align 32
+    // CHECK: store <8 x float>{{.+}}align 32
+    swap(x, y)
+}
+
+// CHECK-LABEL: @swap_m256_slice
+#[no_mangle]
+pub fn swap_m256_slice(x: &mut [__m256], y: &mut [__m256]) {
+    // CHECK-NOT: alloca
+    // CHECK-COUNT-2: load <4 x i64>{{.+}}align 32
+    // CHECK-COUNT-2: store <4 x i64>{{.+}}align 32
+    if x.len() == y.len() {
+        x.swap_with_slice(y);
+    }
+}
+
+// CHECK-LABEL: @swap_bytes32
+#[no_mangle]
+pub fn swap_bytes32(x: &mut [u8; 32], y: &mut [u8; 32]) {
+    // CHECK-NOT: alloca
+    // CHECK-COUNT-2: load <4 x i64>{{.+}}align 1
+    // CHECK-COUNT-2: store <4 x i64>{{.+}}align 1
+    swap(x, y)
+}
diff --git a/tests/codegen-llvm/simd/unpadded-simd.rs b/tests/codegen-llvm/simd/unpadded-simd.rs
new file mode 100644
index 00000000000..ef067a15702
--- /dev/null
+++ b/tests/codegen-llvm/simd/unpadded-simd.rs
@@ -0,0 +1,19 @@
+// Make sure that no 0-sized padding is inserted in structs and that
+// structs are represented as expected by Neon intrinsics in LLVM.
+// See #87254.
+
+#![crate_type = "lib"]
+#![feature(repr_simd, abi_unadjusted)]
+
+#[derive(Copy, Clone)]
+#[repr(simd)]
+pub struct int16x4_t(pub [i16; 4]);
+
+#[derive(Copy, Clone)]
+pub struct int16x4x2_t(pub int16x4_t, pub int16x4_t);
+
+// CHECK: %int16x4x2_t = type { <4 x i16>, <4 x i16> }
+#[no_mangle]
+extern "unadjusted" fn takes_int16x4x2_t(t: int16x4x2_t) -> int16x4x2_t {
+    t
+}