3 files changed, 140 insertions, 0 deletions
diff --git a/src/test/codegen/swap-large-types.rs b/src/test/codegen/swap-large-types.rs
new file mode 100644
index 00000000000..535d301a3d2
--- /dev/null
+++ b/src/test/codegen/swap-large-types.rs
@@ -0,0 +1,64 @@
+// compile-flags: -O
+// only-x86_64
+// ignore-debug: the debug assertions get in the way
+
+#![crate_type = "lib"]
+
+use std::mem::swap;
+use std::ptr::{read, copy_nonoverlapping, write};
+
+type KeccakBuffer = [[u64; 5]; 5];
+
+// A basic read+copy+write swap implementation ends up copying one of the values
+// to stack for large types, which is completely unnecessary as the lack of
+// overlap means we can just do whatever fits in registers at a time.
+
+// CHECK-LABEL: @swap_basic
+#[no_mangle]
+pub fn swap_basic(x: &mut KeccakBuffer, y: &mut KeccakBuffer) {
+// CHECK: alloca [5 x [5 x i64]]
+
+    // SAFETY: exclusive references are always valid to read/write,
+    // are non-overlapping, and nothing here panics so it's drop-safe.
+    unsafe {
+        let z = read(x);
+        copy_nonoverlapping(y, x, 1);
+        write(y, z);
+    }
+}
+
+// This test verifies that the library does something smarter, and thus
+// doesn't need any scratch space on the stack.
+
+// CHECK-LABEL: @swap_std
+#[no_mangle]
+pub fn swap_std(x: &mut KeccakBuffer, y: &mut KeccakBuffer) {
+// CHECK-NOT: alloca
+// CHECK: load <{{[0-9]+}} x i64>
+// CHECK: store <{{[0-9]+}} x i64>
+    swap(x, y)
+}
+
+// CHECK-LABEL: @swap_slice
+#[no_mangle]
+pub fn swap_slice(x: &mut [KeccakBuffer], y: &mut [KeccakBuffer]) {
+// CHECK-NOT: alloca
+// CHECK: load <{{[0-9]+}} x i64>
+// CHECK: store <{{[0-9]+}} x i64>
+    if x.len() == y.len() {
+        x.swap_with_slice(y);
+    }
+}
+
+type OneKilobyteBuffer = [u8; 1024];
+
+// CHECK-LABEL: @swap_1kb_slices
+#[no_mangle]
+pub fn swap_1kb_slices(x: &mut [OneKilobyteBuffer], y: &mut [OneKilobyteBuffer]) {
+// CHECK-NOT: alloca
+// CHECK: load <{{[0-9]+}} x i8>
+// CHECK: store <{{[0-9]+}} x i8>
+    if x.len() == y.len() {
+        x.swap_with_slice(y);
+    }
+}
diff --git a/src/test/codegen/swap-simd-types.rs b/src/test/codegen/swap-simd-types.rs
new file mode 100644
index 00000000000..c90b277eb44
--- /dev/null
+++ b/src/test/codegen/swap-simd-types.rs
@@ -0,0 +1,32 @@
+// compile-flags: -O -C target-feature=+avx
+// only-x86_64
+// ignore-debug: the debug assertions get in the way
+
+#![crate_type = "lib"]
+
+use std::mem::swap;
+
+// SIMD types are highly-aligned already, so make sure the swap code leaves their
+// types alone and doesn't pessimize them (such as by swapping them as `usize`s).
+extern crate core;
+use core::arch::x86_64::__m256;
+
+// CHECK-LABEL: @swap_single_m256
+#[no_mangle]
+pub fn swap_single_m256(x: &mut __m256, y: &mut __m256) {
+// CHECK-NOT: alloca
+// CHECK: load <8 x float>{{.+}}align 32
+// CHECK: store <8 x float>{{.+}}align 32
+    swap(x, y)
+}
+
+// CHECK-LABEL: @swap_m256_slice
+#[no_mangle]
+pub fn swap_m256_slice(x: &mut [__m256], y: &mut [__m256]) {
+// CHECK-NOT: alloca
+// CHECK: load <8 x float>{{.+}}align 32
+// CHECK: store <8 x float>{{.+}}align 32
+    if x.len() == y.len() {
+        x.swap_with_slice(y);
+    }
+}
diff --git a/src/test/codegen/swap-small-types.rs b/src/test/codegen/swap-small-types.rs
index 6205e6a6559..2f375844cc7 100644
--- a/src/test/codegen/swap-small-types.rs
+++ b/src/test/codegen/swap-small-types.rs
@@ -16,3 +16,47 @@ pub fn swap_rgb48(x: &mut RGB48, y: &mut RGB48) {
 // CHECK: store i48
     swap(x, y)
 }
+
+// LLVM doesn't vectorize a loop over 3-byte elements,
+// so we chunk it down to bytes and loop over those instead.
+type RGB24 = [u8; 3];
+
+// CHECK-LABEL: @swap_rgb24_slices
+#[no_mangle]
+pub fn swap_rgb24_slices(x: &mut [RGB24], y: &mut [RGB24]) {
+// CHECK-NOT: alloca
+// CHECK: load <{{[0-9]+}} x i8>
+// CHECK: store <{{[0-9]+}} x i8>
+    if x.len() == y.len() {
+        x.swap_with_slice(y);
+    }
+}
+
+// This one has a power-of-two size, so we iterate over it directly
+type RGBA32 = [u8; 4];
+
+// CHECK-LABEL: @swap_rgba32_slices
+#[no_mangle]
+pub fn swap_rgba32_slices(x: &mut [RGBA32], y: &mut [RGBA32]) {
+// CHECK-NOT: alloca
+// CHECK: load <{{[0-9]+}} x i32>
+// CHECK: store <{{[0-9]+}} x i32>
+    if x.len() == y.len() {
+        x.swap_with_slice(y);
+    }
+}
+
+// Strings have a non-power-of-two size, but have pointer alignment,
+// so we swap usizes instead of dropping all the way down to bytes.
+const _: () = assert!(!std::mem::size_of::<String>().is_power_of_two());
+
+// CHECK-LABEL: @swap_string_slices
+#[no_mangle]
+pub fn swap_string_slices(x: &mut [String], y: &mut [String]) {
+// CHECK-NOT: alloca
+// CHECK: load <{{[0-9]+}} x i64>
+// CHECK: store <{{[0-9]+}} x i64>
+    if x.len() == y.len() {
+        x.swap_with_slice(y);
+    }
+}