Ensure `swap_nonoverlapping` is really always untyped

author: Scott McMurray <scottmcm@users.noreply.github.com> 2025-02-21 21:00:43 -0800
committer: Scott McMurray <scottmcm@users.noreply.github.com> 2025-04-09 09:09:37 -0700
commit: 50d0ce1b42d67ce98b2ccac55d22a2ff8abe3273 (patch)
tree: b372a833e17bf40219302ce5aa5d180ca42675f4 /tests
parent: f06e5c1e35bc5bc6131c6f8a0eb782097e3f28c3 (diff)
download: rust-50d0ce1b42d67ce98b2ccac55d22a2ff8abe3273.tar.gz
rust-50d0ce1b42d67ce98b2ccac55d22a2ff8abe3273.zip
5 files changed, 148 insertions, 48 deletions
diff --git a/tests/assembly/x86_64-typed-swap.rs b/tests/assembly/x86_64-typed-swap.rs
index dfd6ee565bc..a6753011d36 100644
--- a/tests/assembly/x86_64-typed-swap.rs
+++ b/tests/assembly/x86_64-typed-swap.rs
@@ -51,3 +51,31 @@ pub fn swap_simd(x: &mut __m128, y: &mut __m128) {
     // CHECK-NEXT: retq
     swap(x, y)
 }
+
+// CHECK-LABEL: swap_string:
+#[no_mangle]
+pub fn swap_string(x: &mut String, y: &mut String) {
+    // CHECK-NOT: mov
+    // CHECK-COUNT-4: movups
+    // CHECK-NOT: mov
+    // CHECK-COUNT-4: movq
+    // CHECK-NOT: mov
+    swap(x, y)
+}
+
+// CHECK-LABEL: swap_44_bytes:
+#[no_mangle]
+pub fn swap_44_bytes(x: &mut [u8; 44], y: &mut [u8; 44]) {
+    // Ensure we do better than a long run of byte copies,
+    // see <https://github.com/rust-lang/rust/issues/134946>
+
+    // CHECK-NOT: movb
+    // CHECK-COUNT-8: movups{{.+}}xmm
+    // CHECK-NOT: movb
+    // CHECK-COUNT-4: movq
+    // CHECK-NOT: movb
+    // CHECK-COUNT-4: movl
+    // CHECK-NOT: movb
+    // CHECK: retq
+    swap(x, y)
+}
diff --git a/tests/codegen/simd/swap-simd-types.rs b/tests/codegen/simd/swap-simd-types.rs
index 69767d0a755..c063cc683a6 100644
--- a/tests/codegen/simd/swap-simd-types.rs
+++ b/tests/codegen/simd/swap-simd-types.rs
@@ -23,8 +23,8 @@ pub fn swap_single_m256(x: &mut __m256, y: &mut __m256) {
 #[no_mangle]
 pub fn swap_m256_slice(x: &mut [__m256], y: &mut [__m256]) {
     // CHECK-NOT: alloca
-    // CHECK: load <8 x float>{{.+}}align 32
-    // CHECK: store <8 x float>{{.+}}align 32
+    // CHECK-COUNT-2: load <4 x i64>{{.+}}align 32
+    // CHECK-COUNT-2: store <4 x i64>{{.+}}align 32
     if x.len() == y.len() {
         x.swap_with_slice(y);
     }
@@ -34,7 +34,7 @@ pub fn swap_m256_slice(x: &mut [__m256], y: &mut [__m256]) {
 #[no_mangle]
 pub fn swap_bytes32(x: &mut [u8; 32], y: &mut [u8; 32]) {
     // CHECK-NOT: alloca
-    // CHECK: load <32 x i8>{{.+}}align 1
-    // CHECK: store <32 x i8>{{.+}}align 1
+    // CHECK-COUNT-2: load <4 x i64>{{.+}}align 1
+    // CHECK-COUNT-2: store <4 x i64>{{.+}}align 1
     swap(x, y)
 }
diff --git a/tests/codegen/swap-large-types.rs b/tests/codegen/swap-large-types.rs
index 49a41bb1469..08c486affd9 100644
--- a/tests/codegen/swap-large-types.rs
+++ b/tests/codegen/swap-large-types.rs
@@ -12,6 +12,16 @@ type KeccakBuffer = [[u64; 5]; 5];
 // to stack for large types, which is completely unnecessary as the lack of
 // overlap means we can just do whatever fits in registers at a time.
 
+// The tests here (after the first one showing that the problem still exists)
+// are less about testing *exactly* what the codegen is, and more about testing
+// 1) That things are swapped directly from one argument to the other,
+//    never going through stack along the way, and
+// 2) That we're doing the swapping for big things using large vector types,
+//    rather then `i64` or `<8 x i8>` (or, even worse, `i8`) at a time.
+//
+// (There are separate tests for intrinsics::typed_swap_nonoverlapping that
+//  check that it, as an intrinsic, are emitting exactly what it should.)
+
 // CHECK-LABEL: @swap_basic
 #[no_mangle]
 pub fn swap_basic(x: &mut KeccakBuffer, y: &mut KeccakBuffer) {
@@ -26,55 +36,55 @@ pub fn swap_basic(x: &mut KeccakBuffer, y: &mut KeccakBuffer) {
     }
 }
 
-// This test verifies that the library does something smarter, and thus
-// doesn't need any scratch space on the stack.
-
 // CHECK-LABEL: @swap_std
 #[no_mangle]
 pub fn swap_std(x: &mut KeccakBuffer, y: &mut KeccakBuffer) {
     // CHECK-NOT: alloca
-    // CHECK: load <{{[0-9]+}} x i64>
-    // CHECK: store <{{[0-9]+}} x i64>
+    // CHECK: load <{{2|4}} x i64>
+    // CHECK: store <{{2|4}} x i64>
     swap(x, y)
 }
 
-// Verify that types with usize alignment are swapped via vectored usizes,
-// not falling back to byte-level code.
-
 // CHECK-LABEL: @swap_slice
 #[no_mangle]
 pub fn swap_slice(x: &mut [KeccakBuffer], y: &mut [KeccakBuffer]) {
     // CHECK-NOT: alloca
-    // CHECK: load <{{[0-9]+}} x i64>
-    // CHECK: store <{{[0-9]+}} x i64>
+    // CHECK: load <{{2|4}} x i64>
+    // CHECK: store <{{2|4}} x i64>
     if x.len() == y.len() {
         x.swap_with_slice(y);
     }
 }
 
-// But for a large align-1 type, vectorized byte copying is what we want.
-
 type OneKilobyteBuffer = [u8; 1024];
 
 // CHECK-LABEL: @swap_1kb_slices
 #[no_mangle]
 pub fn swap_1kb_slices(x: &mut [OneKilobyteBuffer], y: &mut [OneKilobyteBuffer]) {
     // CHECK-NOT: alloca
-    // CHECK: load <{{[0-9]+}} x i8>
-    // CHECK: store <{{[0-9]+}} x i8>
+
+    // CHECK-NOT: load i32
+    // CHECK-NOT: store i32
+    // CHECK-NOT: load i16
+    // CHECK-NOT: store i16
+    // CHECK-NOT: load i8
+    // CHECK-NOT: store i8
+
+    // CHECK: load <{{2|4}} x i64>{{.+}}align 1,
+    // CHECK: store <{{2|4}} x i64>{{.+}}align 1,
+
+    // CHECK-NOT: load i32
+    // CHECK-NOT: store i32
+    // CHECK-NOT: load i16
+    // CHECK-NOT: store i16
+    // CHECK-NOT: load i8
+    // CHECK-NOT: store i8
+
     if x.len() == y.len() {
         x.swap_with_slice(y);
     }
 }
 
-// This verifies that the 2×read + 2×write optimizes to just 3 memcpys
-// for an unusual type like this.  It's not clear whether we should do anything
-// smarter in Rust for these, so for now it's fine to leave these up to the backend.
-// That's not as bad as it might seem, as for example, LLVM will lower the
-// memcpys below to VMOVAPS on YMMs if one enables the AVX target feature.
-// Eventually we'll be able to pass `align_of::<T>` to a const generic and
-// thus pick a smarter chunk size ourselves without huge code duplication.
-
 #[repr(align(64))]
 pub struct BigButHighlyAligned([u8; 64 * 3]);
 
@@ -82,9 +92,25 @@ pub struct BigButHighlyAligned([u8; 64 * 3]);
 #[no_mangle]
 pub fn swap_big_aligned(x: &mut BigButHighlyAligned, y: &mut BigButHighlyAligned) {
     // CHECK-NOT: call void @llvm.memcpy
-    // CHECK: call void @llvm.memcpy.{{.+}}(ptr noundef nonnull align 64 dereferenceable(192)
-    // CHECK: call void @llvm.memcpy.{{.+}}(ptr noundef nonnull align 64 dereferenceable(192)
-    // CHECK: call void @llvm.memcpy.{{.+}}(ptr noundef nonnull align 64 dereferenceable(192)
+    // CHECK-NOT: load i32
+    // CHECK-NOT: store i32
+    // CHECK-NOT: load i16
+    // CHECK-NOT: store i16
+    // CHECK-NOT: load i8
+    // CHECK-NOT: store i8
+
+    // CHECK-COUNT-2: load <{{2|4}} x i64>{{.+}}align 64,
+    // CHECK-COUNT-2: store <{{2|4}} x i64>{{.+}}align 64,
+
+    // CHECK-COUNT-2: load <{{2|4}} x i64>{{.+}}align 32,
+    // CHECK-COUNT-2: store <{{2|4}} x i64>{{.+}}align 32,
+
+    // CHECK-NOT: load i32
+    // CHECK-NOT: store i32
+    // CHECK-NOT: load i16
+    // CHECK-NOT: store i16
+    // CHECK-NOT: load i8
+    // CHECK-NOT: store i8
     // CHECK-NOT: call void @llvm.memcpy
     swap(x, y)
 }
diff --git a/tests/codegen/swap-small-types.rs b/tests/codegen/swap-small-types.rs
index 76bb853e642..ffa573c9a43 100644
--- a/tests/codegen/swap-small-types.rs
+++ b/tests/codegen/swap-small-types.rs
@@ -1,5 +1,6 @@
 //@ compile-flags: -Copt-level=3 -Z merge-functions=disabled
 //@ only-x86_64
+//@ min-llvm-version: 20
 
 #![crate_type = "lib"]
 
@@ -27,13 +28,19 @@ pub fn swap_rgb48_manually(x: &mut RGB48, y: &mut RGB48) {
 pub fn swap_rgb48(x: &mut RGB48, y: &mut RGB48) {
     // CHECK-NOT: alloca
 
-    // Whether `i8` is the best for this is unclear, but
-    // might as well record what's actually happening right now.
-
-    // CHECK: load i8
-    // CHECK: load i8
-    // CHECK: store i8
-    // CHECK: store i8
+    // Swapping `i48` might be cleaner in LLVM-IR here, but `i32`+`i16` isn't bad,
+    // and is closer to the assembly it generates anyway.
+
+    // CHECK-NOT: load{{ }}
+    // CHECK: load i32{{.+}}align 2
+    // CHECK-NEXT: load i32{{.+}}align 2
+    // CHECK-NEXT: store i32{{.+}}align 2
+    // CHECK-NEXT: store i32{{.+}}align 2
+    // CHECK: load i16{{.+}}align 2
+    // CHECK-NEXT: load i16{{.+}}align 2
+    // CHECK-NEXT: store i16{{.+}}align 2
+    // CHECK-NEXT: store i16{{.+}}align 2
+    // CHECK-NOT: store{{ }}
     swap(x, y)
 }
 
@@ -76,30 +83,49 @@ pub fn swap_slices<'a>(x: &mut &'a [u32], y: &mut &'a [u32]) {
     swap(x, y)
 }
 
-// LLVM doesn't vectorize a loop over 3-byte elements,
-// so we chunk it down to bytes and loop over those instead.
 type RGB24 = [u8; 3];
 
 // CHECK-LABEL: @swap_rgb24_slices
 #[no_mangle]
 pub fn swap_rgb24_slices(x: &mut [RGB24], y: &mut [RGB24]) {
     // CHECK-NOT: alloca
-    // CHECK: load <{{[0-9]+}} x i8>
-    // CHECK: store <{{[0-9]+}} x i8>
+
+    // CHECK: mul nuw nsw i64 %{{x|y}}.1, 3
+
+    // CHECK: load <{{[0-9]+}} x i64>
+    // CHECK: store <{{[0-9]+}} x i64>
+
+    // CHECK-COUNT-2: load i32
+    // CHECK-COUNT-2: store i32
+    // CHECK-COUNT-2: load i16
+    // CHECK-COUNT-2: store i16
+    // CHECK-COUNT-2: load i8
+    // CHECK-COUNT-2: store i8
     if x.len() == y.len() {
         x.swap_with_slice(y);
     }
 }
 
-// This one has a power-of-two size, so we iterate over it directly
 type RGBA32 = [u8; 4];
 
 // CHECK-LABEL: @swap_rgba32_slices
 #[no_mangle]
 pub fn swap_rgba32_slices(x: &mut [RGBA32], y: &mut [RGBA32]) {
     // CHECK-NOT: alloca
-    // CHECK: load <{{[0-9]+}} x i32>
-    // CHECK: store <{{[0-9]+}} x i32>
+
+    // Because the size in bytes in a multiple of 4, we can skip the smallest sizes.
+
+    // CHECK: load <{{[0-9]+}} x i64>
+    // CHECK: store <{{[0-9]+}} x i64>
+
+    // CHECK-COUNT-2: load i32
+    // CHECK-COUNT-2: store i32
+
+    // CHECK-NOT: load i16
+    // CHECK-NOT: store i16
+    // CHECK-NOT: load i8
+    // CHECK-NOT: store i8
+
     if x.len() == y.len() {
         x.swap_with_slice(y);
     }
@@ -113,8 +139,8 @@ const _: () = assert!(!std::mem::size_of::<String>().is_power_of_two());
 #[no_mangle]
 pub fn swap_string_slices(x: &mut [String], y: &mut [String]) {
     // CHECK-NOT: alloca
-    // CHECK: load <{{[0-9]+}} x i64>
-    // CHECK: store <{{[0-9]+}} x i64>
+    // CHECK: load <{{[0-9]+}} x i64>{{.+}}, align 8,
+    // CHECK: store <{{[0-9]+}} x i64>{{.+}}, align 8,
     if x.len() == y.len() {
         x.swap_with_slice(y);
     }
@@ -130,6 +156,26 @@ pub struct Packed {
 #[no_mangle]
 pub fn swap_packed_structs(x: &mut Packed, y: &mut Packed) {
     // CHECK-NOT: alloca
+
+    // CHECK-NOT: load
+    // CHECK-NOT: store
+
+    // CHECK: %[[A:.+]] = load i64, ptr %x, align 1,
+    // CHECK-NEXT: %[[B:.+]] = load i64, ptr %y, align 1,
+    // CHECK-NEXT: store i64 %[[B]], ptr %x, align 1,
+    // CHECK-NEXT: store i64 %[[A]], ptr %y, align 1,
+
+    // CHECK-NOT: load
+    // CHECK-NOT: store
+
+    // CHECK: %[[C:.+]] = load i8, ptr %[[X8:.+]], align 1,
+    // CHECK-NEXT: %[[D:.+]] = load i8, ptr %[[Y8:.+]], align 1,
+    // CHECK-NEXT: store i8 %[[D]], ptr %[[X8]], align 1,
+    // CHECK-NEXT: store i8 %[[C]], ptr %[[Y8]], align 1,
+
+    // CHECK-NOT: load
+    // CHECK-NOT: store
+
     // CHECK: ret void
     swap(x, y)
 }
diff --git a/tests/ui/consts/missing_span_in_backtrace.stderr b/tests/ui/consts/missing_span_in_backtrace.stderr
index 2f3a65302bd..aad3d76dd26 100644
--- a/tests/ui/consts/missing_span_in_backtrace.stderr
+++ b/tests/ui/consts/missing_span_in_backtrace.stderr
@@ -12,10 +12,10 @@ note: inside `swap_nonoverlapping::<MaybeUninit<u8>>`
   --> $SRC_DIR/core/src/ptr/mod.rs:LL:COL
 note: inside `swap_nonoverlapping::compiletime::<MaybeUninit<u8>>`
   --> $SRC_DIR/core/src/ptr/mod.rs:LL:COL
-note: inside `std::ptr::swap_nonoverlapping_simple_untyped::<MaybeUninit<u8>>`
-  --> $SRC_DIR/core/src/ptr/mod.rs:LL:COL
-note: inside `std::ptr::read::<MaybeUninit<MaybeUninit<u8>>>`
+note: inside `std::ptr::swap_nonoverlapping_const::<MaybeUninit<u8>>`
   --> $SRC_DIR/core/src/ptr/mod.rs:LL:COL
+note: inside `copy_nonoverlapping::<MaybeUninit<u8>>`
+  --> $SRC_DIR/core/src/intrinsics/mod.rs:LL:COL
    = help: this code performed an operation that depends on the underlying bytes representing a pointer
    = help: the absolute address of a pointer is not known at compile-time, so such operations are not supported
    = note: this error originates in the macro `$crate::intrinsics::const_eval_select` which comes from the expansion of the macro `const_eval_select` (in Nightly builds, run with -Z macro-backtrace for more info)
author	Scott McMurray <scottmcm@users.noreply.github.com>	2025-02-21 21:00:43 -0800
committer	Scott McMurray <scottmcm@users.noreply.github.com>	2025-04-09 09:09:37 -0700
commit	50d0ce1b42d67ce98b2ccac55d22a2ff8abe3273 (patch)
tree	b372a833e17bf40219302ce5aa5d180ca42675f4 /tests
parent	f06e5c1e35bc5bc6131c6f8a0eb782097e3f28c3 (diff)
download	rust-50d0ce1b42d67ce98b2ccac55d22a2ff8abe3273.tar.gz rust-50d0ce1b42d67ce98b2ccac55d22a2ff8abe3273.zip