Auto merge of #111999 - scottmcm:codegen-less-memcpy, r=compiler-errors

Use `load`+`store` instead of `memcpy` for small integer arrays I was inspired by #98892 to see whether, rather than making `mem::swap` do something smart in the library, we could update MIR assignments like `*_1 = *_2` to do something smarter than `memcpy` for sufficiently-small types that doing it inline is going to be better than a `memcpy` call in assembly anyway. After all, special code may help `mem::swap`, but if the "obvious" MIR can just result in the correct thing that helps everything -- other code like `mem::replace`, people doing it manually, and just passing around by value in general -- as well as makes MIR inlining happier since it doesn't need to deal with all the complicated library code if it just sees a couple assignments. LLVM will turn the short, known-length `memcpy`s into direct instructions in the backend, but that's too late for it to be able to remove `alloca`s. In general, replacing `memcpy`s with typed instructions is hard in the middle-end -- even for `memcpy.inline` where it knows it won't be a function call -- is hard [due to poison propagation issues](https://rust-lang.zulipchat.com/#narrow/stream/187780-t-compiler.2Fwg-llvm/topic/memcpy.20vs.20load-store.20for.20MIR.20assignments/near/360376712). So because we know more about the type invariants -- these are typed copies -- rustc can emit something more specific, allowing LLVM to `mem2reg` away the `alloca`s in some situations. #52051 previously did something like this in the library for `mem::swap`, but it ended up regressing during enabling mir inlining (https://github.com/rust-lang/rust/commit/cbbf06b0cd39dc93033568f1e65f5363cbbdebcd), so this has been suboptimal on stable for ≈5 releases now. The code in this PR is narrowly targeted at just integer arrays in LLVM, but works via a new method on the [`LayoutTypeMethods`](https://doc.rust-lang.org/nightly/nightly-rustc/rustc_codegen_ssa/traits/trait.LayoutTypeMethods.html) trait, so specific backends based on cg_ssa can enable this for more situations over time, as we find them. I don't want to try to bite off too much in this PR, though. (Transparent newtypes and simple things like the 3×usize `String` would be obvious candidates for a follow-up.) Codegen demonstrations: <https://llvm.godbolt.org/z/fK8hT9aqv> Before: ```llvm define void `@swap_rgb48_old(ptr` noalias nocapture noundef align 2 dereferenceable(6) %x, ptr noalias nocapture noundef align 2 dereferenceable(6) %y) unnamed_addr #1 { %a.i = alloca [3 x i16], align 2 call void `@llvm.lifetime.start.p0(i64` 6, ptr nonnull %a.i) call void `@llvm.memcpy.p0.p0.i64(ptr` noundef nonnull align 2 dereferenceable(6) %a.i, ptr noundef nonnull align 2 dereferenceable(6) %x, i64 6, i1 false) tail call void `@llvm.memcpy.p0.p0.i64(ptr` noundef nonnull align 2 dereferenceable(6) %x, ptr noundef nonnull align 2 dereferenceable(6) %y, i64 6, i1 false) call void `@llvm.memcpy.p0.p0.i64(ptr` noundef nonnull align 2 dereferenceable(6) %y, ptr noundef nonnull align 2 dereferenceable(6) %a.i, i64 6, i1 false) call void `@llvm.lifetime.end.p0(i64` 6, ptr nonnull %a.i) ret void } ``` Note it going to stack: ```nasm swap_rgb48_old: # `@swap_rgb48_old` movzx eax, word ptr [rdi + 4] mov word ptr [rsp - 4], ax mov eax, dword ptr [rdi] mov dword ptr [rsp - 8], eax movzx eax, word ptr [rsi + 4] mov word ptr [rdi + 4], ax mov eax, dword ptr [rsi] mov dword ptr [rdi], eax movzx eax, word ptr [rsp - 4] mov word ptr [rsi + 4], ax mov eax, dword ptr [rsp - 8] mov dword ptr [rsi], eax ret ``` Now: ```llvm define void `@swap_rgb48(ptr` noalias nocapture noundef align 2 dereferenceable(6) %x, ptr noalias nocapture noundef align 2 dereferenceable(6) %y) unnamed_addr #0 { start: %0 = load <3 x i16>, ptr %x, align 2 %1 = load <3 x i16>, ptr %y, align 2 store <3 x i16> %1, ptr %x, align 2 store <3 x i16> %0, ptr %y, align 2 ret void } ``` still lowers to `dword`+`word` operations, but has no stack traffic: ```nasm swap_rgb48: # `@swap_rgb48` mov eax, dword ptr [rdi] movzx ecx, word ptr [rdi + 4] movzx edx, word ptr [rsi + 4] mov r8d, dword ptr [rsi] mov dword ptr [rdi], r8d mov word ptr [rdi + 4], dx mov word ptr [rsi + 4], cx mov dword ptr [rsi], eax ret ``` And as a demonstration that this isn't just `mem::swap`, a `mem::replace` on a small array (since replace doesn't use swap since #83022), which used to be `memcpy`s in LLVM changes in IR ```llvm define void `@replace_short_array(ptr` noalias nocapture noundef sret([3 x i32]) dereferenceable(12) %0, ptr noalias noundef align 4 dereferenceable(12) %r, ptr noalias nocapture noundef readonly dereferenceable(12) %v) unnamed_addr #0 { start: %1 = load <3 x i32>, ptr %r, align 4 store <3 x i32> %1, ptr %0, align 4 %2 = load <3 x i32>, ptr %v, align 4 store <3 x i32> %2, ptr %r, align 4 ret void } ``` but that lowers to reasonable `dword`+`qword` instructions still ```nasm replace_short_array: # `@replace_short_array` mov rax, rdi mov rcx, qword ptr [rsi] mov edi, dword ptr [rsi + 8] mov dword ptr [rax + 8], edi mov qword ptr [rax], rcx mov rcx, qword ptr [rdx] mov edx, dword ptr [rdx + 8] mov dword ptr [rsi + 8], edx mov qword ptr [rsi], rcx ret ```
author: bors <bors@rust-lang.org> 2023-06-06 01:50:28 +0000
committer: bors <bors@rust-lang.org> 2023-06-06 01:50:28 +0000
commit: fd9bf594366e73efb1a26a023e0b4de8eff82b94 (patch)
tree: 967f7343876d4b9a1623aaa8d9abb79fa8db2818 /tests
parent: adc719d7147d5e2578ce08e0b4504be44650256e (diff)
parent: e1b020df9f72eab7e8b3e38a5263ddda54ce18e1 (diff)
download: rust-fd9bf594366e73efb1a26a023e0b4de8eff82b94.tar.gz
rust-fd9bf594366e73efb1a26a023e0b4de8eff82b94.zip
4 files changed, 75 insertions, 5 deletions
diff --git a/tests/codegen/array-codegen.rs b/tests/codegen/array-codegen.rs
new file mode 100644
index 00000000000..98488eb92ee
--- /dev/null
+++ b/tests/codegen/array-codegen.rs
@@ -0,0 +1,35 @@
+// compile-flags: -O -C no-prepopulate-passes
+// min-llvm-version: 15.0 (for opaque pointers)
+
+#![crate_type = "lib"]
+
+// CHECK-LABEL: @array_load
+#[no_mangle]
+pub fn array_load(a: &[u8; 4]) -> [u8; 4] {
+    // CHECK: %0 = alloca [4 x i8], align 1
+    // CHECK: %[[TEMP1:.+]] = load <4 x i8>, ptr %a, align 1
+    // CHECK: store <4 x i8> %[[TEMP1]], ptr %0, align 1
+    // CHECK: %[[TEMP2:.+]] = load i32, ptr %0, align 1
+    // CHECK: ret i32 %[[TEMP2]]
+    *a
+}
+
+// CHECK-LABEL: @array_store
+#[no_mangle]
+pub fn array_store(a: [u8; 4], p: &mut [u8; 4]) {
+    // CHECK: %a = alloca [4 x i8]
+    // CHECK: %[[TEMP:.+]] = load <4 x i8>, ptr %a, align 1
+    // CHECK-NEXT: store <4 x i8> %[[TEMP]], ptr %p, align 1
+    *p = a;
+}
+
+// CHECK-LABEL: @array_copy
+#[no_mangle]
+pub fn array_copy(a: &[u8; 4], p: &mut [u8; 4]) {
+    // CHECK: %[[LOCAL:.+]] = alloca [4 x i8], align 1
+    // CHECK: %[[TEMP1:.+]] = load <4 x i8>, ptr %a, align 1
+    // CHECK: store <4 x i8> %[[TEMP1]], ptr %[[LOCAL]], align 1
+    // CHECK: %[[TEMP2:.+]] = load <4 x i8>, ptr %[[LOCAL]], align 1
+    // CHECK: store <4 x i8> %[[TEMP2]], ptr %p, align 1
+    *p = *a;
+}
diff --git a/tests/codegen/mem-replace-simple-type.rs b/tests/codegen/mem-replace-simple-type.rs
index 4253ef13666..6151177de15 100644
--- a/tests/codegen/mem-replace-simple-type.rs
+++ b/tests/codegen/mem-replace-simple-type.rs
@@ -32,3 +32,14 @@ pub fn replace_ref_str<'a>(r: &mut &'a str, v: &'a str) -> &'a str {
     // CHECK: ret { ptr, i64 } %[[P2]]
     std::mem::replace(r, v)
 }
+
+#[no_mangle]
+// CHECK-LABEL: @replace_short_array(
+pub fn replace_short_array(r: &mut [u32; 3], v: [u32; 3]) -> [u32; 3] {
+    // CHECK-NOT: alloca
+    // CHECK: %[[R:.+]] = load <3 x i32>, ptr %r, align 4
+    // CHECK: store <3 x i32> %[[R]], ptr %0
+    // CHECK: %[[V:.+]] = load <3 x i32>, ptr %v, align 4
+    // CHECK: store <3 x i32> %[[V]], ptr %r
+    std::mem::replace(r, v)
+}
diff --git a/tests/codegen/swap-simd-types.rs b/tests/codegen/swap-simd-types.rs
index c90b277eb44..3472a42b0e6 100644
--- a/tests/codegen/swap-simd-types.rs
+++ b/tests/codegen/swap-simd-types.rs
@@ -30,3 +30,12 @@ pub fn swap_m256_slice(x: &mut [__m256], y: &mut [__m256]) {
         x.swap_with_slice(y);
     }
 }
+
+// CHECK-LABEL: @swap_bytes32
+#[no_mangle]
+pub fn swap_bytes32(x: &mut [u8; 32], y: &mut [u8; 32]) {
+// CHECK-NOT: alloca
+// CHECK: load <32 x i8>{{.+}}align 1
+// CHECK: store <32 x i8>{{.+}}align 1
+    swap(x, y)
+}
diff --git a/tests/codegen/swap-small-types.rs b/tests/codegen/swap-small-types.rs
index 03e2a2327fc..419645a3fc6 100644
--- a/tests/codegen/swap-small-types.rs
+++ b/tests/codegen/swap-small-types.rs
@@ -1,4 +1,4 @@
-// compile-flags: -O
+// compile-flags: -O -Z merge-functions=disabled
 // only-x86_64
 // ignore-debug: the debug assertions get in the way
 
@@ -8,13 +8,28 @@ use std::mem::swap;
 
 type RGB48 = [u16; 3];
 
+// CHECK-LABEL: @swap_rgb48_manually(
+#[no_mangle]
+pub fn swap_rgb48_manually(x: &mut RGB48, y: &mut RGB48) {
+    // CHECK-NOT: alloca
+    // CHECK: %[[TEMP0:.+]] = load <3 x i16>, ptr %x, align 2
+    // CHECK: %[[TEMP1:.+]] = load <3 x i16>, ptr %y, align 2
+    // CHECK: store <3 x i16> %[[TEMP1]], ptr %x, align 2
+    // CHECK: store <3 x i16> %[[TEMP0]], ptr %y, align 2
+
+    let temp = *x;
+    *x = *y;
+    *y = temp;
+}
+
 // CHECK-LABEL: @swap_rgb48
 #[no_mangle]
 pub fn swap_rgb48(x: &mut RGB48, y: &mut RGB48) {
-    // FIXME MIR inlining messes up LLVM optimizations.
-// WOULD-CHECK-NOT: alloca
-// WOULD-CHECK: load i48
-// WOULD-CHECK: store i48
+    // CHECK-NOT: alloca
+    // CHECK: load <3 x i16>
+    // CHECK: load <3 x i16>
+    // CHECK: store <3 x i16>
+    // CHECK: store <3 x i16>
     swap(x, y)
 }
author	bors <bors@rust-lang.org>	2023-06-06 01:50:28 +0000
committer	bors <bors@rust-lang.org>	2023-06-06 01:50:28 +0000
commit	fd9bf594366e73efb1a26a023e0b4de8eff82b94 (patch)
tree	967f7343876d4b9a1623aaa8d9abb79fa8db2818 /tests
parent	adc719d7147d5e2578ce08e0b4504be44650256e (diff)
parent	e1b020df9f72eab7e8b3e38a5263ddda54ce18e1 (diff)
download	rust-fd9bf594366e73efb1a26a023e0b4de8eff82b94.tar.gz rust-fd9bf594366e73efb1a26a023e0b4de8eff82b94.zip