about summary refs log tree commit diff
path: root/tests/codegen/array-map.rs
diff options
context:
space:
mode:
authorbors <bors@rust-lang.org>2023-02-13 10:18:48 +0000
committerbors <bors@rust-lang.org>2023-02-13 10:18:48 +0000
commit2d91939bb7130a8e6c092a290b7d37f654e3c23c (patch)
treedff12eb677ee849041903b887126bab1b193ba3c /tests/codegen/array-map.rs
parent20081880ad2a98bbc8c8293f96c5b284d1584d86 (diff)
parentbb77860d9ccdc6a920edeedce313446545294c04 (diff)
downloadrust-2d91939bb7130a8e6c092a290b7d37f654e3c23c.tar.gz
rust-2d91939bb7130a8e6c092a290b7d37f654e3c23c.zip
Auto merge of #107634 - scottmcm:array-drain, r=thomcc
Improve the `array::map` codegen

The `map` method on arrays [is documented as sometimes performing poorly](https://doc.rust-lang.org/std/primitive.array.html#note-on-performance-and-stack-usage), and after [a question on URLO](https://users.rust-lang.org/t/try-trait-residual-o-trait-and-try-collect-into-array/88510?u=scottmcm) prompted me to take another look at the core [`try_collect_into_array`](https://github.com/rust-lang/rust/blob/7c46fb2111936ad21a8e3aa41e9128752357f5d8/library/core/src/array/mod.rs#L865-L912) function, I had some ideas that ended up working better than I'd expected.

There's three main ideas in here, split over three commits:
1. Don't use `array::IntoIter` when we can avoid it, since that seems to not get SRoA'd, meaning that every step writes things like loop counters into the stack unnecessarily
2. Don't return arrays in `Result`s unnecessarily, as that doesn't seem to optimize away even with `unwrap_unchecked` (perhaps because it needs to get moved into a new LLVM type to account for the discriminant)
3. Don't distract LLVM with all the `Option` dances when we know for sure we have enough items (like in `map` and `zip`).  This one's a larger commit as to do it I ended up adding a new `pub(crate)` trait, but hopefully those changes are still straight-forward.

(No libs-api changes; everything should be completely implementation-detail-internal.)

It's still not completely fixed -- I think it needs pcwalton's `memcpy` optimizations still (#103830) to get further -- but this seems to go much better than before.  And the remaining `memcpy`s are just `transmute`-equivalent (`[T; N] -> ManuallyDrop<[T; N]>` and `[MaybeUninit<T>; N] -> [T; N]`), so hopefully those will be easier to remove with LLVM16 than the previous subobject copies 🤞

r? `@thomcc`

As a simple example, this test
```rust
pub fn long_integer_map(x: [u32; 64]) -> [u32; 64] {
    x.map(|x| 13 * x + 7)
}
```
On nightly <https://rust.godbolt.org/z/xK7548TGj> takes `sub rsp, 808`
```llvm
start:
  %array.i.i.i.i = alloca [64 x i32], align 4
  %_3.sroa.5.i.i.i = alloca [65 x i32], align 4
  %_5.i = alloca %"core::iter::adapters::map::Map<core::array::iter::IntoIter<u32, 64>, [closure@/app/example.rs:2:11: 2:14]>", align 8
```
(and yes, that's a 6**5**-element array `alloca` despite 6**4**-element input and output)

But with this PR it's only `sub rsp, 520`
```llvm
start:
  %array.i.i.i.i.i.i = alloca [64 x i32], align 4
  %array1.i.i.i = alloca %"core::mem::manually_drop::ManuallyDrop<[u32; 64]>", align 4
```

Similarly, the loop it emits on nightly is scalar-only and horrifying
```nasm
.LBB0_1:
        mov     esi, 64
        mov     edi, 0
        cmp     rdx, 64
        je      .LBB0_3
        lea     rsi, [rdx + 1]
        mov     qword ptr [rsp + 784], rsi
        mov     r8d, dword ptr [rsp + 4*rdx + 528]
        mov     edi, 1
        lea     edx, [r8 + 2*r8]
        lea     r8d, [r8 + 4*rdx]
        add     r8d, 7
.LBB0_3:
        test    edi, edi
        je      .LBB0_11
        mov     dword ptr [rsp + 4*rcx + 272], r8d
        cmp     rsi, 64
        jne     .LBB0_6
        xor     r8d, r8d
        mov     edx, 64
        test    r8d, r8d
        jne     .LBB0_8
        jmp     .LBB0_11
.LBB0_6:
        lea     rdx, [rsi + 1]
        mov     qword ptr [rsp + 784], rdx
        mov     edi, dword ptr [rsp + 4*rsi + 528]
        mov     r8d, 1
        lea     esi, [rdi + 2*rdi]
        lea     edi, [rdi + 4*rsi]
        add     edi, 7
        test    r8d, r8d
        je      .LBB0_11
.LBB0_8:
        mov     dword ptr [rsp + 4*rcx + 276], edi
        add     rcx, 2
        cmp     rcx, 64
        jne     .LBB0_1
```

whereas with this PR it's unrolled and vectorized
```nasm
	vpmulld	ymm1, ymm0, ymmword ptr [rsp + 64]
	vpaddd	ymm1, ymm1, ymm2
	vmovdqu	ymmword ptr [rsp + 328], ymm1
	vpmulld	ymm1, ymm0, ymmword ptr [rsp + 96]
	vpaddd	ymm1, ymm1, ymm2
	vmovdqu	ymmword ptr [rsp + 360], ymm1
```
(though sadly still stack-to-stack)
Diffstat (limited to 'tests/codegen/array-map.rs')
-rw-r--r--tests/codegen/array-map.rs49
1 files changed, 49 insertions, 0 deletions
diff --git a/tests/codegen/array-map.rs b/tests/codegen/array-map.rs
new file mode 100644
index 00000000000..9298e89e397
--- /dev/null
+++ b/tests/codegen/array-map.rs
@@ -0,0 +1,49 @@
+// compile-flags: -C opt-level=3 -C target-cpu=x86-64-v3
+// no-system-llvm
+// only-x86_64
+// ignore-debug (the extra assertions get in the way)
+
+#![crate_type = "lib"]
+#![feature(array_zip)]
+
+// CHECK-LABEL: @short_integer_map
+#[no_mangle]
+pub fn short_integer_map(x: [u32; 8]) -> [u32; 8] {
+    // CHECK: load <8 x i32>
+    // CHECK: shl <8 x i32>
+    // CHECK: or <8 x i32>
+    // CHECK: store <8 x i32>
+    x.map(|x| 2 * x + 1)
+}
+
+// CHECK-LABEL: @short_integer_zip_map
+#[no_mangle]
+pub fn short_integer_zip_map(x: [u32; 8], y: [u32; 8]) -> [u32; 8] {
+    // CHECK: %[[A:.+]] = load <8 x i32>
+    // CHECK: %[[B:.+]] = load <8 x i32>
+    // CHECK: sub <8 x i32> %[[A]], %[[B]]
+    // CHECK: store <8 x i32>
+    x.zip(y).map(|(x, y)| x - y)
+}
+
+// This test is checking that LLVM can SRoA away a bunch of the overhead,
+// like fully moving the iterators to registers.  Notably, previous implementations
+// of `map` ended up `alloca`ing the whole `array::IntoIterator`, meaning both a
+// hard-to-eliminate `memcpy` and that the iteration counts needed to be written
+// out to stack every iteration, even for infallible operations on `Copy` types.
+//
+// This is still imperfect, as there's more copies than would be ideal,
+// but hopefully work like #103830 will improve that in future,
+// and update this test to be stricter.
+//
+// CHECK-LABEL: @long_integer_map
+#[no_mangle]
+pub fn long_integer_map(x: [u32; 64]) -> [u32; 64] {
+    // CHECK: start:
+    // CHECK-NEXT: alloca [64 x i32]
+    // CHECK-NEXT: alloca %"core::mem::manually_drop::ManuallyDrop<[u32; 64]>"
+    // CHECK-NOT: alloca
+    // CHECK: mul <{{[0-9]+}} x i32>
+    // CHECK: add <{{[0-9]+}} x i32>
+    x.map(|x| 13 * x + 7)
+}