about summary refs log tree commit diff
path: root/src/test/codegen
diff options
context:
space:
mode:
authorDylan DPC <99973273+Dylan-DPC@users.noreply.github.com>2022-02-24 21:42:14 +0100
committerGitHub <noreply@github.com>2022-02-24 21:42:14 +0100
commit7fb55b4c3a308d09a4926d6d7dcce5c13c64eb7f (patch)
treee291b7dc87796a752ba696dc1835c7bc9213f4b5 /src/test/codegen
parent000e38d9cb4140048b14600938e73e9cdafb7341 (diff)
parent8ca47d7ae4e068c94b4ab7b25cc0ccc38d01d52c (diff)
downloadrust-7fb55b4c3a308d09a4926d6d7dcce5c13c64eb7f.tar.gz
rust-7fb55b4c3a308d09a4926d6d7dcce5c13c64eb7f.zip
Rollup merge of #94212 - scottmcm:swapper, r=dtolnay
Stop manually SIMDing in `swap_nonoverlapping`

Like I previously did for `reverse` (#90821), this leaves it to LLVM to pick how to vectorize it, since it can know better the chunk size to use, compared to the "32 bytes always" approach we currently have.

A variety of codegen tests are included to confirm that the various cases are still being vectorized.

It does still need logic to type-erase in some cases, though, as while LLVM is now smart enough to vectorize over slices of things like `[u8; 4]`, it fails to do so over slices of `[u8; 3]`.

As a bonus, this change also means one no longer gets the spurious `memcpy`(s?) at the end up swapping a slice of `__m256`s: <https://rust.godbolt.org/z/joofr4v8Y>

<details>

<summary>ASM for this example</summary>

## Before (from godbolt)

note the `push`/`pop`s and `memcpy`

```x86
swap_m256_slice:
        push    r15
        push    r14
        push    r13
        push    r12
        push    rbx
        sub     rsp, 32
        cmp     rsi, rcx
        jne     .LBB0_6
        mov     r14, rsi
        shl     r14, 5
        je      .LBB0_6
        mov     r15, rdx
        mov     rbx, rdi
        xor     eax, eax
.LBB0_3:
        mov     rcx, rax
        vmovaps ymm0, ymmword ptr [rbx + rax]
        vmovaps ymm1, ymmword ptr [r15 + rax]
        vmovaps ymmword ptr [rbx + rax], ymm1
        vmovaps ymmword ptr [r15 + rax], ymm0
        add     rax, 32
        add     rcx, 64
        cmp     rcx, r14
        jbe     .LBB0_3
        sub     r14, rax
        jbe     .LBB0_6
        add     rbx, rax
        add     r15, rax
        mov     r12, rsp
        mov     r13, qword ptr [rip + memcpy@GOTPCREL]
        mov     rdi, r12
        mov     rsi, rbx
        mov     rdx, r14
        vzeroupper
        call    r13
        mov     rdi, rbx
        mov     rsi, r15
        mov     rdx, r14
        call    r13
        mov     rdi, r15
        mov     rsi, r12
        mov     rdx, r14
        call    r13
.LBB0_6:
        add     rsp, 32
        pop     rbx
        pop     r12
        pop     r13
        pop     r14
        pop     r15
        vzeroupper
        ret
```

## After (from my machine)

Note no `rsp` manipulation, sorry for different ASM syntax

```x86
swap_m256_slice:
	cmpq	%r9, %rdx
	jne	.LBB1_6
	testq	%rdx, %rdx
	je	.LBB1_6
	cmpq	$1, %rdx
	jne	.LBB1_7
	xorl	%r10d, %r10d
	jmp	.LBB1_4
.LBB1_7:
	movq	%rdx, %r9
	andq	$-2, %r9
	movl	$32, %eax
	xorl	%r10d, %r10d
	.p2align	4, 0x90
.LBB1_8:
	vmovaps	-32(%rcx,%rax), %ymm0
	vmovaps	-32(%r8,%rax), %ymm1
	vmovaps	%ymm1, -32(%rcx,%rax)
	vmovaps	%ymm0, -32(%r8,%rax)
	vmovaps	(%rcx,%rax), %ymm0
	vmovaps	(%r8,%rax), %ymm1
	vmovaps	%ymm1, (%rcx,%rax)
	vmovaps	%ymm0, (%r8,%rax)
	addq	$2, %r10
	addq	$64, %rax
	cmpq	%r10, %r9
	jne	.LBB1_8
.LBB1_4:
	testb	$1, %dl
	je	.LBB1_6
	shlq	$5, %r10
	vmovaps	(%rcx,%r10), %ymm0
	vmovaps	(%r8,%r10), %ymm1
	vmovaps	%ymm1, (%rcx,%r10)
	vmovaps	%ymm0, (%r8,%r10)
.LBB1_6:
	vzeroupper
	retq
```

</details>

This does all its copying operations as either the original type or as `MaybeUninit`s, so as far as I know there should be no potential abstract machine issues with reading padding bytes as integers.

<details>

<summary>Perf is essentially unchanged</summary>

Though perhaps with more target features this would help more, if it could pick bigger chunks

## Before

```
running 10 tests
test slice::swap_with_slice_4x_usize_30                            ... bench:         894 ns/iter (+/- 11)
test slice::swap_with_slice_4x_usize_3000                          ... bench:      99,476 ns/iter (+/- 2,784)
test slice::swap_with_slice_5x_usize_30                            ... bench:       1,257 ns/iter (+/- 7)
test slice::swap_with_slice_5x_usize_3000                          ... bench:     139,922 ns/iter (+/- 959)
test slice::swap_with_slice_rgb_30                                 ... bench:         328 ns/iter (+/- 27)
test slice::swap_with_slice_rgb_3000                               ... bench:      16,215 ns/iter (+/- 176)
test slice::swap_with_slice_u8_30                                  ... bench:         312 ns/iter (+/- 9)
test slice::swap_with_slice_u8_3000                                ... bench:       5,401 ns/iter (+/- 123)
test slice::swap_with_slice_usize_30                               ... bench:         368 ns/iter (+/- 3)
test slice::swap_with_slice_usize_3000                             ... bench:      28,472 ns/iter (+/- 3,913)
```

## After

```
running 10 tests
test slice::swap_with_slice_4x_usize_30                            ... bench:         868 ns/iter (+/- 36)
test slice::swap_with_slice_4x_usize_3000                          ... bench:      99,642 ns/iter (+/- 1,507)
test slice::swap_with_slice_5x_usize_30                            ... bench:       1,194 ns/iter (+/- 11)
test slice::swap_with_slice_5x_usize_3000                          ... bench:     139,761 ns/iter (+/- 5,018)
test slice::swap_with_slice_rgb_30                                 ... bench:         324 ns/iter (+/- 6)
test slice::swap_with_slice_rgb_3000                               ... bench:      15,962 ns/iter (+/- 287)
test slice::swap_with_slice_u8_30                                  ... bench:         281 ns/iter (+/- 5)
test slice::swap_with_slice_u8_3000                                ... bench:       5,324 ns/iter (+/- 40)
test slice::swap_with_slice_usize_30                               ... bench:         275 ns/iter (+/- 5)
test slice::swap_with_slice_usize_3000                             ... bench:      28,277 ns/iter (+/- 277)
```

</detail>
Diffstat (limited to 'src/test/codegen')
-rw-r--r--src/test/codegen/swap-large-types.rs64
-rw-r--r--src/test/codegen/swap-simd-types.rs32
-rw-r--r--src/test/codegen/swap-small-types.rs44
3 files changed, 140 insertions, 0 deletions
diff --git a/src/test/codegen/swap-large-types.rs b/src/test/codegen/swap-large-types.rs
new file mode 100644
index 00000000000..535d301a3d2
--- /dev/null
+++ b/src/test/codegen/swap-large-types.rs
@@ -0,0 +1,64 @@
+// compile-flags: -O
+// only-x86_64
+// ignore-debug: the debug assertions get in the way
+
+#![crate_type = "lib"]
+
+use std::mem::swap;
+use std::ptr::{read, copy_nonoverlapping, write};
+
+type KeccakBuffer = [[u64; 5]; 5];
+
+// A basic read+copy+write swap implementation ends up copying one of the values
+// to stack for large types, which is completely unnecessary as the lack of
+// overlap means we can just do whatever fits in registers at a time.
+
+// CHECK-LABEL: @swap_basic
+#[no_mangle]
+pub fn swap_basic(x: &mut KeccakBuffer, y: &mut KeccakBuffer) {
+// CHECK: alloca [5 x [5 x i64]]
+
+    // SAFETY: exclusive references are always valid to read/write,
+    // are non-overlapping, and nothing here panics so it's drop-safe.
+    unsafe {
+        let z = read(x);
+        copy_nonoverlapping(y, x, 1);
+        write(y, z);
+    }
+}
+
+// This test verifies that the library does something smarter, and thus
+// doesn't need any scratch space on the stack.
+
+// CHECK-LABEL: @swap_std
+#[no_mangle]
+pub fn swap_std(x: &mut KeccakBuffer, y: &mut KeccakBuffer) {
+// CHECK-NOT: alloca
+// CHECK: load <{{[0-9]+}} x i64>
+// CHECK: store <{{[0-9]+}} x i64>
+    swap(x, y)
+}
+
+// CHECK-LABEL: @swap_slice
+#[no_mangle]
+pub fn swap_slice(x: &mut [KeccakBuffer], y: &mut [KeccakBuffer]) {
+// CHECK-NOT: alloca
+// CHECK: load <{{[0-9]+}} x i64>
+// CHECK: store <{{[0-9]+}} x i64>
+    if x.len() == y.len() {
+        x.swap_with_slice(y);
+    }
+}
+
+type OneKilobyteBuffer = [u8; 1024];
+
+// CHECK-LABEL: @swap_1kb_slices
+#[no_mangle]
+pub fn swap_1kb_slices(x: &mut [OneKilobyteBuffer], y: &mut [OneKilobyteBuffer]) {
+// CHECK-NOT: alloca
+// CHECK: load <{{[0-9]+}} x i8>
+// CHECK: store <{{[0-9]+}} x i8>
+    if x.len() == y.len() {
+        x.swap_with_slice(y);
+    }
+}
diff --git a/src/test/codegen/swap-simd-types.rs b/src/test/codegen/swap-simd-types.rs
new file mode 100644
index 00000000000..c90b277eb44
--- /dev/null
+++ b/src/test/codegen/swap-simd-types.rs
@@ -0,0 +1,32 @@
+// compile-flags: -O -C target-feature=+avx
+// only-x86_64
+// ignore-debug: the debug assertions get in the way
+
+#![crate_type = "lib"]
+
+use std::mem::swap;
+
+// SIMD types are highly-aligned already, so make sure the swap code leaves their
+// types alone and doesn't pessimize them (such as by swapping them as `usize`s).
+extern crate core;
+use core::arch::x86_64::__m256;
+
+// CHECK-LABEL: @swap_single_m256
+#[no_mangle]
+pub fn swap_single_m256(x: &mut __m256, y: &mut __m256) {
+// CHECK-NOT: alloca
+// CHECK: load <8 x float>{{.+}}align 32
+// CHECK: store <8 x float>{{.+}}align 32
+    swap(x, y)
+}
+
+// CHECK-LABEL: @swap_m256_slice
+#[no_mangle]
+pub fn swap_m256_slice(x: &mut [__m256], y: &mut [__m256]) {
+// CHECK-NOT: alloca
+// CHECK: load <8 x float>{{.+}}align 32
+// CHECK: store <8 x float>{{.+}}align 32
+    if x.len() == y.len() {
+        x.swap_with_slice(y);
+    }
+}
diff --git a/src/test/codegen/swap-small-types.rs b/src/test/codegen/swap-small-types.rs
index 6205e6a6559..2f375844cc7 100644
--- a/src/test/codegen/swap-small-types.rs
+++ b/src/test/codegen/swap-small-types.rs
@@ -16,3 +16,47 @@ pub fn swap_rgb48(x: &mut RGB48, y: &mut RGB48) {
 // CHECK: store i48
     swap(x, y)
 }
+
+// LLVM doesn't vectorize a loop over 3-byte elements,
+// so we chunk it down to bytes and loop over those instead.
+type RGB24 = [u8; 3];
+
+// CHECK-LABEL: @swap_rgb24_slices
+#[no_mangle]
+pub fn swap_rgb24_slices(x: &mut [RGB24], y: &mut [RGB24]) {
+// CHECK-NOT: alloca
+// CHECK: load <{{[0-9]+}} x i8>
+// CHECK: store <{{[0-9]+}} x i8>
+    if x.len() == y.len() {
+        x.swap_with_slice(y);
+    }
+}
+
+// This one has a power-of-two size, so we iterate over it directly
+type RGBA32 = [u8; 4];
+
+// CHECK-LABEL: @swap_rgba32_slices
+#[no_mangle]
+pub fn swap_rgba32_slices(x: &mut [RGBA32], y: &mut [RGBA32]) {
+// CHECK-NOT: alloca
+// CHECK: load <{{[0-9]+}} x i32>
+// CHECK: store <{{[0-9]+}} x i32>
+    if x.len() == y.len() {
+        x.swap_with_slice(y);
+    }
+}
+
+// Strings have a non-power-of-two size, but have pointer alignment,
+// so we swap usizes instead of dropping all the way down to bytes.
+const _: () = assert!(!std::mem::size_of::<String>().is_power_of_two());
+
+// CHECK-LABEL: @swap_string_slices
+#[no_mangle]
+pub fn swap_string_slices(x: &mut [String], y: &mut [String]) {
+// CHECK-NOT: alloca
+// CHECK: load <{{[0-9]+}} x i64>
+// CHECK: store <{{[0-9]+}} x i64>
+    if x.len() == y.len() {
+        x.swap_with_slice(y);
+    }
+}