From 62a182cf7f91b11bfbf200ede67569da0f6488ec Mon Sep 17 00:00:00 2001 From: Simonas Kazlauskas Date: Mon, 4 Jul 2022 00:23:31 +0300 Subject: Add a special case for align_offset /w stride != 1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This generalizes the previous `stride == 1` special case to apply to any situation where the requested alignment is divisible by the stride. This in turn allows the test case from #98809 produce ideal assembly, along the lines of: leaq 15(%rdi), %rax andq $-16, %rax This also produces pretty high quality code for situations where the alignment of the input pointer isn’t known: pub unsafe fn ptr_u32(slice: *const u32) -> *const u32 { slice.offset(slice.align_offset(16) as isize) } // => movl %edi, %eax andl $3, %eax leaq 15(%rdi), %rcx andq $-16, %rcx subq %rdi, %rcx shrq $2, %rcx negq %rax sbbq %rax, %rax orq %rcx, %rax leaq (%rdi,%rax,4), %rax Here LLVM is smart enough to replace the `usize::MAX` special case with a branch-less bitwise-OR approach, where the mask is constructed using the neg and sbb instructions. This appears to work across various architectures I’ve tried. This change ends up introducing more branches and code in situations where there is less knowledge of the arguments. For example when the requested alignment is entirely unknown. This use-case was never really a focus of this function, so I’m not particularly worried, especially since llvm-mca is saying that the new code is still appreciably faster, despite all the new branching. Fixes #98809. Sadly, this does not help with #72356. --- src/test/assembly/align_offset.rs | 48 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 src/test/assembly/align_offset.rs (limited to 'src/test') diff --git a/src/test/assembly/align_offset.rs b/src/test/assembly/align_offset.rs new file mode 100644 index 00000000000..c5eefca3467 --- /dev/null +++ b/src/test/assembly/align_offset.rs @@ -0,0 +1,48 @@ +// assembly-output: emit-asm +// compile-flags: -Copt-level=1 +// only-x86_64 +// min-llvm-version: 14.0 +#![crate_type="rlib"] + +// CHECK-LABEL: align_offset_byte_ptr +// CHECK: leaq 31 +// CHECK: andq $-32 +// CHECK: subq +#[no_mangle] +pub fn align_offset_byte_ptr(ptr: *const u8) -> usize { + ptr.align_offset(32) +} + +// CHECK-LABEL: align_offset_byte_slice +// CHECK: leaq 31 +// CHECK: andq $-32 +// CHECK: subq +#[no_mangle] +pub fn align_offset_byte_slice(slice: &[u8]) -> usize { + slice.as_ptr().align_offset(32) +} + +// CHECK-LABEL: align_offset_word_ptr +// CHECK: leaq 31 +// CHECK: andq $-32 +// CHECK: subq +// CHECK: shrq +// This `ptr` is not known to be aligned, so it is required to check if it is at all possible to +// align. LLVM applies a simple mask. +// CHECK: orq +#[no_mangle] +pub fn align_offset_word_ptr(ptr: *const u32) -> usize { + ptr.align_offset(32) +} + +// CHECK-LABEL: align_offset_word_slice +// CHECK: leaq 31 +// CHECK: andq $-32 +// CHECK: subq +// CHECK: shrq +// `slice` is known to be aligned, so `!0` is not possible as a return +// CHECK-NOT: orq +#[no_mangle] +pub fn align_offset_word_slice(slice: &[u32]) -> usize { + slice.as_ptr().align_offset(32) +} -- cgit 1.4.1-3-g733a5