1 files changed, 14 insertions, 3 deletions
diff --git a/library/core/src/ptr/mod.rs b/library/core/src/ptr/mod.rs
index 68b5d1df71c..6f0e1e6a7b3 100644
--- a/library/core/src/ptr/mod.rs
+++ b/library/core/src/ptr/mod.rs
@@ -1168,7 +1168,9 @@ pub unsafe fn write_volatile<T>(dst: *mut T, src: T) {
 pub(crate) unsafe fn align_offset<T: Sized>(p: *const T, a: usize) -> usize {
     // FIXME(#75598): Direct use of these intrinsics improves codegen significantly at opt-level <=
     // 1, where the method versions of these operations are not inlined.
-    use intrinsics::{unchecked_shl, unchecked_shr, unchecked_sub, wrapping_mul, wrapping_sub};
+    use intrinsics::{
+        unchecked_shl, unchecked_shr, unchecked_sub, wrapping_add, wrapping_mul, wrapping_sub,
+    };
 
     /// Calculate multiplicative modular inverse of `x` modulo `m`.
     ///
@@ -1223,8 +1225,17 @@ pub(crate) unsafe fn align_offset<T: Sized>(p: *const T, a: usize) -> usize {
     // SAFETY: `a` is a power-of-two, therefore non-zero.
     let a_minus_one = unsafe { unchecked_sub(a, 1) };
     if stride == 1 {
-        // `stride == 1` case can be computed more efficiently through `-p (mod a)`.
-        return wrapping_sub(0, p as usize) & a_minus_one;
+        // `stride == 1` case can be computed more simply through `-p (mod a)`, but doing so
+        // inhibits LLVM's ability to select instructions like `lea`. Instead we compute
+        //
+        //    round_up_to_next_alignment(p, a) - p
+        //
+        // which distributes operations around the load-bearing, but pessimizing `and` sufficiently
+        // for LLVM to be able to utilize the various optimizations it knows about.
+        return wrapping_sub(
+            wrapping_add(p as usize, a_minus_one) & wrapping_sub(0, a),
+            p as usize,
+        );
     }
 
     let pmoda = p as usize & a_minus_one;