7 files changed, 363 insertions, 68 deletions
diff --git a/src/libcore/intrinsics.rs b/src/libcore/intrinsics.rs
index 5ec6cb6c710..510a5bb3df7 100644
--- a/src/libcore/intrinsics.rs
+++ b/src/libcore/intrinsics.rs
@@ -1463,15 +1463,26 @@ extern "rust-intrinsic" {
     /// source as well as std's catch implementation.
     pub fn try(f: fn(*mut u8), data: *mut u8, local_ptr: *mut u8) -> i32;
 
-    /// Computes the byte offset that needs to be applied to `ptr` in order to
-    /// make it aligned to `align`.
-    /// If it is not possible to align `ptr`, the implementation returns
+    #[cfg(stage0)]
+    /// docs my friends, its friday!
+    pub fn align_offset(ptr: *const (), align: usize) -> usize;
+
+    /// Computes the offset that needs to be applied to the pointer in order to make it aligned to
+    /// `align`.
+    ///
+    /// If it is not possible to align the pointer, the implementation returns
     /// `usize::max_value()`.
     ///
-    /// There are no guarantees whatsover that offsetting the pointer will not
-    /// overflow or go beyond the allocation that `ptr` points into.
-    /// It is up to the caller to ensure that the returned offset is correct
-    /// in all terms other than alignment.
+    /// The offset is expressed in number of `T` elements, and not bytes. The value returned can be
+    /// used with the `offset` or `offset_to` methods.
+    ///
+    /// There are no guarantees whatsover that offsetting the pointer will not overflow or go
+    /// beyond the allocation that the pointer points into. It is up to the caller to ensure that
+    /// the returned offset is correct in all terms other than alignment.
+    ///
+    /// # Unsafety
+    ///
+    /// `align` must be a power-of-two.
     ///
     /// # Examples
     ///
@@ -1485,7 +1496,7 @@ extern "rust-intrinsic" {
     /// # unsafe {
     /// let x = [5u8, 6u8, 7u8, 8u8, 9u8];
     /// let ptr = &x[n] as *const u8;
-    /// let offset = align_offset(ptr as *const (), align_of::<u16>());
+    /// let offset = align_offset(ptr, align_of::<u16>());
     /// if offset < x.len() - n - 1 {
     ///     let u16_ptr = ptr.offset(offset as isize) as *const u16;
     ///     assert_ne!(*u16_ptr, 500);
@@ -1495,7 +1506,8 @@ extern "rust-intrinsic" {
     /// }
     /// # } }
     /// ```
-    pub fn align_offset(ptr: *const (), align: usize) -> usize;
+    #[cfg(not(stage0))]
+    pub fn align_offset<T>(ptr: *const T, align: usize) -> usize;
 
     /// Emits a `!nontemporal` store according to LLVM (see their docs).
     /// Probably will never become stable.
diff --git a/src/libcore/ptr.rs b/src/libcore/ptr.rs
index 63bcc024020..a762a8a6f92 100644
--- a/src/libcore/ptr.rs
+++ b/src/libcore/ptr.rs
@@ -1433,15 +1433,22 @@ impl<T: ?Sized> *const T {
         copy_nonoverlapping(self, dest, count)
     }
 
-    /// Computes the byte offset that needs to be applied in order to
-    /// make the pointer aligned to `align`.
+    /// Computes the offset that needs to be applied to the pointer in order to make it aligned to
+    /// `align`.
+    ///
     /// If it is not possible to align the pointer, the implementation returns
     /// `usize::max_value()`.
     ///
-    /// There are no guarantees whatsover that offsetting the pointer will not
-    /// overflow or go beyond the allocation that the pointer points into.
-    /// It is up to the caller to ensure that the returned offset is correct
-    /// in all terms other than alignment.
+    /// The offset is expressed in number of `T` elements, and not bytes. The value returned can be
+    /// used with the `offset` or `offset_to` methods.
+    ///
+    /// There are no guarantees whatsover that offsetting the pointer will not overflow or go
+    /// beyond the allocation that the pointer points into. It is up to the caller to ensure that
+    /// the returned offset is correct in all terms other than alignment.
+    ///
+    /// # Panics
+    ///
+    /// The function panics if `align` is not a power-of-two.
     ///
     /// # Examples
     ///
@@ -1465,13 +1472,30 @@ impl<T: ?Sized> *const T {
     /// # } }
     /// ```
     #[unstable(feature = "align_offset", issue = "44488")]
-    pub fn align_offset(self, align: usize) -> usize {
+    #[cfg(not(stage0))]
+    pub fn align_offset(self, align: usize) -> usize where T: Sized {
+        if !align.is_power_of_two() {
+            panic!("align_offset: align is not a power-of-two");
+        }
         unsafe {
-            intrinsics::align_offset(self as *const _, align)
+            intrinsics::align_offset(self, align)
+        }
+    }
+
+    /// definitely docs.
+    #[unstable(feature = "align_offset", issue = "44488")]
+    #[cfg(stage0)]
+    pub fn align_offset(self, align: usize) -> usize where T: Sized {
+        if !align.is_power_of_two() {
+            panic!("align_offset: align is not a power-of-two");
+        }
+        unsafe {
+            intrinsics::align_offset(self as *const (), align)
         }
     }
 }
 
+
 #[lang = "mut_ptr"]
 impl<T: ?Sized> *mut T {
     /// Returns `true` if the pointer is null.
@@ -1804,44 +1828,6 @@ impl<T: ?Sized> *mut T {
         (self as *const T).wrapping_offset_from(origin)
     }
 
-    /// Computes the byte offset that needs to be applied in order to
-    /// make the pointer aligned to `align`.
-    /// If it is not possible to align the pointer, the implementation returns
-    /// `usize::max_value()`.
-    ///
-    /// There are no guarantees whatsover that offsetting the pointer will not
-    /// overflow or go beyond the allocation that the pointer points into.
-    /// It is up to the caller to ensure that the returned offset is correct
-    /// in all terms other than alignment.
-    ///
-    /// # Examples
-    ///
-    /// Accessing adjacent `u8` as `u16`
-    ///
-    /// ```
-    /// # #![feature(align_offset)]
-    /// # fn foo(n: usize) {
-    /// # use std::mem::align_of;
-    /// # unsafe {
-    /// let x = [5u8, 6u8, 7u8, 8u8, 9u8];
-    /// let ptr = &x[n] as *const u8;
-    /// let offset = ptr.align_offset(align_of::<u16>());
-    /// if offset < x.len() - n - 1 {
-    ///     let u16_ptr = ptr.offset(offset as isize) as *const u16;
-    ///     assert_ne!(*u16_ptr, 500);
-    /// } else {
-    ///     // while the pointer can be aligned via `offset`, it would point
-    ///     // outside the allocation
-    /// }
-    /// # } }
-    /// ```
-    #[unstable(feature = "align_offset", issue = "44488")]
-    pub fn align_offset(self, align: usize) -> usize {
-        unsafe {
-            intrinsics::align_offset(self as *const _, align)
-        }
-    }
-
     /// Calculates the offset from a pointer (convenience for `.offset(count as isize)`).
     ///
     /// `count` is in units of T; e.g. a `count` of 3 represents a pointer
@@ -2511,8 +2497,189 @@ impl<T: ?Sized> *mut T {
     {
         swap(self, with)
     }
+
+    /// Computes the offset that needs to be applied to the pointer in order to make it aligned to
+    /// `align`.
+    ///
+    /// If it is not possible to align the pointer, the implementation returns
+    /// `usize::max_value()`.
+    ///
+    /// The offset is expressed in number of `T` elements, and not bytes. The value returned can be
+    /// used with the `offset` or `offset_to` methods.
+    ///
+    /// There are no guarantees whatsover that offsetting the pointer will not overflow or go
+    /// beyond the allocation that the pointer points into. It is up to the caller to ensure that
+    /// the returned offset is correct in all terms other than alignment.
+    ///
+    /// # Panics
+    ///
+    /// The function panics if `align` is not a power-of-two.
+    ///
+    /// # Examples
+    ///
+    /// Accessing adjacent `u8` as `u16`
+    ///
+    /// ```
+    /// # #![feature(align_offset)]
+    /// # fn foo(n: usize) {
+    /// # use std::mem::align_of;
+    /// # unsafe {
+    /// let x = [5u8, 6u8, 7u8, 8u8, 9u8];
+    /// let ptr = &x[n] as *const u8;
+    /// let offset = ptr.align_offset(align_of::<u16>());
+    /// if offset < x.len() - n - 1 {
+    ///     let u16_ptr = ptr.offset(offset as isize) as *const u16;
+    ///     assert_ne!(*u16_ptr, 500);
+    /// } else {
+    ///     // while the pointer can be aligned via `offset`, it would point
+    ///     // outside the allocation
+    /// }
+    /// # } }
+    /// ```
+    #[unstable(feature = "align_offset", issue = "44488")]
+    #[cfg(not(stage0))]
+    pub fn align_offset(self, align: usize) -> usize where T: Sized {
+        if !align.is_power_of_two() {
+            panic!("align_offset: align is not a power-of-two");
+        }
+        unsafe {
+            intrinsics::align_offset(self, align)
+        }
+    }
+
+    /// definitely docs.
+    #[unstable(feature = "align_offset", issue = "44488")]
+    #[cfg(stage0)]
+    pub fn align_offset(self, align: usize) -> usize where T: Sized {
+        if !align.is_power_of_two() {
+            panic!("align_offset: align is not a power-of-two");
+        }
+        unsafe {
+            intrinsics::align_offset(self as *const (), align)
+        }
+    }
 }
 
+/// Align pointer `p`.
+///
+/// Calculate offset (in terms of elements of `stride` stride) that has to be applied
+/// to pointer `p` so that pointer `p` would get aligned to `a`.
+///
+/// This is an implementation of the `align_offset` intrinsic for the case where `stride > 1`.
+///
+/// Note: This implementation has been carefully tailored to not panic. It is UB for this to panic.
+/// The only real change that can be made here is change of `INV_TABLE_MOD_16` and associated
+/// constants.
+///
+/// If we ever decide to make it possible to call the intrinsic with `a` that is not a
+/// power-of-two, it will probably be more prudent to just change to a naive implementation rather
+/// than trying to adapt this to accomodate that change.
+///
+/// Any questions go to @nagisa.
+#[lang="align_offset"]
+#[cfg(not(stage0))]
+unsafe fn align_offset(p: *const (), a: usize, stride: usize) -> usize {
+    /// Calculate multiplicative modular inverse of `x` modulo `m`.
+    ///
+    /// This implementation is tailored for align_offset and has following preconditions:
+    ///
+    /// * `m` is a power-of-two;
+    /// * `x < m`; (if `x ≥ m`, pass in `x % m` instead)
+    ///
+    /// Implementation of this function shall not panic. Ever.
+    fn mod_inv(x: usize, m: usize) -> usize {
+        /// Multiplicative modular inverse table modulo 2⁴ = 16.
+        ///
+        /// Note, that this table does not contain values where inverse does not exist (i.e. for
+        /// `0⁻¹ mod 16`, `2⁻¹ mod 16`, etc.)
+        static INV_TABLE_MOD_16: [usize; 8] = [1, 11, 13, 7, 9, 3, 5, 15];
+        /// Modulo for which the `INV_TABLE_MOD_16` is intended.
+        const INV_TABLE_MOD: usize = 16;
+        /// INV_TABLE_MOD²
+        const INV_TABLE_MOD_SQUARED: usize = INV_TABLE_MOD * INV_TABLE_MOD;
+
+        let table_inverse = INV_TABLE_MOD_16[(x & (INV_TABLE_MOD - 1)) >> 1];
+        if m <= INV_TABLE_MOD {
+            return table_inverse & (m - 1);
+        } else {
+            // We iterate "up" using the following formula:
+            //
+            // $$ xy ≡ 1 (mod 2ⁿ) → xy (2 - xy) ≡ 1 (mod 2²ⁿ) $$
+            //
+            // until 2²ⁿ ≥ m. Then we can reduce to our desired `m` by taking the result `mod m`.
+            let mut inverse = table_inverse;
+            let mut going_mod = INV_TABLE_MOD_SQUARED;
+            loop {
+                // y = y * (2 - xy) mod n
+                //
+                // Note, that we use wrapping operations here intentionally – the original formula
+                // uses e.g. subtraction `mod n`. It is entirely fine to do them `mod
+                // usize::max_value()` instead, because we take the result `mod n` at the end
+                // anyway.
+                inverse = inverse.wrapping_mul(
+                    2usize.wrapping_sub(x.wrapping_mul(inverse))
+                ) & (going_mod - 1);
+                if going_mod > m {
+                    return inverse & (m - 1);
+                }
+                going_mod = going_mod.wrapping_mul(going_mod);
+            }
+        }
+    }
+
+    let a_minus_one = a.wrapping_sub(1);
+    let pmoda = p as usize & a_minus_one;
+    let smoda = stride & a_minus_one;
+    // a is power-of-two so cannot be 0. stride = 0 is handled by the intrinsic.
+    let gcdpow = intrinsics::cttz_nonzero(stride).min(intrinsics::cttz_nonzero(a));
+    let gcd = 1usize << gcdpow;
+
+    if pmoda == 0 {
+        // Already aligned. Yay!
+        return 0;
+    }
+
+    if gcd == 1 {
+        // This branch solves for the variable $o$ in following linear congruence equation:
+        //
+        // ⎰ p + o ≡ 0 (mod a)   # $p + o$ must be aligned to specified alignment $a$
+        // ⎱     o ≡ 0 (mod s)   # offset $o$ must be a multiple of stride $s$
+        //
+        // where
+        //
+        // * a, s are co-prime
+        //
+        // This gives us the formula below:
+        //
+        // o = (a - (p mod a)) * (s⁻¹ mod a) * s
+        //
+        // The first term is “the relative alignment of p to a”, the second term is “how does
+        // incrementing p by one s change the relative alignment of p”, the third term is
+        // translating change in units of s to a byte count.
+        //
+        // Furthermore, the result produced by this solution is not “minimal”, so it is necessary
+        // to take the result $o mod lcm(s, a)$. Since $s$ and $a$ are co-prime (i.e. $gcd(s, a) =
+        // 1$) and $lcm(s, a) = s * a / gcd(s, a)$, we can replace $lcm(s, a)$ with just a $s * a$.
+        //
+        // (Author note: we decided later on to express the offset in "elements" rather than bytes,
+        // which drops the multiplication by `s` on both sides of the modulo.)
+        return intrinsics::unchecked_rem(a.wrapping_sub(pmoda).wrapping_mul(mod_inv(smoda, a)), a);
+    }
+
+    if p as usize & (gcd - 1) == 0 {
+        // This can be aligned, but `a` and `stride` are not co-prime, so a somewhat adapted
+        // formula is used.
+        let j = a.wrapping_sub(pmoda) >> gcdpow;
+        let k = smoda >> gcdpow;
+        return intrinsics::unchecked_rem(j.wrapping_mul(mod_inv(k, a)), a >> gcdpow);
+    }
+
+    // Cannot be aligned at all.
+    return usize::max_value();
+}
+
+
+
 // Equality for pointers
 #[stable(feature = "rust1", since = "1.0.0")]
 impl<T: ?Sized> PartialEq for *const T {
diff --git a/src/libcore/slice/mod.rs b/src/libcore/slice/mod.rs
index 82891b691dc..6b3c1fcddc2 100644
--- a/src/libcore/slice/mod.rs
+++ b/src/libcore/slice/mod.rs
@@ -1696,6 +1696,28 @@ impl<T> [T] {
                 self.as_mut_ptr(), other.as_mut_ptr(), self.len());
         }
     }
+
+    // #[unstable(feature = "slice_align_to", issue = "44488")]
+    // pub fn align_to<U>(&self) -> (&[T], &[U], &[T]) {
+    //     // First, find at what point do we split between the first and 2nd slice.
+    //     let x = self.as_ptr();
+    //     let offset = x.align_offset(::mem::align_of::<U>());
+    //     if offset > x * ::mem::size_of::<T>() {
+    //         return (self, [], []);
+    //     }
+
+    // }
+
+    // #[unstable(feature = "slice_align_to", issue = "44488")]
+    // pub fn align_to_mut<U>(&mut self) -> (&mut [T], &mut [U], &mut [T]) {
+    // }
+}}
+
+#[lang = "slice"]
+#[cfg(not(test))]
+#[cfg(not(stage0))]
+impl<T> [T] {
+    slice_core_methods!();
 }
 
 #[lang = "slice_u8"]
diff --git a/src/librustc/middle/lang_items.rs b/src/librustc/middle/lang_items.rs
index 95e92e21b09..d70f994e87b 100644
--- a/src/librustc/middle/lang_items.rs
+++ b/src/librustc/middle/lang_items.rs
@@ -348,6 +348,9 @@ language_item_table! {
     I128ShroFnLangItem,              "i128_shro",               i128_shro_fn;
     U128ShroFnLangItem,              "u128_shro",               u128_shro_fn;
 
+    // Align offset for stride != 1, must not panic.
+    AlignOffsetLangItem,             "align_offset",            align_offset_fn;
+
     TerminationTraitLangItem,        "termination",             termination;
 }
 
diff --git a/src/librustc_codegen_llvm/intrinsic.rs b/src/librustc_codegen_llvm/intrinsic.rs
index ba04cc7fad5..93351651db9 100644
--- a/src/librustc_codegen_llvm/intrinsic.rs
+++ b/src/librustc_codegen_llvm/intrinsic.rs
@@ -25,6 +25,7 @@ use type_of::LayoutLlvmExt;
 use rustc::ty::{self, Ty};
 use rustc::ty::layout::{HasDataLayout, LayoutOf};
 use rustc::hir;
+use rustc::middle::lang_items::AlignOffsetLangItem;
 use syntax::ast;
 use syntax::symbol::Symbol;
 use builder::Builder;
@@ -390,16 +391,29 @@ pub fn codegen_intrinsic_call<'a, 'tcx>(bx: &Builder<'a, 'tcx>,
         }
 
         "align_offset" => {
-            // `ptr as usize`
-            let ptr_val = bx.ptrtoint(args[0].immediate(), bx.cx.isize_ty);
-            // `ptr_val % align`
-            let align = args[1].immediate();
-            let offset = bx.urem(ptr_val, align);
+            let (ptr, align) = (args[0].immediate(), args[1].immediate());
+            let stride_of_t = bx.cx.layout_of(substs.type_at(0)).size_and_align().0.bytes();
+            let stride = C_usize(bx.cx, stride_of_t);
             let zero = C_null(bx.cx.isize_ty);
-            // `offset == 0`
-            let is_zero = bx.icmp(llvm::IntPredicate::IntEQ, offset, zero);
-            // `if offset == 0 { 0 } else { align - offset }`
-            bx.select(is_zero, zero, bx.sub(align, offset))
+            let max = C_int(cx.isize_ty, -1); // -1isize (wherein I cheat horribly to make !0usize)
+
+            if stride_of_t <= 1 {
+                // offset = ptr as usize % align => offset = ptr as usize & (align - 1)
+                let modmask = bx.sub(align, C_usize(bx.cx, 1));
+                let offset = bx.and(bx.ptrtoint(ptr, bx.cx.isize_ty), modmask);
+                let is_zero = bx.icmp(llvm::IntPredicate::IntEQ, offset, zero);
+                // if offset == 0 { 0 } else { if stride_of_t == 1 { align - offset } else { !0 } }
+                bx.select(is_zero, zero, if stride_of_t == 1 {
+                    bx.sub(align, offset)
+                } else {
+                    max
+                })
+            } else {
+                let did = ::common::langcall(bx.tcx(), Some(span), "", AlignOffsetLangItem);
+                let instance = ty::Instance::mono(bx.tcx(), did);
+                let llfn = ::callee::get_fn(bx.cx, instance);
+                bx.call(llfn, &[ptr, align, stride], None)
+            }
         }
         name if name.starts_with("simd_") => {
             match generic_simd_intrinsic(bx, name,
diff --git a/src/librustc_typeck/check/intrinsic.rs b/src/librustc_typeck/check/intrinsic.rs
index feb26e76162..db6062db1fb 100644
--- a/src/librustc_typeck/check/intrinsic.rs
+++ b/src/librustc_typeck/check/intrinsic.rs
@@ -315,8 +315,7 @@ pub fn check_intrinsic_type<'a, 'tcx>(tcx: TyCtxt<'a, 'tcx, 'tcx>,
             }
 
             "align_offset" => {
-                let ptr_ty = tcx.mk_imm_ptr(tcx.mk_nil());
-                (0, vec![ptr_ty, tcx.types.usize], tcx.types.usize)
+                (1, vec![tcx.mk_imm_ptr(param(0)), tcx.types.usize], tcx.types.usize)
             },
 
             "nontemporal_store" => {
diff --git a/src/test/run-pass/align-offset-sign.rs b/src/test/run-pass/align-offset-sign.rs
index aaa0419d061..f7d427cb7b2 100644
--- a/src/test/run-pass/align-offset-sign.rs
+++ b/src/test/run-pass/align-offset-sign.rs
@@ -10,7 +10,85 @@
 
 #![feature(align_offset)]
 
+#[derive(Clone, Copy)]
+#[repr(packed)]
+struct A3(u16, u8);
+struct A4(u32);
+#[repr(packed)]
+struct A5(u32, u8);
+#[repr(packed)]
+struct A6(u32, u16);
+#[repr(packed)]
+struct A7(u32, u16, u8);
+#[repr(packed)]
+struct A8(u32, u32);
+#[repr(packed)]
+struct A9(u32, u32, u8);
+#[repr(packed)]
+struct A10(u32, u32, u16);
+
+unsafe fn test_weird_stride<T>(ptr: *const T, align: usize) -> bool {
+    let numptr = ptr as usize;
+    let mut expected = usize::max_value();
+    // Naive but definitely correct way to find the *first* aligned element of stride::<T>.
+    for el in (0..align) {
+        if (numptr + el * ::std::mem::size_of::<T>()) % align == 0 {
+            expected = el;
+            break;
+        }
+    }
+    let got = ptr.align_offset(align);
+    if got != expected {
+        eprintln!("aligning {:p} (with stride of {}) to {}, expected {}, got {}", ptr, ::std::mem::size_of::<T>(), align, expected, got);
+        return true;
+    }
+    return false;
+}
+
 fn main() {
-    let x = 1 as *const u8;
-    assert_eq!(x.align_offset(8), 7);
+    unsafe {
+        // For pointers of stride = 0, the pointer is already aligned or it cannot be aligned at
+        // all, because no amount of elements will align the pointer.
+        let mut p = 1;
+        while p < 1024 {
+            assert_eq!((p as *const ()).align_offset(p), 0);
+            if (p != 1) {
+                assert_eq!(((p + 1) as *const ()).align_offset(p), !0);
+            }
+            p = (p + 1).next_power_of_two();
+        }
+
+        // For pointers of stride = 1, the pointer can always be aligned. The offset is equal to
+        // number of bytes.
+        let mut align = 1;
+        while align < 1024 {
+            for ptr in 1..2*align {
+                let expected = ptr % align;
+                let offset = if expected == 0 { 0 } else { align - expected };
+                assert_eq!((ptr as *const u8).align_offset(align), offset,
+                           "ptr = {}, align = {}, size = 1", ptr, align);
+                align = (align + 1).next_power_of_two();
+            }
+        }
+
+
+        // For pointers of stride != 1, we verify the algorithm against the naivest possible
+        // implementation
+        let mut align = 1;
+        let mut x = false;
+        while align < 1024 {
+            for ptr in 1usize..4*align {
+                x |= test_weird_stride::<A3>(ptr as *const A3, align);
+                x |= test_weird_stride::<A4>(ptr as *const A4, align);
+                x |= test_weird_stride::<A5>(ptr as *const A5, align);
+                x |= test_weird_stride::<A6>(ptr as *const A6, align);
+                x |= test_weird_stride::<A7>(ptr as *const A7, align);
+                x |= test_weird_stride::<A8>(ptr as *const A8, align);
+                x |= test_weird_stride::<A9>(ptr as *const A9, align);
+                x |= test_weird_stride::<A10>(ptr as *const A10, align);
+            }
+            align = (align + 1).next_power_of_two();
+        }
+        assert!(!x);
+    }
 }