Auto merge of #42819 - scottmcm:swap-nonoverlapping, r=sfackler

Reuse the mem::swap optimizations to speed up slice::rotate This is most helpful for compound types where LLVM didn't vectorize the loop. Highlight: bench slice::rotate_medium_by727_strings gets 38% faster. Exposes the swapping logic from PR https://github.com/rust-lang/rust/pull/40454 as `pub unsafe fn ptr::swap_nonoverlapping` under library feature `swap_nonoverlapping` https://github.com/rust-lang/rust/issues/42818. (The new method seemed plausible, and was the simplest way to share the logic. I'm not attached to it, though, so let me know if a different way would be better.)
author: bors <bors@rust-lang.org> 2017-06-28 14:33:00 +0000
committer: bors <bors@rust-lang.org> 2017-06-28 14:33:00 +0000
commit: 47faf1d51952ecd9d4c8a7325332fba34fbe00bd (patch)
tree: 48daed35be898e6aa9b342b3f6fb796b571c04e4 /src/libcore/ptr.rs
parent: c16930762a4a7762b26579f7d44273d3bab4c11e (diff)
parent: 47fa016193a729091aef9c531df9385548ad46ab (diff)
download: rust-47faf1d51952ecd9d4c8a7325332fba34fbe00bd.tar.gz
rust-47faf1d51952ecd9d4c8a7325332fba34fbe00bd.zip
1 files changed, 84 insertions, 0 deletions
diff --git a/src/libcore/ptr.rs b/src/libcore/ptr.rs
index 54ae9e0d628..92470299366 100644
--- a/src/libcore/ptr.rs
+++ b/src/libcore/ptr.rs
@@ -117,6 +117,90 @@ pub unsafe fn swap<T>(x: *mut T, y: *mut T) {
     mem::forget(tmp);
 }
 
+/// Swaps a sequence of values at two mutable locations of the same type.
+///
+/// # Safety
+///
+/// The two arguments must each point to the beginning of `count` locations
+/// of valid memory, and the two memory ranges must not overlap.
+///
+/// # Examples
+///
+/// Basic usage:
+///
+/// ```
+/// #![feature(swap_nonoverlapping)]
+///
+/// use std::ptr;
+///
+/// let mut x = [1, 2, 3, 4];
+/// let mut y = [7, 8, 9];
+///
+/// unsafe {
+///     ptr::swap_nonoverlapping(x.as_mut_ptr(), y.as_mut_ptr(), 2);
+/// }
+///
+/// assert_eq!(x, [7, 8, 3, 4]);
+/// assert_eq!(y, [1, 2, 9]);
+/// ```
+#[inline]
+#[unstable(feature = "swap_nonoverlapping", issue = "42818")]
+pub unsafe fn swap_nonoverlapping<T>(x: *mut T, y: *mut T, count: usize) {
+    let x = x as *mut u8;
+    let y = y as *mut u8;
+    let len = mem::size_of::<T>() * count;
+    swap_nonoverlapping_bytes(x, y, len)
+}
+
+#[inline]
+unsafe fn swap_nonoverlapping_bytes(x: *mut u8, y: *mut u8, len: usize) {
+    // The approach here is to utilize simd to swap x & y efficiently. Testing reveals
+    // that swapping either 32 bytes or 64 bytes at a time is most efficient for intel
+    // Haswell E processors. LLVM is more able to optimize if we give a struct a
+    // #[repr(simd)], even if we don't actually use this struct directly.
+    //
+    // FIXME repr(simd) broken on emscripten and redox
+    #[cfg_attr(not(any(target_os = "emscripten", target_os = "redox")), repr(simd))]
+    struct Block(u64, u64, u64, u64);
+    struct UnalignedBlock(u64, u64, u64, u64);
+
+    let block_size = mem::size_of::<Block>();
+
+    // Loop through x & y, copying them `Block` at a time
+    // The optimizer should unroll the loop fully for most types
+    // N.B. We can't use a for loop as the `range` impl calls `mem::swap` recursively
+    let mut i = 0;
+    while i + block_size <= len {
+        // Create some uninitialized memory as scratch space
+        // Declaring `t` here avoids aligning the stack when this loop is unused
+        let mut t: Block = mem::uninitialized();
+        let t = &mut t as *mut _ as *mut u8;
+        let x = x.offset(i as isize);
+        let y = y.offset(i as isize);
+
+        // Swap a block of bytes of x & y, using t as a temporary buffer
+        // This should be optimized into efficient SIMD operations where available
+        copy_nonoverlapping(x, t, block_size);
+        copy_nonoverlapping(y, x, block_size);
+        copy_nonoverlapping(t, y, block_size);
+        i += block_size;
+    }
+
+    if i < len {
+        // Swap any remaining bytes
+        let mut t: UnalignedBlock = mem::uninitialized();
+        let rem = len - i;
+
+        let t = &mut t as *mut _ as *mut u8;
+        let x = x.offset(i as isize);
+        let y = y.offset(i as isize);
+
+        copy_nonoverlapping(x, t, rem);
+        copy_nonoverlapping(y, x, rem);
+        copy_nonoverlapping(t, y, rem);
+    }
+}
+
 /// Replaces the value at `dest` with `src`, returning the old
 /// value, without dropping either.
 ///
author	bors <bors@rust-lang.org>	2017-06-28 14:33:00 +0000
committer	bors <bors@rust-lang.org>	2017-06-28 14:33:00 +0000
commit	47faf1d51952ecd9d4c8a7325332fba34fbe00bd (patch)
tree	48daed35be898e6aa9b342b3f6fb796b571c04e4 /src/libcore/ptr.rs
parent	c16930762a4a7762b26579f7d44273d3bab4c11e (diff)
parent	47fa016193a729091aef9c531df9385548ad46ab (diff)
download	rust-47faf1d51952ecd9d4c8a7325332fba34fbe00bd.tar.gz rust-47faf1d51952ecd9d4c8a7325332fba34fbe00bd.zip