about summary refs log tree commit diff
diff options
context:
space:
mode:
authorStjepan Glavina <stjepang@gmail.com>2017-02-04 16:26:01 +0100
committerStjepan Glavina <stjepang@gmail.com>2017-02-04 16:44:43 +0100
commita884a6c60d3ec2338d7dfed9998a7aa96e10260e (patch)
treec99db696900f275e050d051b9b1c44a60f565f18
parent7df5c0f17b2966bd7e6cd4326f990fe844254a4f (diff)
downloadrust-a884a6c60d3ec2338d7dfed9998a7aa96e10260e.tar.gz
rust-a884a6c60d3ec2338d7dfed9998a7aa96e10260e.zip
Slightly optimize slice::sort
First, get rid of some bound checks.

Second, instead of comparing by ternary `compare` function, use a binary
function testing whether an element is less than some other element.
This apparently makes it easier for the compiler to reason about the
code.

Benchmark:

```
name                                        before ns/iter        after ns/iter         diff ns/iter   diff %
slice::bench::sort_large_ascending          8,969 (8919 MB/s)     7,410 (10796 MB/s)          -1,559  -17.38%
slice::bench::sort_large_big_ascending      355,640 (3599 MB/s)   359,137 (3564 MB/s)          3,497    0.98%
slice::bench::sort_large_big_descending     427,112 (2996 MB/s)   424,721 (3013 MB/s)         -2,391   -0.56%
slice::bench::sort_large_big_random         2,207,799 (579 MB/s)  2,138,804 (598 MB/s)       -68,995   -3.13%
slice::bench::sort_large_descending         13,694 (5841 MB/s)    13,514 (5919 MB/s)            -180   -1.31%
slice::bench::sort_large_mostly_ascending   239,697 (333 MB/s)    203,542 (393 MB/s)         -36,155  -15.08%
slice::bench::sort_large_mostly_descending  270,102 (296 MB/s)    234,263 (341 MB/s)         -35,839  -13.27%
slice::bench::sort_large_random             513,406 (155 MB/s)    470,084 (170 MB/s)         -43,322   -8.44%
slice::bench::sort_large_random_expensive   23,650,321 (3 MB/s)   23,675,098 (3 MB/s)         24,777    0.10%
slice::bench::sort_medium_ascending         143 (5594 MB/s)       132 (6060 MB/s)                -11   -7.69%
slice::bench::sort_medium_descending        197 (4060 MB/s)       188 (4255 MB/s)                 -9   -4.57%
slice::bench::sort_medium_random            3,358 (238 MB/s)      3,271 (244 MB/s)               -87   -2.59%
slice::bench::sort_small_ascending          32 (2500 MB/s)        32 (2500 MB/s)                   0    0.00%
slice::bench::sort_small_big_ascending      97 (13195 MB/s)       97 (13195 MB/s)                  0    0.00%
slice::bench::sort_small_big_descending     247 (5182 MB/s)       249 (5140 MB/s)                  2    0.81%
slice::bench::sort_small_big_random         502 (2549 MB/s)       498 (2570 MB/s)                 -4   -0.80%
slice::bench::sort_small_descending         55 (1454 MB/s)        61 (1311 MB/s)                   6   10.91%
slice::bench::sort_small_random             358 (223 MB/s)        356 (224 MB/s)                  -2   -0.56%
```
-rw-r--r--src/libcollections/slice.rs68
1 files changed, 36 insertions, 32 deletions
diff --git a/src/libcollections/slice.rs b/src/libcollections/slice.rs
index 11f513ed798..2ea953df873 100644
--- a/src/libcollections/slice.rs
+++ b/src/libcollections/slice.rs
@@ -98,7 +98,7 @@
 #![cfg_attr(test, allow(unused_imports, dead_code))]
 
 use alloc::boxed::Box;
-use core::cmp::Ordering::{self, Greater};
+use core::cmp::Ordering::{self, Less};
 use core::mem::size_of;
 use core::mem;
 use core::ptr;
@@ -1089,7 +1089,7 @@ impl<T> [T] {
     pub fn sort(&mut self)
         where T: Ord
     {
-        self.sort_by(|a, b| a.cmp(b))
+        merge_sort(self, |a, b| a.lt(b));
     }
 
     /// Sorts the slice using `f` to extract a key to compare elements by.
@@ -1119,7 +1119,7 @@ impl<T> [T] {
     pub fn sort_by_key<B, F>(&mut self, mut f: F)
         where F: FnMut(&T) -> B, B: Ord
     {
-        self.sort_by(|a, b| f(a).cmp(&f(b)))
+        merge_sort(self, |a, b| f(a).lt(&f(b)));
     }
 
     /// Sorts the slice using `compare` to compare elements.
@@ -1149,10 +1149,10 @@ impl<T> [T] {
     /// ```
     #[stable(feature = "rust1", since = "1.0.0")]
     #[inline]
-    pub fn sort_by<F>(&mut self, compare: F)
+    pub fn sort_by<F>(&mut self, mut compare: F)
         where F: FnMut(&T, &T) -> Ordering
     {
-        merge_sort(self, compare)
+        merge_sort(self, |a, b| compare(a, b) == Less);
     }
 
     /// Copies the elements from `src` into `self`.
@@ -1355,10 +1355,10 @@ impl<T: Clone> ToOwned for [T] {
 /// Inserts `v[0]` into pre-sorted sequence `v[1..]` so that whole `v[..]` becomes sorted.
 ///
 /// This is the integral subroutine of insertion sort.
-fn insert_head<T, F>(v: &mut [T], compare: &mut F)
-    where F: FnMut(&T, &T) -> Ordering
+fn insert_head<T, F>(v: &mut [T], is_less: &mut F)
+    where F: FnMut(&T, &T) -> bool
 {
-    if v.len() >= 2 && compare(&v[0], &v[1]) == Greater {
+    if v.len() >= 2 && is_less(&v[1], &v[0]) {
         unsafe {
             // There are three ways to implement insertion here:
             //
@@ -1381,12 +1381,12 @@ fn insert_head<T, F>(v: &mut [T], compare: &mut F)
 
             // Intermediate state of the insertion process is always tracked by `hole`, which
             // serves two purposes:
-            // 1. Protects integrity of `v` from panics in `compare`.
+            // 1. Protects integrity of `v` from panics in `is_less`.
             // 2. Fills the remaining hole in `v` in the end.
             //
             // Panic safety:
             //
-            // If `compare` panics at any point during the process, `hole` will get dropped and
+            // If `is_less` panics at any point during the process, `hole` will get dropped and
             // fill the hole in `v` with `tmp`, thus ensuring that `v` still holds every object it
             // initially held exactly once.
             let mut hole = InsertionHole {
@@ -1396,7 +1396,7 @@ fn insert_head<T, F>(v: &mut [T], compare: &mut F)
             ptr::copy_nonoverlapping(&v[1], &mut v[0], 1);
 
             for i in 2..v.len() {
-                if compare(&tmp.value, &v[i]) != Greater {
+                if !is_less(&v[i], &tmp.value) {
                     break;
                 }
                 ptr::copy_nonoverlapping(&v[i], &mut v[i - 1], 1);
@@ -1432,8 +1432,8 @@ fn insert_head<T, F>(v: &mut [T], compare: &mut F)
 ///
 /// The two slices must be non-empty and `mid` must be in bounds. Buffer `buf` must be long enough
 /// to hold a copy of the shorter slice. Also, `T` must not be a zero-sized type.
-unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, compare: &mut F)
-    where F: FnMut(&T, &T) -> Ordering
+unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, is_less: &mut F)
+    where F: FnMut(&T, &T) -> bool
 {
     let len = v.len();
     let v = v.as_mut_ptr();
@@ -1449,12 +1449,12 @@ unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, compare: &mut F)
     // hole in `v`.
     //
     // Intermediate state of the process is always tracked by `hole`, which serves two purposes:
-    // 1. Protects integrity of `v` from panics in `compare`.
+    // 1. Protects integrity of `v` from panics in `is_less`.
     // 2. Fills the remaining hole in `v` if the longer run gets consumed first.
     //
     // Panic safety:
     //
-    // If `compare` panics at any point during the process, `hole` will get dropped and fill the
+    // If `is_less` panics at any point during the process, `hole` will get dropped and fill the
     // hole in `v` with the unconsumed range in `buf`, thus ensuring that `v` still holds every
     // object it initially held exactly once.
     let mut hole;
@@ -1476,7 +1476,7 @@ unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, compare: &mut F)
         while *left < hole.end && right < v_end {
             // Consume the lesser side.
             // If equal, prefer the left run to maintain stability.
-            let to_copy = if compare(&**left, &*right) == Greater {
+            let to_copy = if is_less(&*right, &**left) {
                 get_and_increment(&mut right)
             } else {
                 get_and_increment(left)
@@ -1500,7 +1500,7 @@ unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, compare: &mut F)
         while v < *left && buf < *right {
             // Consume the greater side.
             // If equal, prefer the right run to maintain stability.
-            let to_copy = if compare(&*left.offset(-1), &*right.offset(-1)) == Greater {
+            let to_copy = if is_less(&*right.offset(-1), &*left.offset(-1)) {
                 decrement_and_get(left)
             } else {
                 decrement_and_get(right)
@@ -1550,8 +1550,8 @@ unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, compare: &mut F)
 /// 2. for every `i` in `2..runs.len()`: `runs[i - 2].len > runs[i - 1].len + runs[i].len`
 ///
 /// The invariants ensure that the total running time is `O(n log n)` worst-case.
-fn merge_sort<T, F>(v: &mut [T], mut compare: F)
-    where F: FnMut(&T, &T) -> Ordering
+fn merge_sort<T, F>(v: &mut [T], mut is_less: F)
+    where F: FnMut(&T, &T) -> bool
 {
     // Sorting has no meaningful behavior on zero-sized types.
     if size_of::<T>() == 0 {
@@ -1565,7 +1565,7 @@ fn merge_sort<T, F>(v: &mut [T], mut compare: F)
     //
     // Short runs are extended using insertion sort to span at least `min_run` elements, in order
     // to improve performance.
-    let (max_insertion, min_run) = if size_of::<T>() <= 16 {
+    let (max_insertion, min_run) = if size_of::<T>() <= 2 * mem::size_of::<usize>() {
         (64, 32)
     } else {
         (32, 16)
@@ -1577,7 +1577,7 @@ fn merge_sort<T, F>(v: &mut [T], mut compare: F)
     if len <= max_insertion {
         if len >= 2 {
             for i in (0..len-1).rev() {
-                insert_head(&mut v[i..], &mut compare);
+                insert_head(&mut v[i..], &mut is_less);
             }
         }
         return;
@@ -1585,7 +1585,7 @@ fn merge_sort<T, F>(v: &mut [T], mut compare: F)
 
     // Allocate a buffer to use as scratch memory. We keep the length 0 so we can keep in it
     // shallow copies of the contents of `v` without risking the dtors running on copies if
-    // `compare` panics. When merging two sorted runs, this buffer holds a copy of the shorter run,
+    // `is_less` panics. When merging two sorted runs, this buffer holds a copy of the shorter run,
     // which will always have length at most `len / 2`.
     let mut buf = Vec::with_capacity(len / 2);
 
@@ -1600,14 +1600,18 @@ fn merge_sort<T, F>(v: &mut [T], mut compare: F)
         let mut start = end - 1;
         if start > 0 {
             start -= 1;
-            if compare(&v[start], &v[start + 1]) == Greater {
-                while start > 0 && compare(&v[start - 1], &v[start]) == Greater {
-                    start -= 1;
-                }
-                v[start..end].reverse();
-            } else {
-                while start > 0 && compare(&v[start - 1], &v[start]) != Greater {
-                    start -= 1;
+            unsafe {
+                if is_less(v.get_unchecked(start + 1), v.get_unchecked(start)) {
+                    while start > 0 && is_less(v.get_unchecked(start),
+                                               v.get_unchecked(start - 1)) {
+                        start -= 1;
+                    }
+                    v[start..end].reverse();
+                } else {
+                    while start > 0 && !is_less(v.get_unchecked(start),
+                                                v.get_unchecked(start - 1)) {
+                        start -= 1;
+                    }
                 }
             }
         }
@@ -1616,7 +1620,7 @@ fn merge_sort<T, F>(v: &mut [T], mut compare: F)
         // merge sort on short sequences, so this significantly improves performance.
         while start > 0 && end - start < min_run {
             start -= 1;
-            insert_head(&mut v[start..end], &mut compare);
+            insert_head(&mut v[start..end], &mut is_less);
         }
 
         // Push this run onto the stack.
@@ -1632,7 +1636,7 @@ fn merge_sort<T, F>(v: &mut [T], mut compare: F)
             let right = runs[r];
             unsafe {
                 merge(&mut v[left.start .. right.start + right.len], left.len, buf.as_mut_ptr(),
-                      &mut compare);
+                      &mut is_less);
             }
             runs[r] = Run {
                 start: left.start,