about summary refs log tree commit diff
diff options
context:
space:
mode:
authorbors <bors@rust-lang.org>2022-07-27 01:12:30 +0000
committerbors <bors@rust-lang.org>2022-07-27 01:12:30 +0000
commitb573e10d21b69ebfadf41aa9c2f0a27919fe4480 (patch)
treee4ec1b41755a3e7e6f4e299ba14473e08a578667
parent4d6d601c8a83284d6b23c253a3e2a060fd197316 (diff)
parent4ba7cac359b0180add75d78929ebae4f90813fa1 (diff)
downloadrust-b573e10d21b69ebfadf41aa9c2f0a27919fe4480.tar.gz
rust-b573e10d21b69ebfadf41aa9c2f0a27919fe4480.zip
Auto merge of #98553 - the8472:next_chunk_opt, r=Mark-Simulacrum
Optimized vec::IntoIter::next_chunk impl

```
x86_64v1, default
test vec::bench_next_chunk                               ... bench:         696 ns/iter (+/- 22)
x86_64v1, pr
test vec::bench_next_chunk                               ... bench:         309 ns/iter (+/- 4)

znver2, default
test vec::bench_next_chunk                               ... bench:      17,272 ns/iter (+/- 117)
znver2, pr
test vec::bench_next_chunk                               ... bench:         211 ns/iter (+/- 3)
```

On znver2 the default impl seems to be slow due to different inlining decisions. It goes through `core::array::iter_next_chunk`
which has a deep call tree.
-rw-r--r--library/alloc/benches/lib.rs1
-rw-r--r--library/alloc/benches/vec.rs20
-rw-r--r--library/alloc/src/lib.rs4
-rw-r--r--library/alloc/src/vec/into_iter.rs41
-rw-r--r--library/alloc/tests/lib.rs1
-rw-r--r--library/alloc/tests/vec.rs10
6 files changed, 75 insertions, 2 deletions
diff --git a/library/alloc/benches/lib.rs b/library/alloc/benches/lib.rs
index 7dc0f7cebd5..72ac897d4f1 100644
--- a/library/alloc/benches/lib.rs
+++ b/library/alloc/benches/lib.rs
@@ -2,6 +2,7 @@
 // See https://github.com/rust-lang/rust/issues/73535#event-3477699747
 #![cfg(not(target_os = "android"))]
 #![feature(btree_drain_filter)]
+#![feature(iter_next_chunk)]
 #![feature(map_first_last)]
 #![feature(repr_simd)]
 #![feature(slice_partition_dedup)]
diff --git a/library/alloc/benches/vec.rs b/library/alloc/benches/vec.rs
index efc47327e8a..663f6b9dd1c 100644
--- a/library/alloc/benches/vec.rs
+++ b/library/alloc/benches/vec.rs
@@ -762,3 +762,23 @@ fn bench_retain_whole_100000(b: &mut Bencher) {
     let mut v = black_box(vec![826u32; 100000]);
     b.iter(|| v.retain(|x| *x == 826u32));
 }
+
+#[bench]
+fn bench_next_chunk(b: &mut Bencher) {
+    let v = vec![13u8; 2048];
+
+    b.iter(|| {
+        const CHUNK: usize = 8;
+
+        let mut sum = [0u32; CHUNK];
+        let mut iter = black_box(v.clone()).into_iter();
+
+        while let Ok(chunk) = iter.next_chunk::<CHUNK>() {
+            for i in 0..CHUNK {
+                sum[i] += chunk[i] as u32;
+            }
+        }
+
+        sum
+    })
+}
diff --git a/library/alloc/src/lib.rs b/library/alloc/src/lib.rs
index 315469387e5..8b6f4054851 100644
--- a/library/alloc/src/lib.rs
+++ b/library/alloc/src/lib.rs
@@ -89,6 +89,7 @@
 #![feature(alloc_layout_extra)]
 #![feature(allocator_api)]
 #![feature(array_chunks)]
+#![feature(array_into_iter_constructors)]
 #![feature(array_methods)]
 #![feature(array_windows)]
 #![feature(assert_matches)]
@@ -117,8 +118,11 @@
 #![feature(hasher_prefixfree_extras)]
 #![feature(inplace_iteration)]
 #![feature(iter_advance_by)]
+#![feature(iter_next_chunk)]
 #![feature(layout_for_ptr)]
+#![feature(maybe_uninit_array_assume_init)]
 #![feature(maybe_uninit_slice)]
+#![feature(maybe_uninit_uninit_array)]
 #![cfg_attr(test, feature(new_uninit))]
 #![feature(nonnull_slice_from_raw_parts)]
 #![feature(pattern)]
diff --git a/library/alloc/src/vec/into_iter.rs b/library/alloc/src/vec/into_iter.rs
index 28979457b7f..1b483e3fc77 100644
--- a/library/alloc/src/vec/into_iter.rs
+++ b/library/alloc/src/vec/into_iter.rs
@@ -2,13 +2,14 @@
 use super::AsVecIntoIter;
 use crate::alloc::{Allocator, Global};
 use crate::raw_vec::RawVec;
+use core::array;
 use core::fmt;
 use core::intrinsics::arith_offset;
 use core::iter::{
     FusedIterator, InPlaceIterable, SourceIter, TrustedLen, TrustedRandomAccessNoCoerce,
 };
 use core::marker::PhantomData;
-use core::mem::{self, ManuallyDrop};
+use core::mem::{self, ManuallyDrop, MaybeUninit};
 #[cfg(not(no_global_oom_handling))]
 use core::ops::Deref;
 use core::ptr::{self, NonNull};
@@ -124,7 +125,6 @@ impl<T, A: Allocator> IntoIter<T, A> {
     }
 
     /// Forgets to Drop the remaining elements while still allowing the backing allocation to be freed.
-    #[cfg(not(no_global_oom_handling))]
     pub(crate) fn forget_remaining_elements(&mut self) {
         self.ptr = self.end;
     }
@@ -204,6 +204,43 @@ impl<T, A: Allocator> Iterator for IntoIter<T, A> {
         self.len()
     }
 
+    #[inline]
+    fn next_chunk<const N: usize>(&mut self) -> Result<[T; N], core::array::IntoIter<T, N>> {
+        let mut raw_ary = MaybeUninit::uninit_array();
+
+        let len = self.len();
+
+        if mem::size_of::<T>() == 0 {
+            if len < N {
+                self.forget_remaining_elements();
+                // Safety: ZSTs can be conjured ex nihilo, only the amount has to be correct
+                return Err(unsafe { array::IntoIter::new_unchecked(raw_ary, 0..len) });
+            }
+
+            self.ptr = unsafe { arith_offset(self.ptr as *const i8, N as isize) as *mut T };
+            // Safety: ditto
+            return Ok(unsafe { MaybeUninit::array_assume_init(raw_ary) });
+        }
+
+        if len < N {
+            // Safety: `len` indicates that this many elements are available and we just checked that
+            // it fits into the array.
+            unsafe {
+                ptr::copy_nonoverlapping(self.ptr, raw_ary.as_mut_ptr() as *mut T, len);
+                self.forget_remaining_elements();
+                return Err(array::IntoIter::new_unchecked(raw_ary, 0..len));
+            }
+        }
+
+        // Safety: `len` is larger than the array size. Copy a fixed amount here to fully initialize
+        // the array.
+        return unsafe {
+            ptr::copy_nonoverlapping(self.ptr, raw_ary.as_mut_ptr() as *mut T, N);
+            self.ptr = self.ptr.add(N);
+            Ok(MaybeUninit::array_assume_init(raw_ary))
+        };
+    }
+
     unsafe fn __iterator_get_unchecked(&mut self, i: usize) -> Self::Item
     where
         Self: TrustedRandomAccessNoCoerce,
diff --git a/library/alloc/tests/lib.rs b/library/alloc/tests/lib.rs
index c29e7b9c81e..d83cd29ddba 100644
--- a/library/alloc/tests/lib.rs
+++ b/library/alloc/tests/lib.rs
@@ -27,6 +27,7 @@
 #![feature(binary_heap_as_slice)]
 #![feature(inplace_iteration)]
 #![feature(iter_advance_by)]
+#![feature(iter_next_chunk)]
 #![feature(round_char_boundary)]
 #![feature(slice_group_by)]
 #![feature(slice_partition_dedup)]
diff --git a/library/alloc/tests/vec.rs b/library/alloc/tests/vec.rs
index 699567be5a0..d94da8f5f5a 100644
--- a/library/alloc/tests/vec.rs
+++ b/library/alloc/tests/vec.rs
@@ -1,4 +1,5 @@
 use core::alloc::{Allocator, Layout};
+use core::iter::IntoIterator;
 use core::ptr::NonNull;
 use std::alloc::System;
 use std::assert_matches::assert_matches;
@@ -931,6 +932,15 @@ fn test_into_iter_count() {
 }
 
 #[test]
+fn test_into_iter_next_chunk() {
+    let mut iter = b"lorem".to_vec().into_iter();
+
+    assert_eq!(iter.next_chunk().unwrap(), [b'l', b'o']); // N is inferred as 2
+    assert_eq!(iter.next_chunk().unwrap(), [b'r', b'e', b'm']); // N is inferred as 3
+    assert_eq!(iter.next_chunk::<4>().unwrap_err().as_slice(), &[]); // N is explicitly 4
+}
+
+#[test]
 fn test_into_iter_clone() {
     fn iter_equal<I: Iterator<Item = i32>>(it: I, slice: &[i32]) {
         let v: Vec<i32> = it.collect();