Rollup merge of #115331 - the8472:chars_advance, r=cuviper

optimize str::iter::Chars::advance_by ``` OLD: str::iter::chars_advance_by_0001 0.00ns/iter +/- 0.00ns str::iter::chars_advance_by_0010 13.00ns/iter +/- 1.00ns str::iter::chars_advance_by_1000 1.20µs/iter +/- 15.00ns NEW: str::iter::chars_advance_by_0001 0.00ns/iter +/- 0.00ns str::iter::chars_advance_by_0010 6.00ns/iter +/- 0.00ns str::iter::chars_advance_by_1000 75.00ns/iter +/- 1.00ns ```
author: Matthias Krüger <matthias.krueger@famsik.de> 2023-11-28 09:28:36 +0100
committer: GitHub <noreply@github.com> 2023-11-28 09:28:36 +0100
commit: 4af1f997400dc0c7bf2d1b931c852e92a40fa15b (patch)
tree: 3072accd9c262a54690f8d7d02fdb66ee5cd86e0
parent: c2ec90854a0d3a3e1570cf7acfdd2c9e50aa3b2a (diff)
parent: 40cf1f9257628d40e38a4eca9e1b8ea03a3abcd1 (diff)
download: rust-4af1f997400dc0c7bf2d1b931c852e92a40fa15b.tar.gz
rust-4af1f997400dc0c7bf2d1b931c852e92a40fa15b.zip
5 files changed, 80 insertions, 0 deletions
diff --git a/library/alloc/tests/str.rs b/library/alloc/tests/str.rs
index cb59a9d4ab2..df8a260624a 100644
--- a/library/alloc/tests/str.rs
+++ b/library/alloc/tests/str.rs
@@ -1171,6 +1171,17 @@ fn test_iterator() {
 }
 
 #[test]
+fn test_iterator_advance() {
+    let s = "「赤錆」と呼ばれる鉄錆は、水の存在下での鉄の自然酸化によって生じる、オキシ水酸化鉄(III) 等の（含水）酸化物粒子の疎な凝集膜であるとみなせる。";
+    let chars: Vec<char> = s.chars().collect();
+    let mut it = s.chars();
+    it.advance_by(1).unwrap();
+    assert_eq!(it.next(), Some(chars[1]));
+    it.advance_by(33).unwrap();
+    assert_eq!(it.next(), Some(chars[35]));
+}
+
+#[test]
 fn test_rev_iterator() {
     let s = "ศไทย中华Việt Nam";
     let v = ['m', 'a', 'N', ' ', 't', 'ệ', 'i', 'V', '华', '中', 'ย', 'ท', 'ไ', 'ศ'];
diff --git a/library/core/benches/lib.rs b/library/core/benches/lib.rs
index 74ef0949b8a..fdefc9a714e 100644
--- a/library/core/benches/lib.rs
+++ b/library/core/benches/lib.rs
@@ -5,6 +5,7 @@
 #![feature(trusted_random_access)]
 #![feature(iter_array_chunks)]
 #![feature(iter_next_chunk)]
+#![feature(iter_advance_by)]
 
 extern crate test;
 
diff --git a/library/core/benches/str.rs b/library/core/benches/str.rs
index 78865d81fb9..7d36eff3d6c 100644
--- a/library/core/benches/str.rs
+++ b/library/core/benches/str.rs
@@ -3,6 +3,7 @@ use test::{black_box, Bencher};
 
 mod char_count;
 mod corpora;
+mod iter;
 
 #[bench]
 fn str_validate_emoji(b: &mut Bencher) {
diff --git a/library/core/benches/str/iter.rs b/library/core/benches/str/iter.rs
new file mode 100644
index 00000000000..58ae71fc10f
--- /dev/null
+++ b/library/core/benches/str/iter.rs
@@ -0,0 +1,17 @@
+use super::corpora;
+use test::{black_box, Bencher};
+
+#[bench]
+fn chars_advance_by_1000(b: &mut Bencher) {
+    b.iter(|| black_box(corpora::ru::LARGE).chars().advance_by(1000));
+}
+
+#[bench]
+fn chars_advance_by_0010(b: &mut Bencher) {
+    b.iter(|| black_box(corpora::ru::LARGE).chars().advance_by(10));
+}
+
+#[bench]
+fn chars_advance_by_0001(b: &mut Bencher) {
+    b.iter(|| black_box(corpora::ru::LARGE).chars().advance_by(1));
+}
diff --git a/library/core/src/str/iter.rs b/library/core/src/str/iter.rs
index c30f01b3c06..dd2efb00516 100644
--- a/library/core/src/str/iter.rs
+++ b/library/core/src/str/iter.rs
@@ -8,6 +8,7 @@ use crate::iter::{TrustedRandomAccess, TrustedRandomAccessNoCoerce};
 use crate::ops::Try;
 use crate::option;
 use crate::slice::{self, Split as SliceSplit};
+use core::num::NonZeroUsize;
 
 use super::from_utf8_unchecked;
 use super::pattern::Pattern;
@@ -50,6 +51,55 @@ impl<'a> Iterator for Chars<'a> {
     }
 
     #[inline]
+    fn advance_by(&mut self, mut remainder: usize) -> Result<(), NonZeroUsize> {
+        const CHUNK_SIZE: usize = 32;
+
+        if remainder >= CHUNK_SIZE {
+            let mut chunks = self.iter.as_slice().array_chunks::<CHUNK_SIZE>();
+            let mut bytes_skipped: usize = 0;
+
+            while remainder > CHUNK_SIZE
+                && let Some(chunk) = chunks.next()
+            {
+                bytes_skipped += CHUNK_SIZE;
+
+                let mut start_bytes = [false; CHUNK_SIZE];
+
+                for i in 0..CHUNK_SIZE {
+                    start_bytes[i] = !super::validations::utf8_is_cont_byte(chunk[i]);
+                }
+
+                remainder -= start_bytes.into_iter().map(|i| i as u8).sum::<u8>() as usize;
+            }
+
+            // SAFETY: The amount of bytes exists since we just iterated over them,
+            // so advance_by will succeed.
+            unsafe { self.iter.advance_by(bytes_skipped).unwrap_unchecked() };
+
+            // skip trailing continuation bytes
+            while self.iter.len() > 0 {
+                let b = self.iter.as_slice()[0];
+                if !super::validations::utf8_is_cont_byte(b) {
+                    break;
+                }
+                // SAFETY: We just peeked at the byte, therefore it exists
+                unsafe { self.iter.advance_by(1).unwrap_unchecked() };
+            }
+        }
+
+        while (remainder > 0) && (self.iter.len() > 0) {
+            remainder -= 1;
+            let b = self.iter.as_slice()[0];
+            let slurp = super::validations::utf8_char_width(b);
+            // SAFETY: utf8 validity requires that the string must contain
+            // the continuation bytes (if any)
+            unsafe { self.iter.advance_by(slurp).unwrap_unchecked() };
+        }
+
+        NonZeroUsize::new(remainder).map_or(Ok(()), Err)
+    }
+
+    #[inline]
     fn size_hint(&self) -> (usize, Option<usize>) {
         let len = self.iter.len();
         // `(len + 3)` can't overflow, because we know that the `slice::Iter`
author	Matthias Krüger <matthias.krueger@famsik.de>	2023-11-28 09:28:36 +0100
committer	GitHub <noreply@github.com>	2023-11-28 09:28:36 +0100
commit	4af1f997400dc0c7bf2d1b931c852e92a40fa15b (patch)
tree	3072accd9c262a54690f8d7d02fdb66ee5cd86e0
parent	c2ec90854a0d3a3e1570cf7acfdd2c9e50aa3b2a (diff)
parent	40cf1f9257628d40e38a4eca9e1b8ea03a3abcd1 (diff)
download	rust-4af1f997400dc0c7bf2d1b931c852e92a40fa15b.tar.gz rust-4af1f997400dc0c7bf2d1b931c852e92a40fa15b.zip