diff options
| author | Matthias Krüger <matthias.krueger@famsik.de> | 2023-11-28 09:28:36 +0100 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2023-11-28 09:28:36 +0100 |
| commit | 4af1f997400dc0c7bf2d1b931c852e92a40fa15b (patch) | |
| tree | 3072accd9c262a54690f8d7d02fdb66ee5cd86e0 | |
| parent | c2ec90854a0d3a3e1570cf7acfdd2c9e50aa3b2a (diff) | |
| parent | 40cf1f9257628d40e38a4eca9e1b8ea03a3abcd1 (diff) | |
| download | rust-4af1f997400dc0c7bf2d1b931c852e92a40fa15b.tar.gz rust-4af1f997400dc0c7bf2d1b931c852e92a40fa15b.zip | |
Rollup merge of #115331 - the8472:chars_advance, r=cuviper
optimize str::iter::Chars::advance_by
```
OLD:
str::iter::chars_advance_by_0001 0.00ns/iter +/- 0.00ns
str::iter::chars_advance_by_0010 13.00ns/iter +/- 1.00ns
str::iter::chars_advance_by_1000 1.20µs/iter +/- 15.00ns
NEW:
str::iter::chars_advance_by_0001 0.00ns/iter +/- 0.00ns
str::iter::chars_advance_by_0010 6.00ns/iter +/- 0.00ns
str::iter::chars_advance_by_1000 75.00ns/iter +/- 1.00ns
```
| -rw-r--r-- | library/alloc/tests/str.rs | 11 | ||||
| -rw-r--r-- | library/core/benches/lib.rs | 1 | ||||
| -rw-r--r-- | library/core/benches/str.rs | 1 | ||||
| -rw-r--r-- | library/core/benches/str/iter.rs | 17 | ||||
| -rw-r--r-- | library/core/src/str/iter.rs | 50 |
5 files changed, 80 insertions, 0 deletions
diff --git a/library/alloc/tests/str.rs b/library/alloc/tests/str.rs index cb59a9d4ab2..df8a260624a 100644 --- a/library/alloc/tests/str.rs +++ b/library/alloc/tests/str.rs @@ -1171,6 +1171,17 @@ fn test_iterator() { } #[test] +fn test_iterator_advance() { + let s = "「赤錆」と呼ばれる鉄錆は、水の存在下での鉄の自然酸化によって生じる、オキシ水酸化鉄(III) 等の(含水)酸化物粒子の疎な凝集膜であるとみなせる。"; + let chars: Vec<char> = s.chars().collect(); + let mut it = s.chars(); + it.advance_by(1).unwrap(); + assert_eq!(it.next(), Some(chars[1])); + it.advance_by(33).unwrap(); + assert_eq!(it.next(), Some(chars[35])); +} + +#[test] fn test_rev_iterator() { let s = "ศไทย中华Việt Nam"; let v = ['m', 'a', 'N', ' ', 't', 'ệ', 'i', 'V', '华', '中', 'ย', 'ท', 'ไ', 'ศ']; diff --git a/library/core/benches/lib.rs b/library/core/benches/lib.rs index 74ef0949b8a..fdefc9a714e 100644 --- a/library/core/benches/lib.rs +++ b/library/core/benches/lib.rs @@ -5,6 +5,7 @@ #![feature(trusted_random_access)] #![feature(iter_array_chunks)] #![feature(iter_next_chunk)] +#![feature(iter_advance_by)] extern crate test; diff --git a/library/core/benches/str.rs b/library/core/benches/str.rs index 78865d81fb9..7d36eff3d6c 100644 --- a/library/core/benches/str.rs +++ b/library/core/benches/str.rs @@ -3,6 +3,7 @@ use test::{black_box, Bencher}; mod char_count; mod corpora; +mod iter; #[bench] fn str_validate_emoji(b: &mut Bencher) { diff --git a/library/core/benches/str/iter.rs b/library/core/benches/str/iter.rs new file mode 100644 index 00000000000..58ae71fc10f --- /dev/null +++ b/library/core/benches/str/iter.rs @@ -0,0 +1,17 @@ +use super::corpora; +use test::{black_box, Bencher}; + +#[bench] +fn chars_advance_by_1000(b: &mut Bencher) { + b.iter(|| black_box(corpora::ru::LARGE).chars().advance_by(1000)); +} + +#[bench] +fn chars_advance_by_0010(b: &mut Bencher) { + b.iter(|| black_box(corpora::ru::LARGE).chars().advance_by(10)); +} + +#[bench] +fn chars_advance_by_0001(b: &mut Bencher) { + b.iter(|| black_box(corpora::ru::LARGE).chars().advance_by(1)); +} diff --git a/library/core/src/str/iter.rs b/library/core/src/str/iter.rs index c30f01b3c06..dd2efb00516 100644 --- a/library/core/src/str/iter.rs +++ b/library/core/src/str/iter.rs @@ -8,6 +8,7 @@ use crate::iter::{TrustedRandomAccess, TrustedRandomAccessNoCoerce}; use crate::ops::Try; use crate::option; use crate::slice::{self, Split as SliceSplit}; +use core::num::NonZeroUsize; use super::from_utf8_unchecked; use super::pattern::Pattern; @@ -50,6 +51,55 @@ impl<'a> Iterator for Chars<'a> { } #[inline] + fn advance_by(&mut self, mut remainder: usize) -> Result<(), NonZeroUsize> { + const CHUNK_SIZE: usize = 32; + + if remainder >= CHUNK_SIZE { + let mut chunks = self.iter.as_slice().array_chunks::<CHUNK_SIZE>(); + let mut bytes_skipped: usize = 0; + + while remainder > CHUNK_SIZE + && let Some(chunk) = chunks.next() + { + bytes_skipped += CHUNK_SIZE; + + let mut start_bytes = [false; CHUNK_SIZE]; + + for i in 0..CHUNK_SIZE { + start_bytes[i] = !super::validations::utf8_is_cont_byte(chunk[i]); + } + + remainder -= start_bytes.into_iter().map(|i| i as u8).sum::<u8>() as usize; + } + + // SAFETY: The amount of bytes exists since we just iterated over them, + // so advance_by will succeed. + unsafe { self.iter.advance_by(bytes_skipped).unwrap_unchecked() }; + + // skip trailing continuation bytes + while self.iter.len() > 0 { + let b = self.iter.as_slice()[0]; + if !super::validations::utf8_is_cont_byte(b) { + break; + } + // SAFETY: We just peeked at the byte, therefore it exists + unsafe { self.iter.advance_by(1).unwrap_unchecked() }; + } + } + + while (remainder > 0) && (self.iter.len() > 0) { + remainder -= 1; + let b = self.iter.as_slice()[0]; + let slurp = super::validations::utf8_char_width(b); + // SAFETY: utf8 validity requires that the string must contain + // the continuation bytes (if any) + unsafe { self.iter.advance_by(slurp).unwrap_unchecked() }; + } + + NonZeroUsize::new(remainder).map_or(Ok(()), Err) + } + + #[inline] fn size_hint(&self) -> (usize, Option<usize>) { let len = self.iter.len(); // `(len + 3)` can't overflow, because we know that the `slice::Iter` |
