diff options
| author | The 8472 <git@infinite-source.de> | 2023-08-29 02:01:24 +0200 |
|---|---|---|
| committer | The 8472 <git@infinite-source.de> | 2023-11-27 22:06:35 +0100 |
| commit | 40cf1f9257628d40e38a4eca9e1b8ea03a3abcd1 (patch) | |
| tree | 46dbd71f9ce402e14ef7c7ffcb6236dc7cbc0564 /library/core/src | |
| parent | 3f55e8665c548d5c2ed33c9823880a8dbdf8a78f (diff) | |
| download | rust-40cf1f9257628d40e38a4eca9e1b8ea03a3abcd1.tar.gz rust-40cf1f9257628d40e38a4eca9e1b8ea03a3abcd1.zip | |
optimize str::iter::Chars::advance_by
this avoids part of the char decoding work by not looking at utf8 continuation bytes
Diffstat (limited to 'library/core/src')
| -rw-r--r-- | library/core/src/str/iter.rs | 50 |
1 files changed, 50 insertions, 0 deletions
diff --git a/library/core/src/str/iter.rs b/library/core/src/str/iter.rs index c30f01b3c06..dd2efb00516 100644 --- a/library/core/src/str/iter.rs +++ b/library/core/src/str/iter.rs @@ -8,6 +8,7 @@ use crate::iter::{TrustedRandomAccess, TrustedRandomAccessNoCoerce}; use crate::ops::Try; use crate::option; use crate::slice::{self, Split as SliceSplit}; +use core::num::NonZeroUsize; use super::from_utf8_unchecked; use super::pattern::Pattern; @@ -50,6 +51,55 @@ impl<'a> Iterator for Chars<'a> { } #[inline] + fn advance_by(&mut self, mut remainder: usize) -> Result<(), NonZeroUsize> { + const CHUNK_SIZE: usize = 32; + + if remainder >= CHUNK_SIZE { + let mut chunks = self.iter.as_slice().array_chunks::<CHUNK_SIZE>(); + let mut bytes_skipped: usize = 0; + + while remainder > CHUNK_SIZE + && let Some(chunk) = chunks.next() + { + bytes_skipped += CHUNK_SIZE; + + let mut start_bytes = [false; CHUNK_SIZE]; + + for i in 0..CHUNK_SIZE { + start_bytes[i] = !super::validations::utf8_is_cont_byte(chunk[i]); + } + + remainder -= start_bytes.into_iter().map(|i| i as u8).sum::<u8>() as usize; + } + + // SAFETY: The amount of bytes exists since we just iterated over them, + // so advance_by will succeed. + unsafe { self.iter.advance_by(bytes_skipped).unwrap_unchecked() }; + + // skip trailing continuation bytes + while self.iter.len() > 0 { + let b = self.iter.as_slice()[0]; + if !super::validations::utf8_is_cont_byte(b) { + break; + } + // SAFETY: We just peeked at the byte, therefore it exists + unsafe { self.iter.advance_by(1).unwrap_unchecked() }; + } + } + + while (remainder > 0) && (self.iter.len() > 0) { + remainder -= 1; + let b = self.iter.as_slice()[0]; + let slurp = super::validations::utf8_char_width(b); + // SAFETY: utf8 validity requires that the string must contain + // the continuation bytes (if any) + unsafe { self.iter.advance_by(slurp).unwrap_unchecked() }; + } + + NonZeroUsize::new(remainder).map_or(Ok(()), Err) + } + + #[inline] fn size_hint(&self) -> (usize, Option<usize>) { let len = self.iter.len(); // `(len + 3)` can't overflow, because we know that the `slice::Iter` |
