diff options
| author | The 8472 <git@infinite-source.de> | 2023-08-29 02:01:24 +0200 |
|---|---|---|
| committer | The 8472 <git@infinite-source.de> | 2023-11-27 22:06:35 +0100 |
| commit | 40cf1f9257628d40e38a4eca9e1b8ea03a3abcd1 (patch) | |
| tree | 46dbd71f9ce402e14ef7c7ffcb6236dc7cbc0564 | |
| parent | 3f55e8665c548d5c2ed33c9823880a8dbdf8a78f (diff) | |
| download | rust-40cf1f9257628d40e38a4eca9e1b8ea03a3abcd1.tar.gz rust-40cf1f9257628d40e38a4eca9e1b8ea03a3abcd1.zip | |
optimize str::iter::Chars::advance_by
this avoids part of the char decoding work by not looking at utf8 continuation bytes
| -rw-r--r-- | library/alloc/tests/str.rs | 11 | ||||
| -rw-r--r-- | library/core/src/str/iter.rs | 50 |
2 files changed, 61 insertions, 0 deletions
diff --git a/library/alloc/tests/str.rs b/library/alloc/tests/str.rs index cb59a9d4ab2..df8a260624a 100644 --- a/library/alloc/tests/str.rs +++ b/library/alloc/tests/str.rs @@ -1171,6 +1171,17 @@ fn test_iterator() { } #[test] +fn test_iterator_advance() { + let s = "「赤錆」と呼ばれる鉄錆は、水の存在下での鉄の自然酸化によって生じる、オキシ水酸化鉄(III) 等の(含水)酸化物粒子の疎な凝集膜であるとみなせる。"; + let chars: Vec<char> = s.chars().collect(); + let mut it = s.chars(); + it.advance_by(1).unwrap(); + assert_eq!(it.next(), Some(chars[1])); + it.advance_by(33).unwrap(); + assert_eq!(it.next(), Some(chars[35])); +} + +#[test] fn test_rev_iterator() { let s = "ศไทย中华Việt Nam"; let v = ['m', 'a', 'N', ' ', 't', 'ệ', 'i', 'V', '华', '中', 'ย', 'ท', 'ไ', 'ศ']; diff --git a/library/core/src/str/iter.rs b/library/core/src/str/iter.rs index c30f01b3c06..dd2efb00516 100644 --- a/library/core/src/str/iter.rs +++ b/library/core/src/str/iter.rs @@ -8,6 +8,7 @@ use crate::iter::{TrustedRandomAccess, TrustedRandomAccessNoCoerce}; use crate::ops::Try; use crate::option; use crate::slice::{self, Split as SliceSplit}; +use core::num::NonZeroUsize; use super::from_utf8_unchecked; use super::pattern::Pattern; @@ -50,6 +51,55 @@ impl<'a> Iterator for Chars<'a> { } #[inline] + fn advance_by(&mut self, mut remainder: usize) -> Result<(), NonZeroUsize> { + const CHUNK_SIZE: usize = 32; + + if remainder >= CHUNK_SIZE { + let mut chunks = self.iter.as_slice().array_chunks::<CHUNK_SIZE>(); + let mut bytes_skipped: usize = 0; + + while remainder > CHUNK_SIZE + && let Some(chunk) = chunks.next() + { + bytes_skipped += CHUNK_SIZE; + + let mut start_bytes = [false; CHUNK_SIZE]; + + for i in 0..CHUNK_SIZE { + start_bytes[i] = !super::validations::utf8_is_cont_byte(chunk[i]); + } + + remainder -= start_bytes.into_iter().map(|i| i as u8).sum::<u8>() as usize; + } + + // SAFETY: The amount of bytes exists since we just iterated over them, + // so advance_by will succeed. + unsafe { self.iter.advance_by(bytes_skipped).unwrap_unchecked() }; + + // skip trailing continuation bytes + while self.iter.len() > 0 { + let b = self.iter.as_slice()[0]; + if !super::validations::utf8_is_cont_byte(b) { + break; + } + // SAFETY: We just peeked at the byte, therefore it exists + unsafe { self.iter.advance_by(1).unwrap_unchecked() }; + } + } + + while (remainder > 0) && (self.iter.len() > 0) { + remainder -= 1; + let b = self.iter.as_slice()[0]; + let slurp = super::validations::utf8_char_width(b); + // SAFETY: utf8 validity requires that the string must contain + // the continuation bytes (if any) + unsafe { self.iter.advance_by(slurp).unwrap_unchecked() }; + } + + NonZeroUsize::new(remainder).map_or(Ok(()), Err) + } + + #[inline] fn size_hint(&self) -> (usize, Option<usize>) { let len = self.iter.len(); // `(len + 3)` can't overflow, because we know that the `slice::Iter` |
