diff options
| author | bors <bors@rust-lang.org> | 2018-01-01 19:04:33 +0000 |
|---|---|---|
| committer | bors <bors@rust-lang.org> | 2018-01-01 19:04:33 +0000 |
| commit | b65f0bedd2f22d9661ecb7092f07746dc2ccfb0d (patch) | |
| tree | 142a5b0bdf6197e8a87847d881d9b8616494f3ff | |
| parent | 5deba220d4c42b5313d7e71731ce5e8698866684 (diff) | |
| parent | 5cf55165fae5c8538db5c00e252ad9ba42aaf246 (diff) | |
| download | rust-b65f0bedd2f22d9661ecb7092f07746dc2ccfb0d.tar.gz rust-b65f0bedd2f22d9661ecb7092f07746dc2ccfb0d.zip | |
Auto merge of #46735 - Manishearth:memchr-find, r=burntsushi
Use memchr for str::find(char) This is a 10x improvement for searching for characters. This also contains the patches from https://github.com/rust-lang/rust/pull/46713 . Feel free to land both separately or together. cc @mystor @alexcrichton r? @bluss fixes #46693
| -rw-r--r-- | src/libcore/str/pattern.rs | 337 | ||||
| -rw-r--r-- | src/libcore/tests/lib.rs | 2 | ||||
| -rw-r--r-- | src/libcore/tests/pattern.rs | 264 |
3 files changed, 519 insertions, 84 deletions
diff --git a/src/libcore/str/pattern.rs b/src/libcore/str/pattern.rs index edb7bed4520..677c0ecc33d 100644 --- a/src/libcore/str/pattern.rs +++ b/src/libcore/str/pattern.rs @@ -19,6 +19,7 @@ use cmp; use fmt; +use slice::memchr; use usize; // Pattern @@ -127,6 +128,11 @@ pub unsafe trait Searcher<'a> { fn next(&mut self) -> SearchStep; /// Find the next `Match` result. See `next()` + /// + /// Unlike next(), there is no guarantee that the returned ranges + /// of this and next_reject will overlap. This will return (start_match, end_match), + /// where start_match is the index of where the match begins, and end_match is + /// the index after the end of the match. #[inline] fn next_match(&mut self) -> Option<(usize, usize)> { loop { @@ -138,7 +144,10 @@ pub unsafe trait Searcher<'a> { } } - /// Find the next `Reject` result. See `next()` + /// Find the next `Reject` result. See `next()` and `next_match()` + /// + /// Unlike next(), there is no guarantee that the returned ranges + /// of this and next_match will overlap. #[inline] fn next_reject(&mut self) -> Option<(usize, usize)> { loop { @@ -234,62 +243,272 @@ pub unsafe trait ReverseSearcher<'a>: Searcher<'a> { /// `"[aa]a"` or `"a[aa]"`, depending from which side it is searched. pub trait DoubleEndedSearcher<'a>: ReverseSearcher<'a> {} + ///////////////////////////////////////////////////////////////////////////// -// Impl for a CharEq wrapper +// Impl for char ///////////////////////////////////////////////////////////////////////////// -#[doc(hidden)] -trait CharEq { - fn matches(&mut self, c: char) -> bool; - fn only_ascii(&self) -> bool; +/// Associated type for `<char as Pattern<'a>>::Searcher`. +#[derive(Clone, Debug)] +pub struct CharSearcher<'a> { + haystack: &'a str, + // safety invariant: `finger`/`finger_back` must be a valid utf8 byte index of `haystack` + // This invariant can be broken *within* next_match and next_match_back, however + // they must exit with fingers on valid code point boundaries. + + /// `finger` is the current byte index of the forward search. + /// Imagine that it exists before the byte at its index, i.e. + /// haystack[finger] is the first byte of the slice we must inspect during + /// forward searching + finger: usize, + /// `finger_back` is the current byte index of the reverse search. + /// Imagine that it exists after the byte at its index, i.e. + /// haystack[finger_back - 1] is the last byte of the slice we must inspect during + /// forward searching (and thus the first byte to be inspected when calling next_back()) + finger_back: usize, + /// The character being searched for + needle: char, + + // safety invariant: `utf8_size` must be less than 5 + /// The number of bytes `needle` takes up when encoded in utf8 + utf8_size: usize, + /// A utf8 encoded copy of the `needle` + utf8_encoded: [u8; 4], } -impl CharEq for char { +unsafe impl<'a> Searcher<'a> for CharSearcher<'a> { + #[inline] + fn haystack(&self) -> &'a str { + self.haystack + } + #[inline] + fn next(&mut self) -> SearchStep { + let old_finger = self.finger; + let slice = unsafe { self.haystack.get_unchecked(old_finger..self.haystack.len()) }; + let mut iter = slice.chars(); + let old_len = iter.iter.len(); + if let Some(ch) = iter.next() { + // add byte offset of current character + // without re-encoding as utf-8 + self.finger += old_len - iter.iter.len(); + if ch == self.needle { + SearchStep::Match(old_finger, self.finger) + } else { + SearchStep::Reject(old_finger, self.finger) + } + } else { + SearchStep::Done + } + } #[inline] - fn matches(&mut self, c: char) -> bool { *self == c } + fn next_match(&mut self) -> Option<(usize, usize)> { + loop { + // get the haystack after the last character found + let bytes = if let Some(slice) = self.haystack.as_bytes().get(self.finger..) { + slice + } else { + return None; + }; + // the last byte of the utf8 encoded needle + let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) }; + if let Some(index) = memchr::memchr(last_byte, bytes) { + // The new finger is the index of the byte we found, + // plus one, since we memchr'd for the last byte of the character. + // + // Note that this doesn't always give us a finger on a UTF8 boundary. + // If we *didn't* find our character + // we may have indexed to the non-last byte of a 3-byte or 4-byte character. + // We can't just skip to the next valid starting byte because a character like + // ꁁ (U+A041 YI SYLLABLE PA), utf-8 `EA 81 81` will have us always find + // the second byte when searching for the third. + // + // However, this is totally okay. While we have the invariant that + // self.finger is on a UTF8 boundary, this invariant is not relid upon + // within this method (it is relied upon in CharSearcher::next()). + // + // We only exit this method when we reach the end of the string, or if we + // find something. When we find something the `finger` will be set + // to a UTF8 boundary. + self.finger += index + 1; + if self.finger >= self.utf8_size { + let found_char = self.finger - self.utf8_size; + if let Some(slice) = self.haystack.as_bytes().get(found_char..self.finger) { + if slice == &self.utf8_encoded[0..self.utf8_size] { + return Some((found_char, self.finger)); + } + } + } + } else { + // found nothing, exit + self.finger = self.haystack.len(); + return None; + } + } + } + // let next_reject use the default implementation from the Searcher trait +} + +unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> { + #[inline] + fn next_back(&mut self) -> SearchStep { + let old_finger = self.finger_back; + let slice = unsafe { self.haystack.slice_unchecked(0, old_finger) }; + let mut iter = slice.chars(); + let old_len = iter.iter.len(); + if let Some(ch) = iter.next_back() { + // subtract byte offset of current character + // without re-encoding as utf-8 + self.finger_back -= old_len - iter.iter.len(); + if ch == self.needle { + SearchStep::Match(self.finger_back, old_finger) + } else { + SearchStep::Reject(self.finger_back, old_finger) + } + } else { + SearchStep::Done + } + } #[inline] - fn only_ascii(&self) -> bool { (*self as u32) < 128 } + fn next_match_back(&mut self) -> Option<(usize, usize)> { + let haystack = self.haystack.as_bytes(); + loop { + // get the haystack up to but not including the last character searched + let bytes = if let Some(slice) = haystack.get(..self.finger_back) { + slice + } else { + return None; + }; + // the last byte of the utf8 encoded needle + let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) }; + if let Some(index) = memchr::memrchr(last_byte, bytes) { + // memrchr will return the index of the byte we wish to + // find. In case of an ASCII character, this is indeed + // were we wish our new finger to be ("after" the found + // char in the paradigm of reverse iteration). For + // multibyte chars we need to skip down by the number of more + // bytes they have than ASCII + let shift = self.utf8_size - 1; + if index >= shift { + let found_char = index - shift; + if let Some(slice) = haystack.get(found_char..(found_char + self.utf8_size)) { + if slice == &self.utf8_encoded[0..self.utf8_size] { + // move finger to before the character found (i.e. at its start index) + self.finger_back = found_char; + return Some((self.finger_back, self.finger_back + self.utf8_size)); + } + } + } + // We can't use finger_back = index - size + 1 here. If we found the last char + // of a different-sized character (or the middle byte of a different character) + // we need to bump the finger_back down to `index`. This similarly makes + // `finger_back` have the potential to no longer be on a boundary, + // but this is OK since we only exit this function on a boundary + // or when the haystack has been searched completely. + // + // Unlike next_match this does not + // have the problem of repeated bytes in utf-8 because + // we're searching for the last byte, and we can only have + // found the last byte when searching in reverse. + self.finger_back = index; + } else { + self.finger_back = 0; + // found nothing, exit + return None; + } + } + } + + // let next_reject_back use the default implementation from the Searcher trait } -impl<F> CharEq for F where F: FnMut(char) -> bool { +impl<'a> DoubleEndedSearcher<'a> for CharSearcher<'a> {} + +/// Searches for chars that are equal to a given char +impl<'a> Pattern<'a> for char { + type Searcher = CharSearcher<'a>; + #[inline] - fn matches(&mut self, c: char) -> bool { (*self)(c) } + fn into_searcher(self, haystack: &'a str) -> Self::Searcher { + let mut utf8_encoded = [0; 4]; + self.encode_utf8(&mut utf8_encoded); + let utf8_size = self.len_utf8(); + CharSearcher { + haystack, + finger: 0, + finger_back: haystack.len(), + needle: self, + utf8_size, + utf8_encoded + } + } #[inline] - fn only_ascii(&self) -> bool { false } -} + fn is_contained_in(self, haystack: &'a str) -> bool { + if (self as u32) < 128 { + haystack.as_bytes().contains(&(self as u8)) + } else { + let mut buffer = [0u8; 4]; + self.encode_utf8(&mut buffer).is_contained_in(haystack) + } + } -impl<'a> CharEq for &'a [char] { #[inline] - fn matches(&mut self, c: char) -> bool { - self.iter().any(|&m| { let mut m = m; m.matches(c) }) + fn is_prefix_of(self, haystack: &'a str) -> bool { + if let Some(ch) = haystack.chars().next() { + self == ch + } else { + false + } + } + + #[inline] + fn is_suffix_of(self, haystack: &'a str) -> bool where Self::Searcher: ReverseSearcher<'a> + { + if let Some(ch) = haystack.chars().next_back() { + self == ch + } else { + false + } } +} + +///////////////////////////////////////////////////////////////////////////// +// Impl for a MultiCharEq wrapper +///////////////////////////////////////////////////////////////////////////// + +#[doc(hidden)] +trait MultiCharEq { + fn matches(&mut self, c: char) -> bool; +} +impl<F> MultiCharEq for F where F: FnMut(char) -> bool { #[inline] - fn only_ascii(&self) -> bool { - self.iter().all(|m| m.only_ascii()) + fn matches(&mut self, c: char) -> bool { (*self)(c) } +} + +impl<'a> MultiCharEq for &'a [char] { + #[inline] + fn matches(&mut self, c: char) -> bool { + self.iter().any(|&m| { m == c }) } } -struct CharEqPattern<C: CharEq>(C); +struct MultiCharEqPattern<C: MultiCharEq>(C); #[derive(Clone, Debug)] -struct CharEqSearcher<'a, C: CharEq> { +struct MultiCharEqSearcher<'a, C: MultiCharEq> { char_eq: C, haystack: &'a str, char_indices: super::CharIndices<'a>, - #[allow(dead_code)] - ascii_only: bool, } -impl<'a, C: CharEq> Pattern<'a> for CharEqPattern<C> { - type Searcher = CharEqSearcher<'a, C>; +impl<'a, C: MultiCharEq> Pattern<'a> for MultiCharEqPattern<C> { + type Searcher = MultiCharEqSearcher<'a, C>; #[inline] - fn into_searcher(self, haystack: &'a str) -> CharEqSearcher<'a, C> { - CharEqSearcher { - ascii_only: self.0.only_ascii(), + fn into_searcher(self, haystack: &'a str) -> MultiCharEqSearcher<'a, C> { + MultiCharEqSearcher { haystack, char_eq: self.0, char_indices: haystack.char_indices(), @@ -297,7 +516,7 @@ impl<'a, C: CharEq> Pattern<'a> for CharEqPattern<C> { } } -unsafe impl<'a, C: CharEq> Searcher<'a> for CharEqSearcher<'a, C> { +unsafe impl<'a, C: MultiCharEq> Searcher<'a> for MultiCharEqSearcher<'a, C> { #[inline] fn haystack(&self) -> &'a str { self.haystack @@ -322,7 +541,7 @@ unsafe impl<'a, C: CharEq> Searcher<'a> for CharEqSearcher<'a, C> { } } -unsafe impl<'a, C: CharEq> ReverseSearcher<'a> for CharEqSearcher<'a, C> { +unsafe impl<'a, C: MultiCharEq> ReverseSearcher<'a> for MultiCharEqSearcher<'a, C> { #[inline] fn next_back(&mut self) -> SearchStep { let s = &mut self.char_indices; @@ -342,7 +561,7 @@ unsafe impl<'a, C: CharEq> ReverseSearcher<'a> for CharEqSearcher<'a, C> { } } -impl<'a, C: CharEq> DoubleEndedSearcher<'a> for CharEqSearcher<'a, C> {} +impl<'a, C: MultiCharEq> DoubleEndedSearcher<'a> for MultiCharEqSearcher<'a, C> {} ///////////////////////////////////////////////////////////////////////////// @@ -410,55 +629,6 @@ macro_rules! searcher_methods { } ///////////////////////////////////////////////////////////////////////////// -// Impl for char -///////////////////////////////////////////////////////////////////////////// - -/// Associated type for `<char as Pattern<'a>>::Searcher`. -#[derive(Clone, Debug)] -pub struct CharSearcher<'a>(<CharEqPattern<char> as Pattern<'a>>::Searcher); - -unsafe impl<'a> Searcher<'a> for CharSearcher<'a> { - searcher_methods!(forward); -} - -unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> { - searcher_methods!(reverse); -} - -impl<'a> DoubleEndedSearcher<'a> for CharSearcher<'a> {} - -/// Searches for chars that are equal to a given char -impl<'a> Pattern<'a> for char { - type Searcher = CharSearcher<'a>; - - #[inline] - fn into_searcher(self, haystack: &'a str) -> Self::Searcher { - CharSearcher(CharEqPattern(self).into_searcher(haystack)) - } - - #[inline] - fn is_contained_in(self, haystack: &'a str) -> bool { - if (self as u32) < 128 { - haystack.as_bytes().contains(&(self as u8)) - } else { - let mut buffer = [0u8; 4]; - self.encode_utf8(&mut buffer).is_contained_in(haystack) - } - } - - #[inline] - fn is_prefix_of(self, haystack: &'a str) -> bool { - CharEqPattern(self).is_prefix_of(haystack) - } - - #[inline] - fn is_suffix_of(self, haystack: &'a str) -> bool where Self::Searcher: ReverseSearcher<'a> - { - CharEqPattern(self).is_suffix_of(haystack) - } -} - -///////////////////////////////////////////////////////////////////////////// // Impl for &[char] ///////////////////////////////////////////////////////////////////////////// @@ -466,7 +636,7 @@ impl<'a> Pattern<'a> for char { /// Associated type for `<&[char] as Pattern<'a>>::Searcher`. #[derive(Clone, Debug)] -pub struct CharSliceSearcher<'a, 'b>(<CharEqPattern<&'b [char]> as Pattern<'a>>::Searcher); +pub struct CharSliceSearcher<'a, 'b>(<MultiCharEqPattern<&'b [char]> as Pattern<'a>>::Searcher); unsafe impl<'a, 'b> Searcher<'a> for CharSliceSearcher<'a, 'b> { searcher_methods!(forward); @@ -480,7 +650,7 @@ impl<'a, 'b> DoubleEndedSearcher<'a> for CharSliceSearcher<'a, 'b> {} /// Searches for chars that are equal to any of the chars in the array impl<'a, 'b> Pattern<'a> for &'b [char] { - pattern_methods!(CharSliceSearcher<'a, 'b>, CharEqPattern, CharSliceSearcher); + pattern_methods!(CharSliceSearcher<'a, 'b>, MultiCharEqPattern, CharSliceSearcher); } ///////////////////////////////////////////////////////////////////////////// @@ -489,7 +659,7 @@ impl<'a, 'b> Pattern<'a> for &'b [char] { /// Associated type for `<F as Pattern<'a>>::Searcher`. #[derive(Clone)] -pub struct CharPredicateSearcher<'a, F>(<CharEqPattern<F> as Pattern<'a>>::Searcher) +pub struct CharPredicateSearcher<'a, F>(<MultiCharEqPattern<F> as Pattern<'a>>::Searcher) where F: FnMut(char) -> bool; impl<'a, F> fmt::Debug for CharPredicateSearcher<'a, F> @@ -499,7 +669,6 @@ impl<'a, F> fmt::Debug for CharPredicateSearcher<'a, F> f.debug_struct("CharPredicateSearcher") .field("haystack", &self.0.haystack) .field("char_indices", &self.0.char_indices) - .field("ascii_only", &self.0.ascii_only) .finish() } } @@ -520,7 +689,7 @@ impl<'a, F> DoubleEndedSearcher<'a> for CharPredicateSearcher<'a, F> /// Searches for chars that match the given predicate impl<'a, F> Pattern<'a> for F where F: FnMut(char) -> bool { - pattern_methods!(CharPredicateSearcher<'a, F>, CharEqPattern, CharPredicateSearcher); + pattern_methods!(CharPredicateSearcher<'a, F>, MultiCharEqPattern, CharPredicateSearcher); } ///////////////////////////////////////////////////////////////////////////// diff --git a/src/libcore/tests/lib.rs b/src/libcore/tests/lib.rs index 0e445cdac35..c4b85b82981 100644 --- a/src/libcore/tests/lib.rs +++ b/src/libcore/tests/lib.rs @@ -28,6 +28,7 @@ #![feature(iter_rfind)] #![feature(iter_rfold)] #![feature(nonzero)] +#![feature(pattern)] #![feature(raw)] #![feature(refcell_replace_swap)] #![feature(sip_hash_13)] @@ -61,6 +62,7 @@ mod nonzero; mod num; mod ops; mod option; +mod pattern; mod ptr; mod result; mod slice; diff --git a/src/libcore/tests/pattern.rs b/src/libcore/tests/pattern.rs new file mode 100644 index 00000000000..d0fd15263b2 --- /dev/null +++ b/src/libcore/tests/pattern.rs @@ -0,0 +1,264 @@ +// Copyright 2017 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use std::str::pattern::*; + +// This macro makes it easier to write +// tests that do a series of iterations +macro_rules! search_asserts { + ($haystack:expr, $needle:expr, $testname:expr, [$($func:ident),*], $result:expr) => { + let mut searcher = $needle.into_searcher($haystack); + let arr = [$( Step::from(searcher.$func()) ),+]; + assert_eq!(&arr[..], &$result, $testname); + } +} + +/// Combined enum for the results of next() and next_match()/next_reject() +#[derive(Debug, PartialEq, Eq)] +enum Step { + // variant names purposely chosen to + // be the same length for easy alignment + Matches(usize, usize), + Rejects(usize, usize), + InRange(usize, usize), + Done +} + +use self::Step::*; + +impl From<SearchStep> for Step { + fn from(x: SearchStep) -> Self { + match x { + SearchStep::Match(a, b) => Matches(a, b), + SearchStep::Reject(a, b) => Rejects(a, b), + SearchStep::Done => Done + } + } +} + +impl From<Option<(usize, usize)>> for Step { + fn from(x: Option<(usize, usize)>) -> Self { + match x { + Some((a, b)) => InRange(a, b), + None => Done + } + } +} + +// ignore-tidy-linelength + +// FIXME(Manishearth) these tests focus on single-character searching (CharSearcher) +// and on next()/next_match(), not next_reject(). This is because +// the memchr changes make next_match() for single chars complex, but next_reject() +// continues to use next() under the hood. We should add more test cases for all +// of these, as well as tests for StrSearcher and higher level tests for str::find() (etc) + +#[test] +fn test_simple_iteration() { + search_asserts! ("abcdeabcd", 'a', "forward iteration for ASCII string", + // a b c d e a b c d EOF + [next, next, next, next, next, next, next, next, next, next], + [Matches(0, 1), Rejects(1, 2), Rejects(2, 3), Rejects(3, 4), Rejects(4, 5), Matches(5, 6), Rejects(6, 7), Rejects(7, 8), Rejects(8, 9), Done] + ); + + search_asserts! ("abcdeabcd", 'a', "reverse iteration for ASCII string", + // d c b a e d c b a EOF + [next_back, next_back, next_back, next_back, next_back, next_back, next_back, next_back, next_back, next_back], + [Rejects(8, 9), Rejects(7, 8), Rejects(6, 7), Matches(5, 6), Rejects(4, 5), Rejects(3, 4), Rejects(2, 3), Rejects(1, 2), Matches(0, 1), Done] + ); + + search_asserts! ("我爱我的猫", '我', "forward iteration for Chinese string", + // 我 愛 我 的 貓 EOF + [next, next, next, next, next, next], + [Matches(0, 3), Rejects(3, 6), Matches(6, 9), Rejects(9, 12), Rejects(12, 15), Done] + ); + + search_asserts! ("我的猫说meow", 'm', "forward iteration for mixed string", + // 我 的 猫 说 m e o w EOF + [next, next, next, next, next, next, next, next, next], + [Rejects(0, 3), Rejects(3, 6), Rejects(6, 9), Rejects(9, 12), Matches(12, 13), Rejects(13, 14), Rejects(14, 15), Rejects(15, 16), Done] + ); + + search_asserts! ("我的猫说meow", '猫', "reverse iteration for mixed string", + // w o e m 说 猫 的 我 EOF + [next_back, next_back, next_back, next_back, next_back, next_back, next_back, next_back, next_back], + [Rejects(15, 16), Rejects(14, 15), Rejects(13, 14), Rejects(12, 13), Rejects(9, 12), Matches(6, 9), Rejects(3, 6), Rejects(0, 3), Done] + ); +} + +#[test] +fn test_simple_search() { + search_asserts!("abcdeabcdeabcde", 'a', "next_match for ASCII string", + [next_match, next_match, next_match, next_match], + [InRange(0, 1), InRange(5, 6), InRange(10, 11), Done] + ); + + search_asserts!("abcdeabcdeabcde", 'a', "next_match_back for ASCII string", + [next_match_back, next_match_back, next_match_back, next_match_back], + [InRange(10, 11), InRange(5, 6), InRange(0, 1), Done] + ); + + search_asserts!("abcdeab", 'a', "next_reject for ASCII string", + [next_reject, next_reject, next_match, next_reject, next_reject], + [InRange(1, 2), InRange(2, 3), InRange(5, 6), InRange(6, 7), Done] + ); + + search_asserts!("abcdeabcdeabcde", 'a', "next_reject_back for ASCII string", + [next_reject_back, next_reject_back, next_match_back, next_reject_back, next_reject_back, next_reject_back], + [InRange(14, 15), InRange(13, 14), InRange(10, 11), InRange(9, 10), InRange(8, 9), InRange(7, 8)] + ); +} + +// Á, 각, ก, 😀 all end in 0x81 +// 🁀, ᘀ do not end in 0x81 but contain the byte +// ꁁ has 0x81 as its second and third bytes. +// +// The memchr-using implementation of next_match +// and next_match_back temporarily violate +// the property that the search is always on a unicode boundary, +// which is fine as long as this never reaches next() or next_back(). +// So we test if next() is correct after each next_match() as well. +const STRESS: &str = "Áa🁀bÁꁁfg😁각กᘀ각aÁ각ꁁก😁a"; + +#[test] +fn test_stress_indices() { + // this isn't really a test, more of documentation on the indices of each character in the stresstest string + + search_asserts!(STRESS, 'x', "Indices of characters in stress test", + [next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next], + [Rejects(0, 2), // Á + Rejects(2, 3), // a + Rejects(3, 7), // 🁀 + Rejects(7, 8), // b + Rejects(8, 10), // Á + Rejects(10, 13), // ꁁ + Rejects(13, 14), // f + Rejects(14, 15), // g + Rejects(15, 19), // 😀 + Rejects(19, 22), // 각 + Rejects(22, 25), // ก + Rejects(25, 28), // ᘀ + Rejects(28, 31), // 각 + Rejects(31, 32), // a + Rejects(32, 34), // Á + Rejects(34, 37), // 각 + Rejects(37, 40), // ꁁ + Rejects(40, 43), // ก + Rejects(43, 47), // 😀 + Rejects(47, 48), // a + Done] + ); +} + +#[test] +fn test_forward_search_shared_bytes() { + search_asserts!(STRESS, 'Á', "Forward search for two-byte Latin character", + [next_match, next_match, next_match, next_match], + [InRange(0, 2), InRange(8, 10), InRange(32, 34), Done] + ); + + search_asserts!(STRESS, 'Á', "Forward search for two-byte Latin character; check if next() still works", + [next_match, next, next_match, next, next_match, next, next_match], + [InRange(0, 2), Rejects(2, 3), InRange(8, 10), Rejects(10, 13), InRange(32, 34), Rejects(34, 37), Done] + ); + + search_asserts!(STRESS, '각', "Forward search for three-byte Hangul character", + [next_match, next, next_match, next_match, next_match], + [InRange(19, 22), Rejects(22, 25), InRange(28, 31), InRange(34, 37), Done] + ); + + search_asserts!(STRESS, '각', "Forward search for three-byte Hangul character; check if next() still works", + [next_match, next, next_match, next, next_match, next, next_match], + [InRange(19, 22), Rejects(22, 25), InRange(28, 31), Rejects(31, 32), InRange(34, 37), Rejects(37, 40), Done] + ); + + search_asserts!(STRESS, 'ก', "Forward search for three-byte Thai character", + [next_match, next, next_match, next, next_match], + [InRange(22, 25), Rejects(25, 28), InRange(40, 43), Rejects(43, 47), Done] + ); + + search_asserts!(STRESS, 'ก', "Forward search for three-byte Thai character; check if next() still works", + [next_match, next, next_match, next, next_match], + [InRange(22, 25), Rejects(25, 28), InRange(40, 43), Rejects(43, 47), Done] + ); + + search_asserts!(STRESS, '😁', "Forward search for four-byte emoji", + [next_match, next, next_match, next, next_match], + [InRange(15, 19), Rejects(19, 22), InRange(43, 47), Rejects(47, 48), Done] + ); + + search_asserts!(STRESS, '😁', "Forward search for four-byte emoji; check if next() still works", + [next_match, next, next_match, next, next_match], + [InRange(15, 19), Rejects(19, 22), InRange(43, 47), Rejects(47, 48), Done] + ); + + search_asserts!(STRESS, 'ꁁ', "Forward search for three-byte Yi character with repeated bytes", + [next_match, next, next_match, next, next_match], + [InRange(10, 13), Rejects(13, 14), InRange(37, 40), Rejects(40, 43), Done] + ); + + search_asserts!(STRESS, 'ꁁ', "Forward search for three-byte Yi character with repeated bytes; check if next() still works", + [next_match, next, next_match, next, next_match], + [InRange(10, 13), Rejects(13, 14), InRange(37, 40), Rejects(40, 43), Done] + ); +} + +#[test] +fn test_reverse_search_shared_bytes() { + search_asserts!(STRESS, 'Á', "Reverse search for two-byte Latin character", + [next_match_back, next_match_back, next_match_back, next_match_back], + [InRange(32, 34), InRange(8, 10), InRange(0, 2), Done] + ); + + search_asserts!(STRESS, 'Á', "Reverse search for two-byte Latin character; check if next_back() still works", + [next_match_back, next_back, next_match_back, next_back, next_match_back, next_back], + [InRange(32, 34), Rejects(31, 32), InRange(8, 10), Rejects(7, 8), InRange(0, 2), Done] + ); + + search_asserts!(STRESS, '각', "Reverse search for three-byte Hangul character", + [next_match_back, next_back, next_match_back, next_match_back, next_match_back], + [InRange(34, 37), Rejects(32, 34), InRange(28, 31), InRange(19, 22), Done] + ); + + search_asserts!(STRESS, '각', "Reverse search for three-byte Hangul character; check if next_back() still works", + [next_match_back, next_back, next_match_back, next_back, next_match_back, next_back, next_match_back], + [InRange(34, 37), Rejects(32, 34), InRange(28, 31), Rejects(25, 28), InRange(19, 22), Rejects(15, 19), Done] + ); + + search_asserts!(STRESS, 'ก', "Reverse search for three-byte Thai character", + [next_match_back, next_back, next_match_back, next_back, next_match_back], + [InRange(40, 43), Rejects(37, 40), InRange(22, 25), Rejects(19, 22), Done] + ); + + search_asserts!(STRESS, 'ก', "Reverse search for three-byte Thai character; check if next_back() still works", + [next_match_back, next_back, next_match_back, next_back, next_match_back], + [InRange(40, 43), Rejects(37, 40), InRange(22, 25), Rejects(19, 22), Done] + ); + + search_asserts!(STRESS, '😁', "Reverse search for four-byte emoji", + [next_match_back, next_back, next_match_back, next_back, next_match_back], + [InRange(43, 47), Rejects(40, 43), InRange(15, 19), Rejects(14, 15), Done] + ); + + search_asserts!(STRESS, '😁', "Reverse search for four-byte emoji; check if next_back() still works", + [next_match_back, next_back, next_match_back, next_back, next_match_back], + [InRange(43, 47), Rejects(40, 43), InRange(15, 19), Rejects(14, 15), Done] + ); + + search_asserts!(STRESS, 'ꁁ', "Reverse search for three-byte Yi character with repeated bytes", + [next_match_back, next_back, next_match_back, next_back, next_match_back], + [InRange(37, 40), Rejects(34, 37), InRange(10, 13), Rejects(8, 10), Done] + ); + + search_asserts!(STRESS, 'ꁁ', "Reverse search for three-byte Yi character with repeated bytes; check if next_back() still works", + [next_match_back, next_back, next_match_back, next_back, next_match_back], + [InRange(37, 40), Rejects(34, 37), InRange(10, 13), Rejects(8, 10), Done] + ); +} |
