Auto merge of #46735 - Manishearth:memchr-find, r=burntsushi

Use memchr for str::find(char) This is a 10x improvement for searching for characters. This also contains the patches from https://github.com/rust-lang/rust/pull/46713 . Feel free to land both separately or together. cc @mystor @alexcrichton r? @bluss fixes #46693
author: bors <bors@rust-lang.org> 2018-01-01 19:04:33 +0000
committer: bors <bors@rust-lang.org> 2018-01-01 19:04:33 +0000
commit: b65f0bedd2f22d9661ecb7092f07746dc2ccfb0d (patch)
tree: 142a5b0bdf6197e8a87847d881d9b8616494f3ff
parent: 5deba220d4c42b5313d7e71731ce5e8698866684 (diff)
parent: 5cf55165fae5c8538db5c00e252ad9ba42aaf246 (diff)
download: rust-b65f0bedd2f22d9661ecb7092f07746dc2ccfb0d.tar.gz
rust-b65f0bedd2f22d9661ecb7092f07746dc2ccfb0d.zip
3 files changed, 519 insertions, 84 deletions
diff --git a/src/libcore/str/pattern.rs b/src/libcore/str/pattern.rs
index edb7bed4520..677c0ecc33d 100644
--- a/src/libcore/str/pattern.rs
+++ b/src/libcore/str/pattern.rs
@@ -19,6 +19,7 @@
 
 use cmp;
 use fmt;
+use slice::memchr;
 use usize;
 
 // Pattern
@@ -127,6 +128,11 @@ pub unsafe trait Searcher<'a> {
     fn next(&mut self) -> SearchStep;
 
     /// Find the next `Match` result. See `next()`
+    ///
+    /// Unlike next(), there is no guarantee that the returned ranges
+    /// of this and next_reject will overlap. This will return (start_match, end_match),
+    /// where start_match is the index of where the match begins, and end_match is
+    /// the index after the end of the match.
     #[inline]
     fn next_match(&mut self) -> Option<(usize, usize)> {
         loop {
@@ -138,7 +144,10 @@ pub unsafe trait Searcher<'a> {
         }
     }
 
-    /// Find the next `Reject` result. See `next()`
+    /// Find the next `Reject` result. See `next()` and `next_match()`
+    ///
+    /// Unlike next(), there is no guarantee that the returned ranges
+    /// of this and next_match will overlap.
     #[inline]
     fn next_reject(&mut self) -> Option<(usize, usize)> {
         loop {
@@ -234,62 +243,272 @@ pub unsafe trait ReverseSearcher<'a>: Searcher<'a> {
 /// `"[aa]a"` or `"a[aa]"`, depending from which side it is searched.
 pub trait DoubleEndedSearcher<'a>: ReverseSearcher<'a> {}
 
+
 /////////////////////////////////////////////////////////////////////////////
-// Impl for a CharEq wrapper
+// Impl for char
 /////////////////////////////////////////////////////////////////////////////
 
-#[doc(hidden)]
-trait CharEq {
-    fn matches(&mut self, c: char) -> bool;
-    fn only_ascii(&self) -> bool;
+/// Associated type for `<char as Pattern<'a>>::Searcher`.
+#[derive(Clone, Debug)]
+pub struct CharSearcher<'a> {
+    haystack: &'a str,
+    // safety invariant: `finger`/`finger_back` must be a valid utf8 byte index of `haystack`
+    // This invariant can be broken *within* next_match and next_match_back, however
+    // they must exit with fingers on valid code point boundaries.
+
+    /// `finger` is the current byte index of the forward search.
+    /// Imagine that it exists before the byte at its index, i.e.
+    /// haystack[finger] is the first byte of the slice we must inspect during
+    /// forward searching
+    finger: usize,
+    /// `finger_back` is the current byte index of the reverse search.
+    /// Imagine that it exists after the byte at its index, i.e.
+    /// haystack[finger_back - 1] is the last byte of the slice we must inspect during
+    /// forward searching (and thus the first byte to be inspected when calling next_back())
+    finger_back: usize,
+    /// The character being searched for
+    needle: char,
+
+    // safety invariant: `utf8_size` must be less than 5
+    /// The number of bytes `needle` takes up when encoded in utf8
+    utf8_size: usize,
+    /// A utf8 encoded copy of the `needle`
+    utf8_encoded: [u8; 4],
 }
 
-impl CharEq for char {
+unsafe impl<'a> Searcher<'a> for CharSearcher<'a> {
+    #[inline]
+    fn haystack(&self) -> &'a str {
+        self.haystack
+    }
+    #[inline]
+    fn next(&mut self) -> SearchStep {
+        let old_finger = self.finger;
+        let slice = unsafe { self.haystack.get_unchecked(old_finger..self.haystack.len()) };
+        let mut iter = slice.chars();
+        let old_len = iter.iter.len();
+        if let Some(ch) = iter.next() {
+            // add byte offset of current character
+            // without re-encoding as utf-8
+            self.finger += old_len - iter.iter.len();
+            if ch == self.needle {
+                SearchStep::Match(old_finger, self.finger)
+            } else {
+                SearchStep::Reject(old_finger, self.finger)
+            }
+        } else {
+            SearchStep::Done
+        }
+    }
     #[inline]
-    fn matches(&mut self, c: char) -> bool { *self == c }
+    fn next_match(&mut self) -> Option<(usize, usize)> {
+        loop {
+            // get the haystack after the last character found
+            let bytes = if let Some(slice) = self.haystack.as_bytes().get(self.finger..) {
+                slice
+            } else {
+                return None;
+            };
+            // the last byte of the utf8 encoded needle
+            let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) };
+            if let Some(index) = memchr::memchr(last_byte, bytes) {
+                // The new finger is the index of the byte we found,
+                // plus one, since we memchr'd for the last byte of the character.
+                //
+                // Note that this doesn't always give us a finger on a UTF8 boundary.
+                // If we *didn't* find our character
+                // we may have indexed to the non-last byte of a 3-byte or 4-byte character.
+                // We can't just skip to the next valid starting byte because a character like
+                // ꁁ (U+A041 YI SYLLABLE PA), utf-8 `EA 81 81` will have us always find
+                // the second byte when searching for the third.
+                //
+                // However, this is totally okay. While we have the invariant that
+                // self.finger is on a UTF8 boundary, this invariant is not relid upon
+                // within this method (it is relied upon in CharSearcher::next()).
+                //
+                // We only exit this method when we reach the end of the string, or if we
+                // find something. When we find something the `finger` will be set
+                // to a UTF8 boundary.
+                self.finger += index + 1;
+                if self.finger >= self.utf8_size {
+                    let found_char = self.finger - self.utf8_size;
+                    if let Some(slice) = self.haystack.as_bytes().get(found_char..self.finger) {
+                        if slice == &self.utf8_encoded[0..self.utf8_size] {
+                            return Some((found_char, self.finger));
+                        }
+                    }
+                }
+            } else {
+                // found nothing, exit
+                self.finger = self.haystack.len();
+                return None;
+            }
+        }
+    }
 
+    // let next_reject use the default implementation from the Searcher trait
+}
+
+unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> {
+    #[inline]
+    fn next_back(&mut self) -> SearchStep {
+        let old_finger = self.finger_back;
+        let slice = unsafe { self.haystack.slice_unchecked(0, old_finger) };
+        let mut iter = slice.chars();
+        let old_len = iter.iter.len();
+        if let Some(ch) = iter.next_back() {
+            // subtract byte offset of current character
+            // without re-encoding as utf-8
+            self.finger_back -= old_len - iter.iter.len();
+            if ch == self.needle {
+                SearchStep::Match(self.finger_back, old_finger)
+            } else {
+                SearchStep::Reject(self.finger_back, old_finger)
+            }
+        } else {
+            SearchStep::Done
+        }
+    }
     #[inline]
-    fn only_ascii(&self) -> bool { (*self as u32) < 128 }
+    fn next_match_back(&mut self) -> Option<(usize, usize)> {
+        let haystack = self.haystack.as_bytes();
+        loop {
+            // get the haystack up to but not including the last character searched
+            let bytes = if let Some(slice) = haystack.get(..self.finger_back) {
+                slice
+            } else {
+                return None;
+            };
+            // the last byte of the utf8 encoded needle
+            let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) };
+            if let Some(index) = memchr::memrchr(last_byte, bytes) {
+                // memrchr will return the index of the byte we wish to
+                // find. In case of an ASCII character, this is indeed
+                // were we wish our new finger to be ("after" the found
+                // char in the paradigm of reverse iteration). For
+                // multibyte chars we need to skip down by the number of more
+                // bytes they have than ASCII
+                let shift = self.utf8_size - 1;
+                if index >= shift {
+                    let found_char = index - shift;
+                    if let Some(slice) = haystack.get(found_char..(found_char + self.utf8_size)) {
+                        if slice == &self.utf8_encoded[0..self.utf8_size] {
+                            // move finger to before the character found (i.e. at its start index)
+                            self.finger_back = found_char;
+                            return Some((self.finger_back, self.finger_back + self.utf8_size));
+                        }
+                    }
+                }
+                // We can't use finger_back = index - size + 1 here. If we found the last char
+                // of a different-sized character (or the middle byte of a different character)
+                // we need to bump the finger_back down to `index`. This similarly makes
+                // `finger_back` have the potential to no longer be on a boundary,
+                // but this is OK since we only exit this function on a boundary
+                // or when the haystack has been searched completely.
+                //
+                // Unlike next_match this does not
+                // have the problem of repeated bytes in utf-8 because
+                // we're searching for the last byte, and we can only have
+                // found the last byte when searching in reverse.
+                self.finger_back = index;
+            } else {
+                self.finger_back = 0;
+                // found nothing, exit
+                return None;
+            }
+        }
+    }
+
+    // let next_reject_back use the default implementation from the Searcher trait
 }
 
-impl<F> CharEq for F where F: FnMut(char) -> bool {
+impl<'a> DoubleEndedSearcher<'a> for CharSearcher<'a> {}
+
+/// Searches for chars that are equal to a given char
+impl<'a> Pattern<'a> for char {
+    type Searcher = CharSearcher<'a>;
+
     #[inline]
-    fn matches(&mut self, c: char) -> bool { (*self)(c) }
+    fn into_searcher(self, haystack: &'a str) -> Self::Searcher {
+        let mut utf8_encoded = [0; 4];
+        self.encode_utf8(&mut utf8_encoded);
+        let utf8_size = self.len_utf8();
+        CharSearcher {
+            haystack,
+            finger: 0,
+            finger_back: haystack.len(),
+            needle: self,
+            utf8_size,
+            utf8_encoded
+        }
+    }
 
     #[inline]
-    fn only_ascii(&self) -> bool { false }
-}
+    fn is_contained_in(self, haystack: &'a str) -> bool {
+        if (self as u32) < 128 {
+            haystack.as_bytes().contains(&(self as u8))
+        } else {
+            let mut buffer = [0u8; 4];
+            self.encode_utf8(&mut buffer).is_contained_in(haystack)
+        }
+    }
 
-impl<'a> CharEq for &'a [char] {
     #[inline]
-    fn matches(&mut self, c: char) -> bool {
-        self.iter().any(|&m| { let mut m = m; m.matches(c) })
+    fn is_prefix_of(self, haystack: &'a str) -> bool {
+        if let Some(ch) = haystack.chars().next() {
+            self == ch
+        } else {
+            false
+        }
+    }
+
+    #[inline]
+    fn is_suffix_of(self, haystack: &'a str) -> bool where Self::Searcher: ReverseSearcher<'a>
+    {
+        if let Some(ch) = haystack.chars().next_back() {
+            self == ch
+        } else {
+            false
+        }
     }
+}
+
+/////////////////////////////////////////////////////////////////////////////
+// Impl for a MultiCharEq wrapper
+/////////////////////////////////////////////////////////////////////////////
+
+#[doc(hidden)]
+trait MultiCharEq {
+    fn matches(&mut self, c: char) -> bool;
+}
 
+impl<F> MultiCharEq for F where F: FnMut(char) -> bool {
     #[inline]
-    fn only_ascii(&self) -> bool {
-        self.iter().all(|m| m.only_ascii())
+    fn matches(&mut self, c: char) -> bool { (*self)(c) }
+}
+
+impl<'a> MultiCharEq for &'a [char] {
+    #[inline]
+    fn matches(&mut self, c: char) -> bool {
+        self.iter().any(|&m| { m == c })
     }
 }
 
-struct CharEqPattern<C: CharEq>(C);
+struct MultiCharEqPattern<C: MultiCharEq>(C);
 
 #[derive(Clone, Debug)]
-struct CharEqSearcher<'a, C: CharEq> {
+struct MultiCharEqSearcher<'a, C: MultiCharEq> {
     char_eq: C,
     haystack: &'a str,
     char_indices: super::CharIndices<'a>,
-    #[allow(dead_code)]
-    ascii_only: bool,
 }
 
-impl<'a, C: CharEq> Pattern<'a> for CharEqPattern<C> {
-    type Searcher = CharEqSearcher<'a, C>;
+impl<'a, C: MultiCharEq> Pattern<'a> for MultiCharEqPattern<C> {
+    type Searcher = MultiCharEqSearcher<'a, C>;
 
     #[inline]
-    fn into_searcher(self, haystack: &'a str) -> CharEqSearcher<'a, C> {
-        CharEqSearcher {
-            ascii_only: self.0.only_ascii(),
+    fn into_searcher(self, haystack: &'a str) -> MultiCharEqSearcher<'a, C> {
+        MultiCharEqSearcher {
             haystack,
             char_eq: self.0,
             char_indices: haystack.char_indices(),
@@ -297,7 +516,7 @@ impl<'a, C: CharEq> Pattern<'a> for CharEqPattern<C> {
     }
 }
 
-unsafe impl<'a, C: CharEq> Searcher<'a> for CharEqSearcher<'a, C> {
+unsafe impl<'a, C: MultiCharEq> Searcher<'a> for MultiCharEqSearcher<'a, C> {
     #[inline]
     fn haystack(&self) -> &'a str {
         self.haystack
@@ -322,7 +541,7 @@ unsafe impl<'a, C: CharEq> Searcher<'a> for CharEqSearcher<'a, C> {
     }
 }
 
-unsafe impl<'a, C: CharEq> ReverseSearcher<'a> for CharEqSearcher<'a, C> {
+unsafe impl<'a, C: MultiCharEq> ReverseSearcher<'a> for MultiCharEqSearcher<'a, C> {
     #[inline]
     fn next_back(&mut self) -> SearchStep {
         let s = &mut self.char_indices;
@@ -342,7 +561,7 @@ unsafe impl<'a, C: CharEq> ReverseSearcher<'a> for CharEqSearcher<'a, C> {
     }
 }
 
-impl<'a, C: CharEq> DoubleEndedSearcher<'a> for CharEqSearcher<'a, C> {}
+impl<'a, C: MultiCharEq> DoubleEndedSearcher<'a> for MultiCharEqSearcher<'a, C> {}
 
 /////////////////////////////////////////////////////////////////////////////
 
@@ -410,55 +629,6 @@ macro_rules! searcher_methods {
 }
 
 /////////////////////////////////////////////////////////////////////////////
-// Impl for char
-/////////////////////////////////////////////////////////////////////////////
-
-/// Associated type for `<char as Pattern<'a>>::Searcher`.
-#[derive(Clone, Debug)]
-pub struct CharSearcher<'a>(<CharEqPattern<char> as Pattern<'a>>::Searcher);
-
-unsafe impl<'a> Searcher<'a> for CharSearcher<'a> {
-    searcher_methods!(forward);
-}
-
-unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> {
-    searcher_methods!(reverse);
-}
-
-impl<'a> DoubleEndedSearcher<'a> for CharSearcher<'a> {}
-
-/// Searches for chars that are equal to a given char
-impl<'a> Pattern<'a> for char {
-    type Searcher = CharSearcher<'a>;
-
-    #[inline]
-    fn into_searcher(self, haystack: &'a str) -> Self::Searcher {
-        CharSearcher(CharEqPattern(self).into_searcher(haystack))
-    }
-
-    #[inline]
-    fn is_contained_in(self, haystack: &'a str) -> bool {
-        if (self as u32) < 128 {
-            haystack.as_bytes().contains(&(self as u8))
-        } else {
-            let mut buffer = [0u8; 4];
-            self.encode_utf8(&mut buffer).is_contained_in(haystack)
-        }
-    }
-
-    #[inline]
-    fn is_prefix_of(self, haystack: &'a str) -> bool {
-        CharEqPattern(self).is_prefix_of(haystack)
-    }
-
-    #[inline]
-    fn is_suffix_of(self, haystack: &'a str) -> bool where Self::Searcher: ReverseSearcher<'a>
-    {
-        CharEqPattern(self).is_suffix_of(haystack)
-    }
-}
-
-/////////////////////////////////////////////////////////////////////////////
 // Impl for &[char]
 /////////////////////////////////////////////////////////////////////////////
 
@@ -466,7 +636,7 @@ impl<'a> Pattern<'a> for char {
 
 /// Associated type for `<&[char] as Pattern<'a>>::Searcher`.
 #[derive(Clone, Debug)]
-pub struct CharSliceSearcher<'a, 'b>(<CharEqPattern<&'b [char]> as Pattern<'a>>::Searcher);
+pub struct CharSliceSearcher<'a, 'b>(<MultiCharEqPattern<&'b [char]> as Pattern<'a>>::Searcher);
 
 unsafe impl<'a, 'b> Searcher<'a> for CharSliceSearcher<'a, 'b> {
     searcher_methods!(forward);
@@ -480,7 +650,7 @@ impl<'a, 'b> DoubleEndedSearcher<'a> for CharSliceSearcher<'a, 'b> {}
 
 /// Searches for chars that are equal to any of the chars in the array
 impl<'a, 'b> Pattern<'a> for &'b [char] {
-    pattern_methods!(CharSliceSearcher<'a, 'b>, CharEqPattern, CharSliceSearcher);
+    pattern_methods!(CharSliceSearcher<'a, 'b>, MultiCharEqPattern, CharSliceSearcher);
 }
 
 /////////////////////////////////////////////////////////////////////////////
@@ -489,7 +659,7 @@ impl<'a, 'b> Pattern<'a> for &'b [char] {
 
 /// Associated type for `<F as Pattern<'a>>::Searcher`.
 #[derive(Clone)]
-pub struct CharPredicateSearcher<'a, F>(<CharEqPattern<F> as Pattern<'a>>::Searcher)
+pub struct CharPredicateSearcher<'a, F>(<MultiCharEqPattern<F> as Pattern<'a>>::Searcher)
     where F: FnMut(char) -> bool;
 
 impl<'a, F> fmt::Debug for CharPredicateSearcher<'a, F>
@@ -499,7 +669,6 @@ impl<'a, F> fmt::Debug for CharPredicateSearcher<'a, F>
         f.debug_struct("CharPredicateSearcher")
             .field("haystack", &self.0.haystack)
             .field("char_indices", &self.0.char_indices)
-            .field("ascii_only", &self.0.ascii_only)
             .finish()
     }
 }
@@ -520,7 +689,7 @@ impl<'a, F> DoubleEndedSearcher<'a> for CharPredicateSearcher<'a, F>
 
 /// Searches for chars that match the given predicate
 impl<'a, F> Pattern<'a> for F where F: FnMut(char) -> bool {
-    pattern_methods!(CharPredicateSearcher<'a, F>, CharEqPattern, CharPredicateSearcher);
+    pattern_methods!(CharPredicateSearcher<'a, F>, MultiCharEqPattern, CharPredicateSearcher);
 }
 
 /////////////////////////////////////////////////////////////////////////////
diff --git a/src/libcore/tests/lib.rs b/src/libcore/tests/lib.rs
index 0e445cdac35..c4b85b82981 100644
--- a/src/libcore/tests/lib.rs
+++ b/src/libcore/tests/lib.rs
@@ -28,6 +28,7 @@
 #![feature(iter_rfind)]
 #![feature(iter_rfold)]
 #![feature(nonzero)]
+#![feature(pattern)]
 #![feature(raw)]
 #![feature(refcell_replace_swap)]
 #![feature(sip_hash_13)]
@@ -61,6 +62,7 @@ mod nonzero;
 mod num;
 mod ops;
 mod option;
+mod pattern;
 mod ptr;
 mod result;
 mod slice;
diff --git a/src/libcore/tests/pattern.rs b/src/libcore/tests/pattern.rs
new file mode 100644
index 00000000000..d0fd15263b2
--- /dev/null
+++ b/src/libcore/tests/pattern.rs
@@ -0,0 +1,264 @@
+// Copyright 2017 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use std::str::pattern::*;
+
+// This macro makes it easier to write
+// tests that do a series of iterations
+macro_rules! search_asserts {
+    ($haystack:expr, $needle:expr, $testname:expr, [$($func:ident),*], $result:expr) => {
+        let mut searcher = $needle.into_searcher($haystack);
+        let arr = [$( Step::from(searcher.$func()) ),+];
+        assert_eq!(&arr[..], &$result, $testname);
+    }
+}
+
+/// Combined enum for the results of next() and next_match()/next_reject()
+#[derive(Debug, PartialEq, Eq)]
+enum Step {
+    // variant names purposely chosen to
+    // be the same length for easy alignment
+    Matches(usize, usize),
+    Rejects(usize, usize),
+    InRange(usize, usize),
+    Done
+}
+
+use self::Step::*;
+
+impl From<SearchStep> for Step {
+    fn from(x: SearchStep) -> Self {
+        match x {
+            SearchStep::Match(a, b) => Matches(a, b),
+            SearchStep::Reject(a, b) => Rejects(a, b),
+            SearchStep::Done => Done
+        }
+    }
+}
+
+impl From<Option<(usize, usize)>> for Step {
+    fn from(x: Option<(usize, usize)>) -> Self {
+        match x {
+            Some((a, b)) => InRange(a, b),
+            None => Done
+        }
+    }
+}
+
+// ignore-tidy-linelength
+
+// FIXME(Manishearth) these tests focus on single-character searching  (CharSearcher)
+// and on next()/next_match(), not next_reject(). This is because
+// the memchr changes make next_match() for single chars complex, but next_reject()
+// continues to use next() under the hood. We should add more test cases for all
+// of these, as well as tests for StrSearcher and higher level tests for str::find() (etc)
+
+#[test]
+fn test_simple_iteration() {
+    search_asserts! ("abcdeabcd", 'a', "forward iteration for ASCII string",
+        // a            b              c              d              e              a              b              c              d              EOF
+        [next,          next,          next,          next,          next,          next,          next,          next,          next,          next],
+        [Matches(0, 1), Rejects(1, 2), Rejects(2, 3), Rejects(3, 4), Rejects(4, 5), Matches(5, 6), Rejects(6, 7), Rejects(7, 8), Rejects(8, 9), Done]
+    );
+
+    search_asserts! ("abcdeabcd", 'a', "reverse iteration for ASCII string",
+        // d            c              b              a            e                d              c              b              a             EOF
+        [next_back,     next_back,     next_back,     next_back,     next_back,     next_back,     next_back,     next_back,     next_back,     next_back],
+        [Rejects(8, 9), Rejects(7, 8), Rejects(6, 7), Matches(5, 6), Rejects(4, 5), Rejects(3, 4), Rejects(2, 3), Rejects(1, 2), Matches(0, 1), Done]
+    );
+
+    search_asserts! ("我爱我的猫", '我', "forward iteration for Chinese string",
+        // 我           愛             我             的              貓               EOF
+        [next,          next,          next,          next,           next,            next],
+        [Matches(0, 3), Rejects(3, 6), Matches(6, 9), Rejects(9, 12), Rejects(12, 15), Done]
+    );
+
+    search_asserts! ("我的猫说meow", 'm', "forward iteration for mixed string",
+        // 我           的             猫             说              m                e                o                w                EOF
+        [next,          next,          next,          next,           next,            next,            next,            next,            next],
+        [Rejects(0, 3), Rejects(3, 6), Rejects(6, 9), Rejects(9, 12), Matches(12, 13), Rejects(13, 14), Rejects(14, 15), Rejects(15, 16), Done]
+    );
+
+    search_asserts! ("我的猫说meow", '猫', "reverse iteration for mixed string",
+        // w             o                 e                m                说              猫             的             我             EOF
+        [next_back,       next_back,       next_back,       next_back,       next_back,      next_back,      next_back,    next_back,     next_back],
+        [Rejects(15, 16), Rejects(14, 15), Rejects(13, 14), Rejects(12, 13), Rejects(9, 12), Matches(6, 9), Rejects(3, 6), Rejects(0, 3), Done]
+    );
+}
+
+#[test]
+fn test_simple_search() {
+    search_asserts!("abcdeabcdeabcde", 'a', "next_match for ASCII string",
+        [next_match,    next_match,    next_match,      next_match],
+        [InRange(0, 1), InRange(5, 6), InRange(10, 11), Done]
+    );
+
+    search_asserts!("abcdeabcdeabcde", 'a', "next_match_back for ASCII string",
+        [next_match_back, next_match_back, next_match_back, next_match_back],
+        [InRange(10, 11), InRange(5, 6),   InRange(0, 1),   Done]
+    );
+
+    search_asserts!("abcdeab", 'a', "next_reject for ASCII string",
+        [next_reject,   next_reject,   next_match,    next_reject,   next_reject],
+        [InRange(1, 2), InRange(2, 3), InRange(5, 6), InRange(6, 7), Done]
+    );
+
+    search_asserts!("abcdeabcdeabcde", 'a', "next_reject_back for ASCII string",
+        [next_reject_back, next_reject_back, next_match_back, next_reject_back, next_reject_back, next_reject_back],
+        [InRange(14, 15),  InRange(13, 14),  InRange(10, 11), InRange(9, 10),   InRange(8, 9),    InRange(7, 8)]
+    );
+}
+
+// Á, 각, ก, 😀 all end in 0x81
+// 🁀, ᘀ do not end in 0x81 but contain the byte
+// ꁁ has 0x81 as its second and third bytes.
+//
+// The memchr-using implementation of next_match
+// and next_match_back temporarily violate
+// the property that the search is always on a unicode boundary,
+// which is fine as long as this never reaches next() or next_back().
+// So we test if next() is correct after each next_match() as well.
+const STRESS: &str = "Áa🁀bÁꁁfg😁각กᘀ각aÁ각ꁁก😁a";
+
+#[test]
+fn test_stress_indices() {
+    // this isn't really a test, more of documentation on the indices of each character in the stresstest string
+
+    search_asserts!(STRESS, 'x', "Indices of characters in stress test",
+        [next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next],
+        [Rejects(0, 2), // Á
+         Rejects(2, 3), // a
+         Rejects(3, 7), // 🁀
+         Rejects(7, 8), // b
+         Rejects(8, 10), // Á
+         Rejects(10, 13), // ꁁ
+         Rejects(13, 14), // f
+         Rejects(14, 15), // g
+         Rejects(15, 19), // 😀
+         Rejects(19, 22), // 각
+         Rejects(22, 25), // ก
+         Rejects(25, 28), // ᘀ
+         Rejects(28, 31), // 각
+         Rejects(31, 32), // a
+         Rejects(32, 34), // Á
+         Rejects(34, 37), // 각
+         Rejects(37, 40), // ꁁ
+         Rejects(40, 43), // ก
+         Rejects(43, 47), // 😀
+         Rejects(47, 48), // a
+         Done]
+    );
+}
+
+#[test]
+fn test_forward_search_shared_bytes() {
+    search_asserts!(STRESS, 'Á', "Forward search for two-byte Latin character",
+        [next_match,    next_match,     next_match,      next_match],
+        [InRange(0, 2), InRange(8, 10), InRange(32, 34), Done]
+    );
+
+    search_asserts!(STRESS, 'Á', "Forward search for two-byte Latin character; check if next() still works",
+        [next_match,    next,          next_match,     next,             next_match,     next,            next_match],
+        [InRange(0, 2), Rejects(2, 3), InRange(8, 10), Rejects(10, 13), InRange(32, 34), Rejects(34, 37), Done]
+    );
+
+    search_asserts!(STRESS, '각', "Forward search for three-byte Hangul character",
+        [next_match,      next,            next_match,      next_match,      next_match],
+        [InRange(19, 22), Rejects(22, 25), InRange(28, 31), InRange(34, 37), Done]
+    );
+
+    search_asserts!(STRESS, '각', "Forward search for three-byte Hangul character; check if next() still works",
+        [next_match,      next,            next_match,      next,            next_match,      next,            next_match],
+        [InRange(19, 22), Rejects(22, 25), InRange(28, 31), Rejects(31, 32), InRange(34, 37), Rejects(37, 40), Done]
+    );
+
+    search_asserts!(STRESS, 'ก', "Forward search for three-byte Thai character",
+        [next_match,      next,            next_match,      next,            next_match],
+        [InRange(22, 25), Rejects(25, 28), InRange(40, 43), Rejects(43, 47), Done]
+    );
+
+    search_asserts!(STRESS, 'ก', "Forward search for three-byte Thai character; check if next() still works",
+        [next_match,      next,            next_match,      next,            next_match],
+        [InRange(22, 25), Rejects(25, 28), InRange(40, 43), Rejects(43, 47), Done]
+    );
+
+    search_asserts!(STRESS, '😁', "Forward search for four-byte emoji",
+        [next_match,      next,            next_match,      next,            next_match],
+        [InRange(15, 19), Rejects(19, 22), InRange(43, 47), Rejects(47, 48), Done]
+    );
+
+    search_asserts!(STRESS, '😁', "Forward search for four-byte emoji; check if next() still works",
+        [next_match,      next,            next_match,      next,            next_match],
+        [InRange(15, 19), Rejects(19, 22), InRange(43, 47), Rejects(47, 48), Done]
+    );
+
+    search_asserts!(STRESS, 'ꁁ', "Forward search for three-byte Yi character with repeated bytes",
+        [next_match,      next,            next_match,      next,            next_match],
+        [InRange(10, 13), Rejects(13, 14), InRange(37, 40), Rejects(40, 43), Done]
+    );
+
+    search_asserts!(STRESS, 'ꁁ', "Forward search for three-byte Yi character with repeated bytes; check if next() still works",
+        [next_match,      next,            next_match,      next,            next_match],
+        [InRange(10, 13), Rejects(13, 14), InRange(37, 40), Rejects(40, 43), Done]
+    );
+}
+
+#[test]
+fn test_reverse_search_shared_bytes() {
+    search_asserts!(STRESS, 'Á', "Reverse search for two-byte Latin character",
+        [next_match_back, next_match_back, next_match_back, next_match_back],
+        [InRange(32, 34), InRange(8, 10),  InRange(0, 2),   Done]
+    );
+
+    search_asserts!(STRESS, 'Á', "Reverse search for two-byte Latin character; check if next_back() still works",
+        [next_match_back, next_back,       next_match_back, next_back,     next_match_back, next_back],
+        [InRange(32, 34), Rejects(31, 32), InRange(8, 10),  Rejects(7, 8), InRange(0, 2),   Done]
+    );
+
+    search_asserts!(STRESS, '각', "Reverse search for three-byte Hangul character",
+        [next_match_back, next_back,        next_match_back, next_match_back, next_match_back],
+        [InRange(34, 37), Rejects(32, 34), InRange(28, 31),  InRange(19, 22), Done]
+    );
+
+    search_asserts!(STRESS, '각', "Reverse search for three-byte Hangul character; check if next_back() still works",
+        [next_match_back, next_back,       next_match_back, next_back,       next_match_back, next_back,       next_match_back],
+        [InRange(34, 37), Rejects(32, 34), InRange(28, 31), Rejects(25, 28), InRange(19, 22), Rejects(15, 19), Done]
+    );
+
+    search_asserts!(STRESS, 'ก', "Reverse search for three-byte Thai character",
+        [next_match_back, next_back,       next_match_back, next_back,       next_match_back],
+        [InRange(40, 43), Rejects(37, 40), InRange(22, 25), Rejects(19, 22), Done]
+    );
+
+    search_asserts!(STRESS, 'ก', "Reverse search for three-byte Thai character; check if next_back() still works",
+        [next_match_back, next_back,       next_match_back, next_back,       next_match_back],
+        [InRange(40, 43), Rejects(37, 40), InRange(22, 25), Rejects(19, 22), Done]
+    );
+
+    search_asserts!(STRESS, '😁', "Reverse search for four-byte emoji",
+        [next_match_back, next_back,       next_match_back, next_back,       next_match_back],
+        [InRange(43, 47), Rejects(40, 43), InRange(15, 19), Rejects(14, 15), Done]
+    );
+
+    search_asserts!(STRESS, '😁', "Reverse search for four-byte emoji; check if next_back() still works",
+        [next_match_back, next_back,       next_match_back, next_back,       next_match_back],
+        [InRange(43, 47), Rejects(40, 43), InRange(15, 19), Rejects(14, 15), Done]
+    );
+
+    search_asserts!(STRESS, 'ꁁ', "Reverse search for three-byte Yi character with repeated bytes",
+        [next_match_back, next_back,       next_match_back, next_back,      next_match_back],
+        [InRange(37, 40), Rejects(34, 37), InRange(10, 13), Rejects(8, 10), Done]
+    );
+
+    search_asserts!(STRESS, 'ꁁ', "Reverse search for three-byte Yi character with repeated bytes; check if next_back() still works",
+        [next_match_back, next_back,       next_match_back, next_back,      next_match_back],
+        [InRange(37, 40), Rejects(34, 37), InRange(10, 13), Rejects(8, 10), Done]
+    );
+}
author	bors <bors@rust-lang.org>	2018-01-01 19:04:33 +0000
committer	bors <bors@rust-lang.org>	2018-01-01 19:04:33 +0000
commit	b65f0bedd2f22d9661ecb7092f07746dc2ccfb0d (patch)
tree	142a5b0bdf6197e8a87847d881d9b8616494f3ff
parent	5deba220d4c42b5313d7e71731ce5e8698866684 (diff)
parent	5cf55165fae5c8538db5c00e252ad9ba42aaf246 (diff)
download	rust-b65f0bedd2f22d9661ecb7092f07746dc2ccfb0d.tar.gz rust-b65f0bedd2f22d9661ecb7092f07746dc2ccfb0d.zip