From 548bdbaa29e0855a87ac5eec073d83babb72d8f2 Mon Sep 17 00:00:00 2001
From: blake2-ppc <blake2-ppc>
Date: Sun, 18 Aug 2013 13:57:34 +0200
Subject: std::str: Bench test for char iterators

---
 src/libstd/str.rs | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

(limited to 'src/libstd')
diff --git a/src/libstd/str.rs b/src/libstd/str.rs
index a759b8cbd62..3d793bc8e77 100644
--- a/src/libstd/str.rs
+++ b/src/libstd/str.rs
@@ -3267,6 +3267,62 @@ mod tests {
 mod bench {
     use extra::test::BenchHarness;
     use super::*;
+    use prelude::*;
+
+    #[bench]
+    fn char_iterator(bh: &mut BenchHarness) {
+        let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
+        let len = s.char_len();
+
+        do bh.iter {
+            assert_eq!(s.iter().len(), len);
+        }
+    }
+
+    #[bench]
+    fn char_iterator_ascii(bh: &mut BenchHarness) {
+        let s = "Mary had a little lamb, Little lamb
+        Mary had a little lamb, Little lamb
+        Mary had a little lamb, Little lamb
+        Mary had a little lamb, Little lamb
+        Mary had a little lamb, Little lamb
+        Mary had a little lamb, Little lamb";
+        let len = s.char_len();
+
+        do bh.iter {
+            assert_eq!(s.iter().len(), len);
+        }
+    }
+
+    #[bench]
+    fn char_iterator_rev(bh: &mut BenchHarness) {
+        let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
+        let len = s.char_len();
+
+        do bh.iter {
+            assert_eq!(s.rev_iter().len(), len);
+        }
+    }
+
+    #[bench]
+    fn char_offset_iterator(bh: &mut BenchHarness) {
+        let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
+        let len = s.char_len();
+
+        do bh.iter {
+            assert_eq!(s.char_offset_iter().len(), len);
+        }
+    }
+
+    #[bench]
+    fn char_offset_iterator_rev(bh: &mut BenchHarness) {
+        let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
+        let len = s.char_len();
+
+        do bh.iter {
+            assert_eq!(s.char_offset_rev_iter().len(), len);
+        }
+    }
 
     #[bench]
     fn is_utf8_100_ascii(bh: &mut BenchHarness) {
-- 
cgit 1.4.1-3-g733a5


From 4043c70f23bbc883088634e9cf0c3224524a2c5c Mon Sep 17 00:00:00 2001
From: blake2-ppc <blake2-ppc>
Date: Sun, 18 Aug 2013 13:57:34 +0200
Subject: std::str: Small fix for slice

---
 src/libstd/str.rs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'src/libstd')

diff --git a/src/libstd/str.rs b/src/libstd/str.rs
index 3d793bc8e77..07006ba8c15 100644
--- a/src/libstd/str.rs
+++ b/src/libstd/str.rs
@@ -1366,8 +1366,7 @@ impl<'self> StrSlice<'self> for &'self str {
     /// beyond the last character of the string
     #[inline]
     fn slice(&self, begin: uint, end: uint) -> &'self str {
-        assert!(self.is_char_boundary(begin));
-        assert!(self.is_char_boundary(end));
+        assert!(self.is_char_boundary(begin) && self.is_char_boundary(end));
         unsafe { raw::slice_bytes(*self, begin, end) }
     }
 
@@ -1609,6 +1608,7 @@ impl<'self> StrSlice<'self> for &'self str {
 
     /// Returns false if the index points into the middle of a multi-byte
     /// character sequence.
+    #[inline]
     fn is_char_boundary(&self, index: uint) -> bool {
         if index == self.len() { return true; }
         let b = self[index];
@@ -1694,6 +1694,7 @@ impl<'self> StrSlice<'self> for &'self str {
     /// This function can be used to iterate over a unicode string in reverse.
     ///
     /// Returns 0 for next index if called on start index 0.
+    #[inline]
     fn char_range_at_reverse(&self, start: uint) -> CharRange {
         let mut prev = start;
 
-- 
cgit 1.4.1-3-g733a5


From 3cb5b8dc1849c5958c62caf990faf75fcec6b2ea Mon Sep 17 00:00:00 2001
From: blake2-ppc <blake2-ppc>
Date: Sun, 18 Aug 2013 13:57:34 +0200
Subject: std::str: Special case char_range_at_reverse so it is faster

Implement char_range_at_reverse similarly to char_range_at, instead of
re-using that method.
---
 src/libstd/str.rs | 34 +++++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 13 deletions(-)

(limited to 'src/libstd')

diff --git a/src/libstd/str.rs b/src/libstd/str.rs
index 07006ba8c15..c5c2150617c 100644
--- a/src/libstd/str.rs
+++ b/src/libstd/str.rs
@@ -27,7 +27,7 @@ use iterator::{Iterator, FromIterator, Extendable};
 use iterator::{Filter, AdditiveIterator, Map};
 use iterator::{Invert, DoubleEndedIterator};
 use libc;
-use num::Zero;
+use num::{Saturating, Zero};
 use option::{None, Option, Some};
 use ptr;
 use ptr::RawPtr;
@@ -1698,21 +1698,29 @@ impl<'self> StrSlice<'self> for &'self str {
     fn char_range_at_reverse(&self, start: uint) -> CharRange {
         let mut prev = start;
 
-        // while there is a previous byte == 10......
-        while prev > 0u && self[prev - 1u] & 192u8 == TAG_CONT_U8 {
-            prev -= 1u;
-        }
+        prev = prev.saturating_sub(1);
+        if self[prev] < 128 { return CharRange{ch: self[prev] as char, next: prev} }
 
-        // now refer to the initial byte of previous char
-        if prev > 0u {
-            prev -= 1u;
-        } else {
-            prev = 0u;
-        }
+        // Multibyte case is a fn to allow char_range_at_reverse to inline cleanly
+        fn multibyte_char_range_at_rev(s: &str, mut i: uint) -> CharRange {
+            // while there is a previous byte == 10......
+            while i > 0 && s[i] & 192u8 == TAG_CONT_U8 {
+                i -= 1u;
+            }
+
+            let mut val = s[i] as uint;
+            let w = UTF8_CHAR_WIDTH[val] as uint;
+            assert!((w != 0));
 
+            val = utf8_first_byte!(val, w);
+            val = utf8_acc_cont_byte!(val, s[i + 1]);
+            if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); }
+            if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); }
+
+            return CharRange {ch: val as char, next: i};
+        }
 
-        let ch = self.char_at(prev);
-        return CharRange {ch:ch, next:prev};
+        return multibyte_char_range_at_rev(*self, prev);
     }
 
     /// Plucks the character ending at the `i`th byte of a string
-- 
cgit 1.4.1-3-g733a5


From 8a5889d2a2eb4b2c9d41f6f3991fdd2622933047 Mon Sep 17 00:00:00 2001
From: blake2-ppc <blake2-ppc>
Date: Sun, 18 Aug 2013 13:57:34 +0200
Subject: std::str: Add str::raw::slice_unchecked

Add a function like raw::slice_bytes, but it doesn't check slice
boundaries. For iterator use where we always know the begin, end indices
are in range.
---
 src/libstd/str.rs | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'src/libstd')

diff --git a/src/libstd/str.rs b/src/libstd/str.rs
index c5c2150617c..5022e558884 100644
--- a/src/libstd/str.rs
+++ b/src/libstd/str.rs
@@ -847,12 +847,21 @@ pub mod raw {
     /// If end is greater than the length of the string.
     #[inline]
     pub unsafe fn slice_bytes<'a>(s: &'a str, begin: uint, end: uint) -> &'a str {
-        do s.as_imm_buf |sbuf, n| {
-             assert!((begin <= end));
-             assert!((end <= n));
+        assert!(begin <= end);
+        assert!(end <= s.len());
+        slice_unchecked(s, begin, end)
+    }
 
+    /// Takes a bytewise (not UTF-8) slice from a string.
+    ///
+    /// Returns the substring from [`begin`..`end`).
+    ///
+    /// Caller must check slice boundaries!
+    #[inline]
+    pub unsafe fn slice_unchecked<'a>(s: &'a str, begin: uint, end: uint) -> &'a str {
+        do s.as_imm_buf |sbuf, _n| {
              cast::transmute(Slice {
-                 data: ptr::offset(sbuf, begin as int),
+                 data: sbuf.offset_inbounds(begin as int),
                  len: end - begin,
              })
         }
-- 
cgit 1.4.1-3-g733a5


From db3eb7291a3af1b88052f8ad87da79d62bd60b81 Mon Sep 17 00:00:00 2001
From: blake2-ppc <blake2-ppc>
Date: Sun, 18 Aug 2013 13:57:34 +0200
Subject: std::str: Implement CharIterator separately

Let CharIterator be a separate type from CharOffsetIterator (so that
CharIterator can be cloned, for example).

Implement CharOffsetIterator by using the same technique as the method
subslice_offset.
---
 src/libstd/str.rs | 103 +++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 68 insertions(+), 35 deletions(-)

(limited to 'src/libstd')

diff --git a/src/libstd/str.rs b/src/libstd/str.rs
index 5022e558884..ccb7349eefd 100644
--- a/src/libstd/str.rs
+++ b/src/libstd/str.rs
@@ -255,56 +255,94 @@ impl<'self, C: CharEq> CharEq for &'self [C] {
 Section: Iterators
 */
 
-/// External iterator for a string's characters and their byte offsets.
-/// Use with the `std::iterator` module.
+/// External iterator for a string's characters.
 #[deriving(Clone)]
-pub struct CharOffsetIterator<'self> {
-    priv index_front: uint,
-    priv index_back: uint,
+pub struct CharIterator<'self> {
     priv string: &'self str,
 }
 
-impl<'self> Iterator<(uint, char)> for CharOffsetIterator<'self> {
+impl<'self> Iterator<char> for CharIterator<'self> {
     #[inline]
-    fn next(&mut self) -> Option<(uint, char)> {
-        if self.index_front < self.index_back {
-            let CharRange {ch, next} = self.string.char_range_at(self.index_front);
-            let index = self.index_front;
-            self.index_front = next;
-            Some((index, ch))
+    fn next(&mut self) -> Option<char> {
+        if self.string.len() != 0 {
+            let CharRange {ch, next} = self.string.char_range_at(0);
+            unsafe {
+                self.string = raw::slice_unchecked(self.string, next, self.string.len());
+            }
+            Some(ch)
         } else {
             None
         }
     }
+
+    #[inline]
+    fn size_hint(&self) -> (uint, Option<uint>) {
+        (self.string.len().saturating_add(3)/4, Some(self.string.len()))
+    }
 }
 
-impl<'self> DoubleEndedIterator<(uint, char)> for CharOffsetIterator<'self> {
+impl<'self> DoubleEndedIterator<char> for CharIterator<'self> {
     #[inline]
-    fn next_back(&mut self) -> Option<(uint, char)> {
-        if self.index_front < self.index_back {
-            let CharRange {ch, next} = self.string.char_range_at_reverse(self.index_back);
-            self.index_back = next;
-            Some((next, ch))
+    fn next_back(&mut self) -> Option<char> {
+        if self.string.len() != 0 {
+            let CharRange {ch, next} = self.string.char_range_at_reverse(self.string.len());
+            unsafe {
+                self.string = raw::slice_unchecked(self.string, 0, next);
+            }
+            Some(ch)
         } else {
             None
         }
     }
 }
 
-/// External iterator for a string's characters and their byte offsets in reverse order.
-/// Use with the `std::iterator` module.
-pub type CharOffsetRevIterator<'self> =
-    Invert<CharOffsetIterator<'self>>;
 
-/// External iterator for a string's characters.
+/// External iterator for a string's characters and their byte offsets.
 /// Use with the `std::iterator` module.
-pub type CharIterator<'self> =
-    Map<'self, (uint, char), char, CharOffsetIterator<'self>>;
+#[deriving(Clone)]
+pub struct CharOffsetIterator<'self> {
+    priv string: &'self str,
+    priv iter: CharIterator<'self>,
+}
+
+impl<'self> Iterator<(uint, char)> for CharOffsetIterator<'self> {
+    #[inline]
+    fn next(&mut self) -> Option<(uint, char)> {
+        let offset = do self.string.as_imm_buf |a, _| {
+            do self.iter.string.as_imm_buf |b, _| {
+                b as uint - a as uint
+            }
+        };
+        self.iter.next().map_move(|ch| (offset, ch))
+    }
+
+    #[inline]
+    fn size_hint(&self) -> (uint, Option<uint>) {
+        self.iter.size_hint()
+    }
+}
+
+impl<'self> DoubleEndedIterator<(uint, char)> for CharOffsetIterator<'self> {
+    #[inline]
+    fn next_back(&mut self) -> Option<(uint, char)> {
+        self.iter.next_back().map_move(|ch| {
+            let offset = do self.string.as_imm_buf |a, _| {
+                do self.iter.string.as_imm_buf |b, len| {
+                    b as uint - a as uint + len
+                }
+            };
+            (offset, ch)
+        })
+    }
+}
 
 /// External iterator for a string's characters in reverse order.
 /// Use with the `std::iterator` module.
-pub type CharRevIterator<'self> =
-    Invert<Map<'self, (uint, char), char, CharOffsetIterator<'self>>>;
+pub type CharRevIterator<'self> = Invert<CharIterator<'self>>;
+
+/// External iterator for a string's characters and their byte offsets in reverse order.
+/// Use with the `std::iterator` module.
+pub type CharOffsetRevIterator<'self> = Invert<CharOffsetIterator<'self>>;
 
 /// External iterator for a string's bytes.
 /// Use with the `std::iterator` module.
@@ -313,8 +351,7 @@ pub type ByteIterator<'self> =
 
 /// External iterator for a string's bytes in reverse order.
 /// Use with the `std::iterator` module.
-pub type ByteRevIterator<'self> =
-    Invert<Map<'self, &'self u8, u8, vec::VecIterator<'self, u8>>>;
+pub type ByteRevIterator<'self> = Invert<ByteIterator<'self>>;
 
 /// An iterator over the substrings of a string, separated by `sep`.
 #[deriving(Clone)]
@@ -1218,7 +1255,7 @@ impl<'self> StrSlice<'self> for &'self str {
     /// ~~~
     #[inline]
     fn iter(&self) -> CharIterator<'self> {
-        self.char_offset_iter().map(|(_, c)| c)
+        CharIterator{string: *self}
     }
 
     /// An iterator over the characters of `self`, in reverse order.
@@ -1242,11 +1279,7 @@ impl<'self> StrSlice<'self> for &'self str {
     /// An iterator over the characters of `self` and their byte offsets.
     #[inline]
     fn char_offset_iter(&self) -> CharOffsetIterator<'self> {
-        CharOffsetIterator {
-            index_front: 0,
-            index_back: self.len(),
-            string: *self
-        }
+        CharOffsetIterator{string: *self, iter: self.iter()}
     }
 
     /// An iterator over the characters of `self` and their byte offsets.
-- 
cgit 1.4.1-3-g733a5


From 595dd843d7e2e38c08b4e03b79a0531d32d778fb Mon Sep 17 00:00:00 2001
From: blake2-ppc <blake2-ppc>
Date: Sun, 18 Aug 2013 13:57:35 +0200
Subject: std::str: Use CharOffsetIterator in .find() and .rfind()

---
 src/libstd/str.rs | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'src/libstd')

diff --git a/src/libstd/str.rs b/src/libstd/str.rs
index ccb7349eefd..0becd8e722e 100644
--- a/src/libstd/str.rs
+++ b/src/libstd/str.rs
@@ -1790,10 +1790,8 @@ impl<'self> StrSlice<'self> for &'self str {
                 if search.matches(b as char) { return Some(i) }
             }
         } else {
-            let mut index = 0;
-            for c in self.iter() {
+            for (index, c) in self.char_offset_iter() {
                 if search.matches(c) { return Some(index); }
-                index += c.len_utf8_bytes();
             }
         }
 
@@ -1807,15 +1805,14 @@ impl<'self> StrSlice<'self> for &'self str {
     /// `Some` containing the byte index of the last matching character
     /// or `None` if there is no match
     fn rfind<C: CharEq>(&self, search: C) -> Option<uint> {
-        let mut index = self.len();
         if search.only_ascii() {
+            let mut index = self.len();
             for b in self.byte_rev_iter() {
                 index -= 1;
                 if search.matches(b as char) { return Some(index); }
             }
         } else {
-            for c in self.rev_iter() {
-                index -= c.len_utf8_bytes();
+            for (index, c) in self.char_offset_rev_iter() {
                 if search.matches(c) { return Some(index); }
             }
         }
-- 
cgit 1.4.1-3-g733a5


From f33a30e7e8f22a1e438dc5b30959bb80829ee505 Mon Sep 17 00:00:00 2001
From: blake2-ppc <blake2-ppc>
Date: Sun, 18 Aug 2013 13:57:35 +0200
Subject: std::str: Correct docstrings for lack of null terminator in ~str and
 &str

---
 src/libstd/str.rs | 37 +++++++++++++------------------------
 1 file changed, 13 insertions(+), 24 deletions(-)

(limited to 'src/libstd')

diff --git a/src/libstd/str.rs b/src/libstd/str.rs
index 0becd8e722e..8944d0b291e 100644
--- a/src/libstd/str.rs
+++ b/src/libstd/str.rs
@@ -8,13 +8,12 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.
 
-/*!
- * String manipulation
- *
- * Strings are a packed UTF-8 representation of text, stored as null
- * terminated buffers of u8 bytes.  Strings should be indexed in bytes,
- * for efficiency, but UTF-8 unsafe operations should be avoided.
- */
+//! String manipulation
+//!
+//! Strings are a packed UTF-8 representation of text, stored as
+//! buffers of u8 bytes. The buffer is not null terminated.
+//! Strings should be indexed in bytes, for efficiency, but UTF-8 unsafe
+//! operations should be avoided.
 
 use at_vec;
 use cast;
@@ -1772,8 +1771,6 @@ impl<'self> StrSlice<'self> for &'self str {
     }
 
     /// Work with the byte buffer of a string as a byte slice.
-    ///
-    /// The byte slice does not include the null terminator.
     fn as_bytes(&self) -> &'self [u8] {
         unsafe { cast::transmute(*self) }
     }
@@ -1953,10 +1950,7 @@ impl<'self> StrSlice<'self> for &'self str {
 
     /// Work with the byte buffer and length of a slice.
     ///
-    /// The given length is one byte longer than the 'official' indexable
-    /// length of the string. This is to permit probing the byte past the
-    /// indexable area for a null byte, as is the case in slices pointing
-    /// to full strings, or suffixes of them.
+    /// The buffer does not have a null terminator.
     #[inline]
     fn as_imm_buf<T>(&self, f: &fn(*u8, uint) -> T) -> T {
         let v: &[u8] = unsafe { cast::transmute(*self) };
@@ -1979,12 +1973,10 @@ pub trait OwnedStr {
 
     /// Work with the mutable byte buffer and length of a slice.
     ///
-    /// The given length is one byte longer than the 'official' indexable
-    /// length of the string. This is to permit probing the byte past the
-    /// indexable area for a null byte, as is the case in slices pointing
-    /// to full strings, or suffixes of them.
+    /// The buffer does not have a null terminator.
     ///
-    /// Make sure any mutations to this buffer keep this string valid UTF8.
+    /// The caller must make sure any mutations to this buffer keep the string
+    /// valid UTF-8!
     fn as_mut_buf<T>(&mut self, f: &fn(*mut u8, uint) -> T) -> T;
 }
 
@@ -2085,12 +2077,10 @@ impl OwnedStr for ~str {
         new_str
     }
 
-    /// Reserves capacity for exactly `n` bytes in the given string, not including
-    /// the null terminator.
+    /// Reserves capacity for exactly `n` bytes in the given string.
     ///
     /// Assuming single-byte characters, the resulting string will be large
-    /// enough to hold a string of length `n`. To account for the null terminator,
-    /// the underlying buffer will have the size `n` + 1.
+    /// enough to hold a string of length `n`.
     ///
     /// If the capacity for `s` is already equal to or greater than the requested
     /// capacity, then no action is taken.
@@ -2110,8 +2100,7 @@ impl OwnedStr for ~str {
     /// Reserves capacity for at least `n` bytes in the given string.
     ///
     /// Assuming single-byte characters, the resulting string will be large
-    /// enough to hold a string of length `n`. To account for the null terminator,
-    /// the underlying buffer will have the size `n` + 1.
+    /// enough to hold a string of length `n`.
     ///
     /// This function will over-allocate in order to amortize the allocation costs
     /// in scenarios where the caller may need to repeatedly reserve additional
-- 
cgit 1.4.1-3-g733a5


From 8931ad9e52e4f23043eea9cc63039d7e5f1e1efc Mon Sep 17 00:00:00 2001
From: blake2-ppc <blake2-ppc>
Date: Sun, 18 Aug 2013 21:28:04 +0200
Subject: std::str: Only check char boundary for end index in .slice_to()

---
 src/libstd/str.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'src/libstd')

diff --git a/src/libstd/str.rs b/src/libstd/str.rs
index 8944d0b291e..df24d8b20e2 100644
--- a/src/libstd/str.rs
+++ b/src/libstd/str.rs
@@ -1427,7 +1427,8 @@ impl<'self> StrSlice<'self> for &'self str {
     /// out of bounds.
     #[inline]
     fn slice_to(&self, end: uint) -> &'self str {
-        self.slice(0, end)
+        assert!(self.is_char_boundary(end));
+        unsafe { raw::slice_bytes(*self, 0, end) }
     }
 
     /// Returns a slice of the string from the char range
-- 
cgit 1.4.1-3-g733a5


From 5eff3e1bd9d6ed2a58700d5cdde3266856f95271 Mon Sep 17 00:00:00 2001
From: blake2-ppc <blake2-ppc>
Date: Sun, 18 Aug 2013 22:15:47 +0200
Subject: std::str: Use CharOffsetIterator in slice_chars

---
 src/libstd/str.rs | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

(limited to 'src/libstd')

diff --git a/src/libstd/str.rs b/src/libstd/str.rs
index df24d8b20e2..93ece53bd20 100644
--- a/src/libstd/str.rs
+++ b/src/libstd/str.rs
@@ -1438,23 +1438,24 @@ impl<'self> StrSlice<'self> for &'self str {
     /// beyond the last character of the string.
     fn slice_chars(&self, begin: uint, end: uint) -> &'self str {
         assert!(begin <= end);
-        // not sure how to use the iterators for this nicely.
-        let mut position = 0;
         let mut count = 0;
-        let l = self.len();
-        while count < begin && position < l {
-            position = self.char_range_at(position).next;
+        let mut begin_byte = None;
+        let mut end_byte = None;
+
+        // This could be even more efficient by not decoding,
+        // only finding the char boundaries
+        for (idx, _) in self.char_offset_iter() {
+            if count == begin { begin_byte = Some(idx); }
+            if count == end { end_byte = Some(idx); break; }
             count += 1;
         }
-        if count < begin { fail!("Attempted to begin slice_chars beyond end of string") }
-        let start_byte = position;
-        while count < end && position < l {
-            position = self.char_range_at(position).next;
-            count += 1;
-        }
-        if count < end { fail!("Attempted to end slice_chars beyond end of string") }
+        if end_byte.is_none() && count == end { end_byte = Some(self.len()) }
 
-        self.slice(start_byte, position)
+        match (begin_byte, end_byte) {
+            (None, _) => fail!("slice_chars: `begin` is beyond end of string"),
+            (_, None) => fail!("slice_chars: `end` is beyond end of string"),
+            (Some(a), Some(b)) => unsafe { raw::slice_bytes(*self, a, b) }
+        }
     }
 
     /// Returns true if `needle` is a prefix of the string.
-- 
cgit 1.4.1-3-g733a5


From 30ab96b27229000d3754e7dee64fc431b5105150 Mon Sep 17 00:00:00 2001
From: blake2-ppc <blake2-ppc>
Date: Mon, 19 Aug 2013 11:18:30 +0200
Subject: std::str: Improve comments for CharIterator

---
 src/libstd/str.rs | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'src/libstd')

diff --git a/src/libstd/str.rs b/src/libstd/str.rs
index 93ece53bd20..7fde1c9f03c 100644
--- a/src/libstd/str.rs
+++ b/src/libstd/str.rs
@@ -255,14 +255,18 @@ Section: Iterators
 */
 
 /// External iterator for a string's characters.
+/// Use with the `std::iterator` module.
 #[deriving(Clone)]
 pub struct CharIterator<'self> {
+    /// The slice remaining to be iterated
     priv string: &'self str,
 }
 
 impl<'self> Iterator<char> for CharIterator<'self> {
     #[inline]
     fn next(&mut self) -> Option<char> {
+        // Decode the next codepoint, then update
+        // the slice to be just the remaining part
         if self.string.len() != 0 {
             let CharRange {ch, next} = self.string.char_range_at(0);
             unsafe {
@@ -300,6 +304,7 @@ impl<'self> DoubleEndedIterator<char> for CharIterator<'self> {
 /// Use with the `std::iterator` module.
 #[deriving(Clone)]
 pub struct CharOffsetIterator<'self> {
+    /// The original string to be iterated
     priv string: &'self str,
     priv iter: CharIterator<'self>,
 }
@@ -307,6 +312,8 @@ pub struct CharOffsetIterator<'self> {
 impl<'self> Iterator<(uint, char)> for CharOffsetIterator<'self> {
     #[inline]
     fn next(&mut self) -> Option<(uint, char)> {
+        // Compute the byte offset by using the pointer offset between
+        // the original string slice and the iterator's remaining part
         let offset = do self.string.as_imm_buf |a, _| {
             do self.iter.string.as_imm_buf |b, _| {
                 b as uint - a as uint
@@ -1281,7 +1288,8 @@ impl<'self> StrSlice<'self> for &'self str {
         CharOffsetIterator{string: *self, iter: self.iter()}
     }
 
-    /// An iterator over the characters of `self` and their byte offsets.
+    /// An iterator over the characters of `self` and their byte offsets,
+    /// in reverse order.
     #[inline]
     fn char_offset_rev_iter(&self) -> CharOffsetRevIterator<'self> {
         self.char_offset_iter().invert()
-- 
cgit 1.4.1-3-g733a5


From 8fe83028870ac6ac48e99a38d2992bedc26ec0d7 Mon Sep 17 00:00:00 2001
From: blake2-ppc <blake2-ppc>
Date: Mon, 19 Aug 2013 15:34:48 +0200
Subject: std::str: Use iterators instead of while loops for CharSplitIterator

Embed an iterator in the CharSplitIterator struct, and combine that with
the former bool `only_ascii`; so use an enum instead.
---
 src/libstd/str.rs | 78 ++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 45 insertions(+), 33 deletions(-)

(limited to 'src/libstd')

diff --git a/src/libstd/str.rs b/src/libstd/str.rs
index 7fde1c9f03c..d8f723d9c78 100644
--- a/src/libstd/str.rs
+++ b/src/libstd/str.rs
@@ -23,7 +23,7 @@ use clone::Clone;
 use container::{Container, Mutable};
 use iter::Times;
 use iterator::{Iterator, FromIterator, Extendable};
-use iterator::{Filter, AdditiveIterator, Map};
+use iterator::{Filter, AdditiveIterator, Map, Enumerate};
 use iterator::{Invert, DoubleEndedIterator};
 use libc;
 use num::{Saturating, Zero};
@@ -359,9 +359,18 @@ pub type ByteIterator<'self> =
 /// Use with the `std::iterator` module.
 pub type ByteRevIterator<'self> = Invert<ByteIterator<'self>>;
 
+/// An iterator over byte index and either &u8 or char
+#[deriving(Clone)]
+enum OffsetIterator<'self> {
+    // use ByteIterator here when it can be cloned
+    ByteOffset(Enumerate<vec::VecIterator<'self, u8>>),
+    CharOffset(CharOffsetIterator<'self>),
+}
+
 /// An iterator over the substrings of a string, separated by `sep`.
 #[deriving(Clone)]
 pub struct CharSplitIterator<'self,Sep> {
+    priv iter: OffsetIterator<'self>,
     priv string: &'self str,
     priv position: uint,
     priv sep: Sep,
@@ -370,7 +379,6 @@ pub struct CharSplitIterator<'self,Sep> {
     /// Whether an empty string at the end is allowed
     priv allow_trailing_empty: bool,
     priv finished: bool,
-    priv only_ascii: bool
 }
 
 /// An iterator over the words of a string, separated by an sequence of whitespace
@@ -386,39 +394,39 @@ impl<'self, Sep: CharEq> Iterator<&'self str> for CharSplitIterator<'self, Sep>
     fn next(&mut self) -> Option<&'self str> {
         if self.finished { return None }
 
-        let l = self.string.len();
         let start = self.position;
-
-        if self.only_ascii {
-            // this gives a *huge* speed up for splitting on ASCII
-            // characters (e.g. '\n' or ' ')
-            while self.position < l && self.count > 0 {
-                let byte = self.string[self.position];
-
-                if self.sep.matches(byte as char) {
-                    let slice = unsafe { raw::slice_bytes(self.string, start, self.position) };
-                    self.position += 1;
-                    self.count -= 1;
-                    return Some(slice);
-                }
-                self.position += 1;
-            }
-        } else {
-            while self.position < l && self.count > 0 {
-                let CharRange {ch, next} = self.string.char_range_at(self.position);
-
-                if self.sep.matches(ch) {
-                    let slice = unsafe { raw::slice_bytes(self.string, start, self.position) };
-                    self.position = next;
-                    self.count -= 1;
-                    return Some(slice);
-                }
-                self.position = next;
+        let len = self.string.len();
+
+        if self.count > 0 {
+            match self.iter {
+                // this gives a *huge* speed up for splitting on ASCII
+                // characters (e.g. '\n' or ' ')
+                ByteOffset(ref mut iter) =>
+                    for (idx, &byte) in *iter {
+                        if self.sep.matches(byte as char) {
+                            self.position = idx + 1;
+                            self.count -= 1;
+                            return Some(unsafe {
+                                raw::slice_bytes(self.string, start, idx)
+                            })
+                        }
+                    },
+                CharOffset(ref mut iter) =>
+                    for (idx, ch) in *iter {
+                        if self.sep.matches(ch) {
+                            // skip over the separator
+                            self.position = self.string.char_range_at(idx).next;
+                            self.count -= 1;
+                            return Some(unsafe {
+                                raw::slice_bytes(self.string, start, idx)
+                            })
+                        }
+                    },
             }
         }
         self.finished = true;
-        if self.allow_trailing_empty || start < l {
-            Some(unsafe { raw::slice_bytes(self.string, start, l) })
+        if self.allow_trailing_empty || start < len {
+            Some(unsafe { raw::slice_bytes(self.string, start, len) })
         } else {
             None
         }
@@ -1327,15 +1335,19 @@ impl<'self> StrSlice<'self> for &'self str {
     #[inline]
     fn split_options_iter<Sep: CharEq>(&self, sep: Sep, count: uint, allow_trailing_empty: bool)
         -> CharSplitIterator<'self, Sep> {
-        let only_ascii = sep.only_ascii();
+        let iter = if sep.only_ascii() {
+            ByteOffset(self.as_bytes().iter().enumerate())
+        } else {
+            CharOffset(self.char_offset_iter())
+        };
         CharSplitIterator {
+            iter: iter,
             string: *self,
             position: 0,
             sep: sep,
             count: count,
             allow_trailing_empty: allow_trailing_empty,
             finished: false,
-            only_ascii: only_ascii
         }
     }
 
-- 
cgit 1.4.1-3-g733a5


From 93de60e511d15b61a490ed690dee15c923ff9538 Mon Sep 17 00:00:00 2001
From: blake2-ppc <blake2-ppc>
Date: Thu, 22 Aug 2013 00:35:16 +0200
Subject: std::str: Add test for CharIterator .clone()

---
 src/libstd/str.rs | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'src/libstd')

diff --git a/src/libstd/str.rs b/src/libstd/str.rs
index d8f723d9c78..690e1906ae2 100644
--- a/src/libstd/str.rs
+++ b/src/libstd/str.rs
@@ -3139,6 +3139,14 @@ mod tests {
         assert_eq!(pos, v.len());
     }
 
+    #[test]
+    fn test_iterator_clone() {
+        let s = "ศไทย中华Việt Nam";
+        let mut it = s.iter();
+        it.next();
+        assert!(it.zip(it.clone()).all(|(x,y)| x == y));
+    }
+
     #[test]
     fn test_byte_iterator() {
         let s = ~"ศไทย中华Việt Nam";
-- 
cgit 1.4.1-3-g733a5