From 548bdbaa29e0855a87ac5eec073d83babb72d8f2 Mon Sep 17 00:00:00 2001 From: blake2-ppc Date: Sun, 18 Aug 2013 13:57:34 +0200 Subject: std::str: Bench test for char iterators --- src/libstd/str.rs | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) (limited to 'src/libstd') diff --git a/src/libstd/str.rs b/src/libstd/str.rs index a759b8cbd62..3d793bc8e77 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -3267,6 +3267,62 @@ mod tests { mod bench { use extra::test::BenchHarness; use super::*; + use prelude::*; + + #[bench] + fn char_iterator(bh: &mut BenchHarness) { + let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb"; + let len = s.char_len(); + + do bh.iter { + assert_eq!(s.iter().len(), len); + } + } + + #[bench] + fn char_iterator_ascii(bh: &mut BenchHarness) { + let s = "Mary had a little lamb, Little lamb + Mary had a little lamb, Little lamb + Mary had a little lamb, Little lamb + Mary had a little lamb, Little lamb + Mary had a little lamb, Little lamb + Mary had a little lamb, Little lamb"; + let len = s.char_len(); + + do bh.iter { + assert_eq!(s.iter().len(), len); + } + } + + #[bench] + fn char_iterator_rev(bh: &mut BenchHarness) { + let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb"; + let len = s.char_len(); + + do bh.iter { + assert_eq!(s.rev_iter().len(), len); + } + } + + #[bench] + fn char_offset_iterator(bh: &mut BenchHarness) { + let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb"; + let len = s.char_len(); + + do bh.iter { + assert_eq!(s.char_offset_iter().len(), len); + } + } + + #[bench] + fn char_offset_iterator_rev(bh: &mut BenchHarness) { + let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb"; + let len = s.char_len(); + + do bh.iter { + assert_eq!(s.char_offset_rev_iter().len(), len); + } + } #[bench] fn is_utf8_100_ascii(bh: &mut BenchHarness) { -- cgit 1.4.1-3-g733a5 From 4043c70f23bbc883088634e9cf0c3224524a2c5c Mon Sep 17 00:00:00 2001 From: blake2-ppc Date: Sun, 18 Aug 2013 13:57:34 +0200 Subject: std::str: Small fix for slice --- src/libstd/str.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'src/libstd') diff --git a/src/libstd/str.rs b/src/libstd/str.rs index 3d793bc8e77..07006ba8c15 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -1366,8 +1366,7 @@ impl<'self> StrSlice<'self> for &'self str { /// beyond the last character of the string #[inline] fn slice(&self, begin: uint, end: uint) -> &'self str { - assert!(self.is_char_boundary(begin)); - assert!(self.is_char_boundary(end)); + assert!(self.is_char_boundary(begin) && self.is_char_boundary(end)); unsafe { raw::slice_bytes(*self, begin, end) } } @@ -1609,6 +1608,7 @@ impl<'self> StrSlice<'self> for &'self str { /// Returns false if the index points into the middle of a multi-byte /// character sequence. + #[inline] fn is_char_boundary(&self, index: uint) -> bool { if index == self.len() { return true; } let b = self[index]; @@ -1694,6 +1694,7 @@ impl<'self> StrSlice<'self> for &'self str { /// This function can be used to iterate over a unicode string in reverse. /// /// Returns 0 for next index if called on start index 0. + #[inline] fn char_range_at_reverse(&self, start: uint) -> CharRange { let mut prev = start; -- cgit 1.4.1-3-g733a5 From 3cb5b8dc1849c5958c62caf990faf75fcec6b2ea Mon Sep 17 00:00:00 2001 From: blake2-ppc Date: Sun, 18 Aug 2013 13:57:34 +0200 Subject: std::str: Special case char_range_at_reverse so it is faster Implement char_range_at_reverse similarly to char_range_at, instead of re-using that method. --- src/libstd/str.rs | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) (limited to 'src/libstd') diff --git a/src/libstd/str.rs b/src/libstd/str.rs index 07006ba8c15..c5c2150617c 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -27,7 +27,7 @@ use iterator::{Iterator, FromIterator, Extendable}; use iterator::{Filter, AdditiveIterator, Map}; use iterator::{Invert, DoubleEndedIterator}; use libc; -use num::Zero; +use num::{Saturating, Zero}; use option::{None, Option, Some}; use ptr; use ptr::RawPtr; @@ -1698,21 +1698,29 @@ impl<'self> StrSlice<'self> for &'self str { fn char_range_at_reverse(&self, start: uint) -> CharRange { let mut prev = start; - // while there is a previous byte == 10...... - while prev > 0u && self[prev - 1u] & 192u8 == TAG_CONT_U8 { - prev -= 1u; - } + prev = prev.saturating_sub(1); + if self[prev] < 128 { return CharRange{ch: self[prev] as char, next: prev} } - // now refer to the initial byte of previous char - if prev > 0u { - prev -= 1u; - } else { - prev = 0u; - } + // Multibyte case is a fn to allow char_range_at_reverse to inline cleanly + fn multibyte_char_range_at_rev(s: &str, mut i: uint) -> CharRange { + // while there is a previous byte == 10...... + while i > 0 && s[i] & 192u8 == TAG_CONT_U8 { + i -= 1u; + } + + let mut val = s[i] as uint; + let w = UTF8_CHAR_WIDTH[val] as uint; + assert!((w != 0)); + val = utf8_first_byte!(val, w); + val = utf8_acc_cont_byte!(val, s[i + 1]); + if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); } + if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); } + + return CharRange {ch: val as char, next: i}; + } - let ch = self.char_at(prev); - return CharRange {ch:ch, next:prev}; + return multibyte_char_range_at_rev(*self, prev); } /// Plucks the character ending at the `i`th byte of a string -- cgit 1.4.1-3-g733a5 From 8a5889d2a2eb4b2c9d41f6f3991fdd2622933047 Mon Sep 17 00:00:00 2001 From: blake2-ppc Date: Sun, 18 Aug 2013 13:57:34 +0200 Subject: std::str: Add str::raw::slice_unchecked Add a function like raw::slice_bytes, but it doesn't check slice boundaries. For iterator use where we always know the begin, end indices are in range. --- src/libstd/str.rs | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'src/libstd') diff --git a/src/libstd/str.rs b/src/libstd/str.rs index c5c2150617c..5022e558884 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -847,12 +847,21 @@ pub mod raw { /// If end is greater than the length of the string. #[inline] pub unsafe fn slice_bytes<'a>(s: &'a str, begin: uint, end: uint) -> &'a str { - do s.as_imm_buf |sbuf, n| { - assert!((begin <= end)); - assert!((end <= n)); + assert!(begin <= end); + assert!(end <= s.len()); + slice_unchecked(s, begin, end) + } + /// Takes a bytewise (not UTF-8) slice from a string. + /// + /// Returns the substring from [`begin`..`end`). + /// + /// Caller must check slice boundaries! + #[inline] + pub unsafe fn slice_unchecked<'a>(s: &'a str, begin: uint, end: uint) -> &'a str { + do s.as_imm_buf |sbuf, _n| { cast::transmute(Slice { - data: ptr::offset(sbuf, begin as int), + data: sbuf.offset_inbounds(begin as int), len: end - begin, }) } -- cgit 1.4.1-3-g733a5 From db3eb7291a3af1b88052f8ad87da79d62bd60b81 Mon Sep 17 00:00:00 2001 From: blake2-ppc Date: Sun, 18 Aug 2013 13:57:34 +0200 Subject: std::str: Implement CharIterator separately Let CharIterator be a separate type from CharOffsetIterator (so that CharIterator can be cloned, for example). Implement CharOffsetIterator by using the same technique as the method subslice_offset. --- src/libstd/str.rs | 103 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 68 insertions(+), 35 deletions(-) (limited to 'src/libstd') diff --git a/src/libstd/str.rs b/src/libstd/str.rs index 5022e558884..ccb7349eefd 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -255,56 +255,94 @@ impl<'self, C: CharEq> CharEq for &'self [C] { Section: Iterators */ -/// External iterator for a string's characters and their byte offsets. -/// Use with the `std::iterator` module. +/// External iterator for a string's characters. #[deriving(Clone)] -pub struct CharOffsetIterator<'self> { - priv index_front: uint, - priv index_back: uint, +pub struct CharIterator<'self> { priv string: &'self str, } -impl<'self> Iterator<(uint, char)> for CharOffsetIterator<'self> { +impl<'self> Iterator for CharIterator<'self> { #[inline] - fn next(&mut self) -> Option<(uint, char)> { - if self.index_front < self.index_back { - let CharRange {ch, next} = self.string.char_range_at(self.index_front); - let index = self.index_front; - self.index_front = next; - Some((index, ch)) + fn next(&mut self) -> Option { + if self.string.len() != 0 { + let CharRange {ch, next} = self.string.char_range_at(0); + unsafe { + self.string = raw::slice_unchecked(self.string, next, self.string.len()); + } + Some(ch) } else { None } } + + #[inline] + fn size_hint(&self) -> (uint, Option) { + (self.string.len().saturating_add(3)/4, Some(self.string.len())) + } } -impl<'self> DoubleEndedIterator<(uint, char)> for CharOffsetIterator<'self> { +impl<'self> DoubleEndedIterator for CharIterator<'self> { #[inline] - fn next_back(&mut self) -> Option<(uint, char)> { - if self.index_front < self.index_back { - let CharRange {ch, next} = self.string.char_range_at_reverse(self.index_back); - self.index_back = next; - Some((next, ch)) + fn next_back(&mut self) -> Option { + if self.string.len() != 0 { + let CharRange {ch, next} = self.string.char_range_at_reverse(self.string.len()); + unsafe { + self.string = raw::slice_unchecked(self.string, 0, next); + } + Some(ch) } else { None } } } -/// External iterator for a string's characters and their byte offsets in reverse order. -/// Use with the `std::iterator` module. -pub type CharOffsetRevIterator<'self> = - Invert>; -/// External iterator for a string's characters. +/// External iterator for a string's characters and their byte offsets. /// Use with the `std::iterator` module. -pub type CharIterator<'self> = - Map<'self, (uint, char), char, CharOffsetIterator<'self>>; +#[deriving(Clone)] +pub struct CharOffsetIterator<'self> { + priv string: &'self str, + priv iter: CharIterator<'self>, +} + +impl<'self> Iterator<(uint, char)> for CharOffsetIterator<'self> { + #[inline] + fn next(&mut self) -> Option<(uint, char)> { + let offset = do self.string.as_imm_buf |a, _| { + do self.iter.string.as_imm_buf |b, _| { + b as uint - a as uint + } + }; + self.iter.next().map_move(|ch| (offset, ch)) + } + + #[inline] + fn size_hint(&self) -> (uint, Option) { + self.iter.size_hint() + } +} + +impl<'self> DoubleEndedIterator<(uint, char)> for CharOffsetIterator<'self> { + #[inline] + fn next_back(&mut self) -> Option<(uint, char)> { + self.iter.next_back().map_move(|ch| { + let offset = do self.string.as_imm_buf |a, _| { + do self.iter.string.as_imm_buf |b, len| { + b as uint - a as uint + len + } + }; + (offset, ch) + }) + } +} /// External iterator for a string's characters in reverse order. /// Use with the `std::iterator` module. -pub type CharRevIterator<'self> = - Invert>>; +pub type CharRevIterator<'self> = Invert>; + +/// External iterator for a string's characters and their byte offsets in reverse order. +/// Use with the `std::iterator` module. +pub type CharOffsetRevIterator<'self> = Invert>; /// External iterator for a string's bytes. /// Use with the `std::iterator` module. @@ -313,8 +351,7 @@ pub type ByteIterator<'self> = /// External iterator for a string's bytes in reverse order. /// Use with the `std::iterator` module. -pub type ByteRevIterator<'self> = - Invert>>; +pub type ByteRevIterator<'self> = Invert>; /// An iterator over the substrings of a string, separated by `sep`. #[deriving(Clone)] @@ -1218,7 +1255,7 @@ impl<'self> StrSlice<'self> for &'self str { /// ~~~ #[inline] fn iter(&self) -> CharIterator<'self> { - self.char_offset_iter().map(|(_, c)| c) + CharIterator{string: *self} } /// An iterator over the characters of `self`, in reverse order. @@ -1242,11 +1279,7 @@ impl<'self> StrSlice<'self> for &'self str { /// An iterator over the characters of `self` and their byte offsets. #[inline] fn char_offset_iter(&self) -> CharOffsetIterator<'self> { - CharOffsetIterator { - index_front: 0, - index_back: self.len(), - string: *self - } + CharOffsetIterator{string: *self, iter: self.iter()} } /// An iterator over the characters of `self` and their byte offsets. -- cgit 1.4.1-3-g733a5 From 595dd843d7e2e38c08b4e03b79a0531d32d778fb Mon Sep 17 00:00:00 2001 From: blake2-ppc Date: Sun, 18 Aug 2013 13:57:35 +0200 Subject: std::str: Use CharOffsetIterator in .find() and .rfind() --- src/libstd/str.rs | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'src/libstd') diff --git a/src/libstd/str.rs b/src/libstd/str.rs index ccb7349eefd..0becd8e722e 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -1790,10 +1790,8 @@ impl<'self> StrSlice<'self> for &'self str { if search.matches(b as char) { return Some(i) } } } else { - let mut index = 0; - for c in self.iter() { + for (index, c) in self.char_offset_iter() { if search.matches(c) { return Some(index); } - index += c.len_utf8_bytes(); } } @@ -1807,15 +1805,14 @@ impl<'self> StrSlice<'self> for &'self str { /// `Some` containing the byte index of the last matching character /// or `None` if there is no match fn rfind(&self, search: C) -> Option { - let mut index = self.len(); if search.only_ascii() { + let mut index = self.len(); for b in self.byte_rev_iter() { index -= 1; if search.matches(b as char) { return Some(index); } } } else { - for c in self.rev_iter() { - index -= c.len_utf8_bytes(); + for (index, c) in self.char_offset_rev_iter() { if search.matches(c) { return Some(index); } } } -- cgit 1.4.1-3-g733a5 From f33a30e7e8f22a1e438dc5b30959bb80829ee505 Mon Sep 17 00:00:00 2001 From: blake2-ppc Date: Sun, 18 Aug 2013 13:57:35 +0200 Subject: std::str: Correct docstrings for lack of null terminator in ~str and &str --- src/libstd/str.rs | 37 +++++++++++++------------------------ 1 file changed, 13 insertions(+), 24 deletions(-) (limited to 'src/libstd') diff --git a/src/libstd/str.rs b/src/libstd/str.rs index 0becd8e722e..8944d0b291e 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -8,13 +8,12 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -/*! - * String manipulation - * - * Strings are a packed UTF-8 representation of text, stored as null - * terminated buffers of u8 bytes. Strings should be indexed in bytes, - * for efficiency, but UTF-8 unsafe operations should be avoided. - */ +//! String manipulation +//! +//! Strings are a packed UTF-8 representation of text, stored as +//! buffers of u8 bytes. The buffer is not null terminated. +//! Strings should be indexed in bytes, for efficiency, but UTF-8 unsafe +//! operations should be avoided. use at_vec; use cast; @@ -1772,8 +1771,6 @@ impl<'self> StrSlice<'self> for &'self str { } /// Work with the byte buffer of a string as a byte slice. - /// - /// The byte slice does not include the null terminator. fn as_bytes(&self) -> &'self [u8] { unsafe { cast::transmute(*self) } } @@ -1953,10 +1950,7 @@ impl<'self> StrSlice<'self> for &'self str { /// Work with the byte buffer and length of a slice. /// - /// The given length is one byte longer than the 'official' indexable - /// length of the string. This is to permit probing the byte past the - /// indexable area for a null byte, as is the case in slices pointing - /// to full strings, or suffixes of them. + /// The buffer does not have a null terminator. #[inline] fn as_imm_buf(&self, f: &fn(*u8, uint) -> T) -> T { let v: &[u8] = unsafe { cast::transmute(*self) }; @@ -1979,12 +1973,10 @@ pub trait OwnedStr { /// Work with the mutable byte buffer and length of a slice. /// - /// The given length is one byte longer than the 'official' indexable - /// length of the string. This is to permit probing the byte past the - /// indexable area for a null byte, as is the case in slices pointing - /// to full strings, or suffixes of them. + /// The buffer does not have a null terminator. /// - /// Make sure any mutations to this buffer keep this string valid UTF8. + /// The caller must make sure any mutations to this buffer keep the string + /// valid UTF-8! fn as_mut_buf(&mut self, f: &fn(*mut u8, uint) -> T) -> T; } @@ -2085,12 +2077,10 @@ impl OwnedStr for ~str { new_str } - /// Reserves capacity for exactly `n` bytes in the given string, not including - /// the null terminator. + /// Reserves capacity for exactly `n` bytes in the given string. /// /// Assuming single-byte characters, the resulting string will be large - /// enough to hold a string of length `n`. To account for the null terminator, - /// the underlying buffer will have the size `n` + 1. + /// enough to hold a string of length `n`. /// /// If the capacity for `s` is already equal to or greater than the requested /// capacity, then no action is taken. @@ -2110,8 +2100,7 @@ impl OwnedStr for ~str { /// Reserves capacity for at least `n` bytes in the given string. /// /// Assuming single-byte characters, the resulting string will be large - /// enough to hold a string of length `n`. To account for the null terminator, - /// the underlying buffer will have the size `n` + 1. + /// enough to hold a string of length `n`. /// /// This function will over-allocate in order to amortize the allocation costs /// in scenarios where the caller may need to repeatedly reserve additional -- cgit 1.4.1-3-g733a5 From 8931ad9e52e4f23043eea9cc63039d7e5f1e1efc Mon Sep 17 00:00:00 2001 From: blake2-ppc Date: Sun, 18 Aug 2013 21:28:04 +0200 Subject: std::str: Only check char boundary for end index in .slice_to() --- src/libstd/str.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'src/libstd') diff --git a/src/libstd/str.rs b/src/libstd/str.rs index 8944d0b291e..df24d8b20e2 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -1427,7 +1427,8 @@ impl<'self> StrSlice<'self> for &'self str { /// out of bounds. #[inline] fn slice_to(&self, end: uint) -> &'self str { - self.slice(0, end) + assert!(self.is_char_boundary(end)); + unsafe { raw::slice_bytes(*self, 0, end) } } /// Returns a slice of the string from the char range -- cgit 1.4.1-3-g733a5 From 5eff3e1bd9d6ed2a58700d5cdde3266856f95271 Mon Sep 17 00:00:00 2001 From: blake2-ppc Date: Sun, 18 Aug 2013 22:15:47 +0200 Subject: std::str: Use CharOffsetIterator in slice_chars --- src/libstd/str.rs | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'src/libstd') diff --git a/src/libstd/str.rs b/src/libstd/str.rs index df24d8b20e2..93ece53bd20 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -1438,23 +1438,24 @@ impl<'self> StrSlice<'self> for &'self str { /// beyond the last character of the string. fn slice_chars(&self, begin: uint, end: uint) -> &'self str { assert!(begin <= end); - // not sure how to use the iterators for this nicely. - let mut position = 0; let mut count = 0; - let l = self.len(); - while count < begin && position < l { - position = self.char_range_at(position).next; + let mut begin_byte = None; + let mut end_byte = None; + + // This could be even more efficient by not decoding, + // only finding the char boundaries + for (idx, _) in self.char_offset_iter() { + if count == begin { begin_byte = Some(idx); } + if count == end { end_byte = Some(idx); break; } count += 1; } - if count < begin { fail!("Attempted to begin slice_chars beyond end of string") } - let start_byte = position; - while count < end && position < l { - position = self.char_range_at(position).next; - count += 1; - } - if count < end { fail!("Attempted to end slice_chars beyond end of string") } + if end_byte.is_none() && count == end { end_byte = Some(self.len()) } - self.slice(start_byte, position) + match (begin_byte, end_byte) { + (None, _) => fail!("slice_chars: `begin` is beyond end of string"), + (_, None) => fail!("slice_chars: `end` is beyond end of string"), + (Some(a), Some(b)) => unsafe { raw::slice_bytes(*self, a, b) } + } } /// Returns true if `needle` is a prefix of the string. -- cgit 1.4.1-3-g733a5 From 30ab96b27229000d3754e7dee64fc431b5105150 Mon Sep 17 00:00:00 2001 From: blake2-ppc Date: Mon, 19 Aug 2013 11:18:30 +0200 Subject: std::str: Improve comments for CharIterator --- src/libstd/str.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'src/libstd') diff --git a/src/libstd/str.rs b/src/libstd/str.rs index 93ece53bd20..7fde1c9f03c 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -255,14 +255,18 @@ Section: Iterators */ /// External iterator for a string's characters. +/// Use with the `std::iterator` module. #[deriving(Clone)] pub struct CharIterator<'self> { + /// The slice remaining to be iterated priv string: &'self str, } impl<'self> Iterator for CharIterator<'self> { #[inline] fn next(&mut self) -> Option { + // Decode the next codepoint, then update + // the slice to be just the remaining part if self.string.len() != 0 { let CharRange {ch, next} = self.string.char_range_at(0); unsafe { @@ -300,6 +304,7 @@ impl<'self> DoubleEndedIterator for CharIterator<'self> { /// Use with the `std::iterator` module. #[deriving(Clone)] pub struct CharOffsetIterator<'self> { + /// The original string to be iterated priv string: &'self str, priv iter: CharIterator<'self>, } @@ -307,6 +312,8 @@ pub struct CharOffsetIterator<'self> { impl<'self> Iterator<(uint, char)> for CharOffsetIterator<'self> { #[inline] fn next(&mut self) -> Option<(uint, char)> { + // Compute the byte offset by using the pointer offset between + // the original string slice and the iterator's remaining part let offset = do self.string.as_imm_buf |a, _| { do self.iter.string.as_imm_buf |b, _| { b as uint - a as uint @@ -1281,7 +1288,8 @@ impl<'self> StrSlice<'self> for &'self str { CharOffsetIterator{string: *self, iter: self.iter()} } - /// An iterator over the characters of `self` and their byte offsets. + /// An iterator over the characters of `self` and their byte offsets, + /// in reverse order. #[inline] fn char_offset_rev_iter(&self) -> CharOffsetRevIterator<'self> { self.char_offset_iter().invert() -- cgit 1.4.1-3-g733a5 From 8fe83028870ac6ac48e99a38d2992bedc26ec0d7 Mon Sep 17 00:00:00 2001 From: blake2-ppc Date: Mon, 19 Aug 2013 15:34:48 +0200 Subject: std::str: Use iterators instead of while loops for CharSplitIterator Embed an iterator in the CharSplitIterator struct, and combine that with the former bool `only_ascii`; so use an enum instead. --- src/libstd/str.rs | 78 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 45 insertions(+), 33 deletions(-) (limited to 'src/libstd') diff --git a/src/libstd/str.rs b/src/libstd/str.rs index 7fde1c9f03c..d8f723d9c78 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -23,7 +23,7 @@ use clone::Clone; use container::{Container, Mutable}; use iter::Times; use iterator::{Iterator, FromIterator, Extendable}; -use iterator::{Filter, AdditiveIterator, Map}; +use iterator::{Filter, AdditiveIterator, Map, Enumerate}; use iterator::{Invert, DoubleEndedIterator}; use libc; use num::{Saturating, Zero}; @@ -359,9 +359,18 @@ pub type ByteIterator<'self> = /// Use with the `std::iterator` module. pub type ByteRevIterator<'self> = Invert>; +/// An iterator over byte index and either &u8 or char +#[deriving(Clone)] +enum OffsetIterator<'self> { + // use ByteIterator here when it can be cloned + ByteOffset(Enumerate>), + CharOffset(CharOffsetIterator<'self>), +} + /// An iterator over the substrings of a string, separated by `sep`. #[deriving(Clone)] pub struct CharSplitIterator<'self,Sep> { + priv iter: OffsetIterator<'self>, priv string: &'self str, priv position: uint, priv sep: Sep, @@ -370,7 +379,6 @@ pub struct CharSplitIterator<'self,Sep> { /// Whether an empty string at the end is allowed priv allow_trailing_empty: bool, priv finished: bool, - priv only_ascii: bool } /// An iterator over the words of a string, separated by an sequence of whitespace @@ -386,39 +394,39 @@ impl<'self, Sep: CharEq> Iterator<&'self str> for CharSplitIterator<'self, Sep> fn next(&mut self) -> Option<&'self str> { if self.finished { return None } - let l = self.string.len(); let start = self.position; - - if self.only_ascii { - // this gives a *huge* speed up for splitting on ASCII - // characters (e.g. '\n' or ' ') - while self.position < l && self.count > 0 { - let byte = self.string[self.position]; - - if self.sep.matches(byte as char) { - let slice = unsafe { raw::slice_bytes(self.string, start, self.position) }; - self.position += 1; - self.count -= 1; - return Some(slice); - } - self.position += 1; - } - } else { - while self.position < l && self.count > 0 { - let CharRange {ch, next} = self.string.char_range_at(self.position); - - if self.sep.matches(ch) { - let slice = unsafe { raw::slice_bytes(self.string, start, self.position) }; - self.position = next; - self.count -= 1; - return Some(slice); - } - self.position = next; + let len = self.string.len(); + + if self.count > 0 { + match self.iter { + // this gives a *huge* speed up for splitting on ASCII + // characters (e.g. '\n' or ' ') + ByteOffset(ref mut iter) => + for (idx, &byte) in *iter { + if self.sep.matches(byte as char) { + self.position = idx + 1; + self.count -= 1; + return Some(unsafe { + raw::slice_bytes(self.string, start, idx) + }) + } + }, + CharOffset(ref mut iter) => + for (idx, ch) in *iter { + if self.sep.matches(ch) { + // skip over the separator + self.position = self.string.char_range_at(idx).next; + self.count -= 1; + return Some(unsafe { + raw::slice_bytes(self.string, start, idx) + }) + } + }, } } self.finished = true; - if self.allow_trailing_empty || start < l { - Some(unsafe { raw::slice_bytes(self.string, start, l) }) + if self.allow_trailing_empty || start < len { + Some(unsafe { raw::slice_bytes(self.string, start, len) }) } else { None } @@ -1327,15 +1335,19 @@ impl<'self> StrSlice<'self> for &'self str { #[inline] fn split_options_iter(&self, sep: Sep, count: uint, allow_trailing_empty: bool) -> CharSplitIterator<'self, Sep> { - let only_ascii = sep.only_ascii(); + let iter = if sep.only_ascii() { + ByteOffset(self.as_bytes().iter().enumerate()) + } else { + CharOffset(self.char_offset_iter()) + }; CharSplitIterator { + iter: iter, string: *self, position: 0, sep: sep, count: count, allow_trailing_empty: allow_trailing_empty, finished: false, - only_ascii: only_ascii } } -- cgit 1.4.1-3-g733a5 From 93de60e511d15b61a490ed690dee15c923ff9538 Mon Sep 17 00:00:00 2001 From: blake2-ppc Date: Thu, 22 Aug 2013 00:35:16 +0200 Subject: std::str: Add test for CharIterator .clone() --- src/libstd/str.rs | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'src/libstd') diff --git a/src/libstd/str.rs b/src/libstd/str.rs index d8f723d9c78..690e1906ae2 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -3139,6 +3139,14 @@ mod tests { assert_eq!(pos, v.len()); } + #[test] + fn test_iterator_clone() { + let s = "ศไทย中华Việt Nam"; + let mut it = s.iter(); + it.next(); + assert!(it.zip(it.clone()).all(|(x,y)| x == y)); + } + #[test] fn test_byte_iterator() { let s = ~"ศไทย中华Việt Nam"; -- cgit 1.4.1-3-g733a5