diff options
Diffstat (limited to 'src/libstd')
| -rw-r--r-- | src/libstd/str.rs | 385 |
1 files changed, 254 insertions, 131 deletions
diff --git a/src/libstd/str.rs b/src/libstd/str.rs index 0dd84fd3443..d5a44201ce6 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -8,13 +8,12 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -/*! - * String manipulation - * - * Strings are a packed UTF-8 representation of text, stored as null - * terminated buffers of u8 bytes. Strings should be indexed in bytes, - * for efficiency, but UTF-8 unsafe operations should be avoided. - */ +//! String manipulation +//! +//! Strings are a packed UTF-8 representation of text, stored as +//! buffers of u8 bytes. The buffer is not null terminated. +//! Strings should be indexed in bytes, for efficiency, but UTF-8 unsafe +//! operations should be avoided. use at_vec; use cast; @@ -24,10 +23,10 @@ use clone::{Clone, DeepClone}; use container::{Container, Mutable}; use iter::Times; use iterator::{Iterator, FromIterator, Extendable}; -use iterator::{Filter, AdditiveIterator, Map}; +use iterator::{Filter, AdditiveIterator, Map, Enumerate}; use iterator::{Invert, DoubleEndedIterator}; use libc; -use num::Zero; +use num::{Saturating, Zero}; use option::{None, Option, Some}; use ptr; use ptr::RawPtr; @@ -255,56 +254,101 @@ impl<'self, C: CharEq> CharEq for &'self [C] { Section: Iterators */ -/// External iterator for a string's characters and their byte offsets. +/// External iterator for a string's characters. /// Use with the `std::iterator` module. #[deriving(Clone)] -pub struct CharOffsetIterator<'self> { - priv index_front: uint, - priv index_back: uint, +pub struct CharIterator<'self> { + /// The slice remaining to be iterated priv string: &'self str, } -impl<'self> Iterator<(uint, char)> for CharOffsetIterator<'self> { +impl<'self> Iterator<char> for CharIterator<'self> { #[inline] - fn next(&mut self) -> Option<(uint, char)> { - if self.index_front < self.index_back { - let CharRange {ch, next} = self.string.char_range_at(self.index_front); - let index = self.index_front; - self.index_front = next; - Some((index, ch)) + fn next(&mut self) -> Option<char> { + // Decode the next codepoint, then update + // the slice to be just the remaining part + if self.string.len() != 0 { + let CharRange {ch, next} = self.string.char_range_at(0); + unsafe { + self.string = raw::slice_unchecked(self.string, next, self.string.len()); + } + Some(ch) } else { None } } + + #[inline] + fn size_hint(&self) -> (uint, Option<uint>) { + (self.string.len().saturating_add(3)/4, Some(self.string.len())) + } } -impl<'self> DoubleEndedIterator<(uint, char)> for CharOffsetIterator<'self> { +impl<'self> DoubleEndedIterator<char> for CharIterator<'self> { #[inline] - fn next_back(&mut self) -> Option<(uint, char)> { - if self.index_front < self.index_back { - let CharRange {ch, next} = self.string.char_range_at_reverse(self.index_back); - self.index_back = next; - Some((next, ch)) + fn next_back(&mut self) -> Option<char> { + if self.string.len() != 0 { + let CharRange {ch, next} = self.string.char_range_at_reverse(self.string.len()); + unsafe { + self.string = raw::slice_unchecked(self.string, 0, next); + } + Some(ch) } else { None } } } -/// External iterator for a string's characters and their byte offsets in reverse order. -/// Use with the `std::iterator` module. -pub type CharOffsetRevIterator<'self> = - Invert<CharOffsetIterator<'self>>; -/// External iterator for a string's characters. +/// External iterator for a string's characters and their byte offsets. /// Use with the `std::iterator` module. -pub type CharIterator<'self> = - Map<'self, (uint, char), char, CharOffsetIterator<'self>>; +#[deriving(Clone)] +pub struct CharOffsetIterator<'self> { + /// The original string to be iterated + priv string: &'self str, + priv iter: CharIterator<'self>, +} + +impl<'self> Iterator<(uint, char)> for CharOffsetIterator<'self> { + #[inline] + fn next(&mut self) -> Option<(uint, char)> { + // Compute the byte offset by using the pointer offset between + // the original string slice and the iterator's remaining part + let offset = do self.string.as_imm_buf |a, _| { + do self.iter.string.as_imm_buf |b, _| { + b as uint - a as uint + } + }; + self.iter.next().map_move(|ch| (offset, ch)) + } + + #[inline] + fn size_hint(&self) -> (uint, Option<uint>) { + self.iter.size_hint() + } +} + +impl<'self> DoubleEndedIterator<(uint, char)> for CharOffsetIterator<'self> { + #[inline] + fn next_back(&mut self) -> Option<(uint, char)> { + self.iter.next_back().map_move(|ch| { + let offset = do self.string.as_imm_buf |a, _| { + do self.iter.string.as_imm_buf |b, len| { + b as uint - a as uint + len + } + }; + (offset, ch) + }) + } +} /// External iterator for a string's characters in reverse order. /// Use with the `std::iterator` module. -pub type CharRevIterator<'self> = - Invert<Map<'self, (uint, char), char, CharOffsetIterator<'self>>>; +pub type CharRevIterator<'self> = Invert<CharIterator<'self>>; + +/// External iterator for a string's characters and their byte offsets in reverse order. +/// Use with the `std::iterator` module. +pub type CharOffsetRevIterator<'self> = Invert<CharOffsetIterator<'self>>; /// External iterator for a string's bytes. /// Use with the `std::iterator` module. @@ -313,12 +357,20 @@ pub type ByteIterator<'self> = /// External iterator for a string's bytes in reverse order. /// Use with the `std::iterator` module. -pub type ByteRevIterator<'self> = - Invert<Map<'self, &'self u8, u8, vec::VecIterator<'self, u8>>>; +pub type ByteRevIterator<'self> = Invert<ByteIterator<'self>>; + +/// An iterator over byte index and either &u8 or char +#[deriving(Clone)] +enum OffsetIterator<'self> { + // use ByteIterator here when it can be cloned + ByteOffset(Enumerate<vec::VecIterator<'self, u8>>), + CharOffset(CharOffsetIterator<'self>), +} /// An iterator over the substrings of a string, separated by `sep`. #[deriving(Clone)] pub struct CharSplitIterator<'self,Sep> { + priv iter: OffsetIterator<'self>, priv string: &'self str, priv position: uint, priv sep: Sep, @@ -327,7 +379,6 @@ pub struct CharSplitIterator<'self,Sep> { /// Whether an empty string at the end is allowed priv allow_trailing_empty: bool, priv finished: bool, - priv only_ascii: bool } /// An iterator over the words of a string, separated by an sequence of whitespace @@ -343,39 +394,39 @@ impl<'self, Sep: CharEq> Iterator<&'self str> for CharSplitIterator<'self, Sep> fn next(&mut self) -> Option<&'self str> { if self.finished { return None } - let l = self.string.len(); let start = self.position; - - if self.only_ascii { - // this gives a *huge* speed up for splitting on ASCII - // characters (e.g. '\n' or ' ') - while self.position < l && self.count > 0 { - let byte = self.string[self.position]; - - if self.sep.matches(byte as char) { - let slice = unsafe { raw::slice_bytes(self.string, start, self.position) }; - self.position += 1; - self.count -= 1; - return Some(slice); - } - self.position += 1; - } - } else { - while self.position < l && self.count > 0 { - let CharRange {ch, next} = self.string.char_range_at(self.position); - - if self.sep.matches(ch) { - let slice = unsafe { raw::slice_bytes(self.string, start, self.position) }; - self.position = next; - self.count -= 1; - return Some(slice); - } - self.position = next; + let len = self.string.len(); + + if self.count > 0 { + match self.iter { + // this gives a *huge* speed up for splitting on ASCII + // characters (e.g. '\n' or ' ') + ByteOffset(ref mut iter) => + for (idx, &byte) in *iter { + if self.sep.matches(byte as char) { + self.position = idx + 1; + self.count -= 1; + return Some(unsafe { + raw::slice_bytes(self.string, start, idx) + }) + } + }, + CharOffset(ref mut iter) => + for (idx, ch) in *iter { + if self.sep.matches(ch) { + // skip over the separator + self.position = self.string.char_range_at(idx).next; + self.count -= 1; + return Some(unsafe { + raw::slice_bytes(self.string, start, idx) + }) + } + }, } } self.finished = true; - if self.allow_trailing_empty || start < l { - Some(unsafe { raw::slice_bytes(self.string, start, l) }) + if self.allow_trailing_empty || start < len { + Some(unsafe { raw::slice_bytes(self.string, start, len) }) } else { None } @@ -938,12 +989,21 @@ pub mod raw { /// If end is greater than the length of the string. #[inline] pub unsafe fn slice_bytes<'a>(s: &'a str, begin: uint, end: uint) -> &'a str { - do s.as_imm_buf |sbuf, n| { - assert!((begin <= end)); - assert!((end <= n)); + assert!(begin <= end); + assert!(end <= s.len()); + slice_unchecked(s, begin, end) + } + /// Takes a bytewise (not UTF-8) slice from a string. + /// + /// Returns the substring from [`begin`..`end`). + /// + /// Caller must check slice boundaries! + #[inline] + pub unsafe fn slice_unchecked<'a>(s: &'a str, begin: uint, end: uint) -> &'a str { + do s.as_imm_buf |sbuf, _n| { cast::transmute(Slice { - data: ptr::offset(sbuf, begin as int), + data: sbuf.offset_inbounds(begin as int), len: end - begin, }) } @@ -1302,7 +1362,7 @@ impl<'self> StrSlice<'self> for &'self str { /// ~~~ #[inline] fn iter(&self) -> CharIterator<'self> { - self.char_offset_iter().map(|(_, c)| c) + CharIterator{string: *self} } /// An iterator over the characters of `self`, in reverse order. @@ -1326,14 +1386,11 @@ impl<'self> StrSlice<'self> for &'self str { /// An iterator over the characters of `self` and their byte offsets. #[inline] fn char_offset_iter(&self) -> CharOffsetIterator<'self> { - CharOffsetIterator { - index_front: 0, - index_back: self.len(), - string: *self - } + CharOffsetIterator{string: *self, iter: self.iter()} } - /// An iterator over the characters of `self` and their byte offsets. + /// An iterator over the characters of `self` and their byte offsets, + /// in reverse order. #[inline] fn char_offset_rev_iter(&self) -> CharOffsetRevIterator<'self> { self.char_offset_iter().invert() @@ -1371,15 +1428,19 @@ impl<'self> StrSlice<'self> for &'self str { #[inline] fn split_options_iter<Sep: CharEq>(&self, sep: Sep, count: uint, allow_trailing_empty: bool) -> CharSplitIterator<'self, Sep> { - let only_ascii = sep.only_ascii(); + let iter = if sep.only_ascii() { + ByteOffset(self.as_bytes().iter().enumerate()) + } else { + CharOffset(self.char_offset_iter()) + }; CharSplitIterator { + iter: iter, string: *self, position: 0, sep: sep, count: count, allow_trailing_empty: allow_trailing_empty, finished: false, - only_ascii: only_ascii } } @@ -1481,8 +1542,7 @@ impl<'self> StrSlice<'self> for &'self str { /// beyond the last character of the string #[inline] fn slice(&self, begin: uint, end: uint) -> &'self str { - assert!(self.is_char_boundary(begin)); - assert!(self.is_char_boundary(end)); + assert!(self.is_char_boundary(begin) && self.is_char_boundary(end)); unsafe { raw::slice_bytes(*self, begin, end) } } @@ -1502,7 +1562,8 @@ impl<'self> StrSlice<'self> for &'self str { /// out of bounds. #[inline] fn slice_to(&self, end: uint) -> &'self str { - self.slice(0, end) + assert!(self.is_char_boundary(end)); + unsafe { raw::slice_bytes(*self, 0, end) } } /// Returns a slice of the string from the char range @@ -1512,23 +1573,24 @@ impl<'self> StrSlice<'self> for &'self str { /// beyond the last character of the string. fn slice_chars(&self, begin: uint, end: uint) -> &'self str { assert!(begin <= end); - // not sure how to use the iterators for this nicely. - let mut position = 0; let mut count = 0; - let l = self.len(); - while count < begin && position < l { - position = self.char_range_at(position).next; - count += 1; - } - if count < begin { fail!("Attempted to begin slice_chars beyond end of string") } - let start_byte = position; - while count < end && position < l { - position = self.char_range_at(position).next; + let mut begin_byte = None; + let mut end_byte = None; + + // This could be even more efficient by not decoding, + // only finding the char boundaries + for (idx, _) in self.char_offset_iter() { + if count == begin { begin_byte = Some(idx); } + if count == end { end_byte = Some(idx); break; } count += 1; } - if count < end { fail!("Attempted to end slice_chars beyond end of string") } + if end_byte.is_none() && count == end { end_byte = Some(self.len()) } - self.slice(start_byte, position) + match (begin_byte, end_byte) { + (None, _) => fail!("slice_chars: `begin` is beyond end of string"), + (_, None) => fail!("slice_chars: `end` is beyond end of string"), + (Some(a), Some(b)) => unsafe { raw::slice_bytes(*self, a, b) } + } } /// Returns true if `needle` is a prefix of the string. @@ -1724,6 +1786,7 @@ impl<'self> StrSlice<'self> for &'self str { /// Returns false if the index points into the middle of a multi-byte /// character sequence. + #[inline] fn is_char_boundary(&self, index: uint) -> bool { if index == self.len() { return true; } let b = self[index]; @@ -1809,24 +1872,33 @@ impl<'self> StrSlice<'self> for &'self str { /// This function can be used to iterate over a unicode string in reverse. /// /// Returns 0 for next index if called on start index 0. + #[inline] fn char_range_at_reverse(&self, start: uint) -> CharRange { let mut prev = start; - // while there is a previous byte == 10...... - while prev > 0u && self[prev - 1u] & 192u8 == TAG_CONT_U8 { - prev -= 1u; - } + prev = prev.saturating_sub(1); + if self[prev] < 128 { return CharRange{ch: self[prev] as char, next: prev} } - // now refer to the initial byte of previous char - if prev > 0u { - prev -= 1u; - } else { - prev = 0u; - } + // Multibyte case is a fn to allow char_range_at_reverse to inline cleanly + fn multibyte_char_range_at_rev(s: &str, mut i: uint) -> CharRange { + // while there is a previous byte == 10...... + while i > 0 && s[i] & 192u8 == TAG_CONT_U8 { + i -= 1u; + } + let mut val = s[i] as uint; + let w = UTF8_CHAR_WIDTH[val] as uint; + assert!((w != 0)); + + val = utf8_first_byte!(val, w); + val = utf8_acc_cont_byte!(val, s[i + 1]); + if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); } + if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); } - let ch = self.char_at(prev); - return CharRange {ch:ch, next:prev}; + return CharRange {ch: val as char, next: i}; + } + + return multibyte_char_range_at_rev(*self, prev); } /// Plucks the character ending at the `i`th byte of a string @@ -1836,8 +1908,6 @@ impl<'self> StrSlice<'self> for &'self str { } /// Work with the byte buffer of a string as a byte slice. - /// - /// The byte slice does not include the null terminator. fn as_bytes(&self) -> &'self [u8] { unsafe { cast::transmute(*self) } } @@ -1854,10 +1924,8 @@ impl<'self> StrSlice<'self> for &'self str { if search.matches(b as char) { return Some(i) } } } else { - let mut index = 0; - for c in self.iter() { + for (index, c) in self.char_offset_iter() { if search.matches(c) { return Some(index); } - index += c.len_utf8_bytes(); } } @@ -1871,15 +1939,14 @@ impl<'self> StrSlice<'self> for &'self str { /// `Some` containing the byte index of the last matching character /// or `None` if there is no match fn rfind<C: CharEq>(&self, search: C) -> Option<uint> { - let mut index = self.len(); if search.only_ascii() { + let mut index = self.len(); for b in self.byte_rev_iter() { index -= 1; if search.matches(b as char) { return Some(index); } } } else { - for c in self.rev_iter() { - index -= c.len_utf8_bytes(); + for (index, c) in self.char_offset_rev_iter() { if search.matches(c) { return Some(index); } } } @@ -2020,10 +2087,7 @@ impl<'self> StrSlice<'self> for &'self str { /// Work with the byte buffer and length of a slice. /// - /// The given length is one byte longer than the 'official' indexable - /// length of the string. This is to permit probing the byte past the - /// indexable area for a null byte, as is the case in slices pointing - /// to full strings, or suffixes of them. + /// The buffer does not have a null terminator. #[inline] fn as_imm_buf<T>(&self, f: &fn(*u8, uint) -> T) -> T { let v: &[u8] = unsafe { cast::transmute(*self) }; @@ -2046,12 +2110,10 @@ pub trait OwnedStr { /// Work with the mutable byte buffer and length of a slice. /// - /// The given length is one byte longer than the 'official' indexable - /// length of the string. This is to permit probing the byte past the - /// indexable area for a null byte, as is the case in slices pointing - /// to full strings, or suffixes of them. + /// The buffer does not have a null terminator. /// - /// Make sure any mutations to this buffer keep this string valid UTF8. + /// The caller must make sure any mutations to this buffer keep the string + /// valid UTF-8! fn as_mut_buf<T>(&mut self, f: &fn(*mut u8, uint) -> T) -> T; } @@ -2152,12 +2214,10 @@ impl OwnedStr for ~str { new_str } - /// Reserves capacity for exactly `n` bytes in the given string, not including - /// the null terminator. + /// Reserves capacity for exactly `n` bytes in the given string. /// /// Assuming single-byte characters, the resulting string will be large - /// enough to hold a string of length `n`. To account for the null terminator, - /// the underlying buffer will have the size `n` + 1. + /// enough to hold a string of length `n`. /// /// If the capacity for `s` is already equal to or greater than the requested /// capacity, then no action is taken. @@ -2177,8 +2237,7 @@ impl OwnedStr for ~str { /// Reserves capacity for at least `n` bytes in the given string. /// /// Assuming single-byte characters, the resulting string will be large - /// enough to hold a string of length `n`. To account for the null terminator, - /// the underlying buffer will have the size `n` + 1. + /// enough to hold a string of length `n`. /// /// This function will over-allocate in order to amortize the allocation costs /// in scenarios where the caller may need to repeatedly reserve additional @@ -3211,6 +3270,14 @@ mod tests { } #[test] + fn test_iterator_clone() { + let s = "ศไทย中华Việt Nam"; + let mut it = s.iter(); + it.next(); + assert!(it.zip(it.clone()).all(|(x,y)| x == y)); + } + + #[test] fn test_byte_iterator() { let s = ~"ศไทย中华Việt Nam"; let v = [ @@ -3425,6 +3492,62 @@ mod tests { mod bench { use extra::test::BenchHarness; use super::*; + use prelude::*; + + #[bench] + fn char_iterator(bh: &mut BenchHarness) { + let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb"; + let len = s.char_len(); + + do bh.iter { + assert_eq!(s.iter().len(), len); + } + } + + #[bench] + fn char_iterator_ascii(bh: &mut BenchHarness) { + let s = "Mary had a little lamb, Little lamb + Mary had a little lamb, Little lamb + Mary had a little lamb, Little lamb + Mary had a little lamb, Little lamb + Mary had a little lamb, Little lamb + Mary had a little lamb, Little lamb"; + let len = s.char_len(); + + do bh.iter { + assert_eq!(s.iter().len(), len); + } + } + + #[bench] + fn char_iterator_rev(bh: &mut BenchHarness) { + let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb"; + let len = s.char_len(); + + do bh.iter { + assert_eq!(s.rev_iter().len(), len); + } + } + + #[bench] + fn char_offset_iterator(bh: &mut BenchHarness) { + let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb"; + let len = s.char_len(); + + do bh.iter { + assert_eq!(s.char_offset_iter().len(), len); + } + } + + #[bench] + fn char_offset_iterator_rev(bh: &mut BenchHarness) { + let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb"; + let len = s.char_len(); + + do bh.iter { + assert_eq!(s.char_offset_rev_iter().len(), len); + } + } #[bench] fn is_utf8_100_ascii(bh: &mut BenchHarness) { |
