diff options
| author | Kevin Ballard <kevin@sb.org> | 2014-02-07 14:58:37 -0800 |
|---|---|---|
| committer | Kevin Ballard <kevin@sb.org> | 2014-02-07 22:31:51 -0800 |
| commit | 28467f5d197d1455e922711a28ea4a19702dbda9 (patch) | |
| tree | bfcd12c078fb6aa9bbd65053efbbd41d758b7778 /src/libstd | |
| parent | dde2e0b3865ba040261d2078db371adbefb32506 (diff) | |
| download | rust-28467f5d197d1455e922711a28ea4a19702dbda9.tar.gz rust-28467f5d197d1455e922711a28ea4a19702dbda9.zip | |
Tweak from_utf8_lossy to return a new MaybeOwned enum
MaybeOwned allows from_utf8_lossy to avoid allocation if there are no invalid bytes in the input.
Diffstat (limited to 'src/libstd')
| -rw-r--r-- | src/libstd/path/mod.rs | 4 | ||||
| -rw-r--r-- | src/libstd/str.rs | 121 |
2 files changed, 96 insertions, 29 deletions
diff --git a/src/libstd/path/mod.rs b/src/libstd/path/mod.rs index 18f28994cba..d5a69b12f2a 100644 --- a/src/libstd/path/mod.rs +++ b/src/libstd/path/mod.rs @@ -508,10 +508,10 @@ impl<'a, P: GenericPath> ToStr for Display<'a, P> { if self.filename { match self.path.filename() { None => ~"", - Some(v) => str::from_utf8_lossy(v) + Some(v) => str::from_utf8_lossy(v).into_owned() } } else { - str::from_utf8_lossy(self.path.as_vec()) + str::from_utf8_lossy(self.path.as_vec()).into_owned() } } } diff --git a/src/libstd/str.rs b/src/libstd/str.rs index 25e15fc1601..204139c5d78 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -729,6 +729,11 @@ Section: Misc /// Determines if a vector of bytes contains valid UTF-8 pub fn is_utf8(v: &[u8]) -> bool { + first_non_utf8_index(v).is_none() +} + +#[inline(always)] +fn first_non_utf8_index(v: &[u8]) -> Option<uint> { let mut i = 0u; let total = v.len(); fn unsafe_get(xs: &[u8], i: uint) -> u8 { @@ -740,10 +745,10 @@ pub fn is_utf8(v: &[u8]) -> bool { i += 1u; } else { let w = utf8_char_width(v_i); - if w == 0u { return false; } + if w == 0u { return Some(i); } let nexti = i + w; - if nexti > total { return false; } + if nexti > total { return Some(i); } // 2-byte encoding is for codepoints \u0080 to \u07ff // first C2 80 last DF BF @@ -766,7 +771,7 @@ pub fn is_utf8(v: &[u8]) -> bool { // UTF8-tail = %x80-BF match w { 2 => if unsafe_get(v, i + 1) & 192u8 != TAG_CONT_U8 { - return false + return Some(i) }, 3 => match (v_i, unsafe_get(v, i + 1), @@ -775,7 +780,7 @@ pub fn is_utf8(v: &[u8]) -> bool { (0xE1 .. 0xEC, 0x80 .. 0xBF, TAG_CONT_U8) => (), (0xED , 0x80 .. 0x9F, TAG_CONT_U8) => (), (0xEE .. 0xEF, 0x80 .. 0xBF, TAG_CONT_U8) => (), - _ => return false, + _ => return Some(i), }, _ => match (v_i, unsafe_get(v, i + 1), @@ -784,14 +789,14 @@ pub fn is_utf8(v: &[u8]) -> bool { (0xF0 , 0x90 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) => (), (0xF1 .. 0xF3, 0x80 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) => (), (0xF4 , 0x80 .. 0x8F, TAG_CONT_U8, TAG_CONT_U8) => (), - _ => return false, + _ => return Some(i) }, } i = nexti; } } - true + None } /// Determines if a vector of `u16` contains valid UTF-16 @@ -910,6 +915,53 @@ macro_rules! utf8_acc_cont_byte( static TAG_CONT_U8: u8 = 128u8; +/// Enum that represents either a borrowed or an owned string. +#[deriving(Eq,Clone)] +pub enum MaybeOwned<'a> { + /// A borrowed string + Slice(&'a str), + /// An owned string + Owned(~str) +} + +impl<'a> Str for MaybeOwned<'a> { + #[inline] + fn as_slice<'b>(&'b self) -> &'b str { + match *self { + Slice(s) => s, + Owned(ref s) => s.as_slice() + } + } + + #[inline] + fn into_owned(self) -> ~str { + match self { + Slice(s) => s.to_owned(), + Owned(s) => s + } + } +} + +impl<'a> ToStr for MaybeOwned<'a> { + #[inline] + fn to_str(&self) -> ~str { + match *self { + Slice(s) => s.to_str(), + Owned(ref s) => s.clone() + } + } +} + +impl<'a> ::fmt::Show for MaybeOwned<'a> { + #[inline] + fn fmt(mo: &MaybeOwned, f: &mut ::fmt::Formatter) -> ::fmt::Result { + match *mo { + Slice(ref s) => ::fmt::Show::fmt(s, f), + Owned(ref s) => ::fmt::Show::fmt(&s.as_slice(), f) + } + } +} + /// Converts a vector of bytes to a new utf-8 string. /// Any invalid utf-8 sequences are replaced with U+FFFD REPLACEMENT CHARACTER. /// @@ -918,12 +970,16 @@ static TAG_CONT_U8: u8 = 128u8; /// ```rust /// let input = bytes!("Hello ", 0xF0, 0x90, 0x80, "World"); /// let output = std::str::from_utf8_lossy(input); -/// assert_eq!(output, ~"Hello \uFFFDWorld"); +/// assert_eq!(output.as_slice(), "Hello \uFFFDWorld"); /// ``` -pub fn from_utf8_lossy(v: &[u8]) -> ~str { +pub fn from_utf8_lossy<'a>(v: &'a [u8]) -> MaybeOwned<'a> { + let firstbad = match first_non_utf8_index(v) { + None => return Slice(unsafe { cast::transmute(v) }), + Some(i) => i + }; + static REPLACEMENT: &'static [u8] = bytes!(0xEF, 0xBF, 0xBD); // U+FFFD in UTF-8 - let mut i = 0u; - let mut lastgood = 0u; + let mut i = firstbad; let total = v.len(); fn unsafe_get(xs: &[u8], i: uint) -> u8 { unsafe { *xs.unsafe_ref(i) } @@ -937,23 +993,32 @@ pub fn from_utf8_lossy(v: &[u8]) -> ~str { } let mut res = with_capacity(total); + if i > 0 { + unsafe { raw::push_bytes(&mut res, v.slice_to(i)) }; + } + + // subseqidx is the index of the first byte of the subsequence we're looking at. + // It's used to copy a bunch of contiguous good codepoints at once instead of copying + // them one by one. + let mut subseqidx = firstbad; + while i < total { let i_ = i; let byte = unsafe_get(v, i); i += 1; - macro_rules! error(() => { + macro_rules! error(() => ({ unsafe { - if lastgood != i_ { - raw::push_bytes(&mut res, v.slice(lastgood, i_)); + if subseqidx != i_ { + raw::push_bytes(&mut res, v.slice(subseqidx, i_)); } - lastgood = i; + subseqidx = i; raw::push_bytes(&mut res, REPLACEMENT); } - }) + })) if byte < 128u8 { - // lastgood handles this + // subseqidx handles this } else { let w = utf8_char_width(byte); @@ -1012,8 +1077,10 @@ pub fn from_utf8_lossy(v: &[u8]) -> ~str { } } } - unsafe { raw::push_bytes(&mut res, v.slice(lastgood, total)) }; - res + if subseqidx < total { + unsafe { raw::push_bytes(&mut res, v.slice(subseqidx, total)) }; + } + Owned(res) } /// Unsafe operations @@ -3943,32 +4010,32 @@ mod tests { #[test] fn test_str_from_utf8_lossy() { let xs = bytes!("hello"); - assert_eq!(from_utf8_lossy(xs), ~"hello"); + assert_eq!(from_utf8_lossy(xs), Slice("hello")); let xs = bytes!("ศไทย中华Việt Nam"); - assert_eq!(from_utf8_lossy(xs), ~"ศไทย中华Việt Nam"); + assert_eq!(from_utf8_lossy(xs), Slice("ศไทย中华Việt Nam")); let xs = bytes!("Hello", 0xC2, " There", 0xFF, " Goodbye"); - assert_eq!(from_utf8_lossy(xs), ~"Hello\uFFFD There\uFFFD Goodbye"); + assert_eq!(from_utf8_lossy(xs), Owned(~"Hello\uFFFD There\uFFFD Goodbye")); let xs = bytes!("Hello", 0xC0, 0x80, " There", 0xE6, 0x83, " Goodbye"); - assert_eq!(from_utf8_lossy(xs), ~"Hello\uFFFD\uFFFD There\uFFFD Goodbye"); + assert_eq!(from_utf8_lossy(xs), Owned(~"Hello\uFFFD\uFFFD There\uFFFD Goodbye")); let xs = bytes!(0xF5, "foo", 0xF5, 0x80, "bar"); - assert_eq!(from_utf8_lossy(xs), ~"\uFFFDfoo\uFFFD\uFFFDbar"); + assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFDfoo\uFFFD\uFFFDbar")); let xs = bytes!(0xF1, "foo", 0xF1, 0x80, "bar", 0xF1, 0x80, 0x80, "baz"); - assert_eq!(from_utf8_lossy(xs), ~"\uFFFDfoo\uFFFDbar\uFFFDbaz"); + assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFDfoo\uFFFDbar\uFFFDbaz")); let xs = bytes!(0xF4, "foo", 0xF4, 0x80, "bar", 0xF4, 0xBF, "baz"); - assert_eq!(from_utf8_lossy(xs), ~"\uFFFDfoo\uFFFDbar\uFFFD\uFFFDbaz"); + assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFDfoo\uFFFDbar\uFFFD\uFFFDbaz")); let xs = bytes!(0xF0, 0x80, 0x80, 0x80, "foo", 0xF0, 0x90, 0x80, 0x80, "bar"); - assert_eq!(from_utf8_lossy(xs), ~"\uFFFD\uFFFD\uFFFD\uFFFDfoo\U00010000bar"); + assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFD\uFFFD\uFFFD\uFFFDfoo\U00010000bar")); // surrogates let xs = bytes!(0xED, 0xA0, 0x80, "foo", 0xED, 0xBF, 0xBF, "bar"); - assert_eq!(from_utf8_lossy(xs), ~"\uFFFD\uFFFD\uFFFDfoo\uFFFD\uFFFD\uFFFDbar"); + assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFD\uFFFD\uFFFDfoo\uFFFD\uFFFD\uFFFDbar")); } #[test] |
