diff options
| author | Stepan Koltsov <stepan.koltsov@gmail.com> | 2017-06-12 20:07:54 +0300 |
|---|---|---|
| committer | Stepan Koltsov <stepan.koltsov@gmail.com> | 2017-06-15 20:42:35 +0100 |
| commit | ea149b8571d538fc8bb2117e46161d442aef48a4 (patch) | |
| tree | 3ba3ef157bf2a05918fc5b6f108cd54cf3e24199 /src/liballoc/string.rs | |
| parent | 258ae6dd9b1a8ac97986852fc9f00f7687004ccb (diff) | |
| download | rust-ea149b8571d538fc8bb2117e46161d442aef48a4.tar.gz rust-ea149b8571d538fc8bb2117e46161d442aef48a4.zip | |
Utf8Lossy type with chunks iterator and impl Display and Debug
Diffstat (limited to 'src/liballoc/string.rs')
| -rw-r--r-- | src/liballoc/string.rs | 121 |
1 files changed, 22 insertions, 99 deletions
diff --git a/src/liballoc/string.rs b/src/liballoc/string.rs index 1d98626e90b..2cb81029f95 100644 --- a/src/liballoc/string.rs +++ b/src/liballoc/string.rs @@ -61,8 +61,8 @@ use core::hash; use core::iter::{FromIterator, FusedIterator}; use core::ops::{self, Add, AddAssign, Index, IndexMut}; use core::ptr; -use core::str as core_str; use core::str::pattern::Pattern; +use std_unicode::lossy; use std_unicode::char::{decode_utf16, REPLACEMENT_CHARACTER}; use borrow::{Cow, ToOwned}; @@ -533,111 +533,34 @@ impl String { /// ``` #[stable(feature = "rust1", since = "1.0.0")] pub fn from_utf8_lossy<'a>(v: &'a [u8]) -> Cow<'a, str> { - let mut i; - match str::from_utf8(v) { - Ok(s) => return Cow::Borrowed(s), - Err(e) => i = e.valid_up_to(), - } + let mut iter = lossy::Utf8Lossy::from_bytes(v).chunks(); - const TAG_CONT_U8: u8 = 128; - const REPLACEMENT: &'static [u8] = b"\xEF\xBF\xBD"; // U+FFFD in UTF-8 - let total = v.len(); - fn unsafe_get(xs: &[u8], i: usize) -> u8 { - unsafe { *xs.get_unchecked(i) } - } - fn safe_get(xs: &[u8], i: usize, total: usize) -> u8 { - if i >= total { 0 } else { unsafe_get(xs, i) } - } + let (first_valid, first_broken) = if let Some(chunk) = iter.next() { + let lossy::Utf8LossyChunk { valid, broken } = chunk; + if valid.len() == v.len() { + debug_assert!(broken.is_empty()); + return Cow::Borrowed(valid); + } + (valid, broken) + } else { + return Cow::Borrowed(""); + }; - let mut res = String::with_capacity(total); + const REPLACEMENT: &'static str = "\u{FFFD}"; - if i > 0 { - unsafe { res.as_mut_vec().extend_from_slice(&v[..i]) }; + let mut res = String::with_capacity(v.len()); + res.push_str(first_valid); + if !first_broken.is_empty() { + res.push_str(REPLACEMENT); } - // subseqidx is the index of the first byte of the subsequence we're - // looking at. It's used to copy a bunch of contiguous good codepoints - // at once instead of copying them one by one. - let mut subseqidx = i; - - while i < total { - let i_ = i; - let byte = unsafe_get(v, i); - i += 1; - - macro_rules! error { () => ({ - unsafe { - if subseqidx != i_ { - res.as_mut_vec().extend_from_slice(&v[subseqidx..i_]); - } - subseqidx = i; - res.as_mut_vec().extend_from_slice(REPLACEMENT); - } - })} - - if byte < 128 { - // subseqidx handles this - } else { - let w = core_str::utf8_char_width(byte); - - match w { - 2 => { - if safe_get(v, i, total) & 192 != TAG_CONT_U8 { - error!(); - continue; - } - i += 1; - } - 3 => { - match (byte, safe_get(v, i, total)) { - (0xE0, 0xA0...0xBF) => (), - (0xE1...0xEC, 0x80...0xBF) => (), - (0xED, 0x80...0x9F) => (), - (0xEE...0xEF, 0x80...0xBF) => (), - _ => { - error!(); - continue; - } - } - i += 1; - if safe_get(v, i, total) & 192 != TAG_CONT_U8 { - error!(); - continue; - } - i += 1; - } - 4 => { - match (byte, safe_get(v, i, total)) { - (0xF0, 0x90...0xBF) => (), - (0xF1...0xF3, 0x80...0xBF) => (), - (0xF4, 0x80...0x8F) => (), - _ => { - error!(); - continue; - } - } - i += 1; - if safe_get(v, i, total) & 192 != TAG_CONT_U8 { - error!(); - continue; - } - i += 1; - if safe_get(v, i, total) & 192 != TAG_CONT_U8 { - error!(); - continue; - } - i += 1; - } - _ => { - error!(); - continue; - } - } + for lossy::Utf8LossyChunk { valid, broken } in iter { + res.push_str(valid); + if !broken.is_empty() { + res.push_str(REPLACEMENT); } } - if subseqidx < total { - unsafe { res.as_mut_vec().extend_from_slice(&v[subseqidx..total]) }; - } + Cow::Owned(res) } |
