about summary refs log tree commit diff
path: root/src/liballoc/string.rs
diff options
context:
space:
mode:
authorStepan Koltsov <stepan.koltsov@gmail.com>2017-06-12 20:07:54 +0300
committerStepan Koltsov <stepan.koltsov@gmail.com>2017-06-15 20:42:35 +0100
commitea149b8571d538fc8bb2117e46161d442aef48a4 (patch)
tree3ba3ef157bf2a05918fc5b6f108cd54cf3e24199 /src/liballoc/string.rs
parent258ae6dd9b1a8ac97986852fc9f00f7687004ccb (diff)
downloadrust-ea149b8571d538fc8bb2117e46161d442aef48a4.tar.gz
rust-ea149b8571d538fc8bb2117e46161d442aef48a4.zip
Utf8Lossy type with chunks iterator and impl Display and Debug
Diffstat (limited to 'src/liballoc/string.rs')
-rw-r--r--src/liballoc/string.rs121
1 files changed, 22 insertions, 99 deletions
diff --git a/src/liballoc/string.rs b/src/liballoc/string.rs
index 1d98626e90b..2cb81029f95 100644
--- a/src/liballoc/string.rs
+++ b/src/liballoc/string.rs
@@ -61,8 +61,8 @@ use core::hash;
 use core::iter::{FromIterator, FusedIterator};
 use core::ops::{self, Add, AddAssign, Index, IndexMut};
 use core::ptr;
-use core::str as core_str;
 use core::str::pattern::Pattern;
+use std_unicode::lossy;
 use std_unicode::char::{decode_utf16, REPLACEMENT_CHARACTER};
 
 use borrow::{Cow, ToOwned};
@@ -533,111 +533,34 @@ impl String {
     /// ```
     #[stable(feature = "rust1", since = "1.0.0")]
     pub fn from_utf8_lossy<'a>(v: &'a [u8]) -> Cow<'a, str> {
-        let mut i;
-        match str::from_utf8(v) {
-            Ok(s) => return Cow::Borrowed(s),
-            Err(e) => i = e.valid_up_to(),
-        }
+        let mut iter = lossy::Utf8Lossy::from_bytes(v).chunks();
 
-        const TAG_CONT_U8: u8 = 128;
-        const REPLACEMENT: &'static [u8] = b"\xEF\xBF\xBD"; // U+FFFD in UTF-8
-        let total = v.len();
-        fn unsafe_get(xs: &[u8], i: usize) -> u8 {
-            unsafe { *xs.get_unchecked(i) }
-        }
-        fn safe_get(xs: &[u8], i: usize, total: usize) -> u8 {
-            if i >= total { 0 } else { unsafe_get(xs, i) }
-        }
+        let (first_valid, first_broken) = if let Some(chunk) = iter.next() {
+            let lossy::Utf8LossyChunk { valid, broken } = chunk;
+            if valid.len() == v.len() {
+                debug_assert!(broken.is_empty());
+                return Cow::Borrowed(valid);
+            }
+            (valid, broken)
+        } else {
+            return Cow::Borrowed("");
+        };
 
-        let mut res = String::with_capacity(total);
+        const REPLACEMENT: &'static str = "\u{FFFD}";
 
-        if i > 0 {
-            unsafe { res.as_mut_vec().extend_from_slice(&v[..i]) };
+        let mut res = String::with_capacity(v.len());
+        res.push_str(first_valid);
+        if !first_broken.is_empty() {
+            res.push_str(REPLACEMENT);
         }
 
-        // subseqidx is the index of the first byte of the subsequence we're
-        // looking at.  It's used to copy a bunch of contiguous good codepoints
-        // at once instead of copying them one by one.
-        let mut subseqidx = i;
-
-        while i < total {
-            let i_ = i;
-            let byte = unsafe_get(v, i);
-            i += 1;
-
-            macro_rules! error { () => ({
-                unsafe {
-                    if subseqidx != i_ {
-                        res.as_mut_vec().extend_from_slice(&v[subseqidx..i_]);
-                    }
-                    subseqidx = i;
-                    res.as_mut_vec().extend_from_slice(REPLACEMENT);
-                }
-            })}
-
-            if byte < 128 {
-                // subseqidx handles this
-            } else {
-                let w = core_str::utf8_char_width(byte);
-
-                match w {
-                    2 => {
-                        if safe_get(v, i, total) & 192 != TAG_CONT_U8 {
-                            error!();
-                            continue;
-                        }
-                        i += 1;
-                    }
-                    3 => {
-                        match (byte, safe_get(v, i, total)) {
-                            (0xE0, 0xA0...0xBF) => (),
-                            (0xE1...0xEC, 0x80...0xBF) => (),
-                            (0xED, 0x80...0x9F) => (),
-                            (0xEE...0xEF, 0x80...0xBF) => (),
-                            _ => {
-                                error!();
-                                continue;
-                            }
-                        }
-                        i += 1;
-                        if safe_get(v, i, total) & 192 != TAG_CONT_U8 {
-                            error!();
-                            continue;
-                        }
-                        i += 1;
-                    }
-                    4 => {
-                        match (byte, safe_get(v, i, total)) {
-                            (0xF0, 0x90...0xBF) => (),
-                            (0xF1...0xF3, 0x80...0xBF) => (),
-                            (0xF4, 0x80...0x8F) => (),
-                            _ => {
-                                error!();
-                                continue;
-                            }
-                        }
-                        i += 1;
-                        if safe_get(v, i, total) & 192 != TAG_CONT_U8 {
-                            error!();
-                            continue;
-                        }
-                        i += 1;
-                        if safe_get(v, i, total) & 192 != TAG_CONT_U8 {
-                            error!();
-                            continue;
-                        }
-                        i += 1;
-                    }
-                    _ => {
-                        error!();
-                        continue;
-                    }
-                }
+        for lossy::Utf8LossyChunk { valid, broken } in iter {
+            res.push_str(valid);
+            if !broken.is_empty() {
+                res.push_str(REPLACEMENT);
             }
         }
-        if subseqidx < total {
-            unsafe { res.as_mut_vec().extend_from_slice(&v[subseqidx..total]) };
-        }
+
         Cow::Owned(res)
     }