From f87d4a15a82a76e7510629173c366d084f2c02ca Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Thu, 5 Apr 2018 15:55:28 +0200 Subject: Move Utf8Lossy decoder to libcore --- src/libstd_unicode/Cargo.toml | 4 - src/libstd_unicode/lib.rs | 1 - src/libstd_unicode/lossy.rs | 213 -------------------------------------- src/libstd_unicode/tests/lib.rs | 15 --- src/libstd_unicode/tests/lossy.rs | 91 ---------------- 5 files changed, 324 deletions(-) delete mode 100644 src/libstd_unicode/lossy.rs delete mode 100644 src/libstd_unicode/tests/lib.rs delete mode 100644 src/libstd_unicode/tests/lossy.rs (limited to 'src/libstd_unicode') diff --git a/src/libstd_unicode/Cargo.toml b/src/libstd_unicode/Cargo.toml index 283070a0e2c..b1c55c2e4b6 100644 --- a/src/libstd_unicode/Cargo.toml +++ b/src/libstd_unicode/Cargo.toml @@ -9,10 +9,6 @@ path = "lib.rs" test = false bench = false -[[test]] -name = "std_unicode_tests" -path = "tests/lib.rs" - [dependencies] core = { path = "../libcore" } compiler_builtins = { path = "../rustc/compiler_builtins_shim" } diff --git a/src/libstd_unicode/lib.rs b/src/libstd_unicode/lib.rs index cf8c101a2f9..106a2c0f0c5 100644 --- a/src/libstd_unicode/lib.rs +++ b/src/libstd_unicode/lib.rs @@ -45,7 +45,6 @@ mod tables; mod u_str; mod version; pub mod char; -pub mod lossy; #[allow(deprecated)] pub mod str { diff --git a/src/libstd_unicode/lossy.rs b/src/libstd_unicode/lossy.rs deleted file mode 100644 index cc8e93308a5..00000000000 --- a/src/libstd_unicode/lossy.rs +++ /dev/null @@ -1,213 +0,0 @@ -// Copyright 2012-2017 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -use core::str as core_str; -use core::fmt; -use core::fmt::Write; -use char; -use core::mem; - - -/// Lossy UTF-8 string. -#[unstable(feature = "str_internals", issue = "0")] -pub struct Utf8Lossy { - bytes: [u8] -} - -impl Utf8Lossy { - pub fn from_str(s: &str) -> &Utf8Lossy { - Utf8Lossy::from_bytes(s.as_bytes()) - } - - pub fn from_bytes(bytes: &[u8]) -> &Utf8Lossy { - unsafe { mem::transmute(bytes) } - } - - pub fn chunks(&self) -> Utf8LossyChunksIter { - Utf8LossyChunksIter { source: &self.bytes } - } -} - - -/// Iterator over lossy UTF-8 string -#[unstable(feature = "str_internals", issue = "0")] -#[allow(missing_debug_implementations)] -pub struct Utf8LossyChunksIter<'a> { - source: &'a [u8], -} - -#[unstable(feature = "str_internals", issue = "0")] -#[derive(PartialEq, Eq, Debug)] -pub struct Utf8LossyChunk<'a> { - /// Sequence of valid chars. - /// Can be empty between broken UTF-8 chars. - pub valid: &'a str, - /// Single broken char, empty if none. - /// Empty iff iterator item is last. - pub broken: &'a [u8], -} - -impl<'a> Iterator for Utf8LossyChunksIter<'a> { - type Item = Utf8LossyChunk<'a>; - - fn next(&mut self) -> Option> { - if self.source.len() == 0 { - return None; - } - - const TAG_CONT_U8: u8 = 128; - fn unsafe_get(xs: &[u8], i: usize) -> u8 { - unsafe { *xs.get_unchecked(i) } - } - fn safe_get(xs: &[u8], i: usize) -> u8 { - if i >= xs.len() { 0 } else { unsafe_get(xs, i) } - } - - let mut i = 0; - while i < self.source.len() { - let i_ = i; - - let byte = unsafe_get(self.source, i); - i += 1; - - if byte < 128 { - - } else { - let w = core_str::utf8_char_width(byte); - - macro_rules! error { () => ({ - unsafe { - let r = Utf8LossyChunk { - valid: core_str::from_utf8_unchecked(&self.source[0..i_]), - broken: &self.source[i_..i], - }; - self.source = &self.source[i..]; - return Some(r); - } - })} - - match w { - 2 => { - if safe_get(self.source, i) & 192 != TAG_CONT_U8 { - error!(); - } - i += 1; - } - 3 => { - match (byte, safe_get(self.source, i)) { - (0xE0, 0xA0 ... 0xBF) => (), - (0xE1 ... 0xEC, 0x80 ... 0xBF) => (), - (0xED, 0x80 ... 0x9F) => (), - (0xEE ... 0xEF, 0x80 ... 0xBF) => (), - _ => { - error!(); - } - } - i += 1; - if safe_get(self.source, i) & 192 != TAG_CONT_U8 { - error!(); - } - i += 1; - } - 4 => { - match (byte, safe_get(self.source, i)) { - (0xF0, 0x90 ... 0xBF) => (), - (0xF1 ... 0xF3, 0x80 ... 0xBF) => (), - (0xF4, 0x80 ... 0x8F) => (), - _ => { - error!(); - } - } - i += 1; - if safe_get(self.source, i) & 192 != TAG_CONT_U8 { - error!(); - } - i += 1; - if safe_get(self.source, i) & 192 != TAG_CONT_U8 { - error!(); - } - i += 1; - } - _ => { - error!(); - } - } - } - } - - let r = Utf8LossyChunk { - valid: unsafe { core_str::from_utf8_unchecked(self.source) }, - broken: &[], - }; - self.source = &[]; - return Some(r); - } -} - - -impl fmt::Display for Utf8Lossy { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - // If we're the empty string then our iterator won't actually yield - // anything, so perform the formatting manually - if self.bytes.len() == 0 { - return "".fmt(f) - } - - for Utf8LossyChunk { valid, broken } in self.chunks() { - // If we successfully decoded the whole chunk as a valid string then - // we can return a direct formatting of the string which will also - // respect various formatting flags if possible. - if valid.len() == self.bytes.len() { - assert!(broken.is_empty()); - return valid.fmt(f) - } - - f.write_str(valid)?; - if !broken.is_empty() { - f.write_char(char::REPLACEMENT_CHARACTER)?; - } - } - Ok(()) - } -} - -impl fmt::Debug for Utf8Lossy { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.write_char('"')?; - - for Utf8LossyChunk { valid, broken } in self.chunks() { - - // Valid part. - // Here we partially parse UTF-8 again which is suboptimal. - { - let mut from = 0; - for (i, c) in valid.char_indices() { - let esc = c.escape_debug(); - // If char needs escaping, flush backlog so far and write, else skip - if esc.len() != 1 { - f.write_str(&valid[from..i])?; - for c in esc { - f.write_char(c)?; - } - from = i + c.len_utf8(); - } - } - f.write_str(&valid[from..])?; - } - - // Broken parts of string as hex escape. - for &b in broken { - write!(f, "\\x{:02x}", b)?; - } - } - - f.write_char('"') - } -} diff --git a/src/libstd_unicode/tests/lib.rs b/src/libstd_unicode/tests/lib.rs deleted file mode 100644 index 9535ec18763..00000000000 --- a/src/libstd_unicode/tests/lib.rs +++ /dev/null @@ -1,15 +0,0 @@ -// Copyright 2012-2017 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -#![feature(str_internals, unicode)] - -extern crate std_unicode; - -mod lossy; diff --git a/src/libstd_unicode/tests/lossy.rs b/src/libstd_unicode/tests/lossy.rs deleted file mode 100644 index e05d0668556..00000000000 --- a/src/libstd_unicode/tests/lossy.rs +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright 2012-2017 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -use std_unicode::lossy::*; - -#[test] -fn chunks() { - let mut iter = Utf8Lossy::from_bytes(b"hello").chunks(); - assert_eq!(Some(Utf8LossyChunk { valid: "hello", broken: b"", }), iter.next()); - assert_eq!(None, iter.next()); - - let mut iter = Utf8Lossy::from_bytes("ศไทย中华Việt Nam".as_bytes()).chunks(); - assert_eq!(Some(Utf8LossyChunk { valid: "ศไทย中华Việt Nam", broken: b"", }), iter.next()); - assert_eq!(None, iter.next()); - - let mut iter = Utf8Lossy::from_bytes(b"Hello\xC2 There\xFF Goodbye").chunks(); - assert_eq!(Some(Utf8LossyChunk { valid: "Hello", broken: b"\xC2", }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: " There", broken: b"\xFF", }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: " Goodbye", broken: b"", }), iter.next()); - assert_eq!(None, iter.next()); - - let mut iter = Utf8Lossy::from_bytes(b"Hello\xC0\x80 There\xE6\x83 Goodbye").chunks(); - assert_eq!(Some(Utf8LossyChunk { valid: "Hello", broken: b"\xC0", }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\x80", }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: " There", broken: b"\xE6\x83", }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: " Goodbye", broken: b"", }), iter.next()); - assert_eq!(None, iter.next()); - - let mut iter = Utf8Lossy::from_bytes(b"\xF5foo\xF5\x80bar").chunks(); - assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\xF5", }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "foo", broken: b"\xF5", }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\x80", }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "bar", broken: b"", }), iter.next()); - assert_eq!(None, iter.next()); - - let mut iter = Utf8Lossy::from_bytes(b"\xF1foo\xF1\x80bar\xF1\x80\x80baz").chunks(); - assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\xF1", }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "foo", broken: b"\xF1\x80", }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "bar", broken: b"\xF1\x80\x80", }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "baz", broken: b"", }), iter.next()); - assert_eq!(None, iter.next()); - - let mut iter = Utf8Lossy::from_bytes(b"\xF4foo\xF4\x80bar\xF4\xBFbaz").chunks(); - assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\xF4", }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "foo", broken: b"\xF4\x80", }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "bar", broken: b"\xF4", }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\xBF", }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "baz", broken: b"", }), iter.next()); - assert_eq!(None, iter.next()); - - let mut iter = Utf8Lossy::from_bytes(b"\xF0\x80\x80\x80foo\xF0\x90\x80\x80bar").chunks(); - assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\xF0", }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\x80", }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\x80", }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\x80", }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "foo\u{10000}bar", broken: b"", }), iter.next()); - assert_eq!(None, iter.next()); - - // surrogates - let mut iter = Utf8Lossy::from_bytes(b"\xED\xA0\x80foo\xED\xBF\xBFbar").chunks(); - assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\xED", }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\xA0", }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\x80", }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "foo", broken: b"\xED", }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\xBF", }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "", broken: b"\xBF", }), iter.next()); - assert_eq!(Some(Utf8LossyChunk { valid: "bar", broken: b"", }), iter.next()); - assert_eq!(None, iter.next()); -} - -#[test] -fn display() { - assert_eq!( - "Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye", - &format!("{}", Utf8Lossy::from_bytes(b"Hello\xC0\x80 There\xE6\x83 Goodbye"))); -} - -#[test] -fn debug() { - assert_eq!( - "\"Hello\\xc0\\x80 There\\xe6\\x83 Goodbye\\u{10d4ea}\"", - &format!("{:?}", Utf8Lossy::from_bytes( - b"Hello\xC0\x80 There\xE6\x83 Goodbye\xf4\x8d\x93\xaa"))); -} -- cgit 1.4.1-3-g733a5