diff options
| author | bors <bors@rust-lang.org> | 2015-01-24 19:39:52 +0000 |
|---|---|---|
| committer | bors <bors@rust-lang.org> | 2015-01-24 19:39:52 +0000 |
| commit | bb7cc4eb26e87ec4cb2acdc5bc3a7d25b9c817be (patch) | |
| tree | 04338cf8bf55a8510cd6ab0771698e320beb13e8 /src/libcore | |
| parent | 76fbb3583174ca8856b4e149929839888f503e6b (diff) | |
| parent | c5369ebc7f4791c4e291951751b8964052c7a523 (diff) | |
| download | rust-bb7cc4eb26e87ec4cb2acdc5bc3a7d25b9c817be.tar.gz rust-bb7cc4eb26e87ec4cb2acdc5bc3a7d25b9c817be.zip | |
Auto merge of #21488 - aturon:os-str, r=alexcrichton
Per [RFC 517](https://github.com/rust-lang/rfcs/pull/575/), this commit introduces platform-native strings. The API is essentially as described in the RFC. The WTF-8 implementation is adapted from @SimonSapin's [implementation](https://github.com/SimonSapin/rust-wtf8). To make this work, some encodign and decoding functionality in `libcore` is now exported in a "raw" fashion reusable for WTF-8. These exports are *not* reexported in `std`, nor are they stable.
Diffstat (limited to 'src/libcore')
| -rw-r--r-- | src/libcore/char.rs | 96 | ||||
| -rw-r--r-- | src/libcore/str/mod.rs | 116 |
2 files changed, 125 insertions, 87 deletions
diff --git a/src/libcore/char.rs b/src/libcore/char.rs index caac894c0da..0e6b634bd11 100644 --- a/src/libcore/char.rs +++ b/src/libcore/char.rs @@ -258,49 +258,69 @@ impl CharExt for char { #[inline] #[unstable = "pending decision about Iterator/Writer/Reader"] fn encode_utf8(self, dst: &mut [u8]) -> Option<uint> { - // Marked #[inline] to allow llvm optimizing it away - let code = self as u32; - if code < MAX_ONE_B && dst.len() >= 1 { - dst[0] = code as u8; - Some(1) - } else if code < MAX_TWO_B && dst.len() >= 2 { - dst[0] = (code >> 6u & 0x1F_u32) as u8 | TAG_TWO_B; - dst[1] = (code & 0x3F_u32) as u8 | TAG_CONT; - Some(2) - } else if code < MAX_THREE_B && dst.len() >= 3 { - dst[0] = (code >> 12u & 0x0F_u32) as u8 | TAG_THREE_B; - dst[1] = (code >> 6u & 0x3F_u32) as u8 | TAG_CONT; - dst[2] = (code & 0x3F_u32) as u8 | TAG_CONT; - Some(3) - } else if dst.len() >= 4 { - dst[0] = (code >> 18u & 0x07_u32) as u8 | TAG_FOUR_B; - dst[1] = (code >> 12u & 0x3F_u32) as u8 | TAG_CONT; - dst[2] = (code >> 6u & 0x3F_u32) as u8 | TAG_CONT; - dst[3] = (code & 0x3F_u32) as u8 | TAG_CONT; - Some(4) - } else { - None - } + encode_utf8_raw(self as u32, dst) } #[inline] #[unstable = "pending decision about Iterator/Writer/Reader"] fn encode_utf16(self, dst: &mut [u16]) -> Option<uint> { - // Marked #[inline] to allow llvm optimizing it away - let mut ch = self as u32; - if (ch & 0xFFFF_u32) == ch && dst.len() >= 1 { - // The BMP falls through (assuming non-surrogate, as it should) - dst[0] = ch as u16; - Some(1) - } else if dst.len() >= 2 { - // Supplementary planes break into surrogates. - ch -= 0x1_0000_u32; - dst[0] = 0xD800_u16 | ((ch >> 10) as u16); - dst[1] = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16); - Some(2) - } else { - None - } + encode_utf16_raw(self as u32, dst) + } +} + +/// Encodes a raw u32 value as UTF-8 into the provided byte buffer, +/// and then returns the number of bytes written. +/// +/// If the buffer is not large enough, nothing will be written into it +/// and a `None` will be returned. +#[inline] +#[unstable] +pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> Option<uint> { + // Marked #[inline] to allow llvm optimizing it away + if code < MAX_ONE_B && dst.len() >= 1 { + dst[0] = code as u8; + Some(1) + } else if code < MAX_TWO_B && dst.len() >= 2 { + dst[0] = (code >> 6u & 0x1F_u32) as u8 | TAG_TWO_B; + dst[1] = (code & 0x3F_u32) as u8 | TAG_CONT; + Some(2) + } else if code < MAX_THREE_B && dst.len() >= 3 { + dst[0] = (code >> 12u & 0x0F_u32) as u8 | TAG_THREE_B; + dst[1] = (code >> 6u & 0x3F_u32) as u8 | TAG_CONT; + dst[2] = (code & 0x3F_u32) as u8 | TAG_CONT; + Some(3) + } else if dst.len() >= 4 { + dst[0] = (code >> 18u & 0x07_u32) as u8 | TAG_FOUR_B; + dst[1] = (code >> 12u & 0x3F_u32) as u8 | TAG_CONT; + dst[2] = (code >> 6u & 0x3F_u32) as u8 | TAG_CONT; + dst[3] = (code & 0x3F_u32) as u8 | TAG_CONT; + Some(4) + } else { + None + } +} + +/// Encodes a raw u32 value as UTF-16 into the provided `u16` buffer, +/// and then returns the number of `u16`s written. +/// +/// If the buffer is not large enough, nothing will be written into it +/// and a `None` will be returned. +#[inline] +#[unstable] +pub fn encode_utf16_raw(mut ch: u32, dst: &mut [u16]) -> Option<uint> { + // Marked #[inline] to allow llvm optimizing it away + if (ch & 0xFFFF_u32) == ch && dst.len() >= 1 { + // The BMP falls through (assuming non-surrogate, as it should) + dst[0] = ch as u16; + Some(1) + } else if dst.len() >= 2 { + // Supplementary planes break into surrogates. + ch -= 0x1_0000_u32; + dst[0] = 0xD800_u16 | ((ch >> 10) as u16); + dst[1] = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16); + Some(2) + } else { + None } } diff --git a/src/libcore/str/mod.rs b/src/libcore/str/mod.rs index bdac686cb66..1e01da4e41d 100644 --- a/src/libcore/str/mod.rs +++ b/src/libcore/str/mod.rs @@ -305,43 +305,52 @@ fn unwrap_or_0(opt: Option<&u8>) -> u8 { } } +/// Reads the next code point out of a byte iterator (assuming a +/// UTF-8-like encoding). +#[unstable] +pub fn next_code_point(bytes: &mut slice::Iter<u8>) -> Option<u32> { + // Decode UTF-8 + let x = match bytes.next() { + None => return None, + Some(&next_byte) if next_byte < 128 => return Some(next_byte as u32), + Some(&next_byte) => next_byte, + }; + + // Multibyte case follows + // Decode from a byte combination out of: [[[x y] z] w] + // NOTE: Performance is sensitive to the exact formulation here + let init = utf8_first_byte!(x, 2); + let y = unwrap_or_0(bytes.next()); + let mut ch = utf8_acc_cont_byte!(init, y); + if x >= 0xE0 { + // [[x y z] w] case + // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid + let z = unwrap_or_0(bytes.next()); + let y_z = utf8_acc_cont_byte!((y & CONT_MASK) as u32, z); + ch = init << 12 | y_z; + if x >= 0xF0 { + // [x y z w] case + // use only the lower 3 bits of `init` + let w = unwrap_or_0(bytes.next()); + ch = (init & 7) << 18 | utf8_acc_cont_byte!(y_z, w); + } + } + + Some(ch) +} + #[stable] impl<'a> Iterator for Chars<'a> { type Item = char; #[inline] fn next(&mut self) -> Option<char> { - // Decode UTF-8, using the valid UTF-8 invariant - let x = match self.iter.next() { - None => return None, - Some(&next_byte) if next_byte < 128 => return Some(next_byte as char), - Some(&next_byte) => next_byte, - }; - - // Multibyte case follows - // Decode from a byte combination out of: [[[x y] z] w] - // NOTE: Performance is sensitive to the exact formulation here - let init = utf8_first_byte!(x, 2); - let y = unwrap_or_0(self.iter.next()); - let mut ch = utf8_acc_cont_byte!(init, y); - if x >= 0xE0 { - // [[x y z] w] case - // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid - let z = unwrap_or_0(self.iter.next()); - let y_z = utf8_acc_cont_byte!((y & CONT_MASK) as u32, z); - ch = init << 12 | y_z; - if x >= 0xF0 { - // [x y z w] case - // use only the lower 3 bits of `init` - let w = unwrap_or_0(self.iter.next()); - ch = (init & 7) << 18 | utf8_acc_cont_byte!(y_z, w); + next_code_point(&mut self.iter).map(|ch| { + // str invariant says `ch` is a valid Unicode Scalar Value + unsafe { + mem::transmute(ch) } - } - - // str invariant says `ch` is a valid Unicode Scalar Value - unsafe { - Some(mem::transmute(ch)) - } + }) } #[inline] @@ -1517,25 +1526,8 @@ impl StrExt for str { #[inline] fn char_range_at(&self, i: uint) -> CharRange { - if self.as_bytes()[i] < 128u8 { - return CharRange {ch: self.as_bytes()[i] as char, next: i + 1 }; - } - - // Multibyte case is a fn to allow char_range_at to inline cleanly - fn multibyte_char_range_at(s: &str, i: uint) -> CharRange { - let mut val = s.as_bytes()[i] as u32; - let w = UTF8_CHAR_WIDTH[val as uint] as uint; - assert!((w != 0)); - - val = utf8_first_byte!(val, w); - val = utf8_acc_cont_byte!(val, s.as_bytes()[i + 1]); - if w > 2 { val = utf8_acc_cont_byte!(val, s.as_bytes()[i + 2]); } - if w > 3 { val = utf8_acc_cont_byte!(val, s.as_bytes()[i + 3]); } - - return CharRange {ch: unsafe { mem::transmute(val) }, next: i + w}; - } - - return multibyte_char_range_at(self, i); + let (c, n) = char_range_at_raw(self.as_bytes(), i); + CharRange { ch: unsafe { mem::transmute(c) }, next: n } } #[inline] @@ -1653,6 +1645,32 @@ impl StrExt for str { fn parse<T: FromStr>(&self) -> Option<T> { FromStr::from_str(self) } } +/// Pluck a code point out of a UTF-8-like byte slice and return the +/// index of the next code point. +#[inline] +#[unstable] +pub fn char_range_at_raw(bytes: &[u8], i: uint) -> (u32, usize) { + if bytes[i] < 128u8 { + return (bytes[i] as u32, i + 1); + } + + // Multibyte case is a fn to allow char_range_at to inline cleanly + fn multibyte_char_range_at(bytes: &[u8], i: uint) -> (u32, usize) { + let mut val = bytes[i] as u32; + let w = UTF8_CHAR_WIDTH[val as uint] as uint; + assert!((w != 0)); + + val = utf8_first_byte!(val, w); + val = utf8_acc_cont_byte!(val, bytes[i + 1]); + if w > 2 { val = utf8_acc_cont_byte!(val, bytes[i + 2]); } + if w > 3 { val = utf8_acc_cont_byte!(val, bytes[i + 3]); } + + return (val, i + w); + } + + multibyte_char_range_at(bytes, i) +} + #[stable] impl<'a> Default for &'a str { #[stable] |
