diff options
| author | bors <bors@rust-lang.org> | 2020-05-31 13:39:05 +0000 |
|---|---|---|
| committer | bors <bors@rust-lang.org> | 2020-05-31 13:39:05 +0000 |
| commit | f6072cab136beea206b7cfe79f8d10fccf5af591 (patch) | |
| tree | 11b0ec9ab29833da453b04c7c2ee9e72a3f52a88 /src/libcore | |
| parent | 4b1f86adbe41e8dd4864ca2315f43953dd503bb5 (diff) | |
| parent | cbc73dc2637343f3a0ebba92efc5942e37cc4763 (diff) | |
| download | rust-f6072cab136beea206b7cfe79f8d10fccf5af591.tar.gz rust-f6072cab136beea206b7cfe79f8d10fccf5af591.zip | |
Auto merge of #72813 - RalfJung:rollup-4ko6q8j, r=RalfJung
Rollup of 5 pull requests Successful merges: - #72683 (from_u32_unchecked: check validity, and fix UB in Wtf8) - #72715 (Account for trailing comma when suggesting `where` clauses) - #72745 (generalize Borrow<[T]> for Interned<'tcx, List<T>>) - #72749 (Update stdarch submodule to latest head) - #72781 (Use `LocalDefId` instead of `NodeId` in `resolve_str_path_error`) Failed merges: r? @ghost
Diffstat (limited to 'src/libcore')
| -rw-r--r-- | src/libcore/char/convert.rs | 4 | ||||
| -rw-r--r-- | src/libcore/char/methods.rs | 163 | ||||
| -rw-r--r-- | src/libcore/char/mod.rs | 6 |
3 files changed, 109 insertions, 64 deletions
diff --git a/src/libcore/char/convert.rs b/src/libcore/char/convert.rs index 315020bac58..87c56c4b0a1 100644 --- a/src/libcore/char/convert.rs +++ b/src/libcore/char/convert.rs @@ -99,7 +99,7 @@ pub fn from_u32(i: u32) -> Option<char> { #[inline] #[stable(feature = "char_from_unchecked", since = "1.5.0")] pub unsafe fn from_u32_unchecked(i: u32) -> char { - transmute(i) + if cfg!(debug_assertions) { char::from_u32(i).unwrap() } else { transmute(i) } } #[stable(feature = "char_convert", since = "1.13.0")] @@ -218,7 +218,7 @@ impl TryFrom<u32> for char { Err(CharTryFromError(())) } else { // SAFETY: checked that it's a legal unicode value - Ok(unsafe { from_u32_unchecked(i) }) + Ok(unsafe { transmute(i) }) } } } diff --git a/src/libcore/char/methods.rs b/src/libcore/char/methods.rs index 5c5bc9adb5d..bf09b28ff69 100644 --- a/src/libcore/char/methods.rs +++ b/src/libcore/char/methods.rs @@ -593,16 +593,7 @@ impl char { #[stable(feature = "rust1", since = "1.0.0")] #[inline] pub fn len_utf8(self) -> usize { - let code = self as u32; - if code < MAX_ONE_B { - 1 - } else if code < MAX_TWO_B { - 2 - } else if code < MAX_THREE_B { - 3 - } else { - 4 - } + len_utf8(self as u32) } /// Returns the number of 16-bit code units this `char` would need if @@ -670,36 +661,8 @@ impl char { #[stable(feature = "unicode_encode_char", since = "1.15.0")] #[inline] pub fn encode_utf8(self, dst: &mut [u8]) -> &mut str { - let code = self as u32; - let len = self.len_utf8(); - match (len, &mut dst[..]) { - (1, [a, ..]) => { - *a = code as u8; - } - (2, [a, b, ..]) => { - *a = (code >> 6 & 0x1F) as u8 | TAG_TWO_B; - *b = (code & 0x3F) as u8 | TAG_CONT; - } - (3, [a, b, c, ..]) => { - *a = (code >> 12 & 0x0F) as u8 | TAG_THREE_B; - *b = (code >> 6 & 0x3F) as u8 | TAG_CONT; - *c = (code & 0x3F) as u8 | TAG_CONT; - } - (4, [a, b, c, d, ..]) => { - *a = (code >> 18 & 0x07) as u8 | TAG_FOUR_B; - *b = (code >> 12 & 0x3F) as u8 | TAG_CONT; - *c = (code >> 6 & 0x3F) as u8 | TAG_CONT; - *d = (code & 0x3F) as u8 | TAG_CONT; - } - _ => panic!( - "encode_utf8: need {} bytes to encode U+{:X}, but the buffer has {}", - len, - code, - dst.len(), - ), - }; - // SAFETY: We just wrote UTF-8 content in, so converting to str is fine. - unsafe { from_utf8_unchecked_mut(&mut dst[..len]) } + // SAFETY: `char` is not a surrogate, so this is valid UTF-8. + unsafe { from_utf8_unchecked_mut(encode_utf8_raw(self as u32, dst)) } } /// Encodes this character as UTF-16 into the provided `u16` buffer, @@ -739,28 +702,7 @@ impl char { #[stable(feature = "unicode_encode_char", since = "1.15.0")] #[inline] pub fn encode_utf16(self, dst: &mut [u16]) -> &mut [u16] { - let mut code = self as u32; - // SAFETY: each arm checks whether there are enough bits to write into - unsafe { - if (code & 0xFFFF) == code && !dst.is_empty() { - // The BMP falls through (assuming non-surrogate, as it should) - *dst.get_unchecked_mut(0) = code as u16; - slice::from_raw_parts_mut(dst.as_mut_ptr(), 1) - } else if dst.len() >= 2 { - // Supplementary planes break into surrogates. - code -= 0x1_0000; - *dst.get_unchecked_mut(0) = 0xD800 | ((code >> 10) as u16); - *dst.get_unchecked_mut(1) = 0xDC00 | ((code as u16) & 0x3FF); - slice::from_raw_parts_mut(dst.as_mut_ptr(), 2) - } else { - panic!( - "encode_utf16: need {} units to encode U+{:X}, but the buffer has {}", - from_u32_unchecked(code).len_utf16(), - code, - dst.len(), - ) - } - } + encode_utf16_raw(self as u32, dst) } /// Returns `true` if this `char` has the `Alphabetic` property. @@ -1673,3 +1615,100 @@ impl char { } } } + +#[inline] +fn len_utf8(code: u32) -> usize { + if code < MAX_ONE_B { + 1 + } else if code < MAX_TWO_B { + 2 + } else if code < MAX_THREE_B { + 3 + } else { + 4 + } +} + +/// Encodes a raw u32 value as UTF-8 into the provided byte buffer, +/// and then returns the subslice of the buffer that contains the encoded character. +/// +/// Unlike `char::encode_utf8`, this method also handles codepoints in the surrogate range. +/// (Creating a `char` in the surrogate range is UB.) +/// The result is valid [generalized UTF-8] but not valid UTF-8. +/// +/// [generalized UTF-8]: https://simonsapin.github.io/wtf-8/#generalized-utf8 +/// +/// # Panics +/// +/// Panics if the buffer is not large enough. +/// A buffer of length four is large enough to encode any `char`. +#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")] +#[doc(hidden)] +#[inline] +pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> &mut [u8] { + let len = len_utf8(code); + match (len, &mut dst[..]) { + (1, [a, ..]) => { + *a = code as u8; + } + (2, [a, b, ..]) => { + *a = (code >> 6 & 0x1F) as u8 | TAG_TWO_B; + *b = (code & 0x3F) as u8 | TAG_CONT; + } + (3, [a, b, c, ..]) => { + *a = (code >> 12 & 0x0F) as u8 | TAG_THREE_B; + *b = (code >> 6 & 0x3F) as u8 | TAG_CONT; + *c = (code & 0x3F) as u8 | TAG_CONT; + } + (4, [a, b, c, d, ..]) => { + *a = (code >> 18 & 0x07) as u8 | TAG_FOUR_B; + *b = (code >> 12 & 0x3F) as u8 | TAG_CONT; + *c = (code >> 6 & 0x3F) as u8 | TAG_CONT; + *d = (code & 0x3F) as u8 | TAG_CONT; + } + _ => panic!( + "encode_utf8: need {} bytes to encode U+{:X}, but the buffer has {}", + len, + code, + dst.len(), + ), + }; + &mut dst[..len] +} + +/// Encodes a raw u32 value as UTF-16 into the provided `u16` buffer, +/// and then returns the subslice of the buffer that contains the encoded character. +/// +/// Unlike `char::encode_utf16`, this method also handles codepoints in the surrogate range. +/// (Creating a `char` in the surrogate range is UB.) +/// +/// # Panics +/// +/// Panics if the buffer is not large enough. +/// A buffer of length 2 is large enough to encode any `char`. +#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")] +#[doc(hidden)] +#[inline] +pub fn encode_utf16_raw(mut code: u32, dst: &mut [u16]) -> &mut [u16] { + // SAFETY: each arm checks whether there are enough bits to write into + unsafe { + if (code & 0xFFFF) == code && !dst.is_empty() { + // The BMP falls through + *dst.get_unchecked_mut(0) = code as u16; + slice::from_raw_parts_mut(dst.as_mut_ptr(), 1) + } else if dst.len() >= 2 { + // Supplementary planes break into surrogates. + code -= 0x1_0000; + *dst.get_unchecked_mut(0) = 0xD800 | ((code >> 10) as u16); + *dst.get_unchecked_mut(1) = 0xDC00 | ((code as u16) & 0x3FF); + slice::from_raw_parts_mut(dst.as_mut_ptr(), 2) + } else { + panic!( + "encode_utf16: need {} units to encode U+{:X}, but the buffer has {}", + from_u32_unchecked(code).len_utf16(), + code, + dst.len(), + ) + } + } +} diff --git a/src/libcore/char/mod.rs b/src/libcore/char/mod.rs index bf65c31e135..1b4e906e4e4 100644 --- a/src/libcore/char/mod.rs +++ b/src/libcore/char/mod.rs @@ -37,6 +37,12 @@ pub use self::decode::{decode_utf16, DecodeUtf16, DecodeUtf16Error}; #[stable(feature = "unicode_version", since = "1.45.0")] pub use crate::unicode::UNICODE_VERSION; +// perma-unstable re-exports +#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")] +pub use self::methods::encode_utf16_raw; +#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")] +pub use self::methods::encode_utf8_raw; + use crate::fmt::{self, Write}; use crate::iter::FusedIterator; |
