diff options
| author | Ralf Jung <post@ralfj.de> | 2020-05-30 17:13:07 +0200 |
|---|---|---|
| committer | Ralf Jung <post@ralfj.de> | 2020-05-30 17:27:34 +0200 |
| commit | 0fb6e63c0438ace4ad9d496376af955c0baacf04 (patch) | |
| tree | 86be9b89d2b2099983548b5407f970df19a4b8b6 | |
| parent | 9c627c33dde998cfe42bcde07e1c5692370daf63 (diff) | |
| download | rust-0fb6e63c0438ace4ad9d496376af955c0baacf04.tar.gz rust-0fb6e63c0438ace4ad9d496376af955c0baacf04.zip | |
encode_utf8_raw is not always valid UTF-8; clarify comments
| -rw-r--r-- | src/libcore/char/methods.rs | 19 | ||||
| -rw-r--r-- | src/libstd/sys_common/wtf8.rs | 2 |
2 files changed, 13 insertions, 8 deletions
diff --git a/src/libcore/char/methods.rs b/src/libcore/char/methods.rs index b1b3c70efb1..bf09b28ff69 100644 --- a/src/libcore/char/methods.rs +++ b/src/libcore/char/methods.rs @@ -661,7 +661,8 @@ impl char { #[stable(feature = "unicode_encode_char", since = "1.15.0")] #[inline] pub fn encode_utf8(self, dst: &mut [u8]) -> &mut str { - encode_utf8_raw(self as u32, dst) + // SAFETY: `char` is not a surrogate, so this is valid UTF-8. + unsafe { from_utf8_unchecked_mut(encode_utf8_raw(self as u32, dst)) } } /// Encodes this character as UTF-16 into the provided `u16` buffer, @@ -1631,7 +1632,11 @@ fn len_utf8(code: u32) -> usize { /// Encodes a raw u32 value as UTF-8 into the provided byte buffer, /// and then returns the subslice of the buffer that contains the encoded character. /// -/// Unlike `char::encode_utf8`, this method can be called on codepoints in the surrogate range. +/// Unlike `char::encode_utf8`, this method also handles codepoints in the surrogate range. +/// (Creating a `char` in the surrogate range is UB.) +/// The result is valid [generalized UTF-8] but not valid UTF-8. +/// +/// [generalized UTF-8]: https://simonsapin.github.io/wtf-8/#generalized-utf8 /// /// # Panics /// @@ -1640,7 +1645,7 @@ fn len_utf8(code: u32) -> usize { #[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")] #[doc(hidden)] #[inline] -pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> &mut str { +pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> &mut [u8] { let len = len_utf8(code); match (len, &mut dst[..]) { (1, [a, ..]) => { @@ -1668,14 +1673,14 @@ pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> &mut str { dst.len(), ), }; - // SAFETY: We just wrote UTF-8 content in, so converting to str is fine. - unsafe { from_utf8_unchecked_mut(&mut dst[..len]) } + &mut dst[..len] } /// Encodes a raw u32 value as UTF-16 into the provided `u16` buffer, /// and then returns the subslice of the buffer that contains the encoded character. /// -/// Unlike `char::encode_utf16`, this method can be called on codepoints in the surrogate range. +/// Unlike `char::encode_utf16`, this method also handles codepoints in the surrogate range. +/// (Creating a `char` in the surrogate range is UB.) /// /// # Panics /// @@ -1688,7 +1693,7 @@ pub fn encode_utf16_raw(mut code: u32, dst: &mut [u16]) -> &mut [u16] { // SAFETY: each arm checks whether there are enough bits to write into unsafe { if (code & 0xFFFF) == code && !dst.is_empty() { - // The BMP falls through (assuming non-surrogate, as it should) + // The BMP falls through *dst.get_unchecked_mut(0) = code as u16; slice::from_raw_parts_mut(dst.as_mut_ptr(), 1) } else if dst.len() >= 2 { diff --git a/src/libstd/sys_common/wtf8.rs b/src/libstd/sys_common/wtf8.rs index 9f589c93ae5..ccb54b7e68d 100644 --- a/src/libstd/sys_common/wtf8.rs +++ b/src/libstd/sys_common/wtf8.rs @@ -202,7 +202,7 @@ impl Wtf8Buf { /// This does **not** include the WTF-8 concatenation check. fn push_code_point_unchecked(&mut self, code_point: CodePoint) { let mut bytes = [0; 4]; - let bytes = char::encode_utf8_raw(code_point.value, &mut bytes).as_bytes(); + let bytes = char::encode_utf8_raw(code_point.value, &mut bytes); self.bytes.extend_from_slice(bytes) } |
