diff options
| author | ltdk <usr@ltdk.xyz> | 2025-08-13 18:54:48 -0400 |
|---|---|---|
| committer | ltdk <usr@ltdk.xyz> | 2025-08-20 20:31:33 -0400 |
| commit | 7c81a067ea4cfd289d30c3903ac60b113f481c87 (patch) | |
| tree | 468625ed2a63e3f813900b5c65a650f027f9b872 | |
| parent | 2914291e09cb13aab64207f9e11f2aaf74de3904 (diff) | |
| download | rust-7c81a067ea4cfd289d30c3903ac60b113f481c87.tar.gz rust-7c81a067ea4cfd289d30c3903ac60b113f481c87.zip | |
Diff-massaging commit
| -rw-r--r-- | library/alloc/src/wtf8/mod.rs | 75 | ||||
| -rw-r--r-- | library/core/src/wtf8.rs | 82 |
2 files changed, 70 insertions, 87 deletions
diff --git a/library/alloc/src/wtf8/mod.rs b/library/alloc/src/wtf8/mod.rs index 95d317a5efb..047994adc44 100644 --- a/library/alloc/src/wtf8/mod.rs +++ b/library/alloc/src/wtf8/mod.rs @@ -451,53 +451,46 @@ impl Extend<CodePoint> for Wtf8Buf { } } -// helps diff -mod wtf8 { - use super::*; - - /// Creates an owned `Wtf8Buf` from a borrowed `Wtf8`. - pub(super) fn to_owned(slice: &Wtf8) -> Wtf8Buf { - Wtf8Buf { bytes: slice.as_bytes().to_vec(), is_known_utf8: false } - } +/// Creates an owned `Wtf8Buf` from a borrowed `Wtf8`. +pub(super) fn to_owned(slice: &Wtf8) -> Wtf8Buf { + Wtf8Buf { bytes: slice.as_bytes().to_vec(), is_known_utf8: false } +} - /// Lossily converts the string to UTF-8. - /// Returns a UTF-8 `&str` slice if the contents are well-formed in UTF-8. - /// - /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”). - /// - /// This only copies the data if necessary (if it contains any surrogate). - pub(super) fn to_string_lossy(slice: &Wtf8) -> Cow<'_, str> { - let Some((surrogate_pos, _)) = slice.next_surrogate(0) else { - return Cow::Borrowed(unsafe { str::from_utf8_unchecked(slice.as_bytes()) }); - }; - let wtf8_bytes = slice.as_bytes(); - let mut utf8_bytes = Vec::with_capacity(slice.len()); - utf8_bytes.extend_from_slice(&wtf8_bytes[..surrogate_pos]); - utf8_bytes.extend_from_slice("\u{FFFD}".as_bytes()); - let mut pos = surrogate_pos + 3; - loop { - match slice.next_surrogate(pos) { - Some((surrogate_pos, _)) => { - utf8_bytes.extend_from_slice(&wtf8_bytes[pos..surrogate_pos]); - utf8_bytes.extend_from_slice("\u{FFFD}".as_bytes()); - pos = surrogate_pos + 3; - } - None => { - utf8_bytes.extend_from_slice(&wtf8_bytes[pos..]); - return Cow::Owned(unsafe { String::from_utf8_unchecked(utf8_bytes) }); - } +/// Lossily converts the string to UTF-8. +/// Returns a UTF-8 `&str` slice if the contents are well-formed in UTF-8. +/// +/// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”). +/// +/// This only copies the data if necessary (if it contains any surrogate). +pub(super) fn to_string_lossy(slice: &Wtf8) -> Cow<'_, str> { + let Some((surrogate_pos, _)) = slice.next_surrogate(0) else { + return Cow::Borrowed(unsafe { str::from_utf8_unchecked(slice.as_bytes()) }); + }; + let wtf8_bytes = slice.as_bytes(); + let mut utf8_bytes = Vec::with_capacity(slice.len()); + utf8_bytes.extend_from_slice(&wtf8_bytes[..surrogate_pos]); + utf8_bytes.extend_from_slice("\u{FFFD}".as_bytes()); + let mut pos = surrogate_pos + 3; + loop { + match slice.next_surrogate(pos) { + Some((surrogate_pos, _)) => { + utf8_bytes.extend_from_slice(&wtf8_bytes[pos..surrogate_pos]); + utf8_bytes.extend_from_slice("\u{FFFD}".as_bytes()); + pos = surrogate_pos + 3; + } + None => { + utf8_bytes.extend_from_slice(&wtf8_bytes[pos..]); + return Cow::Owned(unsafe { String::from_utf8_unchecked(utf8_bytes) }); } } } - - #[inline] - pub(super) fn clone_into(slice: &Wtf8, buf: &mut Wtf8Buf) { - buf.is_known_utf8 = false; - slice.as_bytes().clone_into(&mut buf.bytes); - } } -use self::wtf8::{to_owned, to_string_lossy, clone_into}; +#[inline] +pub(super) fn clone_into(slice: &Wtf8, buf: &mut Wtf8Buf) { + buf.is_known_utf8 = false; + slice.as_bytes().clone_into(&mut buf.bytes); +} #[cfg(not(test))] impl Wtf8 { diff --git a/library/core/src/wtf8.rs b/library/core/src/wtf8.rs index 5631993dea2..de0dfa560a3 100644 --- a/library/core/src/wtf8.rs +++ b/library/core/src/wtf8.rs @@ -345,16 +345,6 @@ impl Wtf8 { pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool { self.bytes.eq_ignore_ascii_case(&other.bytes) } - - #[inline] - pub fn is_code_point_boundary(&self, index: usize) -> bool { - is_code_point_boundary(self, index) - } - - #[inline] - pub fn check_utf8_boundary(&self, index: usize) { - check_utf8_boundary(self, index) - } } /// Returns a slice of the given string for the byte range \[`begin`..`end`). @@ -435,44 +425,44 @@ fn decode_surrogate(second_byte: u8, third_byte: u8) -> u16 { 0xD800 | (second_byte as u16 & 0x3F) << 6 | third_byte as u16 & 0x3F } -// helps diff to be unindented - -/// Copied from str::is_char_boundary -#[inline] -pub fn is_code_point_boundary(slice: &Wtf8, index: usize) -> bool { - if index == 0 { - return true; - } - match slice.bytes.get(index) { - None => index == slice.len(), - Some(&b) => (b as i8) >= -0x40, +impl Wtf8 { + /// Copied from str::is_char_boundary + #[inline] + pub fn is_code_point_boundary(&self, index: usize) -> bool { + if index == 0 { + return true; + } + match self.bytes.get(index) { + None => index == self.len(), + Some(&b) => (b as i8) >= -0x40, + } } -} -/// Verify that `index` is at the edge of either a valid UTF-8 codepoint -/// (i.e. a codepoint that's not a surrogate) or of the whole string. -/// -/// These are the cases currently permitted by `OsStr::slice_encoded_bytes`. -/// Splitting between surrogates is valid as far as WTF-8 is concerned, but -/// we do not permit it in the public API because WTF-8 is considered an -/// implementation detail. -#[track_caller] -#[inline] -pub fn check_utf8_boundary(slice: &Wtf8, index: usize) { - if index == 0 { - return; - } - match slice.bytes.get(index) { - Some(0xED) => (), // Might be a surrogate - Some(&b) if (b as i8) >= -0x40 => return, - Some(_) => panic!("byte index {index} is not a codepoint boundary"), - None if index == slice.len() => return, - None => panic!("byte index {index} is out of bounds"), - } - if slice.bytes[index + 1] >= 0xA0 { - // There's a surrogate after index. Now check before index. - if index >= 3 && slice.bytes[index - 3] == 0xED && slice.bytes[index - 2] >= 0xA0 { - panic!("byte index {index} lies between surrogate codepoints"); + /// Verify that `index` is at the edge of either a valid UTF-8 codepoint + /// (i.e. a codepoint that's not a surrogate) or of the whole string. + /// + /// These are the cases currently permitted by `OsStr::self_encoded_bytes`. + /// Splitting between surrogates is valid as far as WTF-8 is concerned, but + /// we do not permit it in the public API because WTF-8 is considered an + /// implementation detail. + #[track_caller] + #[inline] + pub fn check_utf8_boundary(&self, index: usize) { + if index == 0 { + return; + } + match self.bytes.get(index) { + Some(0xED) => (), // Might be a surrogate + Some(&b) if (b as i8) >= -0x40 => return, + Some(_) => panic!("byte index {index} is not a codepoint boundary"), + None if index == self.len() => return, + None => panic!("byte index {index} is out of bounds"), + } + if self.bytes[index + 1] >= 0xA0 { + // There's a surrogate after index. Now check before index. + if index >= 3 && self.bytes[index - 3] == 0xED && self.bytes[index - 2] >= 0xA0 { + panic!("byte index {index} lies between surrogate codepoints"); + } } } } |
