about summary refs log tree commit diff
diff options
context:
space:
mode:
authorltdk <usr@ltdk.xyz>2025-08-13 18:54:48 -0400
committerltdk <usr@ltdk.xyz>2025-08-20 20:31:33 -0400
commit7c81a067ea4cfd289d30c3903ac60b113f481c87 (patch)
tree468625ed2a63e3f813900b5c65a650f027f9b872
parent2914291e09cb13aab64207f9e11f2aaf74de3904 (diff)
downloadrust-7c81a067ea4cfd289d30c3903ac60b113f481c87.tar.gz
rust-7c81a067ea4cfd289d30c3903ac60b113f481c87.zip
Diff-massaging commit
-rw-r--r--library/alloc/src/wtf8/mod.rs75
-rw-r--r--library/core/src/wtf8.rs82
2 files changed, 70 insertions, 87 deletions
diff --git a/library/alloc/src/wtf8/mod.rs b/library/alloc/src/wtf8/mod.rs
index 95d317a5efb..047994adc44 100644
--- a/library/alloc/src/wtf8/mod.rs
+++ b/library/alloc/src/wtf8/mod.rs
@@ -451,53 +451,46 @@ impl Extend<CodePoint> for Wtf8Buf {
     }
 }
 
-// helps diff
-mod wtf8 {
-    use super::*;
-
-    /// Creates an owned `Wtf8Buf` from a borrowed `Wtf8`.
-    pub(super) fn to_owned(slice: &Wtf8) -> Wtf8Buf {
-        Wtf8Buf { bytes: slice.as_bytes().to_vec(), is_known_utf8: false }
-    }
+/// Creates an owned `Wtf8Buf` from a borrowed `Wtf8`.
+pub(super) fn to_owned(slice: &Wtf8) -> Wtf8Buf {
+    Wtf8Buf { bytes: slice.as_bytes().to_vec(), is_known_utf8: false }
+}
 
-    /// Lossily converts the string to UTF-8.
-    /// Returns a UTF-8 `&str` slice if the contents are well-formed in UTF-8.
-    ///
-    /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”).
-    ///
-    /// This only copies the data if necessary (if it contains any surrogate).
-    pub(super) fn to_string_lossy(slice: &Wtf8) -> Cow<'_, str> {
-        let Some((surrogate_pos, _)) = slice.next_surrogate(0) else {
-            return Cow::Borrowed(unsafe { str::from_utf8_unchecked(slice.as_bytes()) });
-        };
-        let wtf8_bytes = slice.as_bytes();
-        let mut utf8_bytes = Vec::with_capacity(slice.len());
-        utf8_bytes.extend_from_slice(&wtf8_bytes[..surrogate_pos]);
-        utf8_bytes.extend_from_slice("\u{FFFD}".as_bytes());
-        let mut pos = surrogate_pos + 3;
-        loop {
-            match slice.next_surrogate(pos) {
-                Some((surrogate_pos, _)) => {
-                    utf8_bytes.extend_from_slice(&wtf8_bytes[pos..surrogate_pos]);
-                    utf8_bytes.extend_from_slice("\u{FFFD}".as_bytes());
-                    pos = surrogate_pos + 3;
-                }
-                None => {
-                    utf8_bytes.extend_from_slice(&wtf8_bytes[pos..]);
-                    return Cow::Owned(unsafe { String::from_utf8_unchecked(utf8_bytes) });
-                }
+/// Lossily converts the string to UTF-8.
+/// Returns a UTF-8 `&str` slice if the contents are well-formed in UTF-8.
+///
+/// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”).
+///
+/// This only copies the data if necessary (if it contains any surrogate).
+pub(super) fn to_string_lossy(slice: &Wtf8) -> Cow<'_, str> {
+    let Some((surrogate_pos, _)) = slice.next_surrogate(0) else {
+        return Cow::Borrowed(unsafe { str::from_utf8_unchecked(slice.as_bytes()) });
+    };
+    let wtf8_bytes = slice.as_bytes();
+    let mut utf8_bytes = Vec::with_capacity(slice.len());
+    utf8_bytes.extend_from_slice(&wtf8_bytes[..surrogate_pos]);
+    utf8_bytes.extend_from_slice("\u{FFFD}".as_bytes());
+    let mut pos = surrogate_pos + 3;
+    loop {
+        match slice.next_surrogate(pos) {
+            Some((surrogate_pos, _)) => {
+                utf8_bytes.extend_from_slice(&wtf8_bytes[pos..surrogate_pos]);
+                utf8_bytes.extend_from_slice("\u{FFFD}".as_bytes());
+                pos = surrogate_pos + 3;
+            }
+            None => {
+                utf8_bytes.extend_from_slice(&wtf8_bytes[pos..]);
+                return Cow::Owned(unsafe { String::from_utf8_unchecked(utf8_bytes) });
             }
         }
     }
-
-    #[inline]
-    pub(super) fn clone_into(slice: &Wtf8, buf: &mut Wtf8Buf) {
-        buf.is_known_utf8 = false;
-        slice.as_bytes().clone_into(&mut buf.bytes);
-    }
 }
 
-use self::wtf8::{to_owned, to_string_lossy, clone_into};
+#[inline]
+pub(super) fn clone_into(slice: &Wtf8, buf: &mut Wtf8Buf) {
+    buf.is_known_utf8 = false;
+    slice.as_bytes().clone_into(&mut buf.bytes);
+}
 
 #[cfg(not(test))]
 impl Wtf8 {
diff --git a/library/core/src/wtf8.rs b/library/core/src/wtf8.rs
index 5631993dea2..de0dfa560a3 100644
--- a/library/core/src/wtf8.rs
+++ b/library/core/src/wtf8.rs
@@ -345,16 +345,6 @@ impl Wtf8 {
     pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool {
         self.bytes.eq_ignore_ascii_case(&other.bytes)
     }
-
-    #[inline]
-    pub fn is_code_point_boundary(&self, index: usize) -> bool {
-        is_code_point_boundary(self, index)
-    }
-
-    #[inline]
-    pub fn check_utf8_boundary(&self, index: usize) {
-        check_utf8_boundary(self, index)
-    }
 }
 
 /// Returns a slice of the given string for the byte range \[`begin`..`end`).
@@ -435,44 +425,44 @@ fn decode_surrogate(second_byte: u8, third_byte: u8) -> u16 {
     0xD800 | (second_byte as u16 & 0x3F) << 6 | third_byte as u16 & 0x3F
 }
 
-// helps diff to be unindented
-
-/// Copied from str::is_char_boundary
-#[inline]
-pub fn is_code_point_boundary(slice: &Wtf8, index: usize) -> bool {
-    if index == 0 {
-        return true;
-    }
-    match slice.bytes.get(index) {
-        None => index == slice.len(),
-        Some(&b) => (b as i8) >= -0x40,
+impl Wtf8 {
+    /// Copied from str::is_char_boundary
+    #[inline]
+    pub fn is_code_point_boundary(&self, index: usize) -> bool {
+        if index == 0 {
+            return true;
+        }
+        match self.bytes.get(index) {
+            None => index == self.len(),
+            Some(&b) => (b as i8) >= -0x40,
+        }
     }
-}
 
-/// Verify that `index` is at the edge of either a valid UTF-8 codepoint
-/// (i.e. a codepoint that's not a surrogate) or of the whole string.
-///
-/// These are the cases currently permitted by `OsStr::slice_encoded_bytes`.
-/// Splitting between surrogates is valid as far as WTF-8 is concerned, but
-/// we do not permit it in the public API because WTF-8 is considered an
-/// implementation detail.
-#[track_caller]
-#[inline]
-pub fn check_utf8_boundary(slice: &Wtf8, index: usize) {
-    if index == 0 {
-        return;
-    }
-    match slice.bytes.get(index) {
-        Some(0xED) => (), // Might be a surrogate
-        Some(&b) if (b as i8) >= -0x40 => return,
-        Some(_) => panic!("byte index {index} is not a codepoint boundary"),
-        None if index == slice.len() => return,
-        None => panic!("byte index {index} is out of bounds"),
-    }
-    if slice.bytes[index + 1] >= 0xA0 {
-        // There's a surrogate after index. Now check before index.
-        if index >= 3 && slice.bytes[index - 3] == 0xED && slice.bytes[index - 2] >= 0xA0 {
-            panic!("byte index {index} lies between surrogate codepoints");
+    /// Verify that `index` is at the edge of either a valid UTF-8 codepoint
+    /// (i.e. a codepoint that's not a surrogate) or of the whole string.
+    ///
+    /// These are the cases currently permitted by `OsStr::self_encoded_bytes`.
+    /// Splitting between surrogates is valid as far as WTF-8 is concerned, but
+    /// we do not permit it in the public API because WTF-8 is considered an
+    /// implementation detail.
+    #[track_caller]
+    #[inline]
+    pub fn check_utf8_boundary(&self, index: usize) {
+        if index == 0 {
+            return;
+        }
+        match self.bytes.get(index) {
+            Some(0xED) => (), // Might be a surrogate
+            Some(&b) if (b as i8) >= -0x40 => return,
+            Some(_) => panic!("byte index {index} is not a codepoint boundary"),
+            None if index == self.len() => return,
+            None => panic!("byte index {index} is out of bounds"),
+        }
+        if self.bytes[index + 1] >= 0xA0 {
+            // There's a surrogate after index. Now check before index.
+            if index >= 3 && self.bytes[index - 3] == 0xED && self.bytes[index - 2] >= 0xA0 {
+                panic!("byte index {index} lies between surrogate codepoints");
+            }
         }
     }
 }