about summary refs log tree commit diff
diff options
context:
space:
mode:
authorMatthias Krüger <matthias.krueger@famsik.de>2025-02-19 21:16:12 +0100
committerGitHub <noreply@github.com>2025-02-19 21:16:12 +0100
commit3964bb131b3b7a24f20803d8efe3bd1d76fc5abc (patch)
tree7c6b96ea3cb13b1739ffd57af16669d8968e9c5e
parentc29cc600fdc7f6c673242c340126dddfdb040413 (diff)
parent05e4175d79a7e8f2fc11c3d2ae70bb31a354f130 (diff)
downloadrust-3964bb131b3b7a24f20803d8efe3bd1d76fc5abc.tar.gz
rust-3964bb131b3b7a24f20803d8efe3bd1d76fc5abc.zip
Rollup merge of #137155 - thaliaarchi:wtf8-organize, r=ChrisDenton
Organize `OsString`/`OsStr` shims

Synchronize the `bytes.rs` and `wtf8.rs` shims for `OsString`/`OsStr` so they're easier to diff between each other. This is mostly ordering items the same between the two. I tried to minimize moves and went for the average locations between the files.

With them in the same order, it is clear that `FromInner<_>` is not implemented for `bytes::Buf` and `Clone::clone_from` is not implemented for `wtf8::Buf`, but they are for the other. Fix that.

I added #[inline] to all inherent methods of the `OsString`/`OsStr` shims, because it seemed that was already the rough pattern. `bytes.rs` has more inlining than `wtf8.rs`, so I added the corresponding ones to `wtf8.rs`. Then, the common missing ones have no discernible pattern to me. They're not divided by non-allocating/allocating. Perhaps the pattern is that UTF-8 validation isn't inlined? Since these types are merely the inner values in `OsStr`/`OsString`, I put inline on all methods and let those public types dictate inlining. I have not inspected codegen or run benchmarks.

Also, touch up some (private) documentation comments.

r? ``````@ChrisDenton``````
-rw-r--r--library/std/src/sys/os_str/bytes.rs86
-rw-r--r--library/std/src/sys/os_str/wtf8.rs107
-rw-r--r--library/std/src/sys_common/wtf8.rs70
3 files changed, 154 insertions, 109 deletions
diff --git a/library/std/src/sys/os_str/bytes.rs b/library/std/src/sys/os_str/bytes.rs
index 5b65d862be1..1d337694944 100644
--- a/library/std/src/sys/os_str/bytes.rs
+++ b/library/std/src/sys/os_str/bytes.rs
@@ -8,7 +8,7 @@ use crate::collections::TryReserveError;
 use crate::fmt::Write;
 use crate::rc::Rc;
 use crate::sync::Arc;
-use crate::sys_common::{AsInner, IntoInner};
+use crate::sys_common::{AsInner, FromInner, IntoInner};
 use crate::{fmt, mem, str};
 
 #[cfg(test)]
@@ -25,6 +25,37 @@ pub struct Slice {
     pub inner: [u8],
 }
 
+impl IntoInner<Vec<u8>> for Buf {
+    fn into_inner(self) -> Vec<u8> {
+        self.inner
+    }
+}
+
+impl FromInner<Vec<u8>> for Buf {
+    fn from_inner(inner: Vec<u8>) -> Self {
+        Buf { inner }
+    }
+}
+
+impl AsInner<[u8]> for Buf {
+    #[inline]
+    fn as_inner(&self) -> &[u8] {
+        &self.inner
+    }
+}
+
+impl fmt::Debug for Buf {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        fmt::Debug::fmt(self.as_slice(), f)
+    }
+}
+
+impl fmt::Display for Buf {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        fmt::Display::fmt(self.as_slice(), f)
+    }
+}
+
 impl fmt::Debug for Slice {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         fmt::Debug::fmt(&self.inner.utf8_chunks().debug(), f)
@@ -55,18 +86,6 @@ impl fmt::Display for Slice {
     }
 }
 
-impl fmt::Debug for Buf {
-    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
-        fmt::Debug::fmt(self.as_slice(), formatter)
-    }
-}
-
-impl fmt::Display for Buf {
-    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
-        fmt::Display::fmt(self.as_slice(), formatter)
-    }
-}
-
 impl Clone for Buf {
     #[inline]
     fn clone(&self) -> Self {
@@ -79,19 +98,6 @@ impl Clone for Buf {
     }
 }
 
-impl IntoInner<Vec<u8>> for Buf {
-    fn into_inner(self) -> Vec<u8> {
-        self.inner
-    }
-}
-
-impl AsInner<[u8]> for Buf {
-    #[inline]
-    fn as_inner(&self) -> &[u8] {
-        &self.inner
-    }
-}
-
 impl Buf {
     #[inline]
     pub fn into_encoded_bytes(self) -> Vec<u8> {
@@ -103,6 +109,12 @@ impl Buf {
         Self { inner: s }
     }
 
+    #[inline]
+    pub fn into_string(self) -> Result<String, Buf> {
+        String::from_utf8(self.inner).map_err(|p| Buf { inner: p.into_bytes() })
+    }
+
+    #[inline]
     pub fn from_string(s: String) -> Buf {
         Buf { inner: s.into_bytes() }
     }
@@ -123,6 +135,11 @@ impl Buf {
     }
 
     #[inline]
+    pub fn push_slice(&mut self, s: &Slice) {
+        self.inner.extend_from_slice(&s.inner)
+    }
+
+    #[inline]
     pub fn reserve(&mut self, additional: usize) {
         self.inner.reserve(additional)
     }
@@ -157,7 +174,7 @@ impl Buf {
         // SAFETY: Slice just wraps [u8],
         // and &*self.inner is &[u8], therefore
         // transmuting &[u8] to &Slice is safe.
-        unsafe { mem::transmute(&*self.inner) }
+        unsafe { mem::transmute(self.inner.as_slice()) }
     }
 
     #[inline]
@@ -165,15 +182,7 @@ impl Buf {
         // SAFETY: Slice just wraps [u8],
         // and &mut *self.inner is &mut [u8], therefore
         // transmuting &mut [u8] to &mut Slice is safe.
-        unsafe { mem::transmute(&mut *self.inner) }
-    }
-
-    pub fn into_string(self) -> Result<String, Buf> {
-        String::from_utf8(self.inner).map_err(|p| Buf { inner: p.into_bytes() })
-    }
-
-    pub fn push_slice(&mut self, s: &Slice) {
-        self.inner.extend_from_slice(&s.inner)
+        unsafe { mem::transmute(self.inner.as_mut_slice()) }
     }
 
     #[inline]
@@ -278,18 +287,22 @@ impl Slice {
         unsafe { Slice::from_encoded_bytes_unchecked(s.as_bytes()) }
     }
 
+    #[inline]
     pub fn to_str(&self) -> Result<&str, crate::str::Utf8Error> {
         str::from_utf8(&self.inner)
     }
 
+    #[inline]
     pub fn to_string_lossy(&self) -> Cow<'_, str> {
         String::from_utf8_lossy(&self.inner)
     }
 
+    #[inline]
     pub fn to_owned(&self) -> Buf {
         Buf { inner: self.inner.to_vec() }
     }
 
+    #[inline]
     pub fn clone_into(&self, buf: &mut Buf) {
         self.inner.clone_into(&mut buf.inner)
     }
@@ -300,6 +313,7 @@ impl Slice {
         unsafe { mem::transmute(boxed) }
     }
 
+    #[inline]
     pub fn empty_box() -> Box<Slice> {
         let boxed: Box<[u8]> = Default::default();
         unsafe { mem::transmute(boxed) }
diff --git a/library/std/src/sys/os_str/wtf8.rs b/library/std/src/sys/os_str/wtf8.rs
index a4ad5966afe..19728d33990 100644
--- a/library/std/src/sys/os_str/wtf8.rs
+++ b/library/std/src/sys/os_str/wtf8.rs
@@ -10,11 +10,16 @@ use crate::sys_common::wtf8::{Wtf8, Wtf8Buf, check_utf8_boundary};
 use crate::sys_common::{AsInner, FromInner, IntoInner};
 use crate::{fmt, mem};
 
-#[derive(Clone, Hash)]
+#[derive(Hash)]
 pub struct Buf {
     pub inner: Wtf8Buf,
 }
 
+#[repr(transparent)]
+pub struct Slice {
+    pub inner: Wtf8,
+}
+
 impl IntoInner<Wtf8Buf> for Buf {
     fn into_inner(self) -> Wtf8Buf {
         self.inner
@@ -35,31 +40,38 @@ impl AsInner<Wtf8> for Buf {
 }
 
 impl fmt::Debug for Buf {
-    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
-        fmt::Debug::fmt(self.as_slice(), formatter)
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        fmt::Debug::fmt(self.as_slice(), f)
     }
 }
 
 impl fmt::Display for Buf {
-    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
-        fmt::Display::fmt(self.as_slice(), formatter)
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        fmt::Display::fmt(self.as_slice(), f)
     }
 }
 
-#[repr(transparent)]
-pub struct Slice {
-    pub inner: Wtf8,
-}
-
 impl fmt::Debug for Slice {
-    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
-        fmt::Debug::fmt(&self.inner, formatter)
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        fmt::Debug::fmt(&self.inner, f)
     }
 }
 
 impl fmt::Display for Slice {
-    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
-        fmt::Display::fmt(&self.inner, formatter)
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        fmt::Display::fmt(&self.inner, f)
+    }
+}
+
+impl Clone for Buf {
+    #[inline]
+    fn clone(&self) -> Self {
+        Buf { inner: self.inner.clone() }
+    }
+
+    #[inline]
+    fn clone_from(&mut self, source: &Self) {
+        self.inner.clone_from(&source.inner)
     }
 }
 
@@ -74,62 +86,57 @@ impl Buf {
         unsafe { Self { inner: Wtf8Buf::from_bytes_unchecked(s) } }
     }
 
-    pub fn with_capacity(capacity: usize) -> Buf {
-        Buf { inner: Wtf8Buf::with_capacity(capacity) }
-    }
-
-    pub fn clear(&mut self) {
-        self.inner.clear()
-    }
-
-    pub fn capacity(&self) -> usize {
-        self.inner.capacity()
+    #[inline]
+    pub fn into_string(self) -> Result<String, Buf> {
+        self.inner.into_string().map_err(|buf| Buf { inner: buf })
     }
 
+    #[inline]
     pub fn from_string(s: String) -> Buf {
         Buf { inner: Wtf8Buf::from_string(s) }
     }
 
-    pub fn as_slice(&self) -> &Slice {
-        // SAFETY: Slice is just a wrapper for Wtf8,
-        // and self.inner.as_slice() returns &Wtf8.
-        // Therefore, transmuting &Wtf8 to &Slice is safe.
-        unsafe { mem::transmute(self.inner.as_slice()) }
+    #[inline]
+    pub fn with_capacity(capacity: usize) -> Buf {
+        Buf { inner: Wtf8Buf::with_capacity(capacity) }
     }
 
-    pub fn as_mut_slice(&mut self) -> &mut Slice {
-        // SAFETY: Slice is just a wrapper for Wtf8,
-        // and self.inner.as_mut_slice() returns &mut Wtf8.
-        // Therefore, transmuting &mut Wtf8 to &mut Slice is safe.
-        // Additionally, care should be taken to ensure the slice
-        // is always valid Wtf8.
-        unsafe { mem::transmute(self.inner.as_mut_slice()) }
+    #[inline]
+    pub fn clear(&mut self) {
+        self.inner.clear()
     }
 
-    pub fn into_string(self) -> Result<String, Buf> {
-        self.inner.into_string().map_err(|buf| Buf { inner: buf })
+    #[inline]
+    pub fn capacity(&self) -> usize {
+        self.inner.capacity()
     }
 
+    #[inline]
     pub fn push_slice(&mut self, s: &Slice) {
         self.inner.push_wtf8(&s.inner)
     }
 
+    #[inline]
     pub fn reserve(&mut self, additional: usize) {
         self.inner.reserve(additional)
     }
 
+    #[inline]
     pub fn try_reserve(&mut self, additional: usize) -> Result<(), TryReserveError> {
         self.inner.try_reserve(additional)
     }
 
+    #[inline]
     pub fn reserve_exact(&mut self, additional: usize) {
         self.inner.reserve_exact(additional)
     }
 
+    #[inline]
     pub fn try_reserve_exact(&mut self, additional: usize) -> Result<(), TryReserveError> {
         self.inner.try_reserve_exact(additional)
     }
 
+    #[inline]
     pub fn shrink_to_fit(&mut self) {
         self.inner.shrink_to_fit()
     }
@@ -140,6 +147,24 @@ impl Buf {
     }
 
     #[inline]
+    pub fn as_slice(&self) -> &Slice {
+        // SAFETY: Slice is just a wrapper for Wtf8,
+        // and self.inner.as_slice() returns &Wtf8.
+        // Therefore, transmuting &Wtf8 to &Slice is safe.
+        unsafe { mem::transmute(self.inner.as_slice()) }
+    }
+
+    #[inline]
+    pub fn as_mut_slice(&mut self) -> &mut Slice {
+        // SAFETY: Slice is just a wrapper for Wtf8,
+        // and self.inner.as_mut_slice() returns &mut Wtf8.
+        // Therefore, transmuting &mut Wtf8 to &mut Slice is safe.
+        // Additionally, care should be taken to ensure the slice
+        // is always valid Wtf8.
+        unsafe { mem::transmute(self.inner.as_mut_slice()) }
+    }
+
+    #[inline]
     pub fn leak<'a>(self) -> &'a mut Slice {
         unsafe { mem::transmute(self.inner.leak()) }
     }
@@ -194,6 +219,7 @@ impl Slice {
     }
 
     #[track_caller]
+    #[inline]
     pub fn check_public_boundary(&self, index: usize) {
         check_utf8_boundary(&self.inner, index);
     }
@@ -203,18 +229,22 @@ impl Slice {
         unsafe { mem::transmute(Wtf8::from_str(s)) }
     }
 
+    #[inline]
     pub fn to_str(&self) -> Result<&str, crate::str::Utf8Error> {
         self.inner.as_str()
     }
 
+    #[inline]
     pub fn to_string_lossy(&self) -> Cow<'_, str> {
         self.inner.to_string_lossy()
     }
 
+    #[inline]
     pub fn to_owned(&self) -> Buf {
         Buf { inner: self.inner.to_owned() }
     }
 
+    #[inline]
     pub fn clone_into(&self, buf: &mut Buf) {
         self.inner.clone_into(&mut buf.inner)
     }
@@ -224,6 +254,7 @@ impl Slice {
         unsafe { mem::transmute(self.inner.into_box()) }
     }
 
+    #[inline]
     pub fn empty_box() -> Box<Slice> {
         unsafe { mem::transmute(Wtf8::empty_box()) }
     }
diff --git a/library/std/src/sys_common/wtf8.rs b/library/std/src/sys_common/wtf8.rs
index a7704a65fc9..952c39132b0 100644
--- a/library/std/src/sys_common/wtf8.rs
+++ b/library/std/src/sys_common/wtf8.rs
@@ -156,9 +156,12 @@ impl ops::DerefMut for Wtf8Buf {
     }
 }
 
-/// Format the string with double quotes,
-/// and surrogates as `\u` followed by four hexadecimal digits.
-/// Example: `"a\u{D800}"` for a string with code points [U+0061, U+D800]
+/// Formats the string in double quotes, with characters escaped according to
+/// [`char::escape_debug`] and unpaired surrogates represented as `\u{xxxx}`,
+/// where each `x` is a hexadecimal digit.
+///
+/// For example, the code units [U+0061, U+D800, U+000A] are formatted as
+/// `"a\u{D800}\n"`.
 impl fmt::Debug for Wtf8Buf {
     #[inline]
     fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
@@ -181,7 +184,7 @@ impl Wtf8Buf {
 
     /// Creates a WTF-8 string from a WTF-8 byte vec.
     ///
-    /// Since the byte vec is not checked for valid WTF-8, this functions is
+    /// Since the byte vec is not checked for valid WTF-8, this function is
     /// marked unsafe.
     #[inline]
     pub unsafe fn from_bytes_unchecked(value: Vec<u8>) -> Wtf8Buf {
@@ -205,7 +208,7 @@ impl Wtf8Buf {
     /// Since WTF-8 is a superset of UTF-8, this always succeeds.
     #[inline]
     pub fn from_str(s: &str) -> Wtf8Buf {
-        Wtf8Buf { bytes: <[_]>::to_vec(s.as_bytes()), is_known_utf8: true }
+        Wtf8Buf { bytes: s.as_bytes().to_vec(), is_known_utf8: true }
     }
 
     pub fn clear(&mut self) {
@@ -237,8 +240,9 @@ impl Wtf8Buf {
         string
     }
 
-    /// Copied from String::push
+    /// Appends the given `char` to the end of this string.
     /// This does **not** include the WTF-8 concatenation check or `is_known_utf8` check.
+    /// Copied from String::push.
     fn push_code_point_unchecked(&mut self, code_point: CodePoint) {
         let mut bytes = [0; MAX_LEN_UTF8];
         let bytes = encode_utf8_raw(code_point.value, &mut bytes);
@@ -264,16 +268,16 @@ impl Wtf8Buf {
     ///
     /// # Panics
     ///
-    /// Panics if the new capacity overflows `usize`.
+    /// Panics if the new capacity exceeds `isize::MAX` bytes.
     #[inline]
     pub fn reserve(&mut self, additional: usize) {
         self.bytes.reserve(additional)
     }
 
-    /// Tries to reserve capacity for at least `additional` more length units
-    /// in the given `Wtf8Buf`. The `Wtf8Buf` may reserve more space to avoid
-    /// frequent reallocations. After calling `try_reserve`, capacity will be
-    /// greater than or equal to `self.len() + additional`. Does nothing if
+    /// Tries to reserve capacity for at least `additional` more bytes to be
+    /// inserted in the given `Wtf8Buf`. The `Wtf8Buf` may reserve more space to
+    /// avoid frequent reallocations. After calling `try_reserve`, capacity will
+    /// be greater than or equal to `self.len() + additional`. Does nothing if
     /// capacity is already sufficient. This method preserves the contents even
     /// if an error occurs.
     ///
@@ -291,8 +295,8 @@ impl Wtf8Buf {
         self.bytes.reserve_exact(additional)
     }
 
-    /// Tries to reserve the minimum capacity for exactly `additional`
-    /// length units in the given `Wtf8Buf`. After calling
+    /// Tries to reserve the minimum capacity for exactly `additional` more
+    /// bytes to be inserted in the given `Wtf8Buf`. After calling
     /// `try_reserve_exact`, capacity will be greater than or equal to
     /// `self.len() + additional` if it returns `Ok(())`.
     /// Does nothing if the capacity is already sufficient.
@@ -440,22 +444,17 @@ impl Wtf8Buf {
     ///
     /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”)
     pub fn into_string_lossy(mut self) -> String {
-        // Fast path: If we already have UTF-8, we can return it immediately.
-        if self.is_known_utf8 {
-            return unsafe { String::from_utf8_unchecked(self.bytes) };
-        }
-
-        let mut pos = 0;
-        loop {
-            match self.next_surrogate(pos) {
-                Some((surrogate_pos, _)) => {
-                    pos = surrogate_pos + 3;
-                    self.bytes[surrogate_pos..pos]
-                        .copy_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes());
-                }
-                None => return unsafe { String::from_utf8_unchecked(self.bytes) },
+        if !self.is_known_utf8 {
+            let mut pos = 0;
+            while let Some((surrogate_pos, _)) = self.next_surrogate(pos) {
+                pos = surrogate_pos + 3;
+                // Surrogates and the replacement character are all 3 bytes, so
+                // they can substituted in-place.
+                self.bytes[surrogate_pos..pos]
+                    .copy_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes());
             }
         }
+        unsafe { String::from_utf8_unchecked(self.bytes) }
     }
 
     /// Converts this `Wtf8Buf` into a boxed `Wtf8`.
@@ -535,9 +534,9 @@ impl AsInner<[u8]> for Wtf8 {
     }
 }
 
-/// Format the slice with double quotes,
-/// and surrogates as `\u` followed by four hexadecimal digits.
-/// Example: `"a\u{D800}"` for a slice with code points [U+0061, U+D800]
+/// Formats the string in double quotes, with characters escaped according to
+/// [`char::escape_debug`] and unpaired surrogates represented as `\u{xxxx}`,
+/// where each `x` is a hexadecimal digit.
 impl fmt::Debug for Wtf8 {
     fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
         fn write_str_escaped(f: &mut fmt::Formatter<'_>, s: &str) -> fmt::Result {
@@ -562,6 +561,8 @@ impl fmt::Debug for Wtf8 {
     }
 }
 
+/// Formats the string with unpaired surrogates substituted with the replacement
+/// character, U+FFFD.
 impl fmt::Display for Wtf8 {
     fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
         let wtf8_bytes = &self.bytes;
@@ -672,9 +673,8 @@ impl Wtf8 {
     ///
     /// This only copies the data if necessary (if it contains any surrogate).
     pub fn to_string_lossy(&self) -> Cow<'_, str> {
-        let surrogate_pos = match self.next_surrogate(0) {
-            None => return Cow::Borrowed(unsafe { str::from_utf8_unchecked(&self.bytes) }),
-            Some((pos, _)) => pos,
+        let Some((surrogate_pos, _)) = self.next_surrogate(0) else {
+            return Cow::Borrowed(unsafe { str::from_utf8_unchecked(&self.bytes) });
         };
         let wtf8_bytes = &self.bytes;
         let mut utf8_bytes = Vec::with_capacity(self.len());
@@ -964,7 +964,7 @@ pub struct Wtf8CodePoints<'a> {
     bytes: slice::Iter<'a, u8>,
 }
 
-impl<'a> Iterator for Wtf8CodePoints<'a> {
+impl Iterator for Wtf8CodePoints<'_> {
     type Item = CodePoint;
 
     #[inline]
@@ -990,7 +990,7 @@ pub struct EncodeWide<'a> {
 
 // Copied from libunicode/u_str.rs
 #[stable(feature = "rust1", since = "1.0.0")]
-impl<'a> Iterator for EncodeWide<'a> {
+impl Iterator for EncodeWide<'_> {
     type Item = u16;
 
     #[inline]