about summary refs log tree commit diff
diff options
context:
space:
mode:
authorLeón Orell Valerian Liehr <me@fmease.dev>2025-02-26 04:15:02 +0100
committerGitHub <noreply@github.com>2025-02-26 04:15:02 +0100
commite121dcffbe28792619c3ed5227ec21ce5f0e4e34 (patch)
treeee3205bd5ae4f0be01b9d5fe3a70e8ca13963e76
parent1cdd38666b705d38a41ac56a34b8a564ec329690 (diff)
parenteb14652770451022d424a1c301a4416514464932 (diff)
downloadrust-e121dcffbe28792619c3ed5227ec21ce5f0e4e34.tar.gz
rust-e121dcffbe28792619c3ed5227ec21ce5f0e4e34.zip
Rollup merge of #137154 - thaliaarchi:wtf8-fast-paths, r=ChrisDenton
Add UTF-8 validation fast paths in `Wtf8Buf`

This adds two more fast paths for UTF-8 validation in `Wtf8Buf`, making use of the `is_known_utf8` flag added in https://github.com/rust-lang/rust/pull/96869 (Optimize `Wtf8Buf::into_string` for the case where it contains UTF-8).

r? `@ChrisDenton`
-rw-r--r--library/std/src/sys/os_str/wtf8.rs4
-rw-r--r--library/std/src/sys_common/wtf8.rs26
2 files changed, 27 insertions, 3 deletions
diff --git a/library/std/src/sys/os_str/wtf8.rs b/library/std/src/sys/os_str/wtf8.rs
index 19728d33990..8acec6f949f 100644
--- a/library/std/src/sys/os_str/wtf8.rs
+++ b/library/std/src/sys/os_str/wtf8.rs
@@ -41,13 +41,13 @@ impl AsInner<Wtf8> for Buf {
 
 impl fmt::Debug for Buf {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        fmt::Debug::fmt(self.as_slice(), f)
+        fmt::Debug::fmt(&self.inner, f)
     }
 }
 
 impl fmt::Display for Buf {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        fmt::Display::fmt(self.as_slice(), f)
+        fmt::Display::fmt(&self.inner, f)
     }
 }
 
diff --git a/library/std/src/sys_common/wtf8.rs b/library/std/src/sys_common/wtf8.rs
index 952c39132b0..f9ec112b197 100644
--- a/library/std/src/sys_common/wtf8.rs
+++ b/library/std/src/sys_common/wtf8.rs
@@ -169,6 +169,18 @@ impl fmt::Debug for Wtf8Buf {
     }
 }
 
+/// Formats the string with unpaired surrogates substituted with the replacement
+/// character, U+FFFD.
+impl fmt::Display for Wtf8Buf {
+    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
+        if let Some(s) = self.as_known_utf8() {
+            fmt::Display::fmt(s, formatter)
+        } else {
+            fmt::Display::fmt(&**self, formatter)
+        }
+    }
+}
+
 impl Wtf8Buf {
     /// Creates a new, empty WTF-8 string.
     #[inline]
@@ -262,6 +274,18 @@ impl Wtf8Buf {
         unsafe { Wtf8::from_mut_bytes_unchecked(&mut self.bytes) }
     }
 
+    /// Converts the string to UTF-8 without validation, if it was created from
+    /// valid UTF-8.
+    #[inline]
+    fn as_known_utf8(&self) -> Option<&str> {
+        if self.is_known_utf8 {
+            // SAFETY: The buffer is known to be valid UTF-8.
+            Some(unsafe { str::from_utf8_unchecked(self.as_bytes()) })
+        } else {
+            None
+        }
+    }
+
     /// Reserves capacity for at least `additional` more bytes to be inserted
     /// in the given `Wtf8Buf`.
     /// The collection may reserve more space to avoid frequent reallocations.
@@ -364,7 +388,7 @@ impl Wtf8Buf {
             _ => {
                 // If we'll be pushing a string containing a surrogate, we may
                 // no longer have UTF-8.
-                if other.next_surrogate(0).is_some() {
+                if self.is_known_utf8 && other.next_surrogate(0).is_some() {
                     self.is_known_utf8 = false;
                 }