diff options
| author | León Orell Valerian Liehr <me@fmease.dev> | 2025-02-26 04:15:02 +0100 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-02-26 04:15:02 +0100 |
| commit | e121dcffbe28792619c3ed5227ec21ce5f0e4e34 (patch) | |
| tree | ee3205bd5ae4f0be01b9d5fe3a70e8ca13963e76 | |
| parent | 1cdd38666b705d38a41ac56a34b8a564ec329690 (diff) | |
| parent | eb14652770451022d424a1c301a4416514464932 (diff) | |
| download | rust-e121dcffbe28792619c3ed5227ec21ce5f0e4e34.tar.gz rust-e121dcffbe28792619c3ed5227ec21ce5f0e4e34.zip | |
Rollup merge of #137154 - thaliaarchi:wtf8-fast-paths, r=ChrisDenton
Add UTF-8 validation fast paths in `Wtf8Buf` This adds two more fast paths for UTF-8 validation in `Wtf8Buf`, making use of the `is_known_utf8` flag added in https://github.com/rust-lang/rust/pull/96869 (Optimize `Wtf8Buf::into_string` for the case where it contains UTF-8). r? `@ChrisDenton`
| -rw-r--r-- | library/std/src/sys/os_str/wtf8.rs | 4 | ||||
| -rw-r--r-- | library/std/src/sys_common/wtf8.rs | 26 |
2 files changed, 27 insertions, 3 deletions
diff --git a/library/std/src/sys/os_str/wtf8.rs b/library/std/src/sys/os_str/wtf8.rs index 19728d33990..8acec6f949f 100644 --- a/library/std/src/sys/os_str/wtf8.rs +++ b/library/std/src/sys/os_str/wtf8.rs @@ -41,13 +41,13 @@ impl AsInner<Wtf8> for Buf { impl fmt::Debug for Buf { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - fmt::Debug::fmt(self.as_slice(), f) + fmt::Debug::fmt(&self.inner, f) } } impl fmt::Display for Buf { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - fmt::Display::fmt(self.as_slice(), f) + fmt::Display::fmt(&self.inner, f) } } diff --git a/library/std/src/sys_common/wtf8.rs b/library/std/src/sys_common/wtf8.rs index 952c39132b0..f9ec112b197 100644 --- a/library/std/src/sys_common/wtf8.rs +++ b/library/std/src/sys_common/wtf8.rs @@ -169,6 +169,18 @@ impl fmt::Debug for Wtf8Buf { } } +/// Formats the string with unpaired surrogates substituted with the replacement +/// character, U+FFFD. +impl fmt::Display for Wtf8Buf { + fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + if let Some(s) = self.as_known_utf8() { + fmt::Display::fmt(s, formatter) + } else { + fmt::Display::fmt(&**self, formatter) + } + } +} + impl Wtf8Buf { /// Creates a new, empty WTF-8 string. #[inline] @@ -262,6 +274,18 @@ impl Wtf8Buf { unsafe { Wtf8::from_mut_bytes_unchecked(&mut self.bytes) } } + /// Converts the string to UTF-8 without validation, if it was created from + /// valid UTF-8. + #[inline] + fn as_known_utf8(&self) -> Option<&str> { + if self.is_known_utf8 { + // SAFETY: The buffer is known to be valid UTF-8. + Some(unsafe { str::from_utf8_unchecked(self.as_bytes()) }) + } else { + None + } + } + /// Reserves capacity for at least `additional` more bytes to be inserted /// in the given `Wtf8Buf`. /// The collection may reserve more space to avoid frequent reallocations. @@ -364,7 +388,7 @@ impl Wtf8Buf { _ => { // If we'll be pushing a string containing a surrogate, we may // no longer have UTF-8. - if other.next_surrogate(0).is_some() { + if self.is_known_utf8 && other.next_surrogate(0).is_some() { self.is_known_utf8 = false; } |
