From b94c5a169b588b9d97c014e6dcef18a1acb1f715 Mon Sep 17 00:00:00 2001 From: okaneco <47607823+okaneco@users.noreply.github.com> Date: Sun, 25 Aug 2024 02:41:56 -0400 Subject: Avoid re-validating UTF-8 in `FromUtf8Error::into_utf8_lossy` Refactor `into_utf8_lossy` to copy valid UTF-8 bytes into the buffer, avoiding double validation of bytes. Add tests that mirror the `String::from_utf8_lossy` tests --- library/alloc/src/string.rs | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) (limited to 'library/alloc/src/string.rs') diff --git a/library/alloc/src/string.rs b/library/alloc/src/string.rs index 6daab5bc73a..837e97dac5c 100644 --- a/library/alloc/src/string.rs +++ b/library/alloc/src/string.rs @@ -2081,7 +2081,31 @@ impl FromUtf8Error { #[cfg(not(no_global_oom_handling))] #[unstable(feature = "string_from_utf8_lossy_owned", issue = "129436")] pub fn into_utf8_lossy(self) -> String { - String::from_utf8_lossy_owned(self.bytes) + const REPLACEMENT: &str = "\u{FFFD}"; + + let mut res = { + let mut v = Vec::with_capacity(self.bytes.len()); + + // `Utf8Error::valid_up_to` returns the maximum index of validated + // UTF-8 bytes. Copy the valid bytes into the output buffer. + v.extend_from_slice(&self.bytes[..self.error.valid_up_to()]); + + // SAFETY: This is safe because the only bytes present in the buffer + // were validated as UTF-8 by the call to `String::from_utf8` which + // produced this `FromUtf8Error`. + unsafe { String::from_utf8_unchecked(v) } + }; + + let iter = self.bytes[self.error.valid_up_to()..].utf8_chunks(); + + for chunk in iter { + res.push_str(chunk.valid()); + if !chunk.invalid().is_empty() { + res.push_str(REPLACEMENT); + } + } + + res } /// Returns the bytes that were attempted to convert to a `String`. -- cgit 1.4.1-3-g733a5