about summary refs log tree commit diff
diff options
context:
space:
mode:
authorokaneco <47607823+okaneco@users.noreply.github.com>2024-08-25 02:41:56 -0400
committerokaneco <47607823+okaneco@users.noreply.github.com>2024-09-20 20:56:07 -0400
commitb94c5a169b588b9d97c014e6dcef18a1acb1f715 (patch)
tree4e850b26126e12831a813157b3b2e47dc909db91
parent8c2c9a9ef527edb453bcee638d34beb0e7b4616b (diff)
downloadrust-b94c5a169b588b9d97c014e6dcef18a1acb1f715.tar.gz
rust-b94c5a169b588b9d97c014e6dcef18a1acb1f715.zip
Avoid re-validating UTF-8 in `FromUtf8Error::into_utf8_lossy`
Refactor `into_utf8_lossy` to copy valid UTF-8 bytes into the buffer,
avoiding double validation of bytes.
Add tests that mirror the `String::from_utf8_lossy` tests
-rw-r--r--library/alloc/src/string.rs26
-rw-r--r--library/alloc/tests/lib.rs1
-rw-r--r--library/alloc/tests/string.rs37
3 files changed, 63 insertions, 1 deletions
diff --git a/library/alloc/src/string.rs b/library/alloc/src/string.rs
index 6daab5bc73a..837e97dac5c 100644
--- a/library/alloc/src/string.rs
+++ b/library/alloc/src/string.rs
@@ -2081,7 +2081,31 @@ impl FromUtf8Error {
     #[cfg(not(no_global_oom_handling))]
     #[unstable(feature = "string_from_utf8_lossy_owned", issue = "129436")]
     pub fn into_utf8_lossy(self) -> String {
-        String::from_utf8_lossy_owned(self.bytes)
+        const REPLACEMENT: &str = "\u{FFFD}";
+
+        let mut res = {
+            let mut v = Vec::with_capacity(self.bytes.len());
+
+            // `Utf8Error::valid_up_to` returns the maximum index of validated
+            // UTF-8 bytes. Copy the valid bytes into the output buffer.
+            v.extend_from_slice(&self.bytes[..self.error.valid_up_to()]);
+
+            // SAFETY: This is safe because the only bytes present in the buffer
+            // were validated as UTF-8 by the call to `String::from_utf8` which
+            // produced this `FromUtf8Error`.
+            unsafe { String::from_utf8_unchecked(v) }
+        };
+
+        let iter = self.bytes[self.error.valid_up_to()..].utf8_chunks();
+
+        for chunk in iter {
+            res.push_str(chunk.valid());
+            if !chunk.invalid().is_empty() {
+                res.push_str(REPLACEMENT);
+            }
+        }
+
+        res
     }
 
     /// Returns the bytes that were attempted to convert to a `String`.
diff --git a/library/alloc/tests/lib.rs b/library/alloc/tests/lib.rs
index ffc9a233b66..1d07a7690da 100644
--- a/library/alloc/tests/lib.rs
+++ b/library/alloc/tests/lib.rs
@@ -28,6 +28,7 @@
 #![feature(iter_next_chunk)]
 #![feature(round_char_boundary)]
 #![feature(slice_partition_dedup)]
+#![feature(string_from_utf8_lossy_owned)]
 #![feature(string_remove_matches)]
 #![feature(const_btree_len)]
 #![feature(const_trait_impl)]
diff --git a/library/alloc/tests/string.rs b/library/alloc/tests/string.rs
index dc03c4860e8..d996c55f946 100644
--- a/library/alloc/tests/string.rs
+++ b/library/alloc/tests/string.rs
@@ -115,6 +115,43 @@ fn test_from_utf8_lossy() {
 }
 
 #[test]
+fn test_fromutf8error_into_lossy() {
+    fn func(input: &[u8]) -> String {
+        String::from_utf8(input.to_owned()).unwrap_or_else(|e| e.into_utf8_lossy())
+    }
+
+    let xs = b"hello";
+    let ys = "hello".to_owned();
+    assert_eq!(func(xs), ys);
+
+    let xs = "ศไทย中华Việt Nam".as_bytes();
+    let ys = "ศไทย中华Việt Nam".to_owned();
+    assert_eq!(func(xs), ys);
+
+    let xs = b"Hello\xC2 There\xFF Goodbye";
+    assert_eq!(func(xs), "Hello\u{FFFD} There\u{FFFD} Goodbye".to_owned());
+
+    let xs = b"Hello\xC0\x80 There\xE6\x83 Goodbye";
+    assert_eq!(func(xs), "Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye".to_owned());
+
+    let xs = b"\xF5foo\xF5\x80bar";
+    assert_eq!(func(xs), "\u{FFFD}foo\u{FFFD}\u{FFFD}bar".to_owned());
+
+    let xs = b"\xF1foo\xF1\x80bar\xF1\x80\x80baz";
+    assert_eq!(func(xs), "\u{FFFD}foo\u{FFFD}bar\u{FFFD}baz".to_owned());
+
+    let xs = b"\xF4foo\xF4\x80bar\xF4\xBFbaz";
+    assert_eq!(func(xs), "\u{FFFD}foo\u{FFFD}bar\u{FFFD}\u{FFFD}baz".to_owned());
+
+    let xs = b"\xF0\x80\x80\x80foo\xF0\x90\x80\x80bar";
+    assert_eq!(func(xs), "\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}foo\u{10000}bar".to_owned());
+
+    // surrogates
+    let xs = b"\xED\xA0\x80foo\xED\xBF\xBFbar";
+    assert_eq!(func(xs), "\u{FFFD}\u{FFFD}\u{FFFD}foo\u{FFFD}\u{FFFD}\u{FFFD}bar".to_owned());
+}
+
+#[test]
 fn test_from_utf16() {
     let pairs = [
         (