From b94c5a169b588b9d97c014e6dcef18a1acb1f715 Mon Sep 17 00:00:00 2001 From: okaneco <47607823+okaneco@users.noreply.github.com> Date: Sun, 25 Aug 2024 02:41:56 -0400 Subject: Avoid re-validating UTF-8 in `FromUtf8Error::into_utf8_lossy` Refactor `into_utf8_lossy` to copy valid UTF-8 bytes into the buffer, avoiding double validation of bytes. Add tests that mirror the `String::from_utf8_lossy` tests --- library/alloc/tests/string.rs | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'library/alloc/tests/string.rs') diff --git a/library/alloc/tests/string.rs b/library/alloc/tests/string.rs index dc03c4860e8..d996c55f946 100644 --- a/library/alloc/tests/string.rs +++ b/library/alloc/tests/string.rs @@ -114,6 +114,43 @@ fn test_from_utf8_lossy() { ); } +#[test] +fn test_fromutf8error_into_lossy() { + fn func(input: &[u8]) -> String { + String::from_utf8(input.to_owned()).unwrap_or_else(|e| e.into_utf8_lossy()) + } + + let xs = b"hello"; + let ys = "hello".to_owned(); + assert_eq!(func(xs), ys); + + let xs = "ศไทย中华Việt Nam".as_bytes(); + let ys = "ศไทย中华Việt Nam".to_owned(); + assert_eq!(func(xs), ys); + + let xs = b"Hello\xC2 There\xFF Goodbye"; + assert_eq!(func(xs), "Hello\u{FFFD} There\u{FFFD} Goodbye".to_owned()); + + let xs = b"Hello\xC0\x80 There\xE6\x83 Goodbye"; + assert_eq!(func(xs), "Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye".to_owned()); + + let xs = b"\xF5foo\xF5\x80bar"; + assert_eq!(func(xs), "\u{FFFD}foo\u{FFFD}\u{FFFD}bar".to_owned()); + + let xs = b"\xF1foo\xF1\x80bar\xF1\x80\x80baz"; + assert_eq!(func(xs), "\u{FFFD}foo\u{FFFD}bar\u{FFFD}baz".to_owned()); + + let xs = b"\xF4foo\xF4\x80bar\xF4\xBFbaz"; + assert_eq!(func(xs), "\u{FFFD}foo\u{FFFD}bar\u{FFFD}\u{FFFD}baz".to_owned()); + + let xs = b"\xF0\x80\x80\x80foo\xF0\x90\x80\x80bar"; + assert_eq!(func(xs), "\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}foo\u{10000}bar".to_owned()); + + // surrogates + let xs = b"\xED\xA0\x80foo\xED\xBF\xBFbar"; + assert_eq!(func(xs), "\u{FFFD}\u{FFFD}\u{FFFD}foo\u{FFFD}\u{FFFD}\u{FFFD}bar".to_owned()); +} + #[test] fn test_from_utf16() { let pairs = [ -- cgit 1.4.1-3-g733a5