about summary refs log tree commit diff
diff options
context:
space:
mode:
authorAndrew Tribick <ajtribick@googlemail.com>2023-07-20 21:52:33 +0200
committerAndrew Tribick <ajtribick@googlemail.com>2023-07-20 21:52:33 +0200
commite6fa5c18b56806aff5525c67f851a250bd8089f7 (patch)
tree9e472fd2293dcb842d7e636c42c4a7944a62032b
parent06a53ddc0bd3a50f9bcf2f7c373011dc7869f59f (diff)
downloadrust-e6fa5c18b56806aff5525c67f851a250bd8089f7.tar.gz
rust-e6fa5c18b56806aff5525c67f851a250bd8089f7.zip
Fix size_hint for EncodeUtf16
-rw-r--r--library/alloc/tests/str.rs22
-rw-r--r--library/core/src/str/iter.rs19
2 files changed, 36 insertions, 5 deletions
diff --git a/library/alloc/tests/str.rs b/library/alloc/tests/str.rs
index 82c1a9f9ad7..8a4b4ac4e8d 100644
--- a/library/alloc/tests/str.rs
+++ b/library/alloc/tests/str.rs
@@ -1739,6 +1739,28 @@ fn test_utf16_code_units() {
 }
 
 #[test]
+fn test_utf16_size_hint() {
+    assert_eq!("".encode_utf16().size_hint(), (0, Some(0)));
+    assert_eq!("123".encode_utf16().size_hint(), (1, Some(3)));
+    assert_eq!("1234".encode_utf16().size_hint(), (2, Some(4)));
+    assert_eq!("12345678".encode_utf16().size_hint(), (3, Some(8)));
+
+    fn hint_vec(src: &str) -> Vec<(usize, Option<usize>)> {
+        let mut it = src.encode_utf16();
+        let mut result = Vec::new();
+        result.push(it.size_hint());
+        while it.next().is_some() {
+            result.push(it.size_hint())
+        }
+        result
+    }
+
+    assert_eq!(hint_vec("12"), [(1, Some(2)), (1, Some(1)), (0, Some(0))]);
+    assert_eq!(hint_vec("\u{101234}"), [(2, Some(4)), (1, Some(1)), (0, Some(0))]);
+    assert_eq!(hint_vec("\u{101234}a"), [(2, Some(5)), (2, Some(2)), (1, Some(1)), (0, Some(0))]);
+}
+
+#[test]
 fn starts_with_in_unicode() {
     assert!(!"├── Cargo.toml".starts_with("# "));
 }
diff --git a/library/core/src/str/iter.rs b/library/core/src/str/iter.rs
index 772c3605562..133167a7067 100644
--- a/library/core/src/str/iter.rs
+++ b/library/core/src/str/iter.rs
@@ -1439,11 +1439,20 @@ impl<'a> Iterator for EncodeUtf16<'a> {
 
     #[inline]
     fn size_hint(&self) -> (usize, Option<usize>) {
-        let (low, high) = self.chars.size_hint();
-        // every char gets either one u16 or two u16,
-        // so this iterator is between 1 or 2 times as
-        // long as the underlying iterator.
-        (low, high.and_then(|n| n.checked_mul(2)))
+        let len = self.chars.iter.len();
+        // The highest bytes:code units ratio occurs for 3-byte sequences, so
+        // use this to determine the lower bound for the hint. The lowest
+        // ratio is for 1-byte sequences, so use this for the upper bound.
+        // `(len + 2)` can't overflow, because we know that the `slice::Iter`
+        // belongs to a slice in memory which has a maximum length of
+        // `isize::MAX` (that's well below `usize::MAX`)
+        if self.extra == 0 {
+            ((len + 2) / 3, Some(len))
+        } else {
+            // We're in the middle of a surrogate pair, so add the remaining
+            // surrogate to the bounds.
+            ((len + 2) / 3 + 1, Some(len + 1))
+        }
     }
 }