[breaking-change] std: change `encode_utf{8,16}()` to take a buffer and return a slice

They panic if the buffer is too small.
author: tormol <t.b.moltu@lyse.net> 2016-09-08 13:54:39 +0200
committer: tormol <t.b.moltu@lyse.net> 2016-09-28 09:03:30 +0200
commit: 13a2dd96fe824cc5d61e94ed380db0114efdd014 (patch)
tree: 808a2f28e42625a2e5bc4a88abf2d46c34727b32
parent: a059cb2f3344c0a9efae17dde3d0e16a55ce93db (diff)
download: rust-13a2dd96fe824cc5d61e94ed380db0114efdd014.tar.gz
rust-13a2dd96fe824cc5d61e94ed380db0114efdd014.zip
9 files changed, 163 insertions, 188 deletions
diff --git a/src/libcollections/string.rs b/src/libcollections/string.rs
index cff0308d4af..e4930ae3572 100644
--- a/src/libcollections/string.rs
+++ b/src/libcollections/string.rs
@@ -975,7 +975,7 @@ impl String {
     pub fn push(&mut self, ch: char) {
         match ch.len_utf8() {
             1 => self.vec.push(ch as u8),
-            _ => self.vec.extend_from_slice(ch.encode_utf8().as_slice()),
+            _ => self.vec.extend_from_slice(ch.encode_utf8(&mut [0;4]).as_bytes()),
         }
     }
 
@@ -1131,10 +1131,11 @@ impl String {
         let len = self.len();
         assert!(idx <= len);
         assert!(self.is_char_boundary(idx));
-        let bits = ch.encode_utf8();
+        let mut bits = [0; 4];
+        let bits = ch.encode_utf8(&mut bits).as_bytes();
 
         unsafe {
-            self.insert_bytes(idx, bits.as_slice());
+            self.insert_bytes(idx, bits);
         }
     }
 
diff --git a/src/libcollectionstest/str.rs b/src/libcollectionstest/str.rs
index 62e164a569a..560895f721b 100644
--- a/src/libcollectionstest/str.rs
+++ b/src/libcollectionstest/str.rs
@@ -786,9 +786,9 @@ fn test_rev_iterator() {
 
 #[test]
 fn test_chars_decoding() {
+    let mut bytes = [0; 4];
     for c in (0..0x110000).filter_map(::std::char::from_u32) {
-        let bytes = c.encode_utf8();
-        let s = ::std::str::from_utf8(bytes.as_slice()).unwrap();
+        let s = c.encode_utf8(&mut bytes);
         if Some(c) != s.chars().next() {
             panic!("character {:x}={} does not decode correctly", c as u32, c);
         }
@@ -797,9 +797,9 @@ fn test_chars_decoding() {
 
 #[test]
 fn test_chars_rev_decoding() {
+    let mut bytes = [0; 4];
     for c in (0..0x110000).filter_map(::std::char::from_u32) {
-        let bytes = c.encode_utf8();
-        let s = ::std::str::from_utf8(bytes.as_slice()).unwrap();
+        let s = c.encode_utf8(&mut bytes);
         if Some(c) != s.chars().rev().next() {
             panic!("character {:x}={} does not decode correctly", c as u32, c);
         }
diff --git a/src/libcore/char.rs b/src/libcore/char.rs
index a21d1229d35..26d28049a47 100644
--- a/src/libcore/char.rs
+++ b/src/libcore/char.rs
@@ -18,6 +18,7 @@
 use char_private::is_printable;
 use convert::TryFrom;
 use fmt;
+use slice;
 use iter::FusedIterator;
 use mem::transmute;
 
@@ -327,9 +328,9 @@ pub trait CharExt {
     #[stable(feature = "core", since = "1.6.0")]
     fn len_utf16(self) -> usize;
     #[unstable(feature = "unicode", issue = "27784")]
-    fn encode_utf8(self) -> EncodeUtf8;
+    fn encode_utf8(self, dst: &mut [u8]) -> &mut str;
     #[unstable(feature = "unicode", issue = "27784")]
-    fn encode_utf16(self) -> EncodeUtf16;
+    fn encode_utf16(self, dst: &mut [u16]) -> &mut [u16];
 }
 
 #[stable(feature = "core", since = "1.6.0")]
@@ -419,47 +420,59 @@ impl CharExt for char {
     }
 
     #[inline]
-    fn encode_utf8(self) -> EncodeUtf8 {
+    fn encode_utf8(self, dst: &mut [u8]) -> &mut str {
         let code = self as u32;
-        let mut buf = [0; 4];
-        let pos = if code < MAX_ONE_B {
-            buf[3] = code as u8;
-            3
-        } else if code < MAX_TWO_B {
-            buf[2] = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
-            buf[3] = (code & 0x3F) as u8 | TAG_CONT;
-            2
-        } else if code < MAX_THREE_B {
-            buf[1] = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
-            buf[2] = (code >>  6 & 0x3F) as u8 | TAG_CONT;
-            buf[3] = (code & 0x3F) as u8 | TAG_CONT;
-            1
-        } else {
-            buf[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
-            buf[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT;
-            buf[2] = (code >>  6 & 0x3F) as u8 | TAG_CONT;
-            buf[3] = (code & 0x3F) as u8 | TAG_CONT;
-            0
-        };
-        EncodeUtf8 { buf: buf, pos: pos }
+        unsafe {
+            let len =
+            if code < MAX_ONE_B && !dst.is_empty() {
+                *dst.get_unchecked_mut(0) = code as u8;
+                1
+            } else if code < MAX_TWO_B && dst.len() >= 2 {
+                *dst.get_unchecked_mut(0) = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
+                *dst.get_unchecked_mut(1) = (code & 0x3F) as u8 | TAG_CONT;
+                2
+            } else if code < MAX_THREE_B && dst.len() >= 3  {
+                *dst.get_unchecked_mut(0) = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
+                *dst.get_unchecked_mut(1) = (code >>  6 & 0x3F) as u8 | TAG_CONT;
+                *dst.get_unchecked_mut(2) = (code & 0x3F) as u8 | TAG_CONT;
+                3
+            } else if dst.len() >= 4 {
+                *dst.get_unchecked_mut(0) = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
+                *dst.get_unchecked_mut(1) = (code >> 12 & 0x3F) as u8 | TAG_CONT;
+                *dst.get_unchecked_mut(2) = (code >>  6 & 0x3F) as u8 | TAG_CONT;
+                *dst.get_unchecked_mut(3) = (code & 0x3F) as u8 | TAG_CONT;
+                4
+            } else {
+                panic!("encode_utf8: need {} bytes to encode U+{:X}, but the buffer has {}",
+                    from_u32_unchecked(code).len_utf8(),
+                    code,
+                    dst.len())
+            };
+            transmute(slice::from_raw_parts_mut(dst.as_mut_ptr(), len))
+        }
     }
 
     #[inline]
-    fn encode_utf16(self) -> EncodeUtf16 {
-        let mut buf = [0; 2];
+    fn encode_utf16(self, dst: &mut [u16]) -> &mut [u16] {
         let mut code = self as u32;
-        let pos = if (code & 0xFFFF) == code {
-            // The BMP falls through (assuming non-surrogate, as it should)
-            buf[1] = code as u16;
-            1
-        } else {
-            // Supplementary planes break into surrogates.
-            code -= 0x1_0000;
-            buf[0] = 0xD800 | ((code >> 10) as u16);
-            buf[1] = 0xDC00 | ((code as u16) & 0x3FF);
-            0
-        };
-        EncodeUtf16 { buf: buf, pos: pos }
+        unsafe {
+            if (code & 0xFFFF) == code && !dst.is_empty() {
+                // The BMP falls through (assuming non-surrogate, as it should)
+                *dst.get_unchecked_mut(0) = code as u16;
+                slice::from_raw_parts_mut(dst.as_mut_ptr(), 1)
+            } else if dst.len() >= 2 {
+                // Supplementary planes break into surrogates.
+                code -= 0x1_0000;
+                *dst.get_unchecked_mut(0) = 0xD800 | ((code >> 10) as u16);
+                *dst.get_unchecked_mut(1) = 0xDC00 | ((code as u16) & 0x3FF);
+                slice::from_raw_parts_mut(dst.as_mut_ptr(), 2)
+            } else {
+                panic!("encode_utf16: need {} units to encode U+{:X}, but the buffer has {}",
+                    from_u32_unchecked(code).len_utf16(),
+                    code,
+                    dst.len())
+            }
+        }
     }
 }
 
@@ -702,88 +715,7 @@ impl ExactSizeIterator for EscapeDebug { }
 #[unstable(feature = "fused", issue = "35602")]
 impl FusedIterator for EscapeDebug {}
 
-/// An iterator over `u8` entries represending the UTF-8 encoding of a `char`
-/// value.
-///
-/// Constructed via the `.encode_utf8()` method on `char`.
-#[unstable(feature = "unicode", issue = "27784")]
-#[derive(Debug)]
-pub struct EncodeUtf8 {
-    buf: [u8; 4],
-    pos: usize,
-}
-
-impl EncodeUtf8 {
-    /// Returns the remaining bytes of this iterator as a slice.
-    #[unstable(feature = "unicode", issue = "27784")]
-    pub fn as_slice(&self) -> &[u8] {
-        &self.buf[self.pos..]
-    }
-}
-
-#[unstable(feature = "unicode", issue = "27784")]
-impl Iterator for EncodeUtf8 {
-    type Item = u8;
-
-    fn next(&mut self) -> Option<u8> {
-        if self.pos == self.buf.len() {
-            None
-        } else {
-            let ret = Some(self.buf[self.pos]);
-            self.pos += 1;
-            ret
-        }
-    }
-
-    fn size_hint(&self) -> (usize, Option<usize>) {
-        self.as_slice().iter().size_hint()
-    }
-}
-
-#[unstable(feature = "fused", issue = "35602")]
-impl FusedIterator for EncodeUtf8 {}
-
-/// An iterator over `u16` entries represending the UTF-16 encoding of a `char`
-/// value.
-///
-/// Constructed via the `.encode_utf16()` method on `char`.
-#[unstable(feature = "unicode", issue = "27784")]
-#[derive(Debug)]
-pub struct EncodeUtf16 {
-    buf: [u16; 2],
-    pos: usize,
-}
-
-impl EncodeUtf16 {
-    /// Returns the remaining bytes of this iterator as a slice.
-    #[unstable(feature = "unicode", issue = "27784")]
-    pub fn as_slice(&self) -> &[u16] {
-        &self.buf[self.pos..]
-    }
-}
-
-
-#[unstable(feature = "unicode", issue = "27784")]
-impl Iterator for EncodeUtf16 {
-    type Item = u16;
-
-    fn next(&mut self) -> Option<u16> {
-        if self.pos == self.buf.len() {
-            None
-        } else {
-            let ret = Some(self.buf[self.pos]);
-            self.pos += 1;
-            ret
-        }
-    }
-
-    fn size_hint(&self) -> (usize, Option<usize>) {
-        self.as_slice().iter().size_hint()
-    }
-}
 
-#[unstable(feature = "fused", issue = "35602")]
-impl FusedIterator for EncodeUtf16 {}
 
 /// An iterator over an iterator of bytes of the characters the bytes represent
 /// as UTF-8
diff --git a/src/libcore/fmt/mod.rs b/src/libcore/fmt/mod.rs
index 8342d663cdc..5d7f41556c2 100644
--- a/src/libcore/fmt/mod.rs
+++ b/src/libcore/fmt/mod.rs
@@ -97,9 +97,7 @@ pub trait Write {
     /// This function will return an instance of `Error` on error.
     #[stable(feature = "fmt_write_char", since = "1.1.0")]
     fn write_char(&mut self, c: char) -> Result {
-        self.write_str(unsafe {
-            str::from_utf8_unchecked(c.encode_utf8().as_slice())
-        })
+        self.write_str(c.encode_utf8(&mut [0; 4]))
     }
 
     /// Glue for usage of the `write!` macro with implementors of this trait.
@@ -924,9 +922,7 @@ impl<'a> Formatter<'a> {
         // Writes the sign if it exists, and then the prefix if it was requested
         let write_prefix = |f: &mut Formatter| {
             if let Some(c) = sign {
-                f.buf.write_str(unsafe {
-                    str::from_utf8_unchecked(c.encode_utf8().as_slice())
-                })?;
+                f.buf.write_str(c.encode_utf8(&mut [0; 4]))?;
             }
             if prefixed { f.buf.write_str(prefix) }
             else { Ok(()) }
@@ -1032,10 +1028,8 @@ impl<'a> Formatter<'a> {
             rt::v1::Alignment::Center => (padding / 2, (padding + 1) / 2),
         };
 
-        let fill = self.fill.encode_utf8();
-        let fill = unsafe {
-            str::from_utf8_unchecked(fill.as_slice())
-        };
+        let mut fill = [0; 4];
+        let fill = self.fill.encode_utf8(&mut fill);
 
         for _ in 0..pre_pad {
             self.buf.write_str(fill)?;
@@ -1435,9 +1429,7 @@ impl Display for char {
         if f.width.is_none() && f.precision.is_none() {
             f.write_char(*self)
         } else {
-            f.pad(unsafe {
-                str::from_utf8_unchecked(self.encode_utf8().as_slice())
-            })
+            f.pad(self.encode_utf8(&mut [0; 4]))
         }
     }
 }
diff --git a/src/libcoretest/char.rs b/src/libcoretest/char.rs
index 199437a431e..7da0b6902f2 100644
--- a/src/libcoretest/char.rs
+++ b/src/libcoretest/char.rs
@@ -8,7 +8,7 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.
 
-use std::char;
+use std::{char,str};
 use std::convert::TryFrom;
 
 #[test]
@@ -248,10 +248,12 @@ fn test_escape_unicode() {
 #[test]
 fn test_encode_utf8() {
     fn check(input: char, expect: &[u8]) {
-        assert_eq!(input.encode_utf8().as_slice(), expect);
-        for (a, b) in input.encode_utf8().zip(expect) {
-            assert_eq!(a, *b);
-        }
+        let mut buf = [0; 4];
+        let ptr = buf.as_ptr();
+        let s = input.encode_utf8(&mut buf);
+        assert_eq!(s.as_ptr() as usize, ptr as usize);
+        assert!(str::from_utf8(s.as_bytes()).is_ok());
+        assert_eq!(s.as_bytes(), expect);
     }
 
     check('x', &[0x78]);
@@ -263,10 +265,11 @@ fn test_encode_utf8() {
 #[test]
 fn test_encode_utf16() {
     fn check(input: char, expect: &[u16]) {
-        assert_eq!(input.encode_utf16().as_slice(), expect);
-        for (a, b) in input.encode_utf16().zip(expect) {
-            assert_eq!(a, *b);
-        }
+        let mut buf = [0; 2];
+        let ptr = buf.as_mut_ptr();
+        let b = input.encode_utf16(&mut buf);
+        assert_eq!(b.as_mut_ptr() as usize, ptr as usize);
+        assert_eq!(b, expect);
     }
 
     check('x', &[0x0078]);
diff --git a/src/librustc_unicode/char.rs b/src/librustc_unicode/char.rs
index 5a0c27d9c60..702d7d8b4b2 100644
--- a/src/librustc_unicode/char.rs
+++ b/src/librustc_unicode/char.rs
@@ -37,7 +37,7 @@ use tables::{conversions, derived_property, general_category, property};
 #[stable(feature = "rust1", since = "1.0.0")]
 pub use core::char::{MAX, from_digit, from_u32, from_u32_unchecked};
 #[stable(feature = "rust1", since = "1.0.0")]
-pub use core::char::{EncodeUtf16, EncodeUtf8, EscapeDebug, EscapeDefault, EscapeUnicode};
+pub use core::char::{EscapeDebug, EscapeDefault, EscapeUnicode};
 
 // unstable reexports
 #[unstable(feature = "try_from", issue = "33417")]
@@ -435,50 +435,96 @@ impl char {
         C::len_utf16(self)
     }
 
-    /// Returns an iterator over the bytes of this character as UTF-8.
+    /// Encodes this character as UTF-8 into the provided byte buffer,
+    /// and then returns the subslice of the buffer that contains the encoded character.
     ///
-    /// The returned iterator also has an `as_slice()` method to view the
-    /// encoded bytes as a byte slice.
+    /// # Panics
+    ///
+    /// Panics if the buffer is not large enough.
+    /// A buffer of length four is large enough to encode any `char`.
     ///
     /// # Examples
     ///
+    /// In both of these examples, 'ß' takes two bytes to encode.
+    ///
     /// ```
     /// #![feature(unicode)]
     ///
-    /// let iterator = 'ß'.encode_utf8();
-    /// assert_eq!(iterator.as_slice(), [0xc3, 0x9f]);
+    /// let mut b = [0; 2];
     ///
-    /// for (i, byte) in iterator.enumerate() {
-    ///     println!("byte {}: {:x}", i, byte);
-    /// }
+    /// let result = 'ß'.encode_utf8(&mut b);
+    ///
+    /// assert_eq!(result, "ß");
+    ///
+    /// assert_eq!(result.len(), 2);
+    /// ```
+    ///
+    /// A buffer that's too small:
+    ///
+    /// ```
+    /// #![feature(unicode)]
+    /// use std::thread;
+    ///
+    /// let result = thread::spawn(|| {
+    ///     let mut b = [0; 1];
+    ///
+    ///     // this panics
+    ///    'ß'.encode_utf8(&mut b);
+    /// }).join();
+    ///
+    /// assert!(result.is_err());
     /// ```
-    #[unstable(feature = "unicode", issue = "27784")]
+    #[unstable(feature = "unicode",
+               reason = "pending decision about Iterator/Writer/Reader",
+               issue = "27784")]
     #[inline]
-    pub fn encode_utf8(self) -> EncodeUtf8 {
-        C::encode_utf8(self)
+    pub fn encode_utf8(self, dst: &mut [u8]) -> &mut str {
+        C::encode_utf8(self, dst)
     }
 
-    /// Returns an iterator over the `u16` entries of this character as UTF-16.
+    /// Encodes this character as UTF-16 into the provided `u16` buffer,
+    /// and then returns the subslice of the buffer that contains the encoded character.
     ///
-    /// The returned iterator also has an `as_slice()` method to view the
-    /// encoded form as a slice.
+    /// # Panics
+    ///
+    /// Panics if the buffer is not large enough.
+    /// A buffer of length 2 is large enough to encode any `char`.
     ///
     /// # Examples
     ///
+    /// In both of these examples, '𝕊' takes two `u16`s to encode.
+    ///
     /// ```
     /// #![feature(unicode)]
     ///
-    /// let iterator = '𝕊'.encode_utf16();
-    /// assert_eq!(iterator.as_slice(), [0xd835, 0xdd4a]);
+    /// let mut b = [0; 2];
     ///
-    /// for (i, val) in iterator.enumerate() {
-    ///     println!("entry {}: {:x}", i, val);
-    /// }
+    /// let result = '𝕊'.encode_utf16(&mut b);
+    ///
+    /// assert_eq!(result.len(), 2);
     /// ```
-    #[unstable(feature = "unicode", issue = "27784")]
+    ///
+    /// A buffer that's too small:
+    ///
+    /// ```
+    /// #![feature(unicode)]
+    /// use std::thread;
+    ///
+    /// let result = thread::spawn(|| {
+    ///     let mut b = [0; 1];
+    ///
+    ///     // this panics
+    ///     '𝕊'.encode_utf16(&mut b);
+    /// }).join();
+    ///
+    /// assert!(result.is_err());
+    /// ```
+    #[unstable(feature = "unicode",
+               reason = "pending decision about Iterator/Writer/Reader",
+               issue = "27784")]
     #[inline]
-    pub fn encode_utf16(self) -> EncodeUtf16 {
-        C::encode_utf16(self)
+    pub fn encode_utf16(self, dst: &mut [u16]) -> &mut [u16] {
+        C::encode_utf16(self, dst)
     }
 
     /// Returns true if this `char` is an alphabetic code point, and false if not.
diff --git a/src/librustc_unicode/u_str.rs b/src/librustc_unicode/u_str.rs
index eb5b6feeb7e..19e419e37a0 100644
--- a/src/librustc_unicode/u_str.rs
+++ b/src/librustc_unicode/u_str.rs
@@ -157,13 +157,13 @@ impl<I> Iterator for Utf16Encoder<I>
             return Some(tmp);
         }
 
+        let mut buf = [0; 2];
         self.chars.next().map(|ch| {
-            let n = CharExt::encode_utf16(ch);
-            let n = n.as_slice();
-            if n.len() == 2 {
-                self.extra = n[1];
+            let n = CharExt::encode_utf16(ch, &mut buf).len();
+            if n == 2 {
+                self.extra = buf[1];
             }
-            n[0]
+            buf[0]
         })
     }
 
diff --git a/src/libserialize/json.rs b/src/libserialize/json.rs
index 6ccc0be41bc..5e25c61bae9 100644
--- a/src/libserialize/json.rs
+++ b/src/libserialize/json.rs
@@ -433,9 +433,7 @@ fn escape_str(wr: &mut fmt::Write, v: &str) -> EncodeResult {
 }
 
 fn escape_char(writer: &mut fmt::Write, v: char) -> EncodeResult {
-    escape_str(writer, unsafe {
-        str::from_utf8_unchecked(v.encode_utf8().as_slice())
-    })
+    escape_str(writer, v.encode_utf8(&mut [0; 4]))
 }
 
 fn spaces(wr: &mut fmt::Write, mut n: usize) -> EncodeResult {
diff --git a/src/libstd/sys/common/wtf8.rs b/src/libstd/sys/common/wtf8.rs
index 8d357aa78c9..0a94ff1e958 100644
--- a/src/libstd/sys/common/wtf8.rs
+++ b/src/libstd/sys/common/wtf8.rs
@@ -206,10 +206,12 @@ impl Wtf8Buf {
     /// Copied from String::push
     /// This does **not** include the WTF-8 concatenation check.
     fn push_code_point_unchecked(&mut self, code_point: CodePoint) {
-        let bytes = unsafe {
-            char::from_u32_unchecked(code_point.value).encode_utf8()
+        let c = unsafe {
+            char::from_u32_unchecked(code_point.value)
         };
-        self.bytes.extend_from_slice(bytes.as_slice());
+        let mut bytes = [0; 4];
+        let bytes = c.encode_utf8(&mut bytes).as_bytes();
+        self.bytes.extend_from_slice(bytes)
     }
 
     #[inline]
@@ -738,15 +740,16 @@ impl<'a> Iterator for EncodeWide<'a> {
             return Some(tmp);
         }
 
+        let mut buf = [0; 2];
         self.code_points.next().map(|code_point| {
-            let n = unsafe {
-                char::from_u32_unchecked(code_point.value).encode_utf16()
+            let c = unsafe {
+                char::from_u32_unchecked(code_point.value)
             };
-            let n = n.as_slice();
-            if n.len() == 2 {
-                self.extra = n[1];
+            let n = c.encode_utf16(&mut buf).len();
+            if n == 2 {
+                self.extra = buf[1];
             }
-            n[0]
+            buf[0]
         })
     }
author	tormol <t.b.moltu@lyse.net>	2016-09-08 13:54:39 +0200
committer	tormol <t.b.moltu@lyse.net>	2016-09-28 09:03:30 +0200
commit	13a2dd96fe824cc5d61e94ed380db0114efdd014 (patch)
tree	808a2f28e42625a2e5bc4a88abf2d46c34727b32
parent	a059cb2f3344c0a9efae17dde3d0e16a55ce93db (diff)
download	rust-13a2dd96fe824cc5d61e94ed380db0114efdd014.tar.gz rust-13a2dd96fe824cc5d61e94ed380db0114efdd014.zip