about summary refs log tree commit diff
path: root/src/libunicode
diff options
context:
space:
mode:
authorAlex Crichton <alex@alexcrichton.com>2015-03-05 18:23:57 -0800
committerAlex Crichton <alex@alexcrichton.com>2015-03-10 15:08:31 -0700
commit0f6a0b58f9dbc3a741abd898f2d06a8ba78a938d (patch)
tree645a80f459c7da59405fc2dc8874a954b4aeb007 /src/libunicode
parent2574009af0ff70dc233beab246db8f2d715be2cb (diff)
downloadrust-0f6a0b58f9dbc3a741abd898f2d06a8ba78a938d.tar.gz
rust-0f6a0b58f9dbc3a741abd898f2d06a8ba78a938d.zip
std: Stabilize more of the `char` module
This commit performs another pass over the `std::char` module for stabilization.
Some minor cleanup is performed such as migrating documentation from libcore to
libunicode (where the `std`-facing trait resides) as well as a slight
reorganiation in libunicode itself. Otherwise, the stability modifications made
are:

* `char::from_digit` is now stable
* `CharExt::is_digit` is now stable
* `CharExt::to_digit` is now stable
* `CharExt::to_{lower,upper}case` are now stable after being modified to return
  an iterator over characters. While the implementation today has not changed
  this should allow us to implement the full set of case conversions in unicode
  where some characters can map to multiple when doing an upper or lower case
  mapping.
* `StrExt::to_{lower,upper}case` was added as unstable for a convenience of not
  having to worry about characters expanding to more characters when you just
  want the whole string to get into upper or lower case.

This is a breaking change due to the change in the signatures of the
`CharExt::to_{upper,lower}case` methods. Code can be updated to use functions
like `flat_map` or `collect` to handle the difference.

[breaking-change]
Diffstat (limited to 'src/libunicode')
-rw-r--r--src/libunicode/char.rs (renamed from src/libunicode/u_char.rs)294
-rw-r--r--src/libunicode/lib.rs31
-rw-r--r--src/libunicode/u_str.rs2
3 files changed, 225 insertions, 102 deletions
diff --git a/src/libunicode/u_char.rs b/src/libunicode/char.rs
index c0f45ca4d72..bcc2820e381 100644
--- a/src/libunicode/u_char.rs
+++ b/src/libunicode/char.rs
@@ -8,16 +8,39 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.
 
-//! Unicode-intensive `char` methods along with the `core` methods.
+//! Character manipulation (`char` type, Unicode Scalar Value)
 //!
-//! These methods implement functionality for `char` that requires knowledge of
-//! Unicode definitions, including normalization, categorization, and display information.
+//! This module provides the `CharExt` trait, as well as its
+//! implementation for the primitive `char` type, in order to allow
+//! basic character manipulation.
+//!
+//! A `char` actually represents a
+//! *[Unicode Scalar
+//! Value](http://www.unicode.org/glossary/#unicode_scalar_value)*, as it can
+//! contain any Unicode code point except high-surrogate and low-surrogate code
+//! points.
+//!
+//! As such, only values in the ranges \[0x0,0xD7FF\] and \[0xE000,0x10FFFF\]
+//! (inclusive) are allowed. A `char` can always be safely cast to a `u32`;
+//! however the converse is not always true due to the above range limits
+//! and, as such, should be performed via the `from_u32` function.
+
+#![stable(feature = "rust1", since = "1.0.0")]
+#![doc(primitive = "char")]
 
-use core::char;
 use core::char::CharExt as C;
-use core::option::Option;
+use core::option::Option::{self, Some};
+use core::iter::Iterator;
 use tables::{derived_property, property, general_category, conversions, charwidth};
 
+// stable reexports
+pub use core::char::{MAX, from_u32, from_digit, EscapeUnicode, EscapeDefault};
+
+// unstable reexports
+pub use normalize::{decompose_canonical, decompose_compatible, compose};
+pub use tables::normalization::canonical_combining_class;
+pub use tables::UNICODE_VERSION;
+
 /// Functionality for manipulating `char`.
 #[stable(feature = "rust1", since = "1.0.0")]
 pub trait CharExt {
@@ -34,8 +57,17 @@ pub trait CharExt {
     /// # Panics
     ///
     /// Panics if given a radix > 36.
-    #[unstable(feature = "unicode",
-               reason = "pending integer conventions")]
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let c = '1';
+    ///
+    /// assert!(c.is_digit(10));
+    ///
+    /// assert!('f'.is_digit(16));
+    /// ```
+    #[stable(feature = "rust1", since = "1.0.0")]
     fn is_digit(self, radix: u32) -> bool;
 
     /// Converts a character to the corresponding digit.
@@ -49,18 +81,56 @@ pub trait CharExt {
     /// # Panics
     ///
     /// Panics if given a radix outside the range [0..36].
-    #[unstable(feature = "unicode",
-               reason = "pending integer conventions")]
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let c = '1';
+    ///
+    /// assert_eq!(c.to_digit(10), Some(1));
+    ///
+    /// assert_eq!('f'.to_digit(16), Some(15));
+    /// ```
+    #[stable(feature = "rust1", since = "1.0.0")]
     fn to_digit(self, radix: u32) -> Option<u32>;
 
-    /// Returns an iterator that yields the hexadecimal Unicode escape
-    /// of a character, as `char`s.
+    /// Returns an iterator that yields the hexadecimal Unicode escape of a
+    /// character, as `char`s.
     ///
     /// All characters are escaped with Rust syntax of the form `\\u{NNNN}`
     /// where `NNNN` is the shortest hexadecimal representation of the code
     /// point.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// for i in '❤'.escape_unicode() {
+    ///     println!("{}", i);
+    /// }
+    /// ```
+    ///
+    /// This prints:
+    ///
+    /// ```text
+    /// \
+    /// u
+    /// {
+    /// 2
+    /// 7
+    /// 6
+    /// 4
+    /// }
+    /// ```
+    ///
+    /// Collecting into a `String`:
+    ///
+    /// ```
+    /// let heart: String = '❤'.escape_unicode().collect();
+    ///
+    /// assert_eq!(heart, r"\u{2764}");
+    /// ```
     #[stable(feature = "rust1", since = "1.0.0")]
-    fn escape_unicode(self) -> char::EscapeUnicode;
+    fn escape_unicode(self) -> EscapeUnicode;
 
     /// Returns an iterator that yields the 'default' ASCII and
     /// C++11-like literal escape of a character, as `char`s.
@@ -74,33 +144,118 @@ pub trait CharExt {
     ///   escaped.
     /// * Any other chars in the range [0x20,0x7e] are not escaped.
     /// * Any other chars are given hex Unicode escapes; see `escape_unicode`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// for i in '"'.escape_default() {
+    ///     println!("{}", i);
+    /// }
+    /// ```
+    ///
+    /// This prints:
+    ///
+    /// ```text
+    /// \
+    /// "
+    /// ```
+    ///
+    /// Collecting into a `String`:
+    ///
+    /// ```
+    /// let quote: String = '"'.escape_default().collect();
+    ///
+    /// assert_eq!(quote, "\\\"");
+    /// ```
     #[stable(feature = "rust1", since = "1.0.0")]
-    fn escape_default(self) -> char::EscapeDefault;
+    fn escape_default(self) -> EscapeDefault;
 
-    /// Returns the amount of bytes this character would need if encoded in
+    /// Returns the number of bytes this character would need if encoded in
     /// UTF-8.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let n = 'ß'.len_utf8();
+    ///
+    /// assert_eq!(n, 2);
+    /// ```
     #[stable(feature = "rust1", since = "1.0.0")]
     fn len_utf8(self) -> usize;
 
-    /// Returns the amount of bytes this character would need if encoded in
-    /// UTF-16.
+    /// Returns the number of 16-bit code units this character would need if
+    /// encoded in UTF-16.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let n = 'ß'.len_utf16();
+    ///
+    /// assert_eq!(n, 1);
+    /// ```
     #[stable(feature = "rust1", since = "1.0.0")]
     fn len_utf16(self) -> usize;
 
-    /// Encodes this character as UTF-8 into the provided byte buffer,
-    /// and then returns the number of bytes written.
+    /// Encodes this character as UTF-8 into the provided byte buffer, and then
+    /// returns the number of bytes written.
+    ///
+    /// If the buffer is not large enough, nothing will be written into it and a
+    /// `None` will be returned. A buffer of length four is large enough to
+    /// encode any `char`.
+    ///
+    /// # Examples
+    ///
+    /// In both of these examples, 'ß' takes two bytes to encode.
+    ///
+    /// ```
+    /// let mut b = [0; 2];
+    ///
+    /// let result = 'ß'.encode_utf8(&mut b);
     ///
-    /// If the buffer is not large enough, nothing will be written into it
-    /// and a `None` will be returned.
+    /// assert_eq!(result, Some(2));
+    /// ```
+    ///
+    /// A buffer that's too small:
+    ///
+    /// ```
+    /// let mut b = [0; 1];
+    ///
+    /// let result = 'ß'.encode_utf8(&mut b);
+    ///
+    /// assert_eq!(result, None);
+    /// ```
     #[unstable(feature = "unicode",
                reason = "pending decision about Iterator/Writer/Reader")]
     fn encode_utf8(self, dst: &mut [u8]) -> Option<usize>;
 
-    /// Encodes this character as UTF-16 into the provided `u16` buffer,
-    /// and then returns the number of `u16`s written.
+    /// Encodes this character as UTF-16 into the provided `u16` buffer, and
+    /// then returns the number of `u16`s written.
+    ///
+    /// If the buffer is not large enough, nothing will be written into it and a
+    /// `None` will be returned. A buffer of length 2 is large enough to encode
+    /// any `char`.
+    ///
+    /// # Examples
     ///
-    /// If the buffer is not large enough, nothing will be written into it
-    /// and a `None` will be returned.
+    /// In both of these examples, 'ß' takes one `u16` to encode.
+    ///
+    /// ```
+    /// let mut b = [0; 1];
+    ///
+    /// let result = 'ß'.encode_utf16(&mut b);
+    ///
+    /// assert_eq!(result, Some(1));
+    /// ```
+    ///
+    /// A buffer that's too small:
+    ///
+    /// ```
+    /// let mut b = [0; 0];
+    ///
+    /// let result = 'ß'.encode_utf8(&mut b);
+    ///
+    /// assert_eq!(result, None);
+    /// ```
     #[unstable(feature = "unicode",
                reason = "pending decision about Iterator/Writer/Reader")]
     fn encode_utf16(self, dst: &mut [u16]) -> Option<usize>;
@@ -175,35 +330,35 @@ pub trait CharExt {
     ///
     /// # Return value
     ///
-    /// Returns the lowercase equivalent of the character, or the character
-    /// itself if no conversion is possible.
-    #[unstable(feature = "unicode",
-               reason = "pending case transformation decisions")]
-    fn to_lowercase(self) -> char;
+    /// Returns an iterator which yields the characters corresponding to the
+    /// lowercase equivalent of the character. If no conversion is possible then
+    /// the input character is returned.
+    #[stable(feature = "rust1", since = "1.0.0")]
+    fn to_lowercase(self) -> ToLowercase;
 
     /// Converts a character to its uppercase equivalent.
     ///
     /// The case-folding performed is the common or simple mapping: it maps
-    /// one Unicode codepoint (one character in Rust) to its uppercase
-    /// equivalent according to the Unicode database [1]. The additional
-    /// [`SpecialCasing.txt`] is not considered here, as it expands to multiple
-    /// codepoints in some cases.
+    /// one Unicode codepoint to its uppercase equivalent according to the
+    /// Unicode database [1]. The additional [`SpecialCasing.txt`] is not yet
+    /// considered here, but the iterator returned will soon support this form
+    /// of case folding.
     ///
     /// A full reference can be found here [2].
     ///
     /// # Return value
     ///
-    /// Returns the uppercase equivalent of the character, or the character
-    /// itself if no conversion was made.
+    /// Returns an iterator which yields the characters corresponding to the
+    /// uppercase equivalent of the character. If no conversion is possible then
+    /// the input character is returned.
     ///
     /// [1]: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
     ///
     /// [`SpecialCasing`.txt`]: ftp://ftp.unicode.org/Public/UNIDATA/SpecialCasing.txt
     ///
     /// [2]: http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf#G33992
-    #[unstable(feature = "unicode",
-               reason = "pending case transformation decisions")]
-    fn to_uppercase(self) -> char;
+    #[stable(feature = "rust1", since = "1.0.0")]
+    fn to_uppercase(self) -> ToUppercase;
 
     /// Returns this character's displayed width in columns, or `None` if it is a
     /// control character other than `'\x00'`.
@@ -221,28 +376,15 @@ pub trait CharExt {
 
 #[stable(feature = "rust1", since = "1.0.0")]
 impl CharExt for char {
-    #[unstable(feature = "unicode",
-               reason = "pending integer conventions")]
     fn is_digit(self, radix: u32) -> bool { C::is_digit(self, radix) }
-    #[unstable(feature = "unicode",
-               reason = "pending integer conventions")]
     fn to_digit(self, radix: u32) -> Option<u32> { C::to_digit(self, radix) }
-    #[stable(feature = "rust1", since = "1.0.0")]
-    fn escape_unicode(self) -> char::EscapeUnicode { C::escape_unicode(self) }
-    #[stable(feature = "rust1", since = "1.0.0")]
-    fn escape_default(self) -> char::EscapeDefault { C::escape_default(self) }
-    #[stable(feature = "rust1", since = "1.0.0")]
+    fn escape_unicode(self) -> EscapeUnicode { C::escape_unicode(self) }
+    fn escape_default(self) -> EscapeDefault { C::escape_default(self) }
     fn len_utf8(self) -> usize { C::len_utf8(self) }
-    #[stable(feature = "rust1", since = "1.0.0")]
     fn len_utf16(self) -> usize { C::len_utf16(self) }
-    #[unstable(feature = "unicode",
-               reason = "pending decision about Iterator/Writer/Reader")]
     fn encode_utf8(self, dst: &mut [u8]) -> Option<usize> { C::encode_utf8(self, dst) }
-    #[unstable(feature = "unicode",
-               reason = "pending decision about Iterator/Writer/Reader")]
     fn encode_utf16(self, dst: &mut [u16]) -> Option<usize> { C::encode_utf16(self, dst) }
 
-    #[stable(feature = "rust1", since = "1.0.0")]
     fn is_alphabetic(self) -> bool {
         match self {
             'a' ... 'z' | 'A' ... 'Z' => true,
@@ -251,15 +393,10 @@ impl CharExt for char {
         }
     }
 
-    #[unstable(feature = "unicode",
-               reason = "mainly needed for compiler internals")]
     fn is_xid_start(self) -> bool { derived_property::XID_Start(self) }
 
-    #[unstable(feature = "unicode",
-               reason = "mainly needed for compiler internals")]
     fn is_xid_continue(self) -> bool { derived_property::XID_Continue(self) }
 
-    #[stable(feature = "rust1", since = "1.0.0")]
     fn is_lowercase(self) -> bool {
         match self {
             'a' ... 'z' => true,
@@ -268,7 +405,6 @@ impl CharExt for char {
         }
     }
 
-    #[stable(feature = "rust1", since = "1.0.0")]
     fn is_uppercase(self) -> bool {
         match self {
             'A' ... 'Z' => true,
@@ -277,7 +413,6 @@ impl CharExt for char {
         }
     }
 
-    #[stable(feature = "rust1", since = "1.0.0")]
     fn is_whitespace(self) -> bool {
         match self {
             ' ' | '\x09' ... '\x0d' => true,
@@ -286,15 +421,12 @@ impl CharExt for char {
         }
     }
 
-    #[stable(feature = "rust1", since = "1.0.0")]
     fn is_alphanumeric(self) -> bool {
         self.is_alphabetic() || self.is_numeric()
     }
 
-    #[stable(feature = "rust1", since = "1.0.0")]
     fn is_control(self) -> bool { general_category::Cc(self) }
 
-    #[stable(feature = "rust1", since = "1.0.0")]
     fn is_numeric(self) -> bool {
         match self {
             '0' ... '9' => true,
@@ -303,15 +435,35 @@ impl CharExt for char {
         }
     }
 
-    #[unstable(feature = "unicode",
-               reason = "pending case transformation decisions")]
-    fn to_lowercase(self) -> char { conversions::to_lower(self) }
+    fn to_lowercase(self) -> ToLowercase {
+        ToLowercase(Some(conversions::to_lower(self)))
+    }
 
-    #[unstable(feature = "unicode",
-               reason = "pending case transformation decisions")]
-    fn to_uppercase(self) -> char { conversions::to_upper(self) }
+    fn to_uppercase(self) -> ToUppercase {
+        ToUppercase(Some(conversions::to_upper(self)))
+    }
 
-    #[unstable(feature = "unicode",
-               reason = "needs expert opinion. is_cjk flag stands out as ugly")]
     fn width(self, is_cjk: bool) -> Option<usize> { charwidth::width(self, is_cjk) }
 }
+
+/// An iterator over the lowercase mapping of a given character, returned from
+/// the `lowercase` method on characters.
+#[stable(feature = "rust1", since = "1.0.0")]
+pub struct ToLowercase(Option<char>);
+
+#[stable(feature = "rust1", since = "1.0.0")]
+impl Iterator for ToLowercase {
+    type Item = char;
+    fn next(&mut self) -> Option<char> { self.0.take() }
+}
+
+/// An iterator over the uppercase mapping of a given character, returned from
+/// the `uppercase` method on characters.
+#[stable(feature = "rust1", since = "1.0.0")]
+pub struct ToUppercase(Option<char>);
+
+#[stable(feature = "rust1", since = "1.0.0")]
+impl Iterator for ToUppercase {
+    type Item = char;
+    fn next(&mut self) -> Option<char> { self.0.take() }
+}
diff --git a/src/libunicode/lib.rs b/src/libunicode/lib.rs
index 2095b6921c8..fadf91f33bc 100644
--- a/src/libunicode/lib.rs
+++ b/src/libunicode/lib.rs
@@ -42,37 +42,8 @@ pub use tables::regex;
 
 mod normalize;
 mod tables;
-mod u_char;
 mod u_str;
-
-// re-export char so that std et al see it correctly
-/// Character manipulation (`char` type, Unicode Scalar Value)
-///
-/// This module provides the `CharExt` trait, as well as its
-/// implementation for the primitive `char` type, in order to allow
-/// basic character manipulation.
-///
-/// A `char` actually represents a
-/// *[Unicode Scalar Value](http://www.unicode.org/glossary/#unicode_scalar_value)*,
-/// as it can contain any Unicode code point except high-surrogate and
-/// low-surrogate code points.
-///
-/// As such, only values in the ranges \[0x0,0xD7FF\] and \[0xE000,0x10FFFF\]
-/// (inclusive) are allowed. A `char` can always be safely cast to a `u32`;
-/// however the converse is not always true due to the above range limits
-/// and, as such, should be performed via the `from_u32` function.
-#[stable(feature = "rust1", since = "1.0.0")]
-#[doc(primitive = "char")]
-pub mod char {
-    pub use core::char::{MAX, from_u32, from_digit};
-
-    pub use normalize::{decompose_canonical, decompose_compatible, compose};
-
-    pub use tables::normalization::canonical_combining_class;
-    pub use tables::UNICODE_VERSION;
-
-    pub use u_char::CharExt;
-}
+pub mod char;
 
 pub mod str {
     pub use u_str::{UnicodeStr, Words, Graphemes, GraphemeIndices};
diff --git a/src/libunicode/u_str.rs b/src/libunicode/u_str.rs
index 57439addeaa..9b3f4b0521d 100644
--- a/src/libunicode/u_str.rs
+++ b/src/libunicode/u_str.rs
@@ -26,7 +26,7 @@ use core::num::Int;
 use core::slice;
 use core::str::Split;
 
-use u_char::CharExt as UCharExt; // conflicts with core::prelude::CharExt
+use char::CharExt as UCharExt; // conflicts with core::prelude::CharExt
 use tables::grapheme::GraphemeCat;
 
 /// An iterator over the words of a string, separated by a sequence of whitespace