diff options
| author | bors <bors@rust-lang.org> | 2015-01-05 06:45:39 +0000 |
|---|---|---|
| committer | bors <bors@rust-lang.org> | 2015-01-05 06:45:39 +0000 |
| commit | 1f732ef53d54ccfc3e7728390ffbcea8a696ecee (patch) | |
| tree | 29ced952bc9e6f49640ccc33974209afc805ff91 /src/libunicode | |
| parent | ed22606c8382822efc555f72f895c560289a5c70 (diff) | |
| parent | 990a79f097e8e74308bfec6d72dcdbb769a7973b (diff) | |
| download | rust-1f732ef53d54ccfc3e7728390ffbcea8a696ecee.tar.gz rust-1f732ef53d54ccfc3e7728390ffbcea8a696ecee.zip | |
auto merge of #20395 : huonw/rust/char-stab-2, r=aturon
cc #19260 The casing transformations are left unstable (it is highly likely to be better to adopt the proper non-1-to-1 case mappings, per #20333) as are `is_xid_*`. I've got a little todo list in the last commit of things I thought about/was told about that I haven't yet handled (I'd also like some feedback).
Diffstat (limited to 'src/libunicode')
| -rw-r--r-- | src/libunicode/lib.rs | 12 | ||||
| -rw-r--r-- | src/libunicode/tables.rs | 2 | ||||
| -rw-r--r-- | src/libunicode/u_char.rs | 133 | ||||
| -rw-r--r-- | src/libunicode/u_str.rs | 6 |
4 files changed, 136 insertions, 17 deletions
diff --git a/src/libunicode/lib.rs b/src/libunicode/lib.rs index 72e9ce2bcaf..a3884d0c86e 100644 --- a/src/libunicode/lib.rs +++ b/src/libunicode/lib.rs @@ -44,9 +44,9 @@ mod u_str; // re-export char so that std et al see it correctly /// Character manipulation (`char` type, Unicode Scalar Value) /// -/// This module provides the `Char` and `UnicodeChar` traits, as well as their -/// implementation for the primitive `char` type, in order to allow basic character -/// manipulation. +/// This module provides the `CharExt` trait, as well as its +/// implementation for the primitive `char` type, in order to allow +/// basic character manipulation. /// /// A `char` actually represents a /// *[Unicode Scalar Value](http://www.unicode.org/glossary/#unicode_scalar_value)*, @@ -58,16 +58,14 @@ mod u_str; /// however the converse is not always true due to the above range limits /// and, as such, should be performed via the `from_u32` function.. pub mod char { - pub use core::char::{MAX, from_u32}; - pub use core::char::{from_digit}; - pub use core::char::Char; + pub use core::char::{MAX, from_u32, from_digit}; pub use normalize::{decompose_canonical, decompose_compatible, compose}; pub use tables::normalization::canonical_combining_class; pub use tables::UNICODE_VERSION; - pub use u_char::UnicodeChar; + pub use u_char::CharExt; } pub mod str { diff --git a/src/libunicode/tables.rs b/src/libunicode/tables.rs index e3550810010..c755ea93184 100644 --- a/src/libunicode/tables.rs +++ b/src/libunicode/tables.rs @@ -13,7 +13,7 @@ #![allow(missing_docs, non_upper_case_globals, non_snake_case)] /// The version of [Unicode](http://www.unicode.org/) -/// that the `UnicodeChar` and `UnicodeStrPrelude` traits are based on. +/// that the unicode parts of `CharExt` and `UnicodeStrPrelude` traits are based on. pub const UNICODE_VERSION: (uint, uint, uint) = (7, 0, 0); fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool { diff --git a/src/libunicode/u_char.rs b/src/libunicode/u_char.rs index 9c356801604..5693c222de1 100644 --- a/src/libunicode/u_char.rs +++ b/src/libunicode/u_char.rs @@ -8,19 +8,102 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -//! Unicode-intensive `char` methods. +//! Unicode-intensive `char` methods along with the `core` methods. //! //! These methods implement functionality for `char` that requires knowledge of //! Unicode definitions, including normalization, categorization, and display information. +use core::char; +use core::char::CharExt as C; use core::option::Option; use tables::{derived_property, property, general_category, conversions, charwidth}; -/// Useful functions for Unicode characters. -#[experimental = "pending prelude organization"] -pub trait UnicodeChar { +/// Functionality for manipulating `char`. +#[stable] +pub trait CharExt { + /// Checks if a `char` parses as a numeric digit in the given radix. + /// + /// Compared to `is_numeric()`, this function only recognizes the characters + /// `0-9`, `a-z` and `A-Z`. + /// + /// # Return value + /// + /// Returns `true` if `c` is a valid digit under `radix`, and `false` + /// otherwise. + /// + /// # Panics + /// + /// Panics if given a radix > 36. + #[unstable = "pending integer conventions"] + fn is_digit(self, radix: uint) -> bool; + + /// Converts a character to the corresponding digit. + /// + /// # Return value + /// + /// If `c` is between '0' and '9', the corresponding value between 0 and + /// 9. If `c` is 'a' or 'A', 10. If `c` is 'b' or 'B', 11, etc. Returns + /// none if the character does not refer to a digit in the given radix. + /// + /// # Panics + /// + /// Panics if given a radix outside the range [0..36]. + #[unstable = "pending integer conventions"] + fn to_digit(self, radix: uint) -> Option<uint>; + + /// Returns an iterator that yields the hexadecimal Unicode escape + /// of a character, as `char`s. + /// + /// All characters are escaped with Rust syntax of the form `\\u{NNNN}` + /// where `NNNN` is the shortest hexadecimal representation of the code + /// point. + #[stable] + fn escape_unicode(self) -> char::EscapeUnicode; + + /// Returns an iterator that yields the 'default' ASCII and + /// C++11-like literal escape of a character, as `char`s. + /// + /// The default is chosen with a bias toward producing literals that are + /// legal in a variety of languages, including C++11 and similar C-family + /// languages. The exact rules are: + /// + /// * Tab, CR and LF are escaped as '\t', '\r' and '\n' respectively. + /// * Single-quote, double-quote and backslash chars are backslash- + /// escaped. + /// * Any other chars in the range [0x20,0x7e] are not escaped. + /// * Any other chars are given hex Unicode escapes; see `escape_unicode`. + #[stable] + fn escape_default(self) -> char::EscapeDefault; + + /// Returns the amount of bytes this character would need if encoded in + /// UTF-8. + #[stable] + fn len_utf8(self) -> uint; + + /// Returns the amount of bytes this character would need if encoded in + /// UTF-16. + #[stable] + fn len_utf16(self) -> uint; + + /// Encodes this character as UTF-8 into the provided byte buffer, + /// and then returns the number of bytes written. + /// + /// If the buffer is not large enough, nothing will be written into it + /// and a `None` will be returned. + #[unstable = "pending decision about Iterator/Writer/Reader"] + fn encode_utf8(self, dst: &mut [u8]) -> Option<uint>; + + /// Encodes this character as UTF-16 into the provided `u16` buffer, + /// and then returns the number of `u16`s written. + /// + /// If the buffer is not large enough, nothing will be written into it + /// and a `None` will be returned. + #[unstable = "pending decision about Iterator/Writer/Reader"] + fn encode_utf16(self, dst: &mut [u16]) -> Option<uint>; + /// Returns whether the specified character is considered a Unicode /// alphabetic code point. + #[stable] fn is_alphabetic(self) -> bool; /// Returns whether the specified character satisfies the 'XID_Start' @@ -29,6 +112,7 @@ pub trait UnicodeChar { /// 'XID_Start' is a Unicode Derived Property specified in /// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications), /// mostly similar to ID_Start but modified for closure under NFKx. + #[experimental = "mainly needed for compiler internals"] fn is_xid_start(self) -> bool; /// Returns whether the specified `char` satisfies the 'XID_Continue' @@ -37,38 +121,45 @@ pub trait UnicodeChar { /// 'XID_Continue' is a Unicode Derived Property specified in /// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications), /// mostly similar to 'ID_Continue' but modified for closure under NFKx. + #[experimental = "mainly needed for compiler internals"] fn is_xid_continue(self) -> bool; /// Indicates whether a character is in lowercase. /// /// This is defined according to the terms of the Unicode Derived Core /// Property `Lowercase`. + #[stable] fn is_lowercase(self) -> bool; /// Indicates whether a character is in uppercase. /// /// This is defined according to the terms of the Unicode Derived Core /// Property `Uppercase`. + #[stable] fn is_uppercase(self) -> bool; /// Indicates whether a character is whitespace. /// /// Whitespace is defined in terms of the Unicode Property `White_Space`. + #[stable] fn is_whitespace(self) -> bool; /// Indicates whether a character is alphanumeric. /// /// Alphanumericness is defined in terms of the Unicode General Categories /// 'Nd', 'Nl', 'No' and the Derived Core Property 'Alphabetic'. + #[stable] fn is_alphanumeric(self) -> bool; /// Indicates whether a character is a control code point. /// /// Control code points are defined in terms of the Unicode General /// Category `Cc`. + #[stable] fn is_control(self) -> bool; /// Indicates whether the character is numeric (Nd, Nl, or No). + #[stable] fn is_numeric(self) -> bool; /// Converts a character to its lowercase equivalent. @@ -80,6 +171,7 @@ pub trait UnicodeChar { /// /// Returns the lowercase equivalent of the character, or the character /// itself if no conversion is possible. + #[experimental = "pending case transformation decisions"] fn to_lowercase(self) -> char; /// Converts a character to its uppercase equivalent. @@ -102,6 +194,7 @@ pub trait UnicodeChar { /// [`SpecialCasing`.txt`]: ftp://ftp.unicode.org/Public/UNIDATA/SpecialCasing.txt /// /// [2]: http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf#G33992 + #[experimental = "pending case transformation decisions"] fn to_uppercase(self) -> char; /// Returns this character's displayed width in columns, or `None` if it is a @@ -117,8 +210,26 @@ pub trait UnicodeChar { fn width(self, is_cjk: bool) -> Option<uint>; } -#[experimental = "pending prelude organization"] -impl UnicodeChar for char { +#[stable] +impl CharExt for char { + #[unstable = "pending integer conventions"] + fn is_digit(self, radix: uint) -> bool { C::is_digit(self, radix) } + #[unstable = "pending integer conventions"] + fn to_digit(self, radix: uint) -> Option<uint> { C::to_digit(self, radix) } + #[stable] + fn escape_unicode(self) -> char::EscapeUnicode { C::escape_unicode(self) } + #[stable] + fn escape_default(self) -> char::EscapeDefault { C::escape_default(self) } + #[stable] + fn len_utf8(self) -> uint { C::len_utf8(self) } + #[stable] + fn len_utf16(self) -> uint { C::len_utf16(self) } + #[unstable = "pending decision about Iterator/Writer/Reader"] + fn encode_utf8(self, dst: &mut [u8]) -> Option<uint> { C::encode_utf8(self, dst) } + #[unstable = "pending decision about Iterator/Writer/Reader"] + fn encode_utf16(self, dst: &mut [u16]) -> Option<uint> { C::encode_utf16(self, dst) } + + #[stable] fn is_alphabetic(self) -> bool { match self { 'a' ... 'z' | 'A' ... 'Z' => true, @@ -127,10 +238,13 @@ impl UnicodeChar for char { } } + #[experimental = "mainly needed for compiler internals"] fn is_xid_start(self) -> bool { derived_property::XID_Start(self) } + #[experimental = "mainly needed for compiler internals"] fn is_xid_continue(self) -> bool { derived_property::XID_Continue(self) } + #[stable] fn is_lowercase(self) -> bool { match self { 'a' ... 'z' => true, @@ -139,6 +253,7 @@ impl UnicodeChar for char { } } + #[stable] fn is_uppercase(self) -> bool { match self { 'A' ... 'Z' => true, @@ -147,6 +262,7 @@ impl UnicodeChar for char { } } + #[stable] fn is_whitespace(self) -> bool { match self { ' ' | '\x09' ... '\x0d' => true, @@ -155,12 +271,15 @@ impl UnicodeChar for char { } } + #[stable] fn is_alphanumeric(self) -> bool { self.is_alphabetic() || self.is_numeric() } + #[stable] fn is_control(self) -> bool { general_category::Cc(self) } + #[stable] fn is_numeric(self) -> bool { match self { '0' ... '9' => true, @@ -169,8 +288,10 @@ impl UnicodeChar for char { } } + #[experimental = "pending case transformation decisions"] fn to_lowercase(self) -> char { conversions::to_lower(self) } + #[experimental = "pending case transformation decisions"] fn to_uppercase(self) -> char { conversions::to_upper(self) } #[experimental = "needs expert opinion. is_cjk flag stands out as ugly"] diff --git a/src/libunicode/u_str.rs b/src/libunicode/u_str.rs index 1b0c4171134..90949437774 100644 --- a/src/libunicode/u_str.rs +++ b/src/libunicode/u_str.rs @@ -13,7 +13,7 @@ //! Unicode-intensive string manipulations. //! //! This module provides functionality to `str` that requires the Unicode methods provided by the -//! UnicodeChar trait. +//! unicode parts of the CharExt trait. use self::GraphemeState::*; use core::prelude::*; @@ -26,7 +26,7 @@ use core::num::Int; use core::slice; use core::str::Split; -use u_char::UnicodeChar; +use u_char::CharExt as UCharExt; // conflicts with core::prelude::CharExt use tables::grapheme::GraphemeCat; /// An iterator over the words of a string, separated by a sequence of whitespace @@ -529,7 +529,7 @@ impl<I> Iterator for Utf16Encoder<I> where I: Iterator<Item=char> { let mut buf = [0u16; 2]; self.chars.next().map(|ch| { - let n = ch.encode_utf16(buf.as_mut_slice()).unwrap_or(0); + let n = CharExt::encode_utf16(ch, buf.as_mut_slice()).unwrap_or(0); if n == 2 { self.extra = buf[1]; } buf[0] }) |
