//! A character type. //! //! The `char` type represents a single character. More specifically, since //! 'character' isn't a well-defined concept in Unicode, `char` is a '[Unicode //! scalar value]', which is similar to, but not the same as, a '[Unicode code //! point]'. //! //! [Unicode scalar value]: http://www.unicode.org/glossary/#unicode_scalar_value //! [Unicode code point]: http://www.unicode.org/glossary/#code_point //! //! This module exists for technical reasons, the primary documentation for //! `char` is directly on [the `char` primitive type](../../std/primitive.char.html) //! itself. //! //! This module is the home of the iterator implementations for the iterators //! implemented on `char`, as well as some useful constants and conversion //! functions that convert various types to `char`. #![allow(non_snake_case)] #![stable(feature = "core_char", since = "1.2.0")] mod convert; mod decode; mod methods; // stable re-exports #[stable(feature = "char_from_unchecked", since = "1.5.0")] pub use self::convert::from_u32_unchecked; #[stable(feature = "try_from", since = "1.34.0")] pub use self::convert::CharTryFromError; #[stable(feature = "char_from_str", since = "1.20.0")] pub use self::convert::ParseCharError; #[stable(feature = "rust1", since = "1.0.0")] pub use self::convert::{from_digit, from_u32}; #[stable(feature = "decode_utf16", since = "1.9.0")] pub use self::decode::{decode_utf16, DecodeUtf16, DecodeUtf16Error}; // unstable re-exports #[unstable(feature = "unicode_version", issue = "49726")] pub use crate::unicode::tables::UNICODE_VERSION; #[unstable(feature = "unicode_version", issue = "49726")] pub use crate::unicode::version::UnicodeVersion; use crate::fmt::{self, Write}; use crate::iter::FusedIterator; // UTF-8 ranges and tags for encoding characters const TAG_CONT: u8 = 0b1000_0000; const TAG_TWO_B: u8 = 0b1100_0000; const TAG_THREE_B: u8 = 0b1110_0000; const TAG_FOUR_B: u8 = 0b1111_0000; const MAX_ONE_B: u32 = 0x80; const MAX_TWO_B: u32 = 0x800; const MAX_THREE_B: u32 = 0x10000; /* Lu Uppercase_Letter an uppercase letter Ll Lowercase_Letter a lowercase letter Lt Titlecase_Letter a digraphic character, with first part uppercase Lm Modifier_Letter a modifier letter Lo Other_Letter other letters, including syllables and ideographs Mn Nonspacing_Mark a nonspacing combining mark (zero advance width) Mc Spacing_Mark a spacing combining mark (positive advance width) Me Enclosing_Mark an enclosing combining mark Nd Decimal_Number a decimal digit Nl Letter_Number a letterlike numeric character No Other_Number a numeric character of other type Pc Connector_Punctuation a connecting punctuation mark, like a tie Pd Dash_Punctuation a dash or hyphen punctuation mark Ps Open_Punctuation an opening punctuation mark (of a pair) Pe Close_Punctuation a closing punctuation mark (of a pair) Pi Initial_Punctuation an initial quotation mark Pf Final_Punctuation a final quotation mark Po Other_Punctuation a punctuation mark of other type Sm Math_Symbol a symbol of primarily mathematical use Sc Currency_Symbol a currency sign Sk Modifier_Symbol a non-letterlike modifier symbol So Other_Symbol a symbol of other type Zs Space_Separator a space character (of various non-zero widths) Zl Line_Separator U+2028 LINE SEPARATOR only Zp Paragraph_Separator U+2029 PARAGRAPH SEPARATOR only Cc Control a C0 or C1 control code Cf Format a format control character Cs Surrogate a surrogate code point Co Private_Use a private-use character Cn Unassigned a reserved unassigned code point or a noncharacter */ /// The highest valid code point a `char` can have. /// /// A [`char`] is a [Unicode Scalar Value], which means that it is a [Code /// Point], but only ones within a certain range. `MAX` is the highest valid /// code point that's a valid [Unicode Scalar Value]. /// /// [`char`]: ../../std/primitive.char.html /// [Unicode Scalar Value]: http://www.unicode.org/glossary/#unicode_scalar_value /// [Code Point]: http://www.unicode.org/glossary/#code_point #[stable(feature = "rust1", since = "1.0.0")] pub const MAX: char = '\u{10ffff}'; /// `U+FFFD REPLACEMENT CHARACTER` (�) is used in Unicode to represent a /// decoding error. /// /// It can occur, for example, when giving ill-formed UTF-8 bytes to /// [`String::from_utf8_lossy`](../../std/string/struct.String.html#method.from_utf8_lossy). #[stable(feature = "decode_utf16", since = "1.9.0")] pub const REPLACEMENT_CHARACTER: char = '\u{FFFD}'; /// Returns an iterator that yields the hexadecimal Unicode escape of a /// character, as `char`s. /// /// This `struct` is created by the [`escape_unicode`] method on [`char`]. See /// its documentation for more. /// /// [`escape_unicode`]: ../../std/primitive.char.html#method.escape_unicode /// [`char`]: ../../std/primitive.char.html #[derive(Clone, Debug)] #[stable(feature = "rust1", since = "1.0.0")] pub struct EscapeUnicode { c: char, state: EscapeUnicodeState, // The index of the next hex digit to be printed (0 if none), // i.e., the number of remaining hex digits to be printed; // increasing from the least significant digit: 0x543210 hex_digit_idx: usize, } // The enum values are ordered so that their representation is the // same as the remaining length (besides the hexadecimal digits). This // likely makes `len()` a single load from memory) and inline-worth. #[derive(Clone, Debug)] enum EscapeUnicodeState { Done, RightBrace, Value, LeftBrace, Type, Backslash, } #[stable(feature = "rust1", since = "1.0.0")] impl Iterator for EscapeUnicode { type Item = char; fn next(&mut self) -> Option { match self.state { EscapeUnicodeState::Backslash => { self.state = EscapeUnicodeState::Type; Some('\\') } EscapeUnicodeState::Type => { self.state = EscapeUnicodeState::LeftBrace; Some('u') } EscapeUnicodeState::LeftBrace => { self.state = EscapeUnicodeState::Value; Some('{') } EscapeUnicodeState::Value => { let hex_digit = ((self.c as u32) >> (self.hex_digit_idx * 4)) & 0xf; let c = from_digit(hex_digit, 16).unwrap(); if self.hex_digit_idx == 0 { self.state = EscapeUnicodeState::RightBrace; } else { self.hex_digit_idx -= 1; } Some(c) } EscapeUnicodeState::RightBrace => { self.state = EscapeUnicodeState::Done; Some('}') } EscapeUnicodeState::Done => None, } } #[inline] fn size_hint(&self) -> (usize, Option) { let n = self.len(); (n, Some(n)) } #[inline] fn count(self) -> usize { self.len() } fn last(self) -> Option { match self.state { EscapeUnicodeState::Done => None, EscapeUnicodeState::RightBrace | EscapeUnicodeState::Value | EscapeUnicodeState::LeftBrace | EscapeUnicodeState::Type | EscapeUnicodeState::Backslash => Some('}'), } } } #[stable(feature = "exact_size_escape", since = "1.11.0")] impl ExactSizeIterator for EscapeUnicode { #[inline] fn len(&self) -> usize { // The match is a single memory access with no branching self.hex_digit_idx + match self.state { EscapeUnicodeState::Done => 0, EscapeUnicodeState::RightBrace => 1, EscapeUnicodeState::Value => 2, EscapeUnicodeState::LeftBrace => 3, EscapeUnicodeState::Type => 4, EscapeUnicodeState::Backslash => 5, } } } #[stable(feature = "fused", since = "1.26.0")] impl FusedIterator for EscapeUnicode {} #[stable(feature = "char_struct_display", since = "1.16.0")] impl fmt::Display for EscapeUnicode { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { for c in self.clone() { f.write_char(c)?; } Ok(()) } } /// An iterator that yields the literal escape code of a `char`. /// /// This `struct` is created by the [`escape_default`] method on [`char`]. See /// its documentation for more. /// /// [`escape_default`]: ../../std/primitive.char.html#method.escape_default /// [`char`]: ../../std/primitive.char.html #[derive(Clone, Debug)] #[stable(feature = "rust1", since = "1.0.0")] pub struct EscapeDefault { state: EscapeDefaultState, } #[derive(Clone, Debug)] enum EscapeDefaultState { Done, Char(char), Backslash(char), Unicode(EscapeUnicode), } #[stable(feature = "rust1", since = "1.0.0")] impl Iterator for EscapeDefault { type Item = char; fn next(&mut self) -> Option { match self.state { EscapeDefaultState::Backslash(c) => { self.state = EscapeDefaultState::Char(c); Some('\\') } EscapeDefaultState::Char(c) => { self.state = EscapeDefaultState::Done; Some(c) } EscapeDefaultState::Done => None, EscapeDefaultState::Unicode(ref mut iter) => iter.next(), } } #[inline] fn size_hint(&self) -> (usize, Option) { let n = self.len(); (n, Some(n)) } #[inline] fn count(self) -> usize { self.len() } fn nth(&mut self, n: usize) -> Option { match self.state { EscapeDefaultState::Backslash(c) if n == 0 => { self.state = EscapeDefaultState::Char(c); Some('\\') } EscapeDefaultState::Backslash(c) if n == 1 => { self.state = EscapeDefaultState::Done; Some(c) } EscapeDefaultState::Backslash(_) => { self.state = EscapeDefaultState::Done; None } EscapeDefaultState::Char(c) => { self.state = EscapeDefaultState::Done; if n == 0 { Some(c) } else { None } } EscapeDefaultState::Done => None, EscapeDefaultState::Unicode(ref mut i) => i.nth(n), } } fn last(self) -> Option { match self.state { EscapeDefaultState::Unicode(iter) => iter.last(), EscapeDefaultState::Done => None, EscapeDefaultState::Backslash(c) | EscapeDefaultState::Char(c) => Some(c), } } } #[stable(feature = "exact_size_escape", since = "1.11.0")] impl ExactSizeIterator for EscapeDefault { fn len(&self) -> usize { match self.state { EscapeDefaultState::Done => 0, EscapeDefaultState::Char(_) => 1, EscapeDefaultState::Backslash(_) => 2, EscapeDefaultState::Unicode(ref iter) => iter.len(), } } } #[stable(feature = "fused", since = "1.26.0")] impl FusedIterator for EscapeDefault {} #[stable(feature = "char_struct_display", since = "1.16.0")] impl fmt::Display for EscapeDefault { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { for c in self.clone() { f.write_char(c)?; } Ok(()) } } /// An iterator that yields the literal escape code of a `char`. /// /// This `struct` is created by the [`escape_debug`] method on [`char`]. See its /// documentation for more. /// /// [`escape_debug`]: ../../std/primitive.char.html#method.escape_debug /// [`char`]: ../../std/primitive.char.html #[stable(feature = "char_escape_debug", since = "1.20.0")] #[derive(Clone, Debug)] pub struct EscapeDebug(EscapeDefault); #[stable(feature = "char_escape_debug", since = "1.20.0")] impl Iterator for EscapeDebug { type Item = char; fn next(&mut self) -> Option { self.0.next() } fn size_hint(&self) -> (usize, Option) { self.0.size_hint() } } #[stable(feature = "char_escape_debug", since = "1.20.0")] impl ExactSizeIterator for EscapeDebug {} #[stable(feature = "fused", since = "1.26.0")] impl FusedIterator for EscapeDebug {} #[stable(feature = "char_escape_debug", since = "1.20.0")] impl fmt::Display for EscapeDebug { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fmt::Display::fmt(&self.0, f) } } /// Returns an iterator that yields the lowercase equivalent of a `char`. /// /// This `struct` is created by the [`to_lowercase`] method on [`char`]. See /// its documentation for more. /// /// [`to_lowercase`]: ../../std/primitive.char.html#method.to_lowercase /// [`char`]: ../../std/primitive.char.html #[stable(feature = "rust1", since = "1.0.0")] #[derive(Debug, Clone)] pub struct ToLowercase(CaseMappingIter); #[stable(feature = "rust1", since = "1.0.0")] impl Iterator for ToLowercase { type Item = char; fn next(&mut self) -> Option { self.0.next() } fn size_hint(&self) -> (usize, Option) { self.0.size_hint() } } #[stable(feature = "fused", since = "1.26.0")] impl FusedIterator for ToLowercase {} #[stable(feature = "exact_size_case_mapping_iter", since = "1.35.0")] impl ExactSizeIterator for ToLowercase {} /// Returns an iterator that yields the uppercase equivalent of a `char`. /// /// This `struct` is created by the [`to_uppercase`] method on [`char`]. See /// its documentation for more. /// /// [`to_uppercase`]: ../../std/primitive.char.html#method.to_uppercase /// [`char`]: ../../std/primitive.char.html #[stable(feature = "rust1", since = "1.0.0")] #[derive(Debug, Clone)] pub struct ToUppercase(CaseMappingIter); #[stable(feature = "rust1", since = "1.0.0")] impl Iterator for ToUppercase { type Item = char; fn next(&mut self) -> Option { self.0.next() } fn size_hint(&self) -> (usize, Option) { self.0.size_hint() } } #[stable(feature = "fused", since = "1.26.0")] impl FusedIterator for ToUppercase {} #[stable(feature = "exact_size_case_mapping_iter", since = "1.35.0")] impl ExactSizeIterator for ToUppercase {} #[derive(Debug, Clone)] enum CaseMappingIter { Three(char, char, char), Two(char, char), One(char), Zero, } impl CaseMappingIter { fn new(chars: [char; 3]) -> CaseMappingIter { if chars[2] == '\0' { if chars[1] == '\0' { CaseMappingIter::One(chars[0]) // Including if chars[0] == '\0' } else { CaseMappingIter::Two(chars[0], chars[1]) } } else { CaseMappingIter::Three(chars[0], chars[1], chars[2]) } } } impl Iterator for CaseMappingIter { type Item = char; fn next(&mut self) -> Option { match *self { CaseMappingIter::Three(a, b, c) => { *self = CaseMappingIter::Two(b, c); Some(a) } CaseMappingIter::Two(b, c) => { *self = CaseMappingIter::One(c); Some(b) } CaseMappingIter::One(c) => { *self = CaseMappingIter::Zero; Some(c) } CaseMappingIter::Zero => None, } } fn size_hint(&self) -> (usize, Option) { let size = match self { CaseMappingIter::Three(..) => 3, CaseMappingIter::Two(..) => 2, CaseMappingIter::One(_) => 1, CaseMappingIter::Zero => 0, }; (size, Some(size)) } } impl fmt::Display for CaseMappingIter { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match *self { CaseMappingIter::Three(a, b, c) => { f.write_char(a)?; f.write_char(b)?; f.write_char(c) } CaseMappingIter::Two(b, c) => { f.write_char(b)?; f.write_char(c) } CaseMappingIter::One(c) => f.write_char(c), CaseMappingIter::Zero => Ok(()), } } } #[stable(feature = "char_struct_display", since = "1.16.0")] impl fmt::Display for ToLowercase { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fmt::Display::fmt(&self.0, f) } } #[stable(feature = "char_struct_display", since = "1.16.0")] impl fmt::Display for ToUppercase { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fmt::Display::fmt(&self.0, f) } }