about summary refs log tree commit diff
diff options
context:
space:
mode:
authorbors <bors@rust-lang.org>2023-05-20 04:43:17 +0000
committerbors <bors@rust-lang.org>2023-05-20 04:43:17 +0000
commite86fd62b6b198584a47798eb271d4b54c4dc96ec (patch)
tree72cfb864600ac1437dd46be482005b597cf56270
parent7047d97e012d5ed90cf2837c361150bc149837b4 (diff)
parent28449daa2241410d4c384c5608b87fe7c85ffae6 (diff)
downloadrust-e86fd62b6b198584a47798eb271d4b54c4dc96ec.tar.gz
rust-e86fd62b6b198584a47798eb271d4b54c4dc96ec.zip
Auto merge of #111524 - scottmcm:escape-using-ascii, r=cuviper
`ascii::Char`-ify the escaping code in `core`

This means that `EscapeIterInner::as_str` no longer needs unsafe code, because the type system ensures the internal buffer is only ASCII, and thus valid UTF-8.

Come to think of it, this also gives it a (non-guaranteed) niche.

cc `@BurntSushi` as potentially interested
`ascii::Char` tracking issue: #110998
-rw-r--r--library/core/src/ascii.rs2
-rw-r--r--library/core/src/char/methods.rs24
-rw-r--r--library/core/src/char/mod.rs23
-rw-r--r--library/core/src/escape.rs65
-rw-r--r--library/core/src/lib.rs1
5 files changed, 65 insertions, 50 deletions
diff --git a/library/core/src/ascii.rs b/library/core/src/ascii.rs
index 7fd14a7e1ea..ef8e4d098ed 100644
--- a/library/core/src/ascii.rs
+++ b/library/core/src/ascii.rs
@@ -91,7 +91,7 @@ pub struct EscapeDefault(escape::EscapeIterInner<4>);
 /// ```
 #[stable(feature = "rust1", since = "1.0.0")]
 pub fn escape_default(c: u8) -> EscapeDefault {
-    let mut data = [0; 4];
+    let mut data = [Char::Null; 4];
     let range = escape::escape_ascii_into(&mut data, c);
     EscapeDefault(escape::EscapeIterInner::new(data, range))
 }
diff --git a/library/core/src/char/methods.rs b/library/core/src/char/methods.rs
index 1dfa9c34db1..515b8d20ead 100644
--- a/library/core/src/char/methods.rs
+++ b/library/core/src/char/methods.rs
@@ -392,13 +392,13 @@ impl char {
     #[inline]
     pub(crate) fn escape_debug_ext(self, args: EscapeDebugExtArgs) -> EscapeDebug {
         match self {
-            '\0' => EscapeDebug::backslash(b'0'),
-            '\t' => EscapeDebug::backslash(b't'),
-            '\r' => EscapeDebug::backslash(b'r'),
-            '\n' => EscapeDebug::backslash(b'n'),
-            '\\' => EscapeDebug::backslash(b'\\'),
-            '"' if args.escape_double_quote => EscapeDebug::backslash(b'"'),
-            '\'' if args.escape_single_quote => EscapeDebug::backslash(b'\''),
+            '\0' => EscapeDebug::backslash(ascii::Char::Digit0),
+            '\t' => EscapeDebug::backslash(ascii::Char::SmallT),
+            '\r' => EscapeDebug::backslash(ascii::Char::SmallR),
+            '\n' => EscapeDebug::backslash(ascii::Char::SmallN),
+            '\\' => EscapeDebug::backslash(ascii::Char::ReverseSolidus),
+            '\"' if args.escape_double_quote => EscapeDebug::backslash(ascii::Char::QuotationMark),
+            '\'' if args.escape_single_quote => EscapeDebug::backslash(ascii::Char::Apostrophe),
             _ if args.escape_grapheme_extended && self.is_grapheme_extended() => {
                 EscapeDebug::from_unicode(self.escape_unicode())
             }
@@ -503,11 +503,11 @@ impl char {
     #[inline]
     pub fn escape_default(self) -> EscapeDefault {
         match self {
-            '\t' => EscapeDefault::backslash(b't'),
-            '\r' => EscapeDefault::backslash(b'r'),
-            '\n' => EscapeDefault::backslash(b'n'),
-            '\\' | '\'' | '"' => EscapeDefault::backslash(self as u8),
-            '\x20'..='\x7e' => EscapeDefault::printable(self as u8),
+            '\t' => EscapeDefault::backslash(ascii::Char::SmallT),
+            '\r' => EscapeDefault::backslash(ascii::Char::SmallR),
+            '\n' => EscapeDefault::backslash(ascii::Char::SmallN),
+            '\\' | '\'' | '"' => EscapeDefault::backslash(self.as_ascii().unwrap()),
+            '\x20'..='\x7e' => EscapeDefault::printable(self.as_ascii().unwrap()),
             _ => EscapeDefault::from_unicode(self.escape_unicode()),
         }
     }
diff --git a/library/core/src/char/mod.rs b/library/core/src/char/mod.rs
index e186db7052c..5c42912874c 100644
--- a/library/core/src/char/mod.rs
+++ b/library/core/src/char/mod.rs
@@ -38,6 +38,7 @@ pub use self::methods::encode_utf16_raw;
 #[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
 pub use self::methods::encode_utf8_raw;
 
+use crate::ascii;
 use crate::error::Error;
 use crate::escape;
 use crate::fmt::{self, Write};
@@ -152,7 +153,7 @@ pub struct EscapeUnicode(escape::EscapeIterInner<10>);
 
 impl EscapeUnicode {
     fn new(chr: char) -> Self {
-        let mut data = [0; 10];
+        let mut data = [ascii::Char::Null; 10];
         let range = escape::escape_unicode_into(&mut data, chr);
         Self(escape::EscapeIterInner::new(data, range))
     }
@@ -218,14 +219,14 @@ impl fmt::Display for EscapeUnicode {
 pub struct EscapeDefault(escape::EscapeIterInner<10>);
 
 impl EscapeDefault {
-    fn printable(chr: u8) -> Self {
-        let data = [chr, 0, 0, 0, 0, 0, 0, 0, 0, 0];
-        Self(escape::EscapeIterInner::new(data, 0..1))
+    fn printable(chr: ascii::Char) -> Self {
+        let data = [chr];
+        Self(escape::EscapeIterInner::from_array(data))
     }
 
-    fn backslash(chr: u8) -> Self {
-        let data = [b'\\', chr, 0, 0, 0, 0, 0, 0, 0, 0];
-        Self(escape::EscapeIterInner::new(data, 0..2))
+    fn backslash(chr: ascii::Char) -> Self {
+        let data = [ascii::Char::ReverseSolidus, chr];
+        Self(escape::EscapeIterInner::from_array(data))
     }
 
     fn from_unicode(esc: EscapeUnicode) -> Self {
@@ -307,9 +308,9 @@ impl EscapeDebug {
         Self(EscapeDebugInner::Char(chr))
     }
 
-    fn backslash(chr: u8) -> Self {
-        let data = [b'\\', chr, 0, 0, 0, 0, 0, 0, 0, 0];
-        let iter = escape::EscapeIterInner::new(data, 0..2);
+    fn backslash(chr: ascii::Char) -> Self {
+        let data = [ascii::Char::ReverseSolidus, chr];
+        let iter = escape::EscapeIterInner::from_array(data);
         Self(EscapeDebugInner::Bytes(iter))
     }
 
@@ -318,7 +319,7 @@ impl EscapeDebug {
     }
 
     fn clear(&mut self) {
-        let bytes = escape::EscapeIterInner::new([0; 10], 0..0);
+        let bytes = escape::EscapeIterInner::from_array([]);
         self.0 = EscapeDebugInner::Bytes(bytes);
     }
 }
diff --git a/library/core/src/escape.rs b/library/core/src/escape.rs
index 20ac3cf027f..3d471419bb8 100644
--- a/library/core/src/escape.rs
+++ b/library/core/src/escape.rs
@@ -1,34 +1,41 @@
 //! Helper code for character escaping.
 
+use crate::ascii;
 use crate::num::NonZeroUsize;
 use crate::ops::Range;
 
-const HEX_DIGITS: [u8; 16] = *b"0123456789abcdef";
+const HEX_DIGITS: [ascii::Char; 16] = *b"0123456789abcdef".as_ascii().unwrap();
 
 /// Escapes a byte into provided buffer; returns length of escaped
 /// representation.
-pub(crate) fn escape_ascii_into(output: &mut [u8; 4], byte: u8) -> Range<u8> {
+pub(crate) fn escape_ascii_into(output: &mut [ascii::Char; 4], byte: u8) -> Range<u8> {
+    #[inline]
+    fn backslash(a: ascii::Char) -> ([ascii::Char; 4], u8) {
+        ([ascii::Char::ReverseSolidus, a, ascii::Char::Null, ascii::Char::Null], 2)
+    }
+
     let (data, len) = match byte {
-        b'\t' => ([b'\\', b't', 0, 0], 2),
-        b'\r' => ([b'\\', b'r', 0, 0], 2),
-        b'\n' => ([b'\\', b'n', 0, 0], 2),
-        b'\\' => ([b'\\', b'\\', 0, 0], 2),
-        b'\'' => ([b'\\', b'\'', 0, 0], 2),
-        b'"' => ([b'\\', b'"', 0, 0], 2),
-        b'\x20'..=b'\x7e' => ([byte, 0, 0, 0], 1),
-        _ => {
+        b'\t' => backslash(ascii::Char::SmallT),
+        b'\r' => backslash(ascii::Char::SmallR),
+        b'\n' => backslash(ascii::Char::SmallN),
+        b'\\' => backslash(ascii::Char::ReverseSolidus),
+        b'\'' => backslash(ascii::Char::Apostrophe),
+        b'\"' => backslash(ascii::Char::QuotationMark),
+        _ => if let Some(a) = byte.as_ascii() && !byte.is_ascii_control() {
+            ([a, ascii::Char::Null, ascii::Char::Null, ascii::Char::Null], 1)
+        } else {
             let hi = HEX_DIGITS[usize::from(byte >> 4)];
             let lo = HEX_DIGITS[usize::from(byte & 0xf)];
-            ([b'\\', b'x', hi, lo], 4)
+            ([ascii::Char::ReverseSolidus, ascii::Char::SmallX, hi, lo], 4)
         }
     };
     *output = data;
-    0..(len as u8)
+    0..len
 }
 
 /// Escapes a character into provided buffer using `\u{NNNN}` representation.
-pub(crate) fn escape_unicode_into(output: &mut [u8; 10], ch: char) -> Range<u8> {
-    output[9] = b'}';
+pub(crate) fn escape_unicode_into(output: &mut [ascii::Char; 10], ch: char) -> Range<u8> {
+    output[9] = ascii::Char::RightCurlyBracket;
 
     let ch = ch as u32;
     output[3] = HEX_DIGITS[((ch >> 20) & 15) as usize];
@@ -41,7 +48,8 @@ pub(crate) fn escape_unicode_into(output: &mut [u8; 10], ch: char) -> Range<u8>
     // or-ing 1 ensures that for ch==0 the code computes that one digit should
     // be printed.
     let start = (ch | 1).leading_zeros() as usize / 4 - 2;
-    output[start..start + 3].copy_from_slice(b"\\u{");
+    const UNICODE_ESCAPE_PREFIX: &[ascii::Char; 3] = b"\\u{".as_ascii().unwrap();
+    output[start..][..3].copy_from_slice(UNICODE_ESCAPE_PREFIX);
 
     (start as u8)..10
 }
@@ -52,29 +60,34 @@ pub(crate) fn escape_unicode_into(output: &mut [u8; 10], ch: char) -> Range<u8>
 /// limited to u8 to reduce size of the structure.
 #[derive(Clone, Debug)]
 pub(crate) struct EscapeIterInner<const N: usize> {
-    // Invariant: data[alive] is all ASCII.
-    pub(crate) data: [u8; N],
+    // The element type ensures this is always ASCII, and thus also valid UTF-8.
+    pub(crate) data: [ascii::Char; N],
 
     // Invariant: alive.start <= alive.end <= N.
     pub(crate) alive: Range<u8>,
 }
 
 impl<const N: usize> EscapeIterInner<N> {
-    pub fn new(data: [u8; N], alive: Range<u8>) -> Self {
+    pub fn new(data: [ascii::Char; N], alive: Range<u8>) -> Self {
         const { assert!(N < 256) };
         debug_assert!(alive.start <= alive.end && usize::from(alive.end) <= N, "{alive:?}");
-        let this = Self { data, alive };
-        debug_assert!(this.as_bytes().is_ascii(), "Expected ASCII, got {:?}", this.as_bytes());
-        this
+        Self { data, alive }
+    }
+
+    pub fn from_array<const M: usize>(array: [ascii::Char; M]) -> Self {
+        const { assert!(M <= N) };
+
+        let mut data = [ascii::Char::Null; N];
+        data[..M].copy_from_slice(&array);
+        Self::new(data, 0..M as u8)
     }
 
-    fn as_bytes(&self) -> &[u8] {
+    pub fn as_ascii(&self) -> &[ascii::Char] {
         &self.data[usize::from(self.alive.start)..usize::from(self.alive.end)]
     }
 
     pub fn as_str(&self) -> &str {
-        // SAFETY: self.data[self.alive] is all ASCII characters.
-        unsafe { crate::str::from_utf8_unchecked(self.as_bytes()) }
+        self.as_ascii().as_str()
     }
 
     pub fn len(&self) -> usize {
@@ -82,11 +95,11 @@ impl<const N: usize> EscapeIterInner<N> {
     }
 
     pub fn next(&mut self) -> Option<u8> {
-        self.alive.next().map(|i| self.data[usize::from(i)])
+        self.alive.next().map(|i| self.data[usize::from(i)].as_u8())
     }
 
     pub fn next_back(&mut self) -> Option<u8> {
-        self.alive.next_back().map(|i| self.data[usize::from(i)])
+        self.alive.next_back().map(|i| self.data[usize::from(i)].as_u8())
     }
 
     pub fn advance_by(&mut self, n: usize) -> Result<(), NonZeroUsize> {
diff --git a/library/core/src/lib.rs b/library/core/src/lib.rs
index 0af04fac909..6c419eb16f3 100644
--- a/library/core/src/lib.rs
+++ b/library/core/src/lib.rs
@@ -216,6 +216,7 @@
 #![feature(intra_doc_pointers)]
 #![feature(intrinsics)]
 #![feature(lang_items)]
+#![feature(let_chains)]
 #![feature(link_llvm_intrinsics)]
 #![feature(macro_metavar_expr)]
 #![feature(min_specialization)]