about summary refs log tree commit diff
diff options
context:
space:
mode:
authorMatthias Krüger <matthias.krueger@famsik.de>2024-06-14 12:23:35 +0200
committerGitHub <noreply@github.com>2024-06-14 12:23:35 +0200
commit20ca54b6a6bb735ddd0eb0ee8aa70d455e35ba18 (patch)
tree0c6dbbb13b95185a4d9e5986d97c029d4de2c311
parentd2ad293851dc8e14a61355d0358490b77efae8cb (diff)
parent7ddc89e893ebb6c60af4fe92c439c4a60c9118dd (diff)
downloadrust-20ca54b6a6bb735ddd0eb0ee8aa70d455e35ba18.tar.gz
rust-20ca54b6a6bb735ddd0eb0ee8aa70d455e35ba18.zip
Rollup merge of #123769 - dtolnay:literal, r=fee1-dead
Improve escaping of byte, byte str, and c str proc-macro literals

This PR changes the behavior of `proc_macro::Literal::byte_character` (https://github.com/rust-lang/rust/issues/115268), `byte_string`, and `c_string` (https://github.com/rust-lang/rust/issues/119750) to improve their choice of escape sequences. 3 categories of changes are made:

1. Never use `\x00`. Always prefer `\0`, which is supported in all the same places.

2. Never escape `\'` inside double quotes and `\"` inside single quotes.

3. Never use `\x` for valid UTF-8 in literals that permit `\u`.

The second commit adds tests covering these cases, asserting the **old** behavior.

The third commit implements the behavior change and simultaneously updates the tests to assert the **new** behavior.
-rw-r--r--library/proc_macro/src/escape.rs57
-rw-r--r--library/proc_macro/src/lib.rs51
-rw-r--r--tests/ui/proc-macro/auxiliary/api/literal.rs83
-rw-r--r--tests/ui/proc-macro/auxiliary/api/mod.rs4
-rw-r--r--tests/ui/proc-macro/auxiliary/api/parse.rs58
5 files changed, 179 insertions, 74 deletions
diff --git a/library/proc_macro/src/escape.rs b/library/proc_macro/src/escape.rs
new file mode 100644
index 00000000000..87a4d1d50fd
--- /dev/null
+++ b/library/proc_macro/src/escape.rs
@@ -0,0 +1,57 @@
+#[derive(Copy, Clone)]
+pub(crate) struct EscapeOptions {
+    /// Produce \'.
+    pub escape_single_quote: bool,
+    /// Produce \".
+    pub escape_double_quote: bool,
+    /// Produce \x escapes for non-ASCII, and use \x rather than \u for ASCII
+    /// control characters.
+    pub escape_nonascii: bool,
+}
+
+pub(crate) fn escape_bytes(bytes: &[u8], opt: EscapeOptions) -> String {
+    let mut repr = String::new();
+
+    if opt.escape_nonascii {
+        for &byte in bytes {
+            escape_single_byte(byte, opt, &mut repr);
+        }
+    } else {
+        let mut chunks = bytes.utf8_chunks();
+        while let Some(chunk) = chunks.next() {
+            for ch in chunk.valid().chars() {
+                escape_single_char(ch, opt, &mut repr);
+            }
+            for &byte in chunk.invalid() {
+                escape_single_byte(byte, opt, &mut repr);
+            }
+        }
+    }
+
+    repr
+}
+
+fn escape_single_byte(byte: u8, opt: EscapeOptions, repr: &mut String) {
+    if byte == b'\0' {
+        repr.push_str("\\0");
+    } else if (byte == b'\'' && !opt.escape_single_quote)
+        || (byte == b'"' && !opt.escape_double_quote)
+    {
+        repr.push(byte as char);
+    } else {
+        // Escapes \t, \r, \n, \\, \', \", and uses \x## for non-ASCII and
+        // for ASCII control characters.
+        repr.extend(byte.escape_ascii().map(char::from));
+    }
+}
+
+fn escape_single_char(ch: char, opt: EscapeOptions, repr: &mut String) {
+    if (ch == '\'' && !opt.escape_single_quote) || (ch == '"' && !opt.escape_double_quote) {
+        repr.push(ch);
+    } else {
+        // Escapes \0, \t, \r, \n, \\, \', \", and uses \u{...} for
+        // non-printable characters and for Grapheme_Extend characters, which
+        // includes things like U+0300 "Combining Grave Accent".
+        repr.extend(ch.escape_debug());
+    }
+}
diff --git a/library/proc_macro/src/lib.rs b/library/proc_macro/src/lib.rs
index 3d7d36b27e5..581d7e3efe3 100644
--- a/library/proc_macro/src/lib.rs
+++ b/library/proc_macro/src/lib.rs
@@ -43,10 +43,12 @@
 pub mod bridge;
 
 mod diagnostic;
+mod escape;
 
 #[unstable(feature = "proc_macro_diagnostic", issue = "54140")]
 pub use diagnostic::{Diagnostic, Level, MultiSpan};
 
+use crate::escape::{escape_bytes, EscapeOptions};
 use std::ffi::CStr;
 use std::ops::{Range, RangeBounds};
 use std::path::PathBuf;
@@ -1356,40 +1358,61 @@ impl Literal {
     /// String literal.
     #[stable(feature = "proc_macro_lib2", since = "1.29.0")]
     pub fn string(string: &str) -> Literal {
-        let quoted = format!("{:?}", string);
-        assert!(quoted.starts_with('"') && quoted.ends_with('"'));
-        let symbol = &quoted[1..quoted.len() - 1];
-        Literal::new(bridge::LitKind::Str, symbol, None)
+        let escape = EscapeOptions {
+            escape_single_quote: false,
+            escape_double_quote: true,
+            escape_nonascii: false,
+        };
+        let repr = escape_bytes(string.as_bytes(), escape);
+        Literal::new(bridge::LitKind::Str, &repr, None)
     }
 
     /// Character literal.
     #[stable(feature = "proc_macro_lib2", since = "1.29.0")]
     pub fn character(ch: char) -> Literal {
-        let quoted = format!("{:?}", ch);
-        assert!(quoted.starts_with('\'') && quoted.ends_with('\''));
-        let symbol = &quoted[1..quoted.len() - 1];
-        Literal::new(bridge::LitKind::Char, symbol, None)
+        let escape = EscapeOptions {
+            escape_single_quote: true,
+            escape_double_quote: false,
+            escape_nonascii: false,
+        };
+        let repr = escape_bytes(ch.encode_utf8(&mut [0u8; 4]).as_bytes(), escape);
+        Literal::new(bridge::LitKind::Char, &repr, None)
     }
 
     /// Byte character literal.
     #[stable(feature = "proc_macro_byte_character", since = "1.79.0")]
     pub fn byte_character(byte: u8) -> Literal {
-        let string = [byte].escape_ascii().to_string();
-        Literal::new(bridge::LitKind::Byte, &string, None)
+        let escape = EscapeOptions {
+            escape_single_quote: true,
+            escape_double_quote: false,
+            escape_nonascii: true,
+        };
+        let repr = escape_bytes(&[byte], escape);
+        Literal::new(bridge::LitKind::Byte, &repr, None)
     }
 
     /// Byte string literal.
     #[stable(feature = "proc_macro_lib2", since = "1.29.0")]
     pub fn byte_string(bytes: &[u8]) -> Literal {
-        let string = bytes.escape_ascii().to_string();
-        Literal::new(bridge::LitKind::ByteStr, &string, None)
+        let escape = EscapeOptions {
+            escape_single_quote: false,
+            escape_double_quote: true,
+            escape_nonascii: true,
+        };
+        let repr = escape_bytes(bytes, escape);
+        Literal::new(bridge::LitKind::ByteStr, &repr, None)
     }
 
     /// C string literal.
     #[stable(feature = "proc_macro_c_str_literals", since = "1.79.0")]
     pub fn c_string(string: &CStr) -> Literal {
-        let string = string.to_bytes().escape_ascii().to_string();
-        Literal::new(bridge::LitKind::CStr, &string, None)
+        let escape = EscapeOptions {
+            escape_single_quote: false,
+            escape_double_quote: true,
+            escape_nonascii: false,
+        };
+        let repr = escape_bytes(string.to_bytes(), escape);
+        Literal::new(bridge::LitKind::CStr, &repr, None)
     }
 
     /// Returns the span encompassing this literal.
diff --git a/tests/ui/proc-macro/auxiliary/api/literal.rs b/tests/ui/proc-macro/auxiliary/api/literal.rs
new file mode 100644
index 00000000000..7109340bb64
--- /dev/null
+++ b/tests/ui/proc-macro/auxiliary/api/literal.rs
@@ -0,0 +1,83 @@
+// ignore-tidy-linelength
+
+use proc_macro::Literal;
+
+pub fn test() {
+    test_display_literal();
+    test_parse_literal();
+}
+
+fn test_display_literal() {
+    assert_eq!(Literal::isize_unsuffixed(-10).to_string(), "-10");
+    assert_eq!(Literal::isize_suffixed(-10).to_string(), "-10isize");
+    assert_eq!(Literal::f32_unsuffixed(-10.0).to_string(), "-10.0");
+    assert_eq!(Literal::f32_suffixed(-10.0).to_string(), "-10f32");
+    assert_eq!(Literal::f64_unsuffixed(-10.0).to_string(), "-10.0");
+    assert_eq!(Literal::f64_suffixed(-10.0).to_string(), "-10f64");
+    assert_eq!(
+        Literal::f64_unsuffixed(1e100).to_string(),
+        "10000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0",
+    );
+
+    assert_eq!(Literal::string("aA").to_string(), r#"  "aA"  "#.trim());
+    assert_eq!(Literal::string("\t").to_string(), r#"  "\t"  "#.trim());
+    assert_eq!(Literal::string("❤").to_string(), r#"  "❤"  "#.trim());
+    assert_eq!(Literal::string("'").to_string(), r#"  "'"  "#.trim());
+    assert_eq!(Literal::string("\"").to_string(), r#"  "\""  "#.trim());
+    assert_eq!(Literal::string("\0").to_string(), r#"  "\0"  "#.trim());
+    assert_eq!(Literal::string("\u{1}").to_string(), r#"  "\u{1}"  "#.trim());
+
+    assert_eq!(Literal::byte_string(b"aA").to_string(), r#"  b"aA"  "#.trim());
+    assert_eq!(Literal::byte_string(b"\t").to_string(), r#"  b"\t"  "#.trim());
+    assert_eq!(Literal::byte_string(b"'").to_string(), r#"  b"'"  "#.trim());
+    assert_eq!(Literal::byte_string(b"\"").to_string(), r#"  b"\""  "#.trim());
+    assert_eq!(Literal::byte_string(b"\0").to_string(), r#"  b"\0"  "#.trim());
+    assert_eq!(Literal::byte_string(b"\x01").to_string(), r#"  b"\x01"  "#.trim());
+
+    assert_eq!(Literal::c_string(c"aA").to_string(), r#"  c"aA"  "#.trim());
+    assert_eq!(Literal::c_string(c"\t").to_string(), r#"  c"\t"  "#.trim());
+    assert_eq!(Literal::c_string(c"❤").to_string(), r#"  c"❤"  "#.trim());
+    assert_eq!(Literal::c_string(c"\'").to_string(), r#"  c"'"  "#.trim());
+    assert_eq!(Literal::c_string(c"\"").to_string(), r#"  c"\""  "#.trim());
+    assert_eq!(Literal::c_string(c"\x7f\xff\xfe\u{333}").to_string(), r#"  c"\u{7f}\xff\xfe\u{333}"  "#.trim());
+
+    assert_eq!(Literal::character('a').to_string(), r#"  'a'  "#.trim());
+    assert_eq!(Literal::character('\t').to_string(), r#"  '\t'  "#.trim());
+    assert_eq!(Literal::character('❤').to_string(), r#"  '❤'  "#.trim());
+    assert_eq!(Literal::character('\'').to_string(), r#"  '\''  "#.trim());
+    assert_eq!(Literal::character('"').to_string(), r#"  '"'  "#.trim());
+    assert_eq!(Literal::character('\0').to_string(), r#"  '\0'  "#.trim());
+    assert_eq!(Literal::character('\u{1}').to_string(), r#"  '\u{1}'  "#.trim());
+
+    assert_eq!(Literal::byte_character(b'a').to_string(), r#"  b'a'  "#.trim());
+    assert_eq!(Literal::byte_character(b'\t').to_string(), r#"  b'\t'  "#.trim());
+    assert_eq!(Literal::byte_character(b'\'').to_string(), r#"  b'\''  "#.trim());
+    assert_eq!(Literal::byte_character(b'"').to_string(), r#"  b'"'  "#.trim());
+    assert_eq!(Literal::byte_character(0).to_string(), r#"  b'\0'  "#.trim());
+    assert_eq!(Literal::byte_character(1).to_string(), r#"  b'\x01'  "#.trim());
+}
+
+fn test_parse_literal() {
+    assert_eq!("1".parse::<Literal>().unwrap().to_string(), "1");
+    assert_eq!("1.0".parse::<Literal>().unwrap().to_string(), "1.0");
+    assert_eq!("'a'".parse::<Literal>().unwrap().to_string(), "'a'");
+    assert_eq!("b'a'".parse::<Literal>().unwrap().to_string(), "b'a'");
+    assert_eq!("\"\n\"".parse::<Literal>().unwrap().to_string(), "\"\n\"");
+    assert_eq!("b\"\"".parse::<Literal>().unwrap().to_string(), "b\"\"");
+    assert_eq!("c\"\"".parse::<Literal>().unwrap().to_string(), "c\"\"");
+    assert_eq!("r##\"\"##".parse::<Literal>().unwrap().to_string(), "r##\"\"##");
+    assert_eq!("10ulong".parse::<Literal>().unwrap().to_string(), "10ulong");
+    assert_eq!("-10ulong".parse::<Literal>().unwrap().to_string(), "-10ulong");
+
+    assert!("true".parse::<Literal>().is_err());
+    assert!(".8".parse::<Literal>().is_err());
+    assert!("0 1".parse::<Literal>().is_err());
+    assert!("'a".parse::<Literal>().is_err());
+    assert!(" 0".parse::<Literal>().is_err());
+    assert!("0 ".parse::<Literal>().is_err());
+    assert!("/* comment */0".parse::<Literal>().is_err());
+    assert!("0/* comment */".parse::<Literal>().is_err());
+    assert!("0// comment".parse::<Literal>().is_err());
+    assert!("- 10".parse::<Literal>().is_err());
+    assert!("-'x'".parse::<Literal>().is_err());
+}
diff --git a/tests/ui/proc-macro/auxiliary/api/mod.rs b/tests/ui/proc-macro/auxiliary/api/mod.rs
index 45ef6922d28..e0a381cb6c1 100644
--- a/tests/ui/proc-macro/auxiliary/api/mod.rs
+++ b/tests/ui/proc-macro/auxiliary/api/mod.rs
@@ -10,7 +10,7 @@
 extern crate proc_macro;
 
 mod cmp;
-mod parse;
+mod literal;
 
 use proc_macro::TokenStream;
 
@@ -19,7 +19,7 @@ pub fn run(input: TokenStream) -> TokenStream {
     assert!(input.is_empty());
 
     cmp::test();
-    parse::test();
+    literal::test();
 
     TokenStream::new()
 }
diff --git a/tests/ui/proc-macro/auxiliary/api/parse.rs b/tests/ui/proc-macro/auxiliary/api/parse.rs
deleted file mode 100644
index 801c616c804..00000000000
--- a/tests/ui/proc-macro/auxiliary/api/parse.rs
+++ /dev/null
@@ -1,58 +0,0 @@
-// ignore-tidy-linelength
-
-use proc_macro::Literal;
-
-pub fn test() {
-    test_display_literal();
-    test_parse_literal();
-}
-
-fn test_display_literal() {
-    assert_eq!(Literal::isize_unsuffixed(-10).to_string(), "-10");
-    assert_eq!(Literal::isize_suffixed(-10).to_string(), "-10isize");
-    assert_eq!(Literal::f32_unsuffixed(-10.0).to_string(), "-10.0");
-    assert_eq!(Literal::f32_suffixed(-10.0).to_string(), "-10f32");
-    assert_eq!(Literal::f64_unsuffixed(-10.0).to_string(), "-10.0");
-    assert_eq!(Literal::f64_suffixed(-10.0).to_string(), "-10f64");
-    assert_eq!(
-        Literal::f64_unsuffixed(1e100).to_string(),
-        "10000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0",
-    );
-
-    assert_eq!(Literal::string("a \t ❤ ' \" \u{1}").to_string(), "\"a \\t ❤ ' \\\" \\u{1}\"",);
-    assert_eq!(Literal::c_string(c"\'\"\x7f\u{7fff}").to_string(), r#"c"\'\"\x7f\xe7\xbf\xbf""#);
-    assert_eq!(Literal::character('a').to_string(), "'a'");
-    assert_eq!(Literal::character('\t').to_string(), "'\\t'");
-    assert_eq!(Literal::character('❤').to_string(), "'❤'");
-    assert_eq!(Literal::character('\'').to_string(), "'\\''");
-    assert_eq!(Literal::character('"').to_string(), "'\"'");
-    assert_eq!(Literal::character('\u{1}').to_string(), "'\\u{1}'");
-
-    assert_eq!(Literal::byte_character(b'a').to_string(), "b'a'");
-    assert_eq!(Literal::byte_character(0).to_string(), "b'\\x00'");
-}
-
-fn test_parse_literal() {
-    assert_eq!("1".parse::<Literal>().unwrap().to_string(), "1");
-    assert_eq!("1.0".parse::<Literal>().unwrap().to_string(), "1.0");
-    assert_eq!("'a'".parse::<Literal>().unwrap().to_string(), "'a'");
-    assert_eq!("b'a'".parse::<Literal>().unwrap().to_string(), "b'a'");
-    assert_eq!("\"\n\"".parse::<Literal>().unwrap().to_string(), "\"\n\"");
-    assert_eq!("b\"\"".parse::<Literal>().unwrap().to_string(), "b\"\"");
-    assert_eq!("c\"\"".parse::<Literal>().unwrap().to_string(), "c\"\"");
-    assert_eq!("r##\"\"##".parse::<Literal>().unwrap().to_string(), "r##\"\"##");
-    assert_eq!("10ulong".parse::<Literal>().unwrap().to_string(), "10ulong");
-    assert_eq!("-10ulong".parse::<Literal>().unwrap().to_string(), "-10ulong");
-
-    assert!("true".parse::<Literal>().is_err());
-    assert!(".8".parse::<Literal>().is_err());
-    assert!("0 1".parse::<Literal>().is_err());
-    assert!("'a".parse::<Literal>().is_err());
-    assert!(" 0".parse::<Literal>().is_err());
-    assert!("0 ".parse::<Literal>().is_err());
-    assert!("/* comment */0".parse::<Literal>().is_err());
-    assert!("0/* comment */".parse::<Literal>().is_err());
-    assert!("0// comment".parse::<Literal>().is_err());
-    assert!("- 10".parse::<Literal>().is_err());
-    assert!("-'x'".parse::<Literal>().is_err());
-}