about summary refs log tree commit diff
diff options
context:
space:
mode:
authorEsteban Küber <esteban@kuber.com.ar>2024-07-18 20:02:08 +0000
committerEsteban Küber <esteban@kuber.com.ar>2024-07-18 20:08:43 +0000
commit9dffe9573be86ac7302fb2f27219ef8f9c78dea7 (patch)
treeaf2758a3014b0ca805c82298b4bdcc0698388cb6
parent2d7795dfb942d49d447837ef43db89216de15696 (diff)
downloadrust-9dffe9573be86ac7302fb2f27219ef8f9c78dea7.tar.gz
rust-9dffe9573be86ac7302fb2f27219ef8f9c78dea7.zip
Make unicode text flow control chars visible as �
We already point these out quite aggressively, telling people not to use them, but would normally be rendered as nothing. Having them visible will make it easier for people to actually deal with them.

```
error: unicode codepoint changing visible direction of text present in literal
  --> $DIR/unicode-control-codepoints.rs:26:22
   |
LL |     println!("{:?}", '�');
   |                      ^-^
   |                      ||
   |                      |'\u{202e}'
   |                      this literal contains an invisible unicode text flow control codepoint
   |
   = note: these kind of unicode codepoints change the way text flows on applications that support them, but can cause confusion because they change the order of characters on the screen
   = help: if their presence wasn't intentional, you can remove them
help: if you want to keep them but make them visible in your source code, you can escape them
   |
LL |     println!("{:?}", '\u{202e}');
   |                       ~~~~~~~~
```

vs the previous

```
error: unicode codepoint changing visible direction of text present in literal
  --> $DIR/unicode-control-codepoints.rs:26:22
   |
LL |     println!("{:?}", '');
   |                      ^-
   |                      ||
   |                      |'\u{202e}'
   |                      this literal contains an invisible unicode text flow control codepoint
   |
   = note: these kind of unicode codepoints change the way text flows on applications that support them, but can cause confusion because they change the order of characters on the screen
   = help: if their presence wasn't intentional, you can remove them
help: if you want to keep them but make them visible in your source code, you can escape them
   |
LL |     println!("{:?}", '\u{202e}');
   |                       ~~~~~~~~
```
-rw-r--r--compiler/rustc_errors/src/emitter.rs21
-rw-r--r--compiler/rustc_span/src/lib.rs3
-rw-r--r--tests/ui/parser/unicode-control-codepoints.stderr98
3 files changed, 62 insertions, 60 deletions
diff --git a/compiler/rustc_errors/src/emitter.rs b/compiler/rustc_errors/src/emitter.rs
index 16fa0ff7a2d..58220c65490 100644
--- a/compiler/rustc_errors/src/emitter.rs
+++ b/compiler/rustc_errors/src/emitter.rs
@@ -2558,18 +2558,19 @@ fn num_decimal_digits(num: usize) -> usize {
 }
 
 // We replace some characters so the CLI output is always consistent and underlines aligned.
+// Keep the following list in sync with `rustc_span::char_width`.
 const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[
-    ('\t', "    "),   // We do our own tab replacement
+    ('\t', "    "),    // We do our own tab replacement
     ('\u{200D}', ""), // Replace ZWJ with nothing for consistent terminal output of grapheme clusters.
-    ('\u{202A}', ""), // The following unicode text flow control characters are inconsistently
-    ('\u{202B}', ""), // supported across CLIs and can cause confusion due to the bytes on disk
-    ('\u{202D}', ""), // not corresponding to the visible source code, so we replace them always.
-    ('\u{202E}', ""),
-    ('\u{2066}', ""),
-    ('\u{2067}', ""),
-    ('\u{2068}', ""),
-    ('\u{202C}', ""),
-    ('\u{2069}', ""),
+    ('\u{202A}', "�"), // The following unicode text flow control characters are inconsistently
+    ('\u{202B}', "�"), // supported across CLIs and can cause confusion due to the bytes on disk
+    ('\u{202D}', "�"), // not corresponding to the visible source code, so we replace them always.
+    ('\u{202E}', "�"),
+    ('\u{2066}', "�"),
+    ('\u{2067}', "�"),
+    ('\u{2068}', "�"),
+    ('\u{202C}', "�"),
+    ('\u{2069}', "�"),
     // In terminals without Unicode support the following will be garbled, but in *all* terminals
     // the underlying codepoint will be as well. We could gate this replacement behind a "unicode
     // support" gate.
diff --git a/compiler/rustc_span/src/lib.rs b/compiler/rustc_span/src/lib.rs
index ea57c1ce818..7c8ac3be4be 100644
--- a/compiler/rustc_span/src/lib.rs
+++ b/compiler/rustc_span/src/lib.rs
@@ -2093,7 +2093,8 @@ pub fn char_width(ch: char) -> usize {
         | '\u{000E}' | '\u{000F}' | '\u{0010}' | '\u{0011}' | '\u{0012}' | '\u{0013}'
         | '\u{0014}' | '\u{0015}' | '\u{0016}' | '\u{0017}' | '\u{0018}' | '\u{0019}'
         | '\u{001A}' | '\u{001B}' | '\u{001C}' | '\u{001D}' | '\u{001E}' | '\u{001F}'
-        | '\u{007F}' => 1,
+        | '\u{007F}' | '\u{202A}' | '\u{202B}' | '\u{202D}' | '\u{202E}' | '\u{2066}'
+        | '\u{2067}' | '\u{2068}' | '\u{202C}' | '\u{2069}' => 1,
         _ => unicode_width::UnicodeWidthChar::width(ch).unwrap_or(1),
     }
 }
diff --git a/tests/ui/parser/unicode-control-codepoints.stderr b/tests/ui/parser/unicode-control-codepoints.stderr
index fc071a94191..28de4ae72ab 100644
--- a/tests/ui/parser/unicode-control-codepoints.stderr
+++ b/tests/ui/parser/unicode-control-codepoints.stderr
@@ -17,78 +17,78 @@ LL |     println!("{:?}", b"us\u{202B}e\u{202A}r");
 error: non-ASCII character in byte string literal
   --> $DIR/unicode-control-codepoints.rs:16:26
    |
-LL |     println!("{:?}", b"/* } if isAdmin  begin admins only ");
+LL |     println!("{:?}", b"/*� } �if isAdmin� � begin admins only ");
    |                          ^ must be ASCII but is '\u{202e}'
    |
 help: if you meant to use the UTF-8 encoding of '\u{202e}', use \xHH escapes
    |
-LL |     println!("{:?}", b"/*\xE2\x80\xAE } if isAdmin  begin admins only ");
+LL |     println!("{:?}", b"/*\xE2\x80\xAE } �if isAdmin� � begin admins only ");
    |                          ~~~~~~~~~~~~
 
 error: non-ASCII character in byte string literal
   --> $DIR/unicode-control-codepoints.rs:16:30
    |
-LL |     println!("{:?}", b"/* } if isAdmin  begin admins only ");
-   |                             ^ must be ASCII but is '\u{2066}'
+LL |     println!("{:?}", b"/*� } �if isAdmin� � begin admins only ");
+   |                              ^ must be ASCII but is '\u{2066}'
    |
 help: if you meant to use the UTF-8 encoding of '\u{2066}', use \xHH escapes
    |
-LL |     println!("{:?}", b"/* } \xE2\x81\xA6if isAdmin  begin admins only ");
-   |                             ~~~~~~~~~~~~
+LL |     println!("{:?}", b"/*� } \xE2\x81\xA6if isAdmin� � begin admins only ");
+   |                              ~~~~~~~~~~~~
 
 error: non-ASCII character in byte string literal
   --> $DIR/unicode-control-codepoints.rs:16:41
    |
-LL |     println!("{:?}", b"/* } if isAdmin  begin admins only ");
-   |                                       ^ must be ASCII but is '\u{2069}'
+LL |     println!("{:?}", b"/*� } �if isAdmin� � begin admins only ");
+   |                                         ^ must be ASCII but is '\u{2069}'
    |
 help: if you meant to use the UTF-8 encoding of '\u{2069}', use \xHH escapes
    |
-LL |     println!("{:?}", b"/* } if isAdmin\xE2\x81\xA9  begin admins only ");
-   |                                       ~~~~~~~~~~~~
+LL |     println!("{:?}", b"/*� } �if isAdmin\xE2\x81\xA9 � begin admins only ");
+   |                                         ~~~~~~~~~~~~
 
 error: non-ASCII character in byte string literal
   --> $DIR/unicode-control-codepoints.rs:16:43
    |
-LL |     println!("{:?}", b"/* } if isAdmin  begin admins only ");
-   |                                        ^ must be ASCII but is '\u{2066}'
+LL |     println!("{:?}", b"/*� } �if isAdmin� � begin admins only ");
+   |                                           ^ must be ASCII but is '\u{2066}'
    |
 help: if you meant to use the UTF-8 encoding of '\u{2066}', use \xHH escapes
    |
-LL |     println!("{:?}", b"/* } if isAdmin \xE2\x81\xA6 begin admins only ");
-   |                                        ~~~~~~~~~~~~
+LL |     println!("{:?}", b"/*� } �if isAdmin� \xE2\x81\xA6 begin admins only ");
+   |                                           ~~~~~~~~~~~~
 
 error: non-ASCII character in raw byte string literal
   --> $DIR/unicode-control-codepoints.rs:21:29
    |
-LL |     println!("{:?}", br##"/* } if isAdmin  begin admins only "##);
+LL |     println!("{:?}", br##"/*� } �if isAdmin� � begin admins only "##);
    |                             ^ must be ASCII but is '\u{202e}'
 
 error: non-ASCII character in raw byte string literal
   --> $DIR/unicode-control-codepoints.rs:21:33
    |
-LL |     println!("{:?}", br##"/* } if isAdmin  begin admins only "##);
-   |                                ^ must be ASCII but is '\u{2066}'
+LL |     println!("{:?}", br##"/*� } �if isAdmin� � begin admins only "##);
+   |                                 ^ must be ASCII but is '\u{2066}'
 
 error: non-ASCII character in raw byte string literal
   --> $DIR/unicode-control-codepoints.rs:21:44
    |
-LL |     println!("{:?}", br##"/* } if isAdmin  begin admins only "##);
-   |                                          ^ must be ASCII but is '\u{2069}'
+LL |     println!("{:?}", br##"/*� } �if isAdmin� � begin admins only "##);
+   |                                            ^ must be ASCII but is '\u{2069}'
 
 error: non-ASCII character in raw byte string literal
   --> $DIR/unicode-control-codepoints.rs:21:46
    |
-LL |     println!("{:?}", br##"/* } if isAdmin  begin admins only "##);
-   |                                           ^ must be ASCII but is '\u{2066}'
+LL |     println!("{:?}", br##"/*� } �if isAdmin� � begin admins only "##);
+   |                                              ^ must be ASCII but is '\u{2066}'
 
 error: unicode codepoint changing visible direction of text present in comment
   --> $DIR/unicode-control-codepoints.rs:2:5
    |
-LL |     // if access_level != "user" { // Check if admin
-   |     ^^^^^^^^^^^^^^^^^^^^^^^^^--^^^^^^^^^^^^^^^^^^^^^
-   |     |                        ||
-   |     |                        |'\u{202a}'
+LL |     // if access_level != "us�e�r" { // Check if admin
+   |     ^^^^^^^^^^^^^^^^^^^^^^^^^-^-^^^^^^^^^^^^^^^^^^^^^^
+   |     |                        | |
+   |     |                        | '\u{202a}'
    |     |                        '\u{202b}'
    |     this comment contains invisible unicode text flow control codepoints
    |
@@ -99,12 +99,12 @@ LL |     // if access_level != "user" { // Check if admin
 error: unicode codepoint changing visible direction of text present in comment
   --> $DIR/unicode-control-codepoints.rs:30:1
    |
-LL | //"/* } if isAdmin  begin admins only */"
-   | ^^^^^-^^-^^^^^^^^^--^^^^^^^^^^^^^^^^^^^^^
-   | |    |  |         ||
-   | |    |  |         |'\u{2066}'
-   | |    |  |         '\u{2069}'
-   | |    |  '\u{2066}'
+LL | //"/*� } �if isAdmin� � begin admins only */"
+   | ^^^^^-^^^-^^^^^^^^^^-^-^^^^^^^^^^^^^^^^^^^^^^
+   | |    |   |          | |
+   | |    |   |          | '\u{2066}'
+   | |    |   |          '\u{2069}'
+   | |    |   '\u{2066}'
    | |    '\u{202e}'
    | this comment contains invisible unicode text flow control codepoints
    |
@@ -114,12 +114,12 @@ LL | //"/* } if isAdmin  begin admins only */"
 error: unicode codepoint changing visible direction of text present in literal
   --> $DIR/unicode-control-codepoints.rs:11:22
    |
-LL |     println!("{:?}", "/* } if isAdmin  begin admins only ");
-   |                      ^^^-^^-^^^^^^^^^--^^^^^^^^^^^^^^^^^^^
-   |                      |  |  |         ||
-   |                      |  |  |         |'\u{2066}'
-   |                      |  |  |         '\u{2069}'
-   |                      |  |  '\u{2066}'
+LL |     println!("{:?}", "/*� } �if isAdmin� � begin admins only ");
+   |                      ^^^-^^^-^^^^^^^^^^-^-^^^^^^^^^^^^^^^^^^^^
+   |                      |  |   |          | |
+   |                      |  |   |          | '\u{2066}'
+   |                      |  |   |          '\u{2069}'
+   |                      |  |   '\u{2066}'
    |                      |  '\u{202e}'
    |                      this literal contains invisible unicode text flow control codepoints
    |
@@ -134,12 +134,12 @@ LL |     println!("{:?}", "/*\u{202e} } \u{2066}if isAdmin\u{2069} \u{2066} begi
 error: unicode codepoint changing visible direction of text present in literal
   --> $DIR/unicode-control-codepoints.rs:14:22
    |
-LL |     println!("{:?}", r##"/* } if isAdmin  begin admins only "##);
-   |                      ^^^^^^-^^-^^^^^^^^^--^^^^^^^^^^^^^^^^^^^^^
-   |                      |     |  |         ||
-   |                      |     |  |         |'\u{2066}'
-   |                      |     |  |         '\u{2069}'
-   |                      |     |  '\u{2066}'
+LL |     println!("{:?}", r##"/*� } �if isAdmin� � begin admins only "##);
+   |                      ^^^^^^-^^^-^^^^^^^^^^-^-^^^^^^^^^^^^^^^^^^^^^^
+   |                      |     |   |          | |
+   |                      |     |   |          | '\u{2066}'
+   |                      |     |   |          '\u{2069}'
+   |                      |     |   '\u{2066}'
    |                      |     '\u{202e}'
    |                      this literal contains invisible unicode text flow control codepoints
    |
@@ -153,8 +153,8 @@ LL |     println!("{:?}", r##"/*\u{202e} } \u{2066}if isAdmin\u{2069} \u{2066} b
 error: unicode codepoint changing visible direction of text present in literal
   --> $DIR/unicode-control-codepoints.rs:26:22
    |
-LL |     println!("{:?}", '');
-   |                      ^-
+LL |     println!("{:?}", '�');
+   |                      ^-^
    |                      ||
    |                      |'\u{202e}'
    |                      this literal contains an invisible unicode text flow control codepoint
@@ -169,8 +169,8 @@ LL |     println!("{:?}", '\u{202e}');
 error: unicode codepoint changing visible direction of text present in doc comment
   --> $DIR/unicode-control-codepoints.rs:33:1
    |
-LL | /**  ''); */fn foo() {}
-   | ^^^^^^^^^^^^ this doc comment contains an invisible unicode text flow control codepoint
+LL | /**  '�'); */fn foo() {}
+   | ^^^^^^^^^^^^^ this doc comment contains an invisible unicode text flow control codepoint
    |
    = note: these kind of unicode codepoints change the way text flows on applications that support them, but can cause confusion because they change the order of characters on the screen
    = note: if their presence wasn't intentional, you can remove them
@@ -181,8 +181,8 @@ error: unicode codepoint changing visible direction of text present in doc comme
    |
 LL | / /**
 LL | |  *
-LL | |  *  ''); */fn bar() {}
-   | |___________^ this doc comment contains an invisible unicode text flow control codepoint
+LL | |  *  '�'); */fn bar() {}
+   | |____________^ this doc comment contains an invisible unicode text flow control codepoint
    |
    = note: these kind of unicode codepoints change the way text flows on applications that support them, but can cause confusion because they change the order of characters on the screen
    = note: if their presence wasn't intentional, you can remove them