about summary refs log tree commit diff
path: root/compiler/rustc_errors
diff options
context:
space:
mode:
authorMatthias Krüger <matthias.krueger@famsik.de>2024-07-25 04:43:19 +0200
committerGitHub <noreply@github.com>2024-07-25 04:43:19 +0200
commitcce2db06c0206949c5d19d52472fd480ca2d60c0 (patch)
tree3c09a9deca25e52b631e67f034c87f5ad16154d1 /compiler/rustc_errors
parentcfc5f25b3d7c2f9fa37d0165085cdd4120939716 (diff)
parent9bd7680b2e0f1f5680b04fdb2401bad3e082fa38 (diff)
downloadrust-cce2db06c0206949c5d19d52472fd480ca2d60c0.tar.gz
rust-cce2db06c0206949c5d19d52472fd480ca2d60c0.zip
Rollup merge of #127528 - estebank:ascii-control-chars, r=oli-obk
Replace ASCII control chars with Unicode Control Pictures

Replace ASCII control chars like `CR` with Unicode Control Pictures like `␍`:

```
error: bare CR not allowed in doc-comment
  --> $DIR/lex-bare-cr-string-literal-doc-comment.rs:3:32
   |
LL | /// doc comment with bare CR: '␍'
   |                                ^
```

Centralize the checking of unicode char width for the purposes of CLI display in one place. Account for the new replacements. Remove unneeded tracking of "zero-width" unicode chars, as we calculate these in the `SourceMap` as needed now.
Diffstat (limited to 'compiler/rustc_errors')
-rw-r--r--compiler/rustc_errors/Cargo.toml1
-rw-r--r--compiler/rustc_errors/src/emitter.rs77
2 files changed, 51 insertions, 27 deletions
diff --git a/compiler/rustc_errors/Cargo.toml b/compiler/rustc_errors/Cargo.toml
index cc114fdcd8c..2fff9f2de50 100644
--- a/compiler/rustc_errors/Cargo.toml
+++ b/compiler/rustc_errors/Cargo.toml
@@ -26,7 +26,6 @@ serde_json = "1.0.59"
 termcolor = "1.2.0"
 termize = "0.1.1"
 tracing = "0.1"
-unicode-width = "0.1.4"
 # tidy-alphabetical-end
 
 [target.'cfg(windows)'.dependencies.windows]
diff --git a/compiler/rustc_errors/src/emitter.rs b/compiler/rustc_errors/src/emitter.rs
index aa47ca16676..58220c65490 100644
--- a/compiler/rustc_errors/src/emitter.rs
+++ b/compiler/rustc_errors/src/emitter.rs
@@ -8,7 +8,7 @@
 //! The output types are defined in `rustc_session::config::ErrorOutputType`.
 
 use rustc_span::source_map::SourceMap;
-use rustc_span::{FileLines, FileName, SourceFile, Span};
+use rustc_span::{char_width, FileLines, FileName, SourceFile, Span};
 
 use crate::snippet::{
     Annotation, AnnotationColumn, AnnotationType, Line, MultilineAnnotation, Style, StyledString,
@@ -677,10 +677,7 @@ impl HumanEmitter {
             .skip(left)
             .take_while(|ch| {
                 // Make sure that the trimming on the right will fall within the terminal width.
-                // FIXME: `unicode_width` sometimes disagrees with terminals on how wide a `char`
-                // is. For now, just accept that sometimes the code line will be longer than
-                // desired.
-                let next = unicode_width::UnicodeWidthChar::width(*ch).unwrap_or(1);
+                let next = char_width(*ch);
                 if taken + next > right - left {
                     return false;
                 }
@@ -742,11 +739,7 @@ impl HumanEmitter {
         let left = margin.left(source_string.len());
 
         // Account for unicode characters of width !=0 that were removed.
-        let left = source_string
-            .chars()
-            .take(left)
-            .map(|ch| unicode_width::UnicodeWidthChar::width(ch).unwrap_or(1))
-            .sum();
+        let left = source_string.chars().take(left).map(|ch| char_width(ch)).sum();
 
         self.draw_line(
             buffer,
@@ -2039,7 +2032,7 @@ impl HumanEmitter {
                     let sub_len: usize =
                         if is_whitespace_addition { &part.snippet } else { part.snippet.trim() }
                             .chars()
-                            .map(|ch| unicode_width::UnicodeWidthChar::width(ch).unwrap_or(1))
+                            .map(|ch| char_width(ch))
                             .sum();
 
                     let offset: isize = offsets
@@ -2076,11 +2069,8 @@ impl HumanEmitter {
                     }
 
                     // length of the code after substitution
-                    let full_sub_len = part
-                        .snippet
-                        .chars()
-                        .map(|ch| unicode_width::UnicodeWidthChar::width(ch).unwrap_or(1))
-                        .sum::<usize>() as isize;
+                    let full_sub_len =
+                        part.snippet.chars().map(|ch| char_width(ch)).sum::<usize>() as isize;
 
                     // length of the code to be substituted
                     let snippet_len = span_end_pos as isize - span_start_pos as isize;
@@ -2568,18 +2558,53 @@ fn num_decimal_digits(num: usize) -> usize {
 }
 
 // We replace some characters so the CLI output is always consistent and underlines aligned.
+// Keep the following list in sync with `rustc_span::char_width`.
 const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[
-    ('\t', "    "),   // We do our own tab replacement
+    ('\t', "    "),    // We do our own tab replacement
     ('\u{200D}', ""), // Replace ZWJ with nothing for consistent terminal output of grapheme clusters.
-    ('\u{202A}', ""), // The following unicode text flow control characters are inconsistently
-    ('\u{202B}', ""), // supported across CLIs and can cause confusion due to the bytes on disk
-    ('\u{202D}', ""), // not corresponding to the visible source code, so we replace them always.
-    ('\u{202E}', ""),
-    ('\u{2066}', ""),
-    ('\u{2067}', ""),
-    ('\u{2068}', ""),
-    ('\u{202C}', ""),
-    ('\u{2069}', ""),
+    ('\u{202A}', "�"), // The following unicode text flow control characters are inconsistently
+    ('\u{202B}', "�"), // supported across CLIs and can cause confusion due to the bytes on disk
+    ('\u{202D}', "�"), // not corresponding to the visible source code, so we replace them always.
+    ('\u{202E}', "�"),
+    ('\u{2066}', "�"),
+    ('\u{2067}', "�"),
+    ('\u{2068}', "�"),
+    ('\u{202C}', "�"),
+    ('\u{2069}', "�"),
+    // In terminals without Unicode support the following will be garbled, but in *all* terminals
+    // the underlying codepoint will be as well. We could gate this replacement behind a "unicode
+    // support" gate.
+    ('\u{0000}', "␀"),
+    ('\u{0001}', "␁"),
+    ('\u{0002}', "␂"),
+    ('\u{0003}', "␃"),
+    ('\u{0004}', "␄"),
+    ('\u{0005}', "␅"),
+    ('\u{0006}', "␆"),
+    ('\u{0007}', "␇"),
+    ('\u{0008}', "␈"),
+    ('\u{000B}', "␋"),
+    ('\u{000C}', "␌"),
+    ('\u{000D}', "␍"),
+    ('\u{000E}', "␎"),
+    ('\u{000F}', "␏"),
+    ('\u{0010}', "␐"),
+    ('\u{0011}', "␑"),
+    ('\u{0012}', "␒"),
+    ('\u{0013}', "␓"),
+    ('\u{0014}', "␔"),
+    ('\u{0015}', "␕"),
+    ('\u{0016}', "␖"),
+    ('\u{0017}', "␗"),
+    ('\u{0018}', "␘"),
+    ('\u{0019}', "␙"),
+    ('\u{001A}', "␚"),
+    ('\u{001B}', "␛"),
+    ('\u{001C}', "␜"),
+    ('\u{001D}', "␝"),
+    ('\u{001E}', "␞"),
+    ('\u{001F}', "␟"),
+    ('\u{007F}', "␡"),
 ];
 
 fn normalize_whitespace(str: &str) -> String {