Change output normalization logic to be linear against size of output

Scan strings to be normalized for printing in a linear scan and collect the resulting `String` only once. Use a binary search when looking for chars to be replaced, instead of a `HashMap::get`.
author: Esteban Küber <esteban@kuber.com.ar> 2024-07-25 18:38:28 +0000
committer: Esteban Küber <esteban@kuber.com.ar> 2024-07-29 18:40:58 +0000
commit: 6d9ee6ee26c296d9361d5f0fb479b2462687fe4e (patch)
tree: ee792787028161e6d1554df8dc7ed38e951081a6 /compiler/rustc_errors/src
parent: 4db3d12e6f395babed53dee1d209a5c8699a5ae6 (diff)
download: rust-6d9ee6ee26c296d9361d5f0fb479b2462687fe4e.tar.gz
rust-6d9ee6ee26c296d9361d5f0fb479b2462687fe4e.zip
1 files changed, 52 insertions, 46 deletions
diff --git a/compiler/rustc_errors/src/emitter.rs b/compiler/rustc_errors/src/emitter.rs
index 73908e58085..d186996040b 100644
--- a/compiler/rustc_errors/src/emitter.rs
+++ b/compiler/rustc_errors/src/emitter.rs
@@ -16,6 +16,7 @@ use std::iter;
 use std::path::Path;
 
 use derive_setters::Setters;
+use either::Either;
 use rustc_data_structures::fx::{FxHashMap, FxIndexMap, FxIndexSet};
 use rustc_data_structures::sync::{DynSend, IntoDynSyncSend, Lrc};
 use rustc_error_messages::{FluentArgs, SpanLabel};
@@ -2559,60 +2560,65 @@ fn num_decimal_digits(num: usize) -> usize {
 
 // We replace some characters so the CLI output is always consistent and underlines aligned.
 // Keep the following list in sync with `rustc_span::char_width`.
+// ATTENTION: keep lexicografically sorted so that the binary search will work
 const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[
-    ('\t', "    "),    // We do our own tab replacement
-    ('\u{200D}', ""), // Replace ZWJ with nothing for consistent terminal output of grapheme clusters.
-    ('\u{202A}', "�"), // The following unicode text flow control characters are inconsistently
-    ('\u{202B}', "�"), // supported across CLIs and can cause confusion due to the bytes on disk
-    ('\u{202D}', "�"), // not corresponding to the visible source code, so we replace them always.
-    ('\u{202E}', "�"),
+    // In terminals without Unicode support the following will be garbled, but in *all* terminals
+    // the underlying codepoint will be as well. We could gate this replacement behind a "unicode
+    // support" gate.
+    ('\0', "␀"),
+    ('\u{1}', "␁"),
+    ('\u{2}', "␂"),
+    ('\u{3}', "␃"),
+    ('\u{4}', "␄"),
+    ('\u{5}', "␅"),
+    ('\u{6}', "␆"),
+    ('\u{7}', "␇"),
+    ('\u{8}', "␈"),
+    ('\t', "    "), // We do our own tab replacement
+    ('\u{b}', "␋"),
+    ('\u{c}', "␌"),
+    ('\r', "␍"),
+    ('\u{e}', "␎"),
+    ('\u{f}', "␏"),
+    ('\u{10}', "␐"),
+    ('\u{11}', "␑"),
+    ('\u{12}', "␒"),
+    ('\u{13}', "␓"),
+    ('\u{14}', "␔"),
+    ('\u{15}', "␕"),
+    ('\u{16}', "␖"),
+    ('\u{17}', "␗"),
+    ('\u{18}', "␘"),
+    ('\u{19}', "␙"),
+    ('\u{1a}', "␚"),
+    ('\u{1b}', "␛"),
+    ('\u{1c}', "␜"),
+    ('\u{1d}', "␝"),
+    ('\u{1e}', "␞"),
+    ('\u{1f}', "␟"),
+    ('\u{7f}', "␡"),
+    ('\u{200d}', ""), // Replace ZWJ for consistent terminal output of grapheme clusters.
+    ('\u{202a}', "�"), // The following unicode text flow control characters are inconsistently
+    ('\u{202b}', "�"), // supported across CLIs and can cause confusion due to the bytes on disk
+    ('\u{202c}', "�"), // not corresponding to the visible source code, so we replace them always.
+    ('\u{202d}', "�"),
+    ('\u{202e}', "�"),
     ('\u{2066}', "�"),
     ('\u{2067}', "�"),
     ('\u{2068}', "�"),
-    ('\u{202C}', "�"),
     ('\u{2069}', "�"),
-    // In terminals without Unicode support the following will be garbled, but in *all* terminals
-    // the underlying codepoint will be as well. We could gate this replacement behind a "unicode
-    // support" gate.
-    ('\u{0000}', "␀"),
-    ('\u{0001}', "␁"),
-    ('\u{0002}', "␂"),
-    ('\u{0003}', "␃"),
-    ('\u{0004}', "␄"),
-    ('\u{0005}', "␅"),
-    ('\u{0006}', "␆"),
-    ('\u{0007}', "␇"),
-    ('\u{0008}', "␈"),
-    ('\u{000B}', "␋"),
-    ('\u{000C}', "␌"),
-    ('\u{000D}', "␍"),
-    ('\u{000E}', "␎"),
-    ('\u{000F}', "␏"),
-    ('\u{0010}', "␐"),
-    ('\u{0011}', "␑"),
-    ('\u{0012}', "␒"),
-    ('\u{0013}', "␓"),
-    ('\u{0014}', "␔"),
-    ('\u{0015}', "␕"),
-    ('\u{0016}', "␖"),
-    ('\u{0017}', "␗"),
-    ('\u{0018}', "␘"),
-    ('\u{0019}', "␙"),
-    ('\u{001A}', "␚"),
-    ('\u{001B}', "␛"),
-    ('\u{001C}', "␜"),
-    ('\u{001D}', "␝"),
-    ('\u{001E}', "␞"),
-    ('\u{001F}', "␟"),
-    ('\u{007F}', "␡"),
 ];
 
 fn normalize_whitespace(str: &str) -> String {
-    let mut s = str.to_string();
-    for (c, replacement) in OUTPUT_REPLACEMENTS {
-        s = s.replace(*c, replacement);
-    }
-    s
+    // Scan the input string for a character in the ordered table above. If it's present, replace
+    // it with it's alternative string (it can be more than 1 char!). Otherwise, retain the input
+    // char. At the end, allocate all chars into a string in one operation.
+    str.chars()
+        .flat_map(|c| match OUTPUT_REPLACEMENTS.binary_search_by_key(&c, |(k, _)| *k) {
+            Ok(i) => Either::Left(OUTPUT_REPLACEMENTS[i].1.chars()),
+            _ => Either::Right([c].into_iter()),
+        })
+        .collect()
 }
 
 fn draw_col_separator(buffer: &mut StyledBuffer, line: usize, col: usize) {
author	Esteban Küber <esteban@kuber.com.ar>	2024-07-25 18:38:28 +0000
committer	Esteban Küber <esteban@kuber.com.ar>	2024-07-29 18:40:58 +0000
commit	6d9ee6ee26c296d9361d5f0fb479b2462687fe4e (patch)
tree	ee792787028161e6d1554df8dc7ed38e951081a6 /compiler/rustc_errors/src
parent	4db3d12e6f395babed53dee1d209a5c8699a5ae6 (diff)
download	rust-6d9ee6ee26c296d9361d5f0fb479b2462687fe4e.tar.gz rust-6d9ee6ee26c296d9361d5f0fb479b2462687fe4e.zip