diff options
Diffstat (limited to 'compiler/rustc_parse/src/lexer')
| -rw-r--r-- | compiler/rustc_parse/src/lexer/mod.rs | 40 | ||||
| -rw-r--r-- | compiler/rustc_parse/src/lexer/unescape_error_reporting.rs | 20 |
2 files changed, 52 insertions, 8 deletions
diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index 1e65cc27154..8e90f73b44e 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -4,7 +4,9 @@ use rustc_ast::tokenstream::{Spacing, TokenStream}; use rustc_errors::{error_code, Applicability, DiagnosticBuilder, FatalError, PResult}; use rustc_lexer::unescape::{self, Mode}; use rustc_lexer::{Base, DocStyle, RawStrError}; -use rustc_session::lint::builtin::RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX; +use rustc_session::lint::builtin::{ + TEXT_DIRECTION_CODEPOINT_IN_COMMENT, RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX, +}; use rustc_session::lint::BuiltinLintDiagnostics; use rustc_session::parse::ParseSess; use rustc_span::symbol::{sym, Symbol}; @@ -129,6 +131,28 @@ impl<'a> StringReader<'a> { .struct_span_fatal(self.mk_sp(from_pos, to_pos), &format!("{}: {}", m, escaped_char(c))) } + /// Detect usages of Unicode codepoints changing the direction of the text on screen and loudly + /// complain about it. + fn lint_unicode_text_flow(&self, start: BytePos) { + // Opening delimiter of the length 2 is not included into the comment text. + let content_start = start + BytePos(2); + let content = self.str_from(content_start); + let span = self.mk_sp(start, self.pos); + const UNICODE_TEXT_FLOW_CHARS: &[char] = &[ + '\u{202A}', '\u{202B}', '\u{202D}', '\u{202E}', '\u{2066}', '\u{2067}', '\u{2068}', + '\u{202C}', '\u{2069}', + ]; + if content.contains(UNICODE_TEXT_FLOW_CHARS) { + self.sess.buffer_lint_with_diagnostic( + &TEXT_DIRECTION_CODEPOINT_IN_COMMENT, + span, + ast::CRATE_NODE_ID, + "unicode codepoint changing visible direction of text present in comment", + BuiltinLintDiagnostics::UnicodeTextFlow(span, content.to_string()), + ); + } + } + /// Turns simple `rustc_lexer::TokenKind` enum into a rich /// `rustc_ast::TokenKind`. This turns strings into interned /// symbols and runs additional validation. @@ -136,7 +160,12 @@ impl<'a> StringReader<'a> { Some(match token { rustc_lexer::TokenKind::LineComment { doc_style } => { // Skip non-doc comments - let doc_style = doc_style?; + let doc_style = if let Some(doc_style) = doc_style { + doc_style + } else { + self.lint_unicode_text_flow(start); + return None; + }; // Opening delimiter of the length 3 is not included into the symbol. let content_start = start + BytePos(3); @@ -158,7 +187,12 @@ impl<'a> StringReader<'a> { } // Skip non-doc comments - let doc_style = doc_style?; + let doc_style = if let Some(doc_style) = doc_style { + doc_style + } else { + self.lint_unicode_text_flow(start); + return None; + }; // Opening delimiter of the length 3 and closing delimiter of the length 2 // are not included into the symbol. diff --git a/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs b/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs index aa6b424ce2b..e26d094a0e2 100644 --- a/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs +++ b/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs @@ -154,12 +154,17 @@ pub(crate) fn emit_unescape_error( assert!(mode.is_bytes()); let (c, span) = last_char(); let mut err = handler.struct_span_err(span, "non-ASCII character in byte constant"); - err.span_label(span, "byte constant must be ASCII"); + let postfix = if unicode_width::UnicodeWidthChar::width(c).unwrap_or(1) == 0 { + format!(" but is {:?}", c) + } else { + String::new() + }; + err.span_label(span, &format!("byte constant must be ASCII{}", postfix)); if (c as u32) <= 0xFF { err.span_suggestion( span, &format!( - "if you meant to use the unicode code point for '{}', use a \\xHH escape", + "if you meant to use the unicode code point for {:?}, use a \\xHH escape", c ), format!("\\x{:X}", c as u32), @@ -173,7 +178,7 @@ pub(crate) fn emit_unescape_error( err.span_suggestion( span, &format!( - "if you meant to use the UTF-8 encoding of '{}', use \\xHH escapes", + "if you meant to use the UTF-8 encoding of {:?}, use \\xHH escapes", c ), utf8.as_bytes() @@ -187,10 +192,15 @@ pub(crate) fn emit_unescape_error( } EscapeError::NonAsciiCharInByteString => { assert!(mode.is_bytes()); - let (_c, span) = last_char(); + let (c, span) = last_char(); + let postfix = if unicode_width::UnicodeWidthChar::width(c).unwrap_or(1) == 0 { + format!(" but is {:?}", c) + } else { + String::new() + }; handler .struct_span_err(span, "raw byte string must be ASCII") - .span_label(span, "must be ASCII") + .span_label(span, &format!("must be ASCII{}", postfix)) .emit(); } EscapeError::OutOfRangeHexEscape => { |
