diff options
| author | Mazdak Farrokhzad <twingoow@gmail.com> | 2019-10-15 22:48:13 +0200 |
|---|---|---|
| committer | Mazdak Farrokhzad <twingoow@gmail.com> | 2019-11-10 03:57:18 +0100 |
| commit | 4ae2728fa8052915414127dce28245eb8f70842a (patch) | |
| tree | 27cc54d90904091e4dc9bf7ae5fa3b41be4b6187 /src/libsyntax/parse/lexer | |
| parent | be023ebe850261c6bb202a02a686827d821c3697 (diff) | |
| download | rust-4ae2728fa8052915414127dce28245eb8f70842a.tar.gz rust-4ae2728fa8052915414127dce28245eb8f70842a.zip | |
move syntax::parse -> librustc_parse
also move MACRO_ARGUMENTS -> librustc_parse
Diffstat (limited to 'src/libsyntax/parse/lexer')
| -rw-r--r-- | src/libsyntax/parse/lexer/mod.rs | 643 | ||||
| -rw-r--r-- | src/libsyntax/parse/lexer/tokentrees.rs | 280 | ||||
| -rw-r--r-- | src/libsyntax/parse/lexer/unescape_error_reporting.rs | 215 | ||||
| -rw-r--r-- | src/libsyntax/parse/lexer/unicode_chars.rs | 392 |
4 files changed, 0 insertions, 1530 deletions
diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs deleted file mode 100644 index f2d5ff3440e..00000000000 --- a/src/libsyntax/parse/lexer/mod.rs +++ /dev/null @@ -1,643 +0,0 @@ -use crate::token::{self, Token, TokenKind}; -use crate::sess::ParseSess; -use crate::symbol::{sym, Symbol}; -use crate::util::comments; - -use errors::{FatalError, DiagnosticBuilder}; -use syntax_pos::{BytePos, Pos, Span}; -use rustc_lexer::Base; -use rustc_lexer::unescape; - -use std::char; -use std::convert::TryInto; -use rustc_data_structures::sync::Lrc; -use log::debug; - -mod tokentrees; -mod unicode_chars; -mod unescape_error_reporting; -use unescape_error_reporting::{emit_unescape_error, push_escaped_char}; - -#[derive(Clone, Debug)] -pub struct UnmatchedBrace { - pub expected_delim: token::DelimToken, - pub found_delim: Option<token::DelimToken>, - pub found_span: Span, - pub unclosed_span: Option<Span>, - pub candidate_span: Option<Span>, -} - -pub struct StringReader<'a> { - sess: &'a ParseSess, - /// Initial position, read-only. - start_pos: BytePos, - /// The absolute offset within the source_map of the current character. - // FIXME(#64197): `pub` is needed by tests for now. - pub pos: BytePos, - /// Stop reading src at this index. - end_src_index: usize, - /// Source text to tokenize. - src: Lrc<String>, - override_span: Option<Span>, -} - -impl<'a> StringReader<'a> { - pub fn new(sess: &'a ParseSess, - source_file: Lrc<syntax_pos::SourceFile>, - override_span: Option<Span>) -> Self { - if source_file.src.is_none() { - sess.span_diagnostic.bug(&format!("cannot lex `source_file` without source: {}", - source_file.name)); - } - - let src = (*source_file.src.as_ref().unwrap()).clone(); - - StringReader { - sess, - start_pos: source_file.start_pos, - pos: source_file.start_pos, - end_src_index: src.len(), - src, - override_span, - } - } - - pub fn retokenize(sess: &'a ParseSess, mut span: Span) -> Self { - let begin = sess.source_map().lookup_byte_offset(span.lo()); - let end = sess.source_map().lookup_byte_offset(span.hi()); - - // Make the range zero-length if the span is invalid. - if begin.sf.start_pos != end.sf.start_pos { - span = span.shrink_to_lo(); - } - - let mut sr = StringReader::new(sess, begin.sf, None); - - // Seek the lexer to the right byte range. - sr.end_src_index = sr.src_index(span.hi()); - - sr - } - - - fn mk_sp(&self, lo: BytePos, hi: BytePos) -> Span { - self.override_span.unwrap_or_else(|| Span::with_root_ctxt(lo, hi)) - } - - /// Returns the next token, including trivia like whitespace or comments. - /// - /// `Err(())` means that some errors were encountered, which can be - /// retrieved using `buffer_fatal_errors`. - pub fn next_token(&mut self) -> Token { - let start_src_index = self.src_index(self.pos); - let text: &str = &self.src[start_src_index..self.end_src_index]; - - if text.is_empty() { - let span = self.mk_sp(self.pos, self.pos); - return Token::new(token::Eof, span); - } - - { - let is_beginning_of_file = self.pos == self.start_pos; - if is_beginning_of_file { - if let Some(shebang_len) = rustc_lexer::strip_shebang(text) { - let start = self.pos; - self.pos = self.pos + BytePos::from_usize(shebang_len); - - let sym = self.symbol_from(start + BytePos::from_usize("#!".len())); - let kind = token::Shebang(sym); - - let span = self.mk_sp(start, self.pos); - return Token::new(kind, span); - } - } - } - - let token = rustc_lexer::first_token(text); - - let start = self.pos; - self.pos = self.pos + BytePos::from_usize(token.len); - - debug!("try_next_token: {:?}({:?})", token.kind, self.str_from(start)); - - // This could use `?`, but that makes code significantly (10-20%) slower. - // https://github.com/rust-lang/rust/issues/37939 - let kind = self.cook_lexer_token(token.kind, start); - - let span = self.mk_sp(start, self.pos); - Token::new(kind, span) - } - - /// Report a fatal lexical error with a given span. - fn fatal_span(&self, sp: Span, m: &str) -> FatalError { - self.sess.span_diagnostic.span_fatal(sp, m) - } - - /// Report a lexical error with a given span. - fn err_span(&self, sp: Span, m: &str) { - self.sess.span_diagnostic.struct_span_err(sp, m).emit(); - } - - - /// Report a fatal error spanning [`from_pos`, `to_pos`). - fn fatal_span_(&self, from_pos: BytePos, to_pos: BytePos, m: &str) -> FatalError { - self.fatal_span(self.mk_sp(from_pos, to_pos), m) - } - - /// Report a lexical error spanning [`from_pos`, `to_pos`). - fn err_span_(&self, from_pos: BytePos, to_pos: BytePos, m: &str) { - self.err_span(self.mk_sp(from_pos, to_pos), m) - } - - fn struct_span_fatal(&self, from_pos: BytePos, to_pos: BytePos, m: &str) - -> DiagnosticBuilder<'a> - { - self.sess.span_diagnostic.struct_span_fatal(self.mk_sp(from_pos, to_pos), m) - } - - fn struct_fatal_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) - -> DiagnosticBuilder<'a> - { - let mut m = m.to_string(); - m.push_str(": "); - push_escaped_char(&mut m, c); - - self.sess.span_diagnostic.struct_span_fatal(self.mk_sp(from_pos, to_pos), &m[..]) - } - - /// Turns simple `rustc_lexer::TokenKind` enum into a rich - /// `libsyntax::TokenKind`. This turns strings into interned - /// symbols and runs additional validation. - fn cook_lexer_token( - &self, - token: rustc_lexer::TokenKind, - start: BytePos, - ) -> TokenKind { - match token { - rustc_lexer::TokenKind::LineComment => { - let string = self.str_from(start); - // comments with only more "/"s are not doc comments - let tok = if comments::is_line_doc_comment(string) { - self.forbid_bare_cr(start, string, "bare CR not allowed in doc-comment"); - token::DocComment(Symbol::intern(string)) - } else { - token::Comment - }; - - tok - } - rustc_lexer::TokenKind::BlockComment { terminated } => { - let string = self.str_from(start); - // block comments starting with "/**" or "/*!" are doc-comments - // but comments with only "*"s between two "/"s are not - let is_doc_comment = comments::is_block_doc_comment(string); - - if !terminated { - let msg = if is_doc_comment { - "unterminated block doc-comment" - } else { - "unterminated block comment" - }; - let last_bpos = self.pos; - self.fatal_span_(start, last_bpos, msg).raise(); - } - - let tok = if is_doc_comment { - self.forbid_bare_cr(start, - string, - "bare CR not allowed in block doc-comment"); - token::DocComment(Symbol::intern(string)) - } else { - token::Comment - }; - - tok - } - rustc_lexer::TokenKind::Whitespace => token::Whitespace, - rustc_lexer::TokenKind::Ident | rustc_lexer::TokenKind::RawIdent => { - let is_raw_ident = token == rustc_lexer::TokenKind::RawIdent; - let mut ident_start = start; - if is_raw_ident { - ident_start = ident_start + BytePos(2); - } - // FIXME: perform NFKC normalization here. (Issue #2253) - let sym = self.symbol_from(ident_start); - if is_raw_ident { - let span = self.mk_sp(start, self.pos); - if !sym.can_be_raw() { - self.err_span(span, &format!("`{}` cannot be a raw identifier", sym)); - } - self.sess.raw_identifier_spans.borrow_mut().push(span); - } - token::Ident(sym, is_raw_ident) - } - rustc_lexer::TokenKind::Literal { kind, suffix_start } => { - let suffix_start = start + BytePos(suffix_start as u32); - let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind); - let suffix = if suffix_start < self.pos { - let string = self.str_from(suffix_start); - if string == "_" { - self.sess.span_diagnostic - .struct_span_warn(self.mk_sp(suffix_start, self.pos), - "underscore literal suffix is not allowed") - .warn("this was previously accepted by the compiler but is \ - being phased out; it will become a hard error in \ - a future release!") - .note("for more information, see issue #42326 \ - <https://github.com/rust-lang/rust/issues/42326>") - .emit(); - None - } else { - Some(Symbol::intern(string)) - } - } else { - None - }; - token::Literal(token::Lit { kind, symbol, suffix }) - } - rustc_lexer::TokenKind::Lifetime { starts_with_number } => { - // Include the leading `'` in the real identifier, for macro - // expansion purposes. See #12512 for the gory details of why - // this is necessary. - let lifetime_name = self.str_from(start); - if starts_with_number { - self.err_span_( - start, - self.pos, - "lifetimes cannot start with a number", - ); - } - let ident = Symbol::intern(lifetime_name); - token::Lifetime(ident) - } - rustc_lexer::TokenKind::Semi => token::Semi, - rustc_lexer::TokenKind::Comma => token::Comma, - rustc_lexer::TokenKind::Dot => token::Dot, - rustc_lexer::TokenKind::OpenParen => token::OpenDelim(token::Paren), - rustc_lexer::TokenKind::CloseParen => token::CloseDelim(token::Paren), - rustc_lexer::TokenKind::OpenBrace => token::OpenDelim(token::Brace), - rustc_lexer::TokenKind::CloseBrace => token::CloseDelim(token::Brace), - rustc_lexer::TokenKind::OpenBracket => token::OpenDelim(token::Bracket), - rustc_lexer::TokenKind::CloseBracket => token::CloseDelim(token::Bracket), - rustc_lexer::TokenKind::At => token::At, - rustc_lexer::TokenKind::Pound => token::Pound, - rustc_lexer::TokenKind::Tilde => token::Tilde, - rustc_lexer::TokenKind::Question => token::Question, - rustc_lexer::TokenKind::Colon => token::Colon, - rustc_lexer::TokenKind::Dollar => token::Dollar, - rustc_lexer::TokenKind::Eq => token::Eq, - rustc_lexer::TokenKind::Not => token::Not, - rustc_lexer::TokenKind::Lt => token::Lt, - rustc_lexer::TokenKind::Gt => token::Gt, - rustc_lexer::TokenKind::Minus => token::BinOp(token::Minus), - rustc_lexer::TokenKind::And => token::BinOp(token::And), - rustc_lexer::TokenKind::Or => token::BinOp(token::Or), - rustc_lexer::TokenKind::Plus => token::BinOp(token::Plus), - rustc_lexer::TokenKind::Star => token::BinOp(token::Star), - rustc_lexer::TokenKind::Slash => token::BinOp(token::Slash), - rustc_lexer::TokenKind::Caret => token::BinOp(token::Caret), - rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent), - - rustc_lexer::TokenKind::Unknown => { - let c = self.str_from(start).chars().next().unwrap(); - let mut err = self.struct_fatal_span_char(start, - self.pos, - "unknown start of token", - c); - // FIXME: the lexer could be used to turn the ASCII version of unicode homoglyphs, - // instead of keeping a table in `check_for_substitution`into the token. Ideally, - // this should be inside `rustc_lexer`. However, we should first remove compound - // tokens like `<<` from `rustc_lexer`, and then add fancier error recovery to it, - // as there will be less overall work to do this way. - let token = unicode_chars::check_for_substitution(self, start, c, &mut err) - .unwrap_or_else(|| token::Unknown(self.symbol_from(start))); - err.emit(); - token - } - } - } - - fn cook_lexer_literal( - &self, - start: BytePos, - suffix_start: BytePos, - kind: rustc_lexer::LiteralKind - ) -> (token::LitKind, Symbol) { - match kind { - rustc_lexer::LiteralKind::Char { terminated } => { - if !terminated { - self.fatal_span_(start, suffix_start, - "unterminated character literal".into()) - .raise() - } - let content_start = start + BytePos(1); - let content_end = suffix_start - BytePos(1); - self.validate_char_escape(content_start, content_end); - let id = self.symbol_from_to(content_start, content_end); - (token::Char, id) - }, - rustc_lexer::LiteralKind::Byte { terminated } => { - if !terminated { - self.fatal_span_(start + BytePos(1), suffix_start, - "unterminated byte constant".into()) - .raise() - } - let content_start = start + BytePos(2); - let content_end = suffix_start - BytePos(1); - self.validate_byte_escape(content_start, content_end); - let id = self.symbol_from_to(content_start, content_end); - (token::Byte, id) - }, - rustc_lexer::LiteralKind::Str { terminated } => { - if !terminated { - self.fatal_span_(start, suffix_start, - "unterminated double quote string".into()) - .raise() - } - let content_start = start + BytePos(1); - let content_end = suffix_start - BytePos(1); - self.validate_str_escape(content_start, content_end); - let id = self.symbol_from_to(content_start, content_end); - (token::Str, id) - } - rustc_lexer::LiteralKind::ByteStr { terminated } => { - if !terminated { - self.fatal_span_(start + BytePos(1), suffix_start, - "unterminated double quote byte string".into()) - .raise() - } - let content_start = start + BytePos(2); - let content_end = suffix_start - BytePos(1); - self.validate_byte_str_escape(content_start, content_end); - let id = self.symbol_from_to(content_start, content_end); - (token::ByteStr, id) - } - rustc_lexer::LiteralKind::RawStr { n_hashes, started, terminated } => { - if !started { - self.report_non_started_raw_string(start); - } - if !terminated { - self.report_unterminated_raw_string(start, n_hashes) - } - let n_hashes: u16 = self.restrict_n_hashes(start, n_hashes); - let n = u32::from(n_hashes); - let content_start = start + BytePos(2 + n); - let content_end = suffix_start - BytePos(1 + n); - self.validate_raw_str_escape(content_start, content_end); - let id = self.symbol_from_to(content_start, content_end); - (token::StrRaw(n_hashes), id) - } - rustc_lexer::LiteralKind::RawByteStr { n_hashes, started, terminated } => { - if !started { - self.report_non_started_raw_string(start); - } - if !terminated { - self.report_unterminated_raw_string(start, n_hashes) - } - let n_hashes: u16 = self.restrict_n_hashes(start, n_hashes); - let n = u32::from(n_hashes); - let content_start = start + BytePos(3 + n); - let content_end = suffix_start - BytePos(1 + n); - self.validate_raw_byte_str_escape(content_start, content_end); - let id = self.symbol_from_to(content_start, content_end); - (token::ByteStrRaw(n_hashes), id) - } - rustc_lexer::LiteralKind::Int { base, empty_int } => { - if empty_int { - self.err_span_(start, suffix_start, "no valid digits found for number"); - (token::Integer, sym::integer(0)) - } else { - self.validate_int_literal(base, start, suffix_start); - (token::Integer, self.symbol_from_to(start, suffix_start)) - } - }, - rustc_lexer::LiteralKind::Float { base, empty_exponent } => { - if empty_exponent { - let mut err = self.struct_span_fatal( - start, self.pos, - "expected at least one digit in exponent" - ); - err.emit(); - } - - match base { - Base::Hexadecimal => { - self.err_span_(start, suffix_start, - "hexadecimal float literal is not supported") - } - Base::Octal => { - self.err_span_(start, suffix_start, - "octal float literal is not supported") - } - Base::Binary => { - self.err_span_(start, suffix_start, - "binary float literal is not supported") - } - _ => () - } - - let id = self.symbol_from_to(start, suffix_start); - (token::Float, id) - }, - } - } - - #[inline] - fn src_index(&self, pos: BytePos) -> usize { - (pos - self.start_pos).to_usize() - } - - /// Slice of the source text from `start` up to but excluding `self.pos`, - /// meaning the slice does not include the character `self.ch`. - fn str_from(&self, start: BytePos) -> &str - { - self.str_from_to(start, self.pos) - } - - /// Creates a Symbol from a given offset to the current offset. - fn symbol_from(&self, start: BytePos) -> Symbol { - debug!("taking an ident from {:?} to {:?}", start, self.pos); - Symbol::intern(self.str_from(start)) - } - - /// As symbol_from, with an explicit endpoint. - fn symbol_from_to(&self, start: BytePos, end: BytePos) -> Symbol { - debug!("taking an ident from {:?} to {:?}", start, end); - Symbol::intern(self.str_from_to(start, end)) - } - - /// Slice of the source text spanning from `start` up to but excluding `end`. - fn str_from_to(&self, start: BytePos, end: BytePos) -> &str - { - &self.src[self.src_index(start)..self.src_index(end)] - } - - fn forbid_bare_cr(&self, start: BytePos, s: &str, errmsg: &str) { - let mut idx = 0; - loop { - idx = match s[idx..].find('\r') { - None => break, - Some(it) => idx + it + 1 - }; - self.err_span_(start + BytePos(idx as u32 - 1), - start + BytePos(idx as u32), - errmsg); - } - } - - fn report_non_started_raw_string(&self, start: BytePos) -> ! { - let bad_char = self.str_from(start).chars().last().unwrap(); - self - .struct_fatal_span_char( - start, - self.pos, - "found invalid character; only `#` is allowed \ - in raw string delimitation", - bad_char, - ) - .emit(); - FatalError.raise() - } - - fn report_unterminated_raw_string(&self, start: BytePos, n_hashes: usize) -> ! { - let mut err = self.struct_span_fatal( - start, start, - "unterminated raw string", - ); - err.span_label( - self.mk_sp(start, start), - "unterminated raw string", - ); - - if n_hashes > 0 { - err.note(&format!("this raw string should be terminated with `\"{}`", - "#".repeat(n_hashes as usize))); - } - - err.emit(); - FatalError.raise() - } - - fn restrict_n_hashes(&self, start: BytePos, n_hashes: usize) -> u16 { - match n_hashes.try_into() { - Ok(n_hashes) => n_hashes, - Err(_) => { - self.fatal_span_(start, - self.pos, - "too many `#` symbols: raw strings may be \ - delimited by up to 65535 `#` symbols").raise(); - } - } - } - - fn validate_char_escape(&self, content_start: BytePos, content_end: BytePos) { - let lit = self.str_from_to(content_start, content_end); - if let Err((off, err)) = unescape::unescape_char(lit) { - emit_unescape_error( - &self.sess.span_diagnostic, - lit, - self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)), - unescape::Mode::Char, - 0..off, - err, - ) - } - } - - fn validate_byte_escape(&self, content_start: BytePos, content_end: BytePos) { - let lit = self.str_from_to(content_start, content_end); - if let Err((off, err)) = unescape::unescape_byte(lit) { - emit_unescape_error( - &self.sess.span_diagnostic, - lit, - self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)), - unescape::Mode::Byte, - 0..off, - err, - ) - } - } - - fn validate_str_escape(&self, content_start: BytePos, content_end: BytePos) { - let lit = self.str_from_to(content_start, content_end); - unescape::unescape_str(lit, &mut |range, c| { - if let Err(err) = c { - emit_unescape_error( - &self.sess.span_diagnostic, - lit, - self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)), - unescape::Mode::Str, - range, - err, - ) - } - }) - } - - fn validate_raw_str_escape(&self, content_start: BytePos, content_end: BytePos) { - let lit = self.str_from_to(content_start, content_end); - unescape::unescape_raw_str(lit, &mut |range, c| { - if let Err(err) = c { - emit_unescape_error( - &self.sess.span_diagnostic, - lit, - self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)), - unescape::Mode::Str, - range, - err, - ) - } - }) - } - - fn validate_raw_byte_str_escape(&self, content_start: BytePos, content_end: BytePos) { - let lit = self.str_from_to(content_start, content_end); - unescape::unescape_raw_byte_str(lit, &mut |range, c| { - if let Err(err) = c { - emit_unescape_error( - &self.sess.span_diagnostic, - lit, - self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)), - unescape::Mode::ByteStr, - range, - err, - ) - } - }) - } - - fn validate_byte_str_escape(&self, content_start: BytePos, content_end: BytePos) { - let lit = self.str_from_to(content_start, content_end); - unescape::unescape_byte_str(lit, &mut |range, c| { - if let Err(err) = c { - emit_unescape_error( - &self.sess.span_diagnostic, - lit, - self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)), - unescape::Mode::ByteStr, - range, - err, - ) - } - }) - } - - fn validate_int_literal(&self, base: Base, content_start: BytePos, content_end: BytePos) { - let base = match base { - Base::Binary => 2, - Base::Octal => 8, - _ => return, - }; - let s = self.str_from_to(content_start + BytePos(2), content_end); - for (idx, c) in s.char_indices() { - let idx = idx as u32; - if c != '_' && c.to_digit(base).is_none() { - let lo = content_start + BytePos(2 + idx); - let hi = content_start + BytePos(2 + idx + c.len_utf8() as u32); - self.err_span_(lo, hi, - &format!("invalid digit for a base {} literal", base)); - - } - } - } -} diff --git a/src/libsyntax/parse/lexer/tokentrees.rs b/src/libsyntax/parse/lexer/tokentrees.rs deleted file mode 100644 index 2b056434d4d..00000000000 --- a/src/libsyntax/parse/lexer/tokentrees.rs +++ /dev/null @@ -1,280 +0,0 @@ -use rustc_data_structures::fx::FxHashMap; -use syntax_pos::Span; - -use super::{StringReader, UnmatchedBrace}; - -use crate::print::pprust::token_to_string; -use crate::token::{self, Token}; -use crate::tokenstream::{DelimSpan, IsJoint::{self, *}, TokenStream, TokenTree, TreeAndJoint}; - -use errors::PResult; - -impl<'a> StringReader<'a> { - crate fn into_token_trees(self) -> (PResult<'a, TokenStream>, Vec<UnmatchedBrace>) { - let mut tt_reader = TokenTreesReader { - string_reader: self, - token: Token::dummy(), - joint_to_prev: Joint, - open_braces: Vec::new(), - unmatched_braces: Vec::new(), - matching_delim_spans: Vec::new(), - last_unclosed_found_span: None, - last_delim_empty_block_spans: FxHashMap::default() - }; - let res = tt_reader.parse_all_token_trees(); - (res, tt_reader.unmatched_braces) - } -} - -struct TokenTreesReader<'a> { - string_reader: StringReader<'a>, - token: Token, - joint_to_prev: IsJoint, - /// Stack of open delimiters and their spans. Used for error message. - open_braces: Vec<(token::DelimToken, Span)>, - unmatched_braces: Vec<UnmatchedBrace>, - /// The type and spans for all braces - /// - /// Used only for error recovery when arriving to EOF with mismatched braces. - matching_delim_spans: Vec<(token::DelimToken, Span, Span)>, - last_unclosed_found_span: Option<Span>, - last_delim_empty_block_spans: FxHashMap<token::DelimToken, Span> -} - -impl<'a> TokenTreesReader<'a> { - // Parse a stream of tokens into a list of `TokenTree`s, up to an `Eof`. - fn parse_all_token_trees(&mut self) -> PResult<'a, TokenStream> { - let mut buf = TokenStreamBuilder::default(); - - self.real_token(); - while self.token != token::Eof { - buf.push(self.parse_token_tree()?); - } - - Ok(buf.into_token_stream()) - } - - // Parse a stream of tokens into a list of `TokenTree`s, up to a `CloseDelim`. - fn parse_token_trees_until_close_delim(&mut self) -> TokenStream { - let mut buf = TokenStreamBuilder::default(); - loop { - if let token::CloseDelim(..) = self.token.kind { - return buf.into_token_stream(); - } - - match self.parse_token_tree() { - Ok(tree) => buf.push(tree), - Err(mut e) => { - e.emit(); - return buf.into_token_stream(); - } - } - } - } - - fn parse_token_tree(&mut self) -> PResult<'a, TreeAndJoint> { - let sm = self.string_reader.sess.source_map(); - match self.token.kind { - token::Eof => { - let msg = "this file contains an un-closed delimiter"; - let mut err = self.string_reader.sess.span_diagnostic - .struct_span_err(self.token.span, msg); - for &(_, sp) in &self.open_braces { - err.span_label(sp, "un-closed delimiter"); - self.unmatched_braces.push(UnmatchedBrace { - expected_delim: token::DelimToken::Brace, - found_delim: None, - found_span: self.token.span, - unclosed_span: Some(sp), - candidate_span: None, - }); - } - - if let Some((delim, _)) = self.open_braces.last() { - if let Some((_, open_sp, close_sp)) = self.matching_delim_spans.iter() - .filter(|(d, open_sp, close_sp)| { - if let Some(close_padding) = sm.span_to_margin(*close_sp) { - if let Some(open_padding) = sm.span_to_margin(*open_sp) { - return delim == d && close_padding != open_padding; - } - } - false - }).next() // these are in reverse order as they get inserted on close, but - { // we want the last open/first close - err.span_label( - *open_sp, - "this delimiter might not be properly closed...", - ); - err.span_label( - *close_sp, - "...as it matches this but it has different indentation", - ); - } - } - Err(err) - }, - token::OpenDelim(delim) => { - // The span for beginning of the delimited section - let pre_span = self.token.span; - - // Parse the open delimiter. - self.open_braces.push((delim, self.token.span)); - self.real_token(); - - // Parse the token trees within the delimiters. - // We stop at any delimiter so we can try to recover if the user - // uses an incorrect delimiter. - let tts = self.parse_token_trees_until_close_delim(); - - // Expand to cover the entire delimited token tree - let delim_span = DelimSpan::from_pair(pre_span, self.token.span); - - match self.token.kind { - // Correct delimiter. - token::CloseDelim(d) if d == delim => { - let (open_brace, open_brace_span) = self.open_braces.pop().unwrap(); - let close_brace_span = self.token.span; - - if tts.is_empty() { - let empty_block_span = open_brace_span.to(close_brace_span); - self.last_delim_empty_block_spans.insert(delim, empty_block_span); - } - - if self.open_braces.len() == 0 { - // Clear up these spans to avoid suggesting them as we've found - // properly matched delimiters so far for an entire block. - self.matching_delim_spans.clear(); - } else { - self.matching_delim_spans.push( - (open_brace, open_brace_span, close_brace_span), - ); - } - // Parse the close delimiter. - self.real_token(); - } - // Incorrect delimiter. - token::CloseDelim(other) => { - let mut unclosed_delimiter = None; - let mut candidate = None; - if self.last_unclosed_found_span != Some(self.token.span) { - // do not complain about the same unclosed delimiter multiple times - self.last_unclosed_found_span = Some(self.token.span); - // This is a conservative error: only report the last unclosed - // delimiter. The previous unclosed delimiters could actually be - // closed! The parser just hasn't gotten to them yet. - if let Some(&(_, sp)) = self.open_braces.last() { - unclosed_delimiter = Some(sp); - }; - if let Some(current_padding) = sm.span_to_margin(self.token.span) { - for (brace, brace_span) in &self.open_braces { - if let Some(padding) = sm.span_to_margin(*brace_span) { - // high likelihood of these two corresponding - if current_padding == padding && brace == &other { - candidate = Some(*brace_span); - } - } - } - } - let (tok, _) = self.open_braces.pop().unwrap(); - self.unmatched_braces.push(UnmatchedBrace { - expected_delim: tok, - found_delim: Some(other), - found_span: self.token.span, - unclosed_span: unclosed_delimiter, - candidate_span: candidate, - }); - } else { - self.open_braces.pop(); - } - - // If the incorrect delimiter matches an earlier opening - // delimiter, then don't consume it (it can be used to - // close the earlier one). Otherwise, consume it. - // E.g., we try to recover from: - // fn foo() { - // bar(baz( - // } // Incorrect delimiter but matches the earlier `{` - if !self.open_braces.iter().any(|&(b, _)| b == other) { - self.real_token(); - } - } - token::Eof => { - // Silently recover, the EOF token will be seen again - // and an error emitted then. Thus we don't pop from - // self.open_braces here. - }, - _ => {} - } - - Ok(TokenTree::Delimited( - delim_span, - delim, - tts.into() - ).into()) - }, - token::CloseDelim(delim) => { - // An unexpected closing delimiter (i.e., there is no - // matching opening delimiter). - let token_str = token_to_string(&self.token); - let msg = format!("unexpected close delimiter: `{}`", token_str); - let mut err = self.string_reader.sess.span_diagnostic - .struct_span_err(self.token.span, &msg); - - if let Some(span) = self.last_delim_empty_block_spans.remove(&delim) { - err.span_label( - span, - "this block is empty, you might have not meant to close it" - ); - } - err.span_label(self.token.span, "unexpected close delimiter"); - Err(err) - }, - _ => { - let tt = TokenTree::Token(self.token.take()); - self.real_token(); - let is_joint = self.joint_to_prev == Joint && self.token.is_op(); - Ok((tt, if is_joint { Joint } else { NonJoint })) - } - } - } - - fn real_token(&mut self) { - self.joint_to_prev = Joint; - loop { - let token = self.string_reader.next_token(); - match token.kind { - token::Whitespace | token::Comment | token::Shebang(_) | token::Unknown(_) => { - self.joint_to_prev = NonJoint; - } - _ => { - self.token = token; - return; - } - } - } - } -} - -#[derive(Default)] -struct TokenStreamBuilder { - buf: Vec<TreeAndJoint>, -} - -impl TokenStreamBuilder { - fn push(&mut self, (tree, joint): TreeAndJoint) { - if let Some((TokenTree::Token(prev_token), Joint)) = self.buf.last() { - if let TokenTree::Token(token) = &tree { - if let Some(glued) = prev_token.glue(token) { - self.buf.pop(); - self.buf.push((TokenTree::Token(glued), joint)); - return; - } - } - } - self.buf.push((tree, joint)) - } - - fn into_token_stream(self) -> TokenStream { - TokenStream::new(self.buf) - } -} diff --git a/src/libsyntax/parse/lexer/unescape_error_reporting.rs b/src/libsyntax/parse/lexer/unescape_error_reporting.rs deleted file mode 100644 index 5565015179c..00000000000 --- a/src/libsyntax/parse/lexer/unescape_error_reporting.rs +++ /dev/null @@ -1,215 +0,0 @@ -//! Utilities for rendering escape sequence errors as diagnostics. - -use std::ops::Range; -use std::iter::once; - -use rustc_lexer::unescape::{EscapeError, Mode}; -use syntax_pos::{Span, BytePos}; - -use crate::errors::{Handler, Applicability}; - -pub(crate) fn emit_unescape_error( - handler: &Handler, - // interior part of the literal, without quotes - lit: &str, - // full span of the literal, including quotes - span_with_quotes: Span, - mode: Mode, - // range of the error inside `lit` - range: Range<usize>, - error: EscapeError, -) { - log::debug!("emit_unescape_error: {:?}, {:?}, {:?}, {:?}, {:?}", - lit, span_with_quotes, mode, range, error); - let span = { - let Range { start, end } = range; - let (start, end) = (start as u32, end as u32); - let lo = span_with_quotes.lo() + BytePos(start + 1); - let hi = lo + BytePos(end - start); - span_with_quotes - .with_lo(lo) - .with_hi(hi) - }; - let last_char = || { - let c = lit[range.clone()].chars().rev().next().unwrap(); - let span = span.with_lo(span.hi() - BytePos(c.len_utf8() as u32)); - (c, span) - }; - match error { - EscapeError::LoneSurrogateUnicodeEscape => { - handler.struct_span_err(span, "invalid unicode character escape") - .help("unicode escape must not be a surrogate") - .emit(); - } - EscapeError::OutOfRangeUnicodeEscape => { - handler.struct_span_err(span, "invalid unicode character escape") - .help("unicode escape must be at most 10FFFF") - .emit(); - } - EscapeError::MoreThanOneChar => { - let msg = if mode.is_bytes() { - "if you meant to write a byte string literal, use double quotes" - } else { - "if you meant to write a `str` literal, use double quotes" - }; - - handler - .struct_span_err( - span_with_quotes, - "character literal may only contain one codepoint", - ) - .span_suggestion( - span_with_quotes, - msg, - format!("\"{}\"", lit), - Applicability::MachineApplicable, - ).emit() - } - EscapeError::EscapeOnlyChar => { - let (c, _span) = last_char(); - - let mut msg = if mode.is_bytes() { - "byte constant must be escaped: " - } else { - "character constant must be escaped: " - }.to_string(); - push_escaped_char(&mut msg, c); - - handler.span_err(span, msg.as_str()) - } - EscapeError::BareCarriageReturn => { - let msg = if mode.in_double_quotes() { - "bare CR not allowed in string, use \\r instead" - } else { - "character constant must be escaped: \\r" - }; - handler.span_err(span, msg); - } - EscapeError::BareCarriageReturnInRawString => { - assert!(mode.in_double_quotes()); - let msg = "bare CR not allowed in raw string"; - handler.span_err(span, msg); - } - EscapeError::InvalidEscape => { - let (c, span) = last_char(); - - let label = if mode.is_bytes() { - "unknown byte escape" - } else { - "unknown character escape" - }; - let mut msg = label.to_string(); - msg.push_str(": "); - push_escaped_char(&mut msg, c); - - let mut diag = handler.struct_span_err(span, msg.as_str()); - diag.span_label(span, label); - if c == '{' || c == '}' && !mode.is_bytes() { - diag.help("if used in a formatting string, \ - curly braces are escaped with `{{` and `}}`"); - } else if c == '\r' { - diag.help("this is an isolated carriage return; \ - consider checking your editor and version control settings"); - } - diag.emit(); - } - EscapeError::TooShortHexEscape => { - handler.span_err(span, "numeric character escape is too short") - } - EscapeError::InvalidCharInHexEscape | EscapeError::InvalidCharInUnicodeEscape => { - let (c, span) = last_char(); - - let mut msg = if error == EscapeError::InvalidCharInHexEscape { - "invalid character in numeric character escape: " - } else { - "invalid character in unicode escape: " - }.to_string(); - push_escaped_char(&mut msg, c); - - handler.span_err(span, msg.as_str()) - } - EscapeError::NonAsciiCharInByte => { - assert!(mode.is_bytes()); - let (_c, span) = last_char(); - handler.span_err(span, "byte constant must be ASCII. \ - Use a \\xHH escape for a non-ASCII byte") - } - EscapeError::NonAsciiCharInByteString => { - assert!(mode.is_bytes()); - let (_c, span) = last_char(); - handler.span_err(span, "raw byte string must be ASCII") - } - EscapeError::OutOfRangeHexEscape => { - handler.span_err(span, "this form of character escape may only be used \ - with characters in the range [\\x00-\\x7f]") - } - EscapeError::LeadingUnderscoreUnicodeEscape => { - let (_c, span) = last_char(); - handler.span_err(span, "invalid start of unicode escape") - } - EscapeError::OverlongUnicodeEscape => { - handler.span_err(span, "overlong unicode escape (must have at most 6 hex digits)") - } - EscapeError::UnclosedUnicodeEscape => { - handler.span_err(span, "unterminated unicode escape (needed a `}`)") - } - EscapeError::NoBraceInUnicodeEscape => { - let msg = "incorrect unicode escape sequence"; - let mut diag = handler.struct_span_err(span, msg); - - let mut suggestion = "\\u{".to_owned(); - let mut suggestion_len = 0; - let (c, char_span) = last_char(); - let chars = once(c).chain(lit[range.end..].chars()); - for c in chars.take(6).take_while(|c| c.is_digit(16)) { - suggestion.push(c); - suggestion_len += c.len_utf8(); - } - - if suggestion_len > 0 { - suggestion.push('}'); - let lo = char_span.lo(); - let hi = lo + BytePos(suggestion_len as u32); - diag.span_suggestion( - span.with_lo(lo).with_hi(hi), - "format of unicode escape sequences uses braces", - suggestion, - Applicability::MaybeIncorrect, - ); - } else { - diag.span_label(span, msg); - diag.help( - "format of unicode escape sequences is `\\u{...}`", - ); - } - - diag.emit(); - } - EscapeError::UnicodeEscapeInByte => { - handler.span_err(span, "unicode escape sequences cannot be used \ - as a byte or in a byte string") - } - EscapeError::EmptyUnicodeEscape => { - handler.span_err(span, "empty unicode escape (must have at least 1 hex digit)") - } - EscapeError::ZeroChars => { - handler.span_err(span, "empty character literal") - } - EscapeError::LoneSlash => { - handler.span_err(span, "invalid trailing slash in literal") - } - } -} - -/// Pushes a character to a message string for error reporting -pub(crate) fn push_escaped_char(msg: &mut String, c: char) { - match c { - '\u{20}'..='\u{7e}' => { - // Don't escape \, ' or " for user-facing messages - msg.push(c); - } - _ => { - msg.extend(c.escape_default()); - } - } -} diff --git a/src/libsyntax/parse/lexer/unicode_chars.rs b/src/libsyntax/parse/lexer/unicode_chars.rs deleted file mode 100644 index 6eb995b61d3..00000000000 --- a/src/libsyntax/parse/lexer/unicode_chars.rs +++ /dev/null @@ -1,392 +0,0 @@ -// Characters and their corresponding confusables were collected from -// http://www.unicode.org/Public/security/10.0.0/confusables.txt - -use super::StringReader; -use errors::{Applicability, DiagnosticBuilder}; -use syntax_pos::{BytePos, Pos, Span, symbol::kw}; -use crate::token; - -#[rustfmt::skip] // for line breaks -const UNICODE_ARRAY: &[(char, &str, char)] = &[ - (' ', "Line Separator", ' '), - (' ', "Paragraph Separator", ' '), - (' ', "Ogham Space mark", ' '), - (' ', "En Quad", ' '), - (' ', "Em Quad", ' '), - (' ', "En Space", ' '), - (' ', "Em Space", ' '), - (' ', "Three-Per-Em Space", ' '), - (' ', "Four-Per-Em Space", ' '), - (' ', "Six-Per-Em Space", ' '), - (' ', "Punctuation Space", ' '), - (' ', "Thin Space", ' '), - (' ', "Hair Space", ' '), - (' ', "Medium Mathematical Space", ' '), - (' ', "No-Break Space", ' '), - (' ', "Figure Space", ' '), - (' ', "Narrow No-Break Space", ' '), - (' ', "Ideographic Space", ' '), - - ('ߺ', "Nko Lajanyalan", '_'), - ('﹍', "Dashed Low Line", '_'), - ('﹎', "Centreline Low Line", '_'), - ('﹏', "Wavy Low Line", '_'), - ('_', "Fullwidth Low Line", '_'), - - ('‐', "Hyphen", '-'), - ('‑', "Non-Breaking Hyphen", '-'), - ('‒', "Figure Dash", '-'), - ('–', "En Dash", '-'), - ('—', "Em Dash", '-'), - ('﹘', "Small Em Dash", '-'), - ('۔', "Arabic Full Stop", '-'), - ('⁃', "Hyphen Bullet", '-'), - ('˗', "Modifier Letter Minus Sign", '-'), - ('−', "Minus Sign", '-'), - ('➖', "Heavy Minus Sign", '-'), - ('Ⲻ', "Coptic Letter Dialect-P Ni", '-'), - ('ー', "Katakana-Hiragana Prolonged Sound Mark", '-'), - ('-', "Fullwidth Hyphen-Minus", '-'), - ('―', "Horizontal Bar", '-'), - ('─', "Box Drawings Light Horizontal", '-'), - ('━', "Box Drawings Heavy Horizontal", '-'), - ('㇐', "CJK Stroke H", '-'), - ('ꟷ', "Latin Epigraphic Letter Sideways I", '-'), - ('ᅳ', "Hangul Jungseong Eu", '-'), - ('ㅡ', "Hangul Letter Eu", '-'), - ('一', "CJK Unified Ideograph-4E00", '-'), - ('⼀', "Kangxi Radical One", '-'), - - ('؍', "Arabic Date Separator", ','), - ('٫', "Arabic Decimal Separator", ','), - ('‚', "Single Low-9 Quotation Mark", ','), - ('¸', "Cedilla", ','), - ('ꓹ', "Lisu Letter Tone Na Po", ','), - (',', "Fullwidth Comma", ','), - - (';', "Greek Question Mark", ';'), - (';', "Fullwidth Semicolon", ';'), - ('︔', "Presentation Form For Vertical Semicolon", ';'), - - ('ः', "Devanagari Sign Visarga", ':'), - ('ઃ', "Gujarati Sign Visarga", ':'), - (':', "Fullwidth Colon", ':'), - ('։', "Armenian Full Stop", ':'), - ('܃', "Syriac Supralinear Colon", ':'), - ('܄', "Syriac Sublinear Colon", ':'), - ('᛬', "Runic Multiple Punctuation", ':'), - ('︰', "Presentation Form For Vertical Two Dot Leader", ':'), - ('᠃', "Mongolian Full Stop", ':'), - ('᠉', "Mongolian Manchu Full Stop", ':'), - ('⁚', "Two Dot Punctuation", ':'), - ('׃', "Hebrew Punctuation Sof Pasuq", ':'), - ('˸', "Modifier Letter Raised Colon", ':'), - ('꞉', "Modifier Letter Colon", ':'), - ('∶', "Ratio", ':'), - ('ː', "Modifier Letter Triangular Colon", ':'), - ('ꓽ', "Lisu Letter Tone Mya Jeu", ':'), - ('︓', "Presentation Form For Vertical Colon", ':'), - - ('!', "Fullwidth Exclamation Mark", '!'), - ('ǃ', "Latin Letter Retroflex Click", '!'), - ('ⵑ', "Tifinagh Letter Tuareg Yang", '!'), - ('︕', "Presentation Form For Vertical Exclamation Mark", '!'), - - ('ʔ', "Latin Letter Glottal Stop", '?'), - ('Ɂ', "Latin Capital Letter Glottal Stop", '?'), - ('ॽ', "Devanagari Letter Glottal Stop", '?'), - ('Ꭾ', "Cherokee Letter He", '?'), - ('ꛫ', "Bamum Letter Ntuu", '?'), - ('?', "Fullwidth Question Mark", '?'), - ('︖', "Presentation Form For Vertical Question Mark", '?'), - - ('𝅭', "Musical Symbol Combining Augmentation Dot", '.'), - ('․', "One Dot Leader", '.'), - ('܁', "Syriac Supralinear Full Stop", '.'), - ('܂', "Syriac Sublinear Full Stop", '.'), - ('꘎', "Vai Full Stop", '.'), - ('𐩐', "Kharoshthi Punctuation Dot", '.'), - ('٠', "Arabic-Indic Digit Zero", '.'), - ('۰', "Extended Arabic-Indic Digit Zero", '.'), - ('ꓸ', "Lisu Letter Tone Mya Ti", '.'), - ('·', "Middle Dot", '.'), - ('・', "Katakana Middle Dot", '.'), - ('・', "Halfwidth Katakana Middle Dot", '.'), - ('᛫', "Runic Single Punctuation", '.'), - ('·', "Greek Ano Teleia", '.'), - ('⸱', "Word Separator Middle Dot", '.'), - ('𐄁', "Aegean Word Separator Dot", '.'), - ('•', "Bullet", '.'), - ('‧', "Hyphenation Point", '.'), - ('∙', "Bullet Operator", '.'), - ('⋅', "Dot Operator", '.'), - ('ꞏ', "Latin Letter Sinological Dot", '.'), - ('ᐧ', "Canadian Syllabics Final Middle Dot", '.'), - ('ᐧ', "Canadian Syllabics Final Middle Dot", '.'), - ('.', "Fullwidth Full Stop", '.'), - ('。', "Ideographic Full Stop", '.'), - ('︒', "Presentation Form For Vertical Ideographic Full Stop", '.'), - - ('՝', "Armenian Comma", '\''), - (''', "Fullwidth Apostrophe", '\''), - ('‘', "Left Single Quotation Mark", '\''), - ('’', "Right Single Quotation Mark", '\''), - ('‛', "Single High-Reversed-9 Quotation Mark", '\''), - ('′', "Prime", '\''), - ('‵', "Reversed Prime", '\''), - ('՚', "Armenian Apostrophe", '\''), - ('׳', "Hebrew Punctuation Geresh", '\''), - ('`', "Grave Accent", '\''), - ('`', "Greek Varia", '\''), - ('`', "Fullwidth Grave Accent", '\''), - ('´', "Acute Accent", '\''), - ('΄', "Greek Tonos", '\''), - ('´', "Greek Oxia", '\''), - ('᾽', "Greek Koronis", '\''), - ('᾿', "Greek Psili", '\''), - ('῾', "Greek Dasia", '\''), - ('ʹ', "Modifier Letter Prime", '\''), - ('ʹ', "Greek Numeral Sign", '\''), - ('ˈ', "Modifier Letter Vertical Line", '\''), - ('ˊ', "Modifier Letter Acute Accent", '\''), - ('ˋ', "Modifier Letter Grave Accent", '\''), - ('˴', "Modifier Letter Middle Grave Accent", '\''), - ('ʻ', "Modifier Letter Turned Comma", '\''), - ('ʽ', "Modifier Letter Reversed Comma", '\''), - ('ʼ', "Modifier Letter Apostrophe", '\''), - ('ʾ', "Modifier Letter Right Half Ring", '\''), - ('ꞌ', "Latin Small Letter Saltillo", '\''), - ('י', "Hebrew Letter Yod", '\''), - ('ߴ', "Nko High Tone Apostrophe", '\''), - ('ߵ', "Nko Low Tone Apostrophe", '\''), - ('ᑊ', "Canadian Syllabics West-Cree P", '\''), - ('ᛌ', "Runic Letter Short-Twig-Sol S", '\''), - ('𖽑', "Miao Sign Aspiration", '\''), - ('𖽒', "Miao Sign Reformed Voicing", '\''), - - ('᳓', "Vedic Sign Nihshvasa", '"'), - ('"', "Fullwidth Quotation Mark", '"'), - ('“', "Left Double Quotation Mark", '"'), - ('”', "Right Double Quotation Mark", '"'), - ('‟', "Double High-Reversed-9 Quotation Mark", '"'), - ('″', "Double Prime", '"'), - ('‶', "Reversed Double Prime", '"'), - ('〃', "Ditto Mark", '"'), - ('״', "Hebrew Punctuation Gershayim", '"'), - ('˝', "Double Acute Accent", '"'), - ('ʺ', "Modifier Letter Double Prime", '"'), - ('˶', "Modifier Letter Middle Double Acute Accent", '"'), - ('˵', "Modifier Letter Middle Double Grave Accent", '"'), - ('ˮ', "Modifier Letter Double Apostrophe", '"'), - ('ײ', "Hebrew Ligature Yiddish Double Yod", '"'), - ('❞', "Heavy Double Comma Quotation Mark Ornament", '"'), - ('❝', "Heavy Double Turned Comma Quotation Mark Ornament", '"'), - - ('(', "Fullwidth Left Parenthesis", '('), - ('❨', "Medium Left Parenthesis Ornament", '('), - ('﴾', "Ornate Left Parenthesis", '('), - - (')', "Fullwidth Right Parenthesis", ')'), - ('❩', "Medium Right Parenthesis Ornament", ')'), - ('﴿', "Ornate Right Parenthesis", ')'), - - ('[', "Fullwidth Left Square Bracket", '['), - ('❲', "Light Left Tortoise Shell Bracket Ornament", '['), - ('「', "Left Corner Bracket", '['), - ('『', "Left White Corner Bracket", '['), - ('【', "Left Black Lenticular Bracket", '['), - ('〔', "Left Tortoise Shell Bracket", '['), - ('〖', "Left White Lenticular Bracket", '['), - ('〘', "Left White Tortoise Shell Bracket", '['), - ('〚', "Left White Square Bracket", '['), - - (']', "Fullwidth Right Square Bracket", ']'), - ('❳', "Light Right Tortoise Shell Bracket Ornament", ']'), - ('」', "Right Corner Bracket", ']'), - ('』', "Right White Corner Bracket", ']'), - ('】', "Right Black Lenticular Bracket", ']'), - ('〕', "Right Tortoise Shell Bracket", ']'), - ('〗', "Right White Lenticular Bracket", ']'), - ('〙', "Right White Tortoise Shell Bracket", ']'), - ('〛', "Right White Square Bracket", ']'), - - ('❴', "Medium Left Curly Bracket Ornament", '{'), - ('𝄔', "Musical Symbol Brace", '{'), - ('{', "Fullwidth Left Curly Bracket", '{'), - - ('❵', "Medium Right Curly Bracket Ornament", '}'), - ('}', "Fullwidth Right Curly Bracket", '}'), - - ('⁎', "Low Asterisk", '*'), - ('٭', "Arabic Five Pointed Star", '*'), - ('∗', "Asterisk Operator", '*'), - ('𐌟', "Old Italic Letter Ess", '*'), - ('*', "Fullwidth Asterisk", '*'), - - ('᜵', "Philippine Single Punctuation", '/'), - ('⁁', "Caret Insertion Point", '/'), - ('∕', "Division Slash", '/'), - ('⁄', "Fraction Slash", '/'), - ('╱', "Box Drawings Light Diagonal Upper Right To Lower Left", '/'), - ('⟋', "Mathematical Rising Diagonal", '/'), - ('⧸', "Big Solidus", '/'), - ('𝈺', "Greek Instrumental Notation Symbol-47", '/'), - ('㇓', "CJK Stroke Sp", '/'), - ('〳', "Vertical Kana Repeat Mark Upper Half", '/'), - ('Ⳇ', "Coptic Capital Letter Old Coptic Esh", '/'), - ('ノ', "Katakana Letter No", '/'), - ('丿', "CJK Unified Ideograph-4E3F", '/'), - ('⼃', "Kangxi Radical Slash", '/'), - ('/', "Fullwidth Solidus", '/'), - - ('\', "Fullwidth Reverse Solidus", '\\'), - ('﹨', "Small Reverse Solidus", '\\'), - ('∖', "Set Minus", '\\'), - ('⟍', "Mathematical Falling Diagonal", '\\'), - ('⧵', "Reverse Solidus Operator", '\\'), - ('⧹', "Big Reverse Solidus", '\\'), - ('⧹', "Greek Vocal Notation Symbol-16", '\\'), - ('⧹', "Greek Instrumental Symbol-48", '\\'), - ('㇔', "CJK Stroke D", '\\'), - ('丶', "CJK Unified Ideograph-4E36", '\\'), - ('⼂', "Kangxi Radical Dot", '\\'), - ('、', "Ideographic Comma", '\\'), - ('ヽ', "Katakana Iteration Mark", '\\'), - - ('ꝸ', "Latin Small Letter Um", '&'), - ('&', "Fullwidth Ampersand", '&'), - - ('᛭', "Runic Cross Punctuation", '+'), - ('➕', "Heavy Plus Sign", '+'), - ('𐊛', "Lycian Letter H", '+'), - ('﬩', "Hebrew Letter Alternative Plus Sign", '+'), - ('+', "Fullwidth Plus Sign", '+'), - - ('‹', "Single Left-Pointing Angle Quotation Mark", '<'), - ('❮', "Heavy Left-Pointing Angle Quotation Mark Ornament", '<'), - ('˂', "Modifier Letter Left Arrowhead", '<'), - ('𝈶', "Greek Instrumental Symbol-40", '<'), - ('ᐸ', "Canadian Syllabics Pa", '<'), - ('ᚲ', "Runic Letter Kauna", '<'), - ('❬', "Medium Left-Pointing Angle Bracket Ornament", '<'), - ('⟨', "Mathematical Left Angle Bracket", '<'), - ('〈', "Left-Pointing Angle Bracket", '<'), - ('〈', "Left Angle Bracket", '<'), - ('㇛', "CJK Stroke Pd", '<'), - ('く', "Hiragana Letter Ku", '<'), - ('𡿨', "CJK Unified Ideograph-21FE8", '<'), - ('《', "Left Double Angle Bracket", '<'), - ('<', "Fullwidth Less-Than Sign", '<'), - - ('᐀', "Canadian Syllabics Hyphen", '='), - ('⹀', "Double Hyphen", '='), - ('゠', "Katakana-Hiragana Double Hyphen", '='), - ('꓿', "Lisu Punctuation Full Stop", '='), - ('=', "Fullwidth Equals Sign", '='), - - ('›', "Single Right-Pointing Angle Quotation Mark", '>'), - ('❯', "Heavy Right-Pointing Angle Quotation Mark Ornament", '>'), - ('˃', "Modifier Letter Right Arrowhead", '>'), - ('𝈷', "Greek Instrumental Symbol-42", '>'), - ('ᐳ', "Canadian Syllabics Po", '>'), - ('𖼿', "Miao Letter Archaic Zza", '>'), - ('❭', "Medium Right-Pointing Angle Bracket Ornament", '>'), - ('⟩', "Mathematical Right Angle Bracket", '>'), - ('〉', "Right-Pointing Angle Bracket", '>'), - ('〉', "Right Angle Bracket", '>'), - ('》', "Right Double Angle Bracket", '>'), - ('>', "Fullwidth Greater-Than Sign", '>'), -]; - -// FIXME: the lexer could be used to turn the ASCII version of unicode homoglyphs, instead of -// keeping the substitution token in this table. Ideally, this should be inside `rustc_lexer`. -// However, we should first remove compound tokens like `<<` from `rustc_lexer`, and then add -// fancier error recovery to it, as there will be less overall work to do this way. -const ASCII_ARRAY: &[(char, &str, Option<token::TokenKind>)] = &[ - (' ', "Space", Some(token::Whitespace)), - ('_', "Underscore", Some(token::Ident(kw::Underscore, false))), - ('-', "Minus/Hyphen", Some(token::BinOp(token::Minus))), - (',', "Comma", Some(token::Comma)), - (';', "Semicolon", Some(token::Semi)), - (':', "Colon", Some(token::Colon)), - ('!', "Exclamation Mark", Some(token::Not)), - ('?', "Question Mark", Some(token::Question)), - ('.', "Period", Some(token::Dot)), - ('(', "Left Parenthesis", Some(token::OpenDelim(token::Paren))), - (')', "Right Parenthesis", Some(token::CloseDelim(token::Paren))), - ('[', "Left Square Bracket", Some(token::OpenDelim(token::Bracket))), - (']', "Right Square Bracket", Some(token::CloseDelim(token::Bracket))), - ('{', "Left Curly Brace", Some(token::OpenDelim(token::Brace))), - ('}', "Right Curly Brace", Some(token::CloseDelim(token::Brace))), - ('*', "Asterisk", Some(token::BinOp(token::Star))), - ('/', "Slash", Some(token::BinOp(token::Slash))), - ('\\', "Backslash", None), - ('&', "Ampersand", Some(token::BinOp(token::And))), - ('+', "Plus Sign", Some(token::BinOp(token::Plus))), - ('<', "Less-Than Sign", Some(token::Lt)), - ('=', "Equals Sign", Some(token::Eq)), - ('>', "Greater-Than Sign", Some(token::Gt)), - // FIXME: Literals are already lexed by this point, so we can't recover gracefully just by - // spitting the correct token out. - ('\'', "Single Quote", None), - ('"', "Quotation Mark", None), -]; - -crate fn check_for_substitution<'a>( - reader: &StringReader<'a>, - pos: BytePos, - ch: char, - err: &mut DiagnosticBuilder<'a>, -) -> Option<token::TokenKind> { - let (u_name, ascii_char) = match UNICODE_ARRAY.iter().find(|&&(c, _, _)| c == ch) { - Some(&(_u_char, u_name, ascii_char)) => (u_name, ascii_char), - None => return None, - }; - - let span = Span::with_root_ctxt(pos, pos + Pos::from_usize(ch.len_utf8())); - - let (ascii_name, token) = match ASCII_ARRAY.iter().find(|&&(c, _, _)| c == ascii_char) { - Some((_ascii_char, ascii_name, token)) => (ascii_name, token), - None => { - let msg = format!("substitution character not found for '{}'", ch); - reader.sess.span_diagnostic.span_bug_no_panic(span, &msg); - return None; - } - }; - - // special help suggestion for "directed" double quotes - if let Some(s) = peek_delimited(&reader.src[reader.src_index(pos)..], '“', '”') { - let msg = format!( - "Unicode characters '“' (Left Double Quotation Mark) and \ - '”' (Right Double Quotation Mark) look like '{}' ({}), but are not", - ascii_char, ascii_name - ); - err.span_suggestion( - Span::with_root_ctxt( - pos, - pos + Pos::from_usize('“'.len_utf8() + s.len() + '”'.len_utf8()), - ), - &msg, - format!("\"{}\"", s), - Applicability::MaybeIncorrect, - ); - } else { - let msg = format!( - "Unicode character '{}' ({}) looks like '{}' ({}), but it is not", - ch, u_name, ascii_char, ascii_name - ); - err.span_suggestion(span, &msg, ascii_char.to_string(), Applicability::MaybeIncorrect); - } - token.clone() -} - -/// Extract string if found at current position with given delimiters -fn peek_delimited(text: &str, from_ch: char, to_ch: char) -> Option<&str> { - let mut chars = text.chars(); - let first_char = chars.next()?; - if first_char != from_ch { - return None; - } - let last_char_idx = chars.as_str().find(to_ch)?; - Some(&chars.as_str()[..last_char_idx]) -} |
