From f5b285906e45d0fd031a1433cdb7ab3c7be92650 Mon Sep 17 00:00:00 2001 From: Esteban Kรผber Date: Wed, 17 Jul 2019 11:40:36 -0700 Subject: Handle more cases of typos misinterpreted as type ascription --- src/libsyntax/parse/diagnostics.rs | 91 ++++++++++++++++++++------------------ src/libsyntax/parse/parser.rs | 45 +++++++++++-------- 2 files changed, 75 insertions(+), 61 deletions(-) (limited to 'src/libsyntax/parse') diff --git a/src/libsyntax/parse/diagnostics.rs b/src/libsyntax/parse/diagnostics.rs index 0e88a0ee289..190672acfcf 100644 --- a/src/libsyntax/parse/diagnostics.rs +++ b/src/libsyntax/parse/diagnostics.rs @@ -2,6 +2,7 @@ use crate::ast::{ self, Arg, BinOpKind, BindingMode, BlockCheckMode, Expr, ExprKind, Ident, Item, ItemKind, Mutability, Pat, PatKind, PathSegment, QSelf, Ty, TyKind, VariantData, }; +use crate::feature_gate::{feature_err, UnstableFeatures}; use crate::parse::{SeqSep, PResult, Parser, ParseSess}; use crate::parse::parser::{BlockMode, PathStyle, SemiColonMode, TokenType, TokenExpectType}; use crate::parse::token::{self, TokenKind}; @@ -365,9 +366,53 @@ impl<'a> Parser<'a> { err.span_label(self.token.span, "unexpected token"); } } + self.maybe_annotate_with_ascription(&mut err, false); Err(err) } + pub fn maybe_annotate_with_ascription( + &self, + err: &mut DiagnosticBuilder<'_>, + maybe_expected_semicolon: bool, + ) { + if let Some((sp, likely_path)) = self.last_type_ascription { + let cm = self.sess.source_map(); + let next_pos = cm.lookup_char_pos(self.token.span.lo()); + let op_pos = cm.lookup_char_pos(sp.hi()); + + if likely_path { + err.span_suggestion( + sp, + "maybe write a path separator here", + "::".to_string(), + match self.sess.unstable_features { + UnstableFeatures::Disallow => Applicability::MachineApplicable, + _ => Applicability::MaybeIncorrect, + }, + ); + } else if op_pos.line != next_pos.line && maybe_expected_semicolon { + err.span_suggestion( + sp, + "try using a semicolon", + ";".to_string(), + Applicability::MaybeIncorrect, + ); + } else if let UnstableFeatures::Disallow = self.sess.unstable_features { + err.span_label(sp, "tried to parse a type due to this"); + } else { + err.span_label(sp, "tried to parse a type due to this type ascription"); + } + if let UnstableFeatures::Disallow = self.sess.unstable_features { + // Give extra information about type ascription only if it's a nightly compiler. + } else { + err.note("`#![feature(type_ascription)]` lets you annotate an expression with a \ + type: `: `"); + err.note("for more information, see \ + https://github.com/rust-lang/rust/issues/23416"); + } + } + } + /// Eats and discards tokens until one of `kets` is encountered. Respects token trees, /// passes through any errors encountered. Used for error recovery. crate fn eat_to_tokens(&mut self, kets: &[&TokenKind]) { @@ -556,7 +601,7 @@ impl<'a> Parser<'a> { .collect::>(); if !discriminant_spans.is_empty() && has_fields { - let mut err = crate::feature_gate::feature_err( + let mut err = feature_err( sess, sym::arbitrary_enum_discriminant, discriminant_spans.clone(), @@ -887,47 +932,9 @@ impl<'a> Parser<'a> { self.look_ahead(2, |t| t.is_ident()) || self.look_ahead(1, |t| t == &token::Colon) && // `foo:bar:baz` self.look_ahead(2, |t| t.is_ident()) || - self.look_ahead(1, |t| t == &token::ModSep) && // `foo:bar::baz` - self.look_ahead(2, |t| t.is_ident()) - } - - crate fn bad_type_ascription( - &self, - err: &mut DiagnosticBuilder<'a>, - lhs_span: Span, - cur_op_span: Span, - next_sp: Span, - maybe_path: bool, - ) { - err.span_label(self.token.span, "expecting a type here because of type ascription"); - let cm = self.sess.source_map(); - let next_pos = cm.lookup_char_pos(next_sp.lo()); - let op_pos = cm.lookup_char_pos(cur_op_span.hi()); - if op_pos.line != next_pos.line { - err.span_suggestion( - cur_op_span, - "try using a semicolon", - ";".to_string(), - Applicability::MaybeIncorrect, - ); - } else { - if maybe_path { - err.span_suggestion( - cur_op_span, - "maybe you meant to write a path separator here", - "::".to_string(), - Applicability::MaybeIncorrect, - ); - } else { - err.note("`#![feature(type_ascription)]` lets you annotate an \ - expression with a type: `: `") - .span_note( - lhs_span, - "this expression expects an ascribed type after the colon", - ) - .help("this might be indicative of a syntax error elsewhere"); - } - } + self.look_ahead(1, |t| t == &token::ModSep) && + (self.look_ahead(2, |t| t.is_ident()) || // `foo:bar::baz` + self.look_ahead(2, |t| t == &token::Lt)) // `foo:bar::` } crate fn recover_seq_parse_error( diff --git a/src/libsyntax/parse/parser.rs b/src/libsyntax/parse/parser.rs index 1d4d02c7325..41cfb5bece3 100644 --- a/src/libsyntax/parse/parser.rs +++ b/src/libsyntax/parse/parser.rs @@ -239,6 +239,7 @@ pub struct Parser<'a> { /// error. crate unclosed_delims: Vec, crate last_unexpected_token_span: Option, + crate last_type_ascription: Option<(Span, bool /* likely path typo */)>, /// If present, this `Parser` is not parsing Rust code but rather a macro call. crate subparser_name: Option<&'static str>, } @@ -502,6 +503,7 @@ impl<'a> Parser<'a> { max_angle_bracket_count: 0, unclosed_delims: Vec::new(), last_unexpected_token_span: None, + last_type_ascription: None, subparser_name, }; @@ -1422,7 +1424,10 @@ impl<'a> Parser<'a> { } } else { let msg = format!("expected type, found {}", self.this_token_descr()); - return Err(self.fatal(&msg)); + let mut err = self.fatal(&msg); + err.span_label(self.token.span, "expected type"); + self.maybe_annotate_with_ascription(&mut err, true); + return Err(err); }; let span = lo.to(self.prev_span); @@ -2823,10 +2828,11 @@ impl<'a> Parser<'a> { } /// Parses an associative expression with operators of at least `min_prec` precedence. - fn parse_assoc_expr_with(&mut self, - min_prec: usize, - lhs: LhsExpr) - -> PResult<'a, P> { + fn parse_assoc_expr_with( + &mut self, + min_prec: usize, + lhs: LhsExpr, + ) -> PResult<'a, P> { let mut lhs = if let LhsExpr::AlreadyParsed(expr) = lhs { expr } else { @@ -2840,9 +2846,13 @@ impl<'a> Parser<'a> { self.parse_prefix_expr(attrs)? } }; + let last_type_ascription_set = self.last_type_ascription.is_some(); match (self.expr_is_complete(&lhs), AssocOp::from_token(&self.token)) { (true, None) => { + if last_type_ascription_set { + self.last_type_ascription = None; + } // Semi-statement forms are odd. See https://github.com/rust-lang/rust/issues/29071 return Ok(lhs); } @@ -2857,12 +2867,18 @@ impl<'a> Parser<'a> { // If the next token is a keyword, then the tokens above *are* unambiguously incorrect: // `if x { a } else { b } && if y { c } else { d }` if !self.look_ahead(1, |t| t.is_reserved_ident()) => { + if last_type_ascription_set { + self.last_type_ascription = None; + } // These cases are ambiguous and can't be identified in the parser alone let sp = self.sess.source_map().start_point(self.token.span); self.sess.ambiguous_block_expr_parse.borrow_mut().insert(sp, lhs.span); return Ok(lhs); } (true, Some(ref op)) if !op.can_continue_expr_unambiguously() => { + if last_type_ascription_set { + self.last_type_ascription = None; + } return Ok(lhs); } (true, Some(_)) => { @@ -2921,21 +2937,9 @@ impl<'a> Parser<'a> { continue } else if op == AssocOp::Colon { let maybe_path = self.could_ascription_be_path(&lhs.node); - let next_sp = self.token.span; + self.last_type_ascription = Some((self.prev_span, maybe_path)); - lhs = match self.parse_assoc_op_cast(lhs, lhs_span, ExprKind::Type) { - Ok(lhs) => lhs, - Err(mut err) => { - self.bad_type_ascription( - &mut err, - lhs_span, - cur_op_span, - next_sp, - maybe_path, - ); - return Err(err); - } - }; + lhs = self.parse_assoc_op_cast(lhs, lhs_span, ExprKind::Type)?; continue } else if op == AssocOp::DotDot || op == AssocOp::DotDotEq { // If we didnโ€™t have to handle `x..`/`x..=`, it would be pretty easy to @@ -3020,6 +3024,9 @@ impl<'a> Parser<'a> { if let Fixity::None = fixity { break } } + if last_type_ascription_set { + self.last_type_ascription = None; + } Ok(lhs) } -- cgit 1.4.1-3-g733a5 From 9dbe2e77b34f5321976ee3b26ca008ad8d574faf Mon Sep 17 00:00:00 2001 From: Esteban Kรผber Date: Fri, 19 Jul 2019 10:59:02 -0700 Subject: review comments --- src/libsyntax/parse/diagnostics.rs | 14 +++++++------- src/libsyntax/parse/parser.rs | 12 +++--------- 2 files changed, 10 insertions(+), 16 deletions(-) (limited to 'src/libsyntax/parse') diff --git a/src/libsyntax/parse/diagnostics.rs b/src/libsyntax/parse/diagnostics.rs index 190672acfcf..f4fc87506f3 100644 --- a/src/libsyntax/parse/diagnostics.rs +++ b/src/libsyntax/parse/diagnostics.rs @@ -327,8 +327,8 @@ impl<'a> Parser<'a> { self.token.is_keyword(kw::Return) || self.token.is_keyword(kw::While) ); - let cm = self.sess.source_map(); - match (cm.lookup_line(self.token.span.lo()), cm.lookup_line(sp.lo())) { + let sm = self.sess.source_map(); + match (sm.lookup_line(self.token.span.lo()), sm.lookup_line(sp.lo())) { (Ok(ref a), Ok(ref b)) if a.line != b.line && is_semi_suggestable => { // The spans are in different lines, expected `;` and found `let` or `return`. // High likelihood that it is only a missing `;`. @@ -376,9 +376,9 @@ impl<'a> Parser<'a> { maybe_expected_semicolon: bool, ) { if let Some((sp, likely_path)) = self.last_type_ascription { - let cm = self.sess.source_map(); - let next_pos = cm.lookup_char_pos(self.token.span.lo()); - let op_pos = cm.lookup_char_pos(sp.hi()); + let sm = self.sess.source_map(); + let next_pos = sm.lookup_char_pos(self.token.span.lo()); + let op_pos = sm.lookup_char_pos(sp.hi()); if likely_path { err.span_suggestion( @@ -814,8 +814,8 @@ impl<'a> Parser<'a> { return Ok(recovered); } } - let cm = self.sess.source_map(); - match (cm.lookup_line(prev_sp.lo()), cm.lookup_line(sp.lo())) { + let sm = self.sess.source_map(); + match (sm.lookup_line(prev_sp.lo()), sm.lookup_line(sp.lo())) { (Ok(ref a), Ok(ref b)) if a.line == b.line => { // When the spans are in the same line, it means that the only content // between them is whitespace, point only at the found token. diff --git a/src/libsyntax/parse/parser.rs b/src/libsyntax/parse/parser.rs index 41cfb5bece3..da388694637 100644 --- a/src/libsyntax/parse/parser.rs +++ b/src/libsyntax/parse/parser.rs @@ -2850,9 +2850,7 @@ impl<'a> Parser<'a> { match (self.expr_is_complete(&lhs), AssocOp::from_token(&self.token)) { (true, None) => { - if last_type_ascription_set { - self.last_type_ascription = None; - } + self.last_type_ascription = None; // Semi-statement forms are odd. See https://github.com/rust-lang/rust/issues/29071 return Ok(lhs); } @@ -2867,18 +2865,14 @@ impl<'a> Parser<'a> { // If the next token is a keyword, then the tokens above *are* unambiguously incorrect: // `if x { a } else { b } && if y { c } else { d }` if !self.look_ahead(1, |t| t.is_reserved_ident()) => { - if last_type_ascription_set { - self.last_type_ascription = None; - } + self.last_type_ascription = None; // These cases are ambiguous and can't be identified in the parser alone let sp = self.sess.source_map().start_point(self.token.span); self.sess.ambiguous_block_expr_parse.borrow_mut().insert(sp, lhs.span); return Ok(lhs); } (true, Some(ref op)) if !op.can_continue_expr_unambiguously() => { - if last_type_ascription_set { - self.last_type_ascription = None; - } + self.last_type_ascription = None; return Ok(lhs); } (true, Some(_)) => { -- cgit 1.4.1-3-g733a5 From e63fe150bfbce632dd7ff0a656a4180557128e4f Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Sun, 21 Jul 2019 16:46:11 +0300 Subject: move unescape module to rustc_lexer --- src/librustc_lexer/src/lib.rs | 1 + src/librustc_lexer/src/unescape.rs | 602 ++++++++++++++++++++++++ src/libsyntax/parse/lexer/mod.rs | 2 +- src/libsyntax/parse/literal.rs | 6 +- src/libsyntax/parse/mod.rs | 1 - src/libsyntax/parse/unescape.rs | 602 ------------------------ src/libsyntax/parse/unescape_error_reporting.rs | 3 +- 7 files changed, 608 insertions(+), 609 deletions(-) create mode 100644 src/librustc_lexer/src/unescape.rs delete mode 100644 src/libsyntax/parse/unescape.rs (limited to 'src/libsyntax/parse') diff --git a/src/librustc_lexer/src/lib.rs b/src/librustc_lexer/src/lib.rs index a21190ec332..12e095b8bd5 100644 --- a/src/librustc_lexer/src/lib.rs +++ b/src/librustc_lexer/src/lib.rs @@ -4,6 +4,7 @@ #![cfg_attr(not(feature = "unicode-xid"), feature(unicode_internals))] mod cursor; +pub mod unescape; use crate::cursor::{Cursor, EOF_CHAR}; diff --git a/src/librustc_lexer/src/unescape.rs b/src/librustc_lexer/src/unescape.rs new file mode 100644 index 00000000000..70085df9948 --- /dev/null +++ b/src/librustc_lexer/src/unescape.rs @@ -0,0 +1,602 @@ +//! Utilities for validating string and char literals and turning them into +//! values they represent. + +use std::str::Chars; +use std::ops::Range; + +#[derive(Debug, PartialEq, Eq)] +pub enum EscapeError { + ZeroChars, + MoreThanOneChar, + + LoneSlash, + InvalidEscape, + BareCarriageReturn, + BareCarriageReturnInRawString, + EscapeOnlyChar, + + TooShortHexEscape, + InvalidCharInHexEscape, + OutOfRangeHexEscape, + + NoBraceInUnicodeEscape, + InvalidCharInUnicodeEscape, + EmptyUnicodeEscape, + UnclosedUnicodeEscape, + LeadingUnderscoreUnicodeEscape, + OverlongUnicodeEscape, + LoneSurrogateUnicodeEscape, + OutOfRangeUnicodeEscape, + + UnicodeEscapeInByte, + NonAsciiCharInByte, + NonAsciiCharInByteString, +} + +/// Takes a contents of a char literal (without quotes), and returns an +/// unescaped char or an error +pub fn unescape_char(literal_text: &str) -> Result { + let mut chars = literal_text.chars(); + unescape_char_or_byte(&mut chars, Mode::Char) + .map_err(|err| (literal_text.len() - chars.as_str().len(), err)) +} + +/// Takes a contents of a string literal (without quotes) and produces a +/// sequence of escaped characters or errors. +pub fn unescape_str(literal_text: &str, callback: &mut F) +where + F: FnMut(Range, Result), +{ + unescape_str_or_byte_str(literal_text, Mode::Str, callback) +} + +pub fn unescape_byte(literal_text: &str) -> Result { + let mut chars = literal_text.chars(); + unescape_char_or_byte(&mut chars, Mode::Byte) + .map(byte_from_char) + .map_err(|err| (literal_text.len() - chars.as_str().len(), err)) +} + +/// Takes a contents of a string literal (without quotes) and produces a +/// sequence of escaped characters or errors. +pub fn unescape_byte_str(literal_text: &str, callback: &mut F) +where + F: FnMut(Range, Result), +{ + unescape_str_or_byte_str(literal_text, Mode::ByteStr, &mut |range, char| { + callback(range, char.map(byte_from_char)) + }) +} + +/// Takes a contents of a string literal (without quotes) and produces a +/// sequence of characters or errors. +/// NOTE: Raw strings do not perform any explicit character escaping, here we +/// only translate CRLF to LF and produce errors on bare CR. +pub fn unescape_raw_str(literal_text: &str, callback: &mut F) +where + F: FnMut(Range, Result), +{ + unescape_raw_str_or_byte_str(literal_text, Mode::Str, callback) +} + +/// Takes a contents of a string literal (without quotes) and produces a +/// sequence of characters or errors. +/// NOTE: Raw strings do not perform any explicit character escaping, here we +/// only translate CRLF to LF and produce errors on bare CR. +pub fn unescape_raw_byte_str(literal_text: &str, callback: &mut F) +where + F: FnMut(Range, Result), +{ + unescape_raw_str_or_byte_str(literal_text, Mode::ByteStr, &mut |range, char| { + callback(range, char.map(byte_from_char)) + }) +} + +#[derive(Debug, Clone, Copy)] +pub enum Mode { + Char, + Str, + Byte, + ByteStr, +} + +impl Mode { + pub fn in_single_quotes(self) -> bool { + match self { + Mode::Char | Mode::Byte => true, + Mode::Str | Mode::ByteStr => false, + } + } + + pub fn in_double_quotes(self) -> bool { + !self.in_single_quotes() + } + + pub fn is_bytes(self) -> bool { + match self { + Mode::Byte | Mode::ByteStr => true, + Mode::Char | Mode::Str => false, + } + } +} + + +fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result { + if first_char != '\\' { + return match first_char { + '\t' | '\n' => Err(EscapeError::EscapeOnlyChar), + '\r' => Err(if chars.clone().next() == Some('\n') { + EscapeError::EscapeOnlyChar + } else { + EscapeError::BareCarriageReturn + }), + '\'' if mode.in_single_quotes() => Err(EscapeError::EscapeOnlyChar), + '"' if mode.in_double_quotes() => Err(EscapeError::EscapeOnlyChar), + _ => { + if mode.is_bytes() && !first_char.is_ascii() { + return Err(EscapeError::NonAsciiCharInByte); + } + Ok(first_char) + } + }; + } + + let second_char = chars.next().ok_or(EscapeError::LoneSlash)?; + + let res = match second_char { + '"' => '"', + 'n' => '\n', + 'r' => '\r', + 't' => '\t', + '\\' => '\\', + '\'' => '\'', + '0' => '\0', + + 'x' => { + let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?; + let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; + + let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?; + let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; + + let value = hi * 16 + lo; + + if !mode.is_bytes() && !is_ascii(value) { + return Err(EscapeError::OutOfRangeHexEscape); + } + let value = value as u8; + + value as char + } + + 'u' => { + if chars.next() != Some('{') { + return Err(EscapeError::NoBraceInUnicodeEscape); + } + + let mut n_digits = 1; + let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? { + '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape), + '}' => return Err(EscapeError::EmptyUnicodeEscape), + c => c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?, + }; + + loop { + match chars.next() { + None => return Err(EscapeError::UnclosedUnicodeEscape), + Some('_') => continue, + Some('}') => { + if n_digits > 6 { + return Err(EscapeError::OverlongUnicodeEscape); + } + if mode.is_bytes() { + return Err(EscapeError::UnicodeEscapeInByte); + } + + break std::char::from_u32(value).ok_or_else(|| { + if value > 0x10FFFF { + EscapeError::OutOfRangeUnicodeEscape + } else { + EscapeError::LoneSurrogateUnicodeEscape + } + })?; + } + Some(c) => { + let digit = c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?; + n_digits += 1; + if n_digits > 6 { + continue; + } + let digit = digit as u32; + value = value * 16 + digit; + } + }; + } + } + _ => return Err(EscapeError::InvalidEscape), + }; + Ok(res) +} + +fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result { + let first_char = chars.next().ok_or(EscapeError::ZeroChars)?; + let res = scan_escape(first_char, chars, mode)?; + if chars.next().is_some() { + return Err(EscapeError::MoreThanOneChar); + } + Ok(res) +} + +/// Takes a contents of a string literal (without quotes) and produces a +/// sequence of escaped characters or errors. +fn unescape_str_or_byte_str(src: &str, mode: Mode, callback: &mut F) +where + F: FnMut(Range, Result), +{ + assert!(mode.in_double_quotes()); + let initial_len = src.len(); + let mut chars = src.chars(); + while let Some(first_char) = chars.next() { + let start = initial_len - chars.as_str().len() - first_char.len_utf8(); + + let unescaped_char = match first_char { + '\\' => { + let (second_char, third_char) = { + let mut chars = chars.clone(); + (chars.next(), chars.next()) + }; + match (second_char, third_char) { + (Some('\n'), _) | (Some('\r'), Some('\n')) => { + skip_ascii_whitespace(&mut chars); + continue; + } + _ => scan_escape(first_char, &mut chars, mode), + } + } + '\r' => { + let second_char = chars.clone().next(); + if second_char == Some('\n') { + chars.next(); + Ok('\n') + } else { + scan_escape(first_char, &mut chars, mode) + } + } + '\n' => Ok('\n'), + '\t' => Ok('\t'), + _ => scan_escape(first_char, &mut chars, mode), + }; + let end = initial_len - chars.as_str().len(); + callback(start..end, unescaped_char); + } + + fn skip_ascii_whitespace(chars: &mut Chars<'_>) { + let str = chars.as_str(); + let first_non_space = str + .bytes() + .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r') + .unwrap_or(str.len()); + *chars = str[first_non_space..].chars() + } +} + +/// Takes a contents of a string literal (without quotes) and produces a +/// sequence of characters or errors. +/// NOTE: Raw strings do not perform any explicit character escaping, here we +/// only translate CRLF to LF and produce errors on bare CR. +fn unescape_raw_str_or_byte_str(literal_text: &str, mode: Mode, callback: &mut F) +where + F: FnMut(Range, Result), +{ + assert!(mode.in_double_quotes()); + let initial_len = literal_text.len(); + + let mut chars = literal_text.chars(); + while let Some(curr) = chars.next() { + let start = initial_len - chars.as_str().len() - curr.len_utf8(); + + let result = match (curr, chars.clone().next()) { + ('\r', Some('\n')) => { + chars.next(); + Ok('\n') + }, + ('\r', _) => Err(EscapeError::BareCarriageReturnInRawString), + (c, _) if mode.is_bytes() && !c.is_ascii() => + Err(EscapeError::NonAsciiCharInByteString), + (c, _) => Ok(c), + }; + let end = initial_len - chars.as_str().len(); + + callback(start..end, result); + } +} + +fn byte_from_char(c: char) -> u8 { + let res = c as u32; + assert!(res <= u8::max_value() as u32, "guaranteed because of Mode::Byte(Str)"); + res as u8 +} + +fn is_ascii(x: u32) -> bool { + x <= 0x7F +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_unescape_char_bad() { + fn check(literal_text: &str, expected_error: EscapeError) { + let actual_result = unescape_char(literal_text).map_err(|(_offset, err)| err); + assert_eq!(actual_result, Err(expected_error)); + } + + check("", EscapeError::ZeroChars); + check(r"\", EscapeError::LoneSlash); + + check("\n", EscapeError::EscapeOnlyChar); + check("\r\n", EscapeError::EscapeOnlyChar); + check("\t", EscapeError::EscapeOnlyChar); + check("'", EscapeError::EscapeOnlyChar); + check("\r", EscapeError::BareCarriageReturn); + + check("spam", EscapeError::MoreThanOneChar); + check(r"\x0ff", EscapeError::MoreThanOneChar); + check(r#"\"a"#, EscapeError::MoreThanOneChar); + check(r"\na", EscapeError::MoreThanOneChar); + check(r"\ra", EscapeError::MoreThanOneChar); + check(r"\ta", EscapeError::MoreThanOneChar); + check(r"\\a", EscapeError::MoreThanOneChar); + check(r"\'a", EscapeError::MoreThanOneChar); + check(r"\0a", EscapeError::MoreThanOneChar); + check(r"\u{0}x", EscapeError::MoreThanOneChar); + check(r"\u{1F63b}}", EscapeError::MoreThanOneChar); + + check(r"\v", EscapeError::InvalidEscape); + check(r"\๐Ÿ’ฉ", EscapeError::InvalidEscape); + check(r"\โ—", EscapeError::InvalidEscape); + + check(r"\x", EscapeError::TooShortHexEscape); + check(r"\x0", EscapeError::TooShortHexEscape); + check(r"\xf", EscapeError::TooShortHexEscape); + check(r"\xa", EscapeError::TooShortHexEscape); + check(r"\xx", EscapeError::InvalidCharInHexEscape); + check(r"\xั‹", EscapeError::InvalidCharInHexEscape); + check(r"\x๐Ÿฆ€", EscapeError::InvalidCharInHexEscape); + check(r"\xtt", EscapeError::InvalidCharInHexEscape); + check(r"\xff", EscapeError::OutOfRangeHexEscape); + check(r"\xFF", EscapeError::OutOfRangeHexEscape); + check(r"\x80", EscapeError::OutOfRangeHexEscape); + + check(r"\u", EscapeError::NoBraceInUnicodeEscape); + check(r"\u[0123]", EscapeError::NoBraceInUnicodeEscape); + check(r"\u{0x}", EscapeError::InvalidCharInUnicodeEscape); + check(r"\u{", EscapeError::UnclosedUnicodeEscape); + check(r"\u{0000", EscapeError::UnclosedUnicodeEscape); + check(r"\u{}", EscapeError::EmptyUnicodeEscape); + check(r"\u{_0000}", EscapeError::LeadingUnderscoreUnicodeEscape); + check(r"\u{0000000}", EscapeError::OverlongUnicodeEscape); + check(r"\u{FFFFFF}", EscapeError::OutOfRangeUnicodeEscape); + check(r"\u{ffffff}", EscapeError::OutOfRangeUnicodeEscape); + check(r"\u{ffffff}", EscapeError::OutOfRangeUnicodeEscape); + + check(r"\u{DC00}", EscapeError::LoneSurrogateUnicodeEscape); + check(r"\u{DDDD}", EscapeError::LoneSurrogateUnicodeEscape); + check(r"\u{DFFF}", EscapeError::LoneSurrogateUnicodeEscape); + + check(r"\u{D800}", EscapeError::LoneSurrogateUnicodeEscape); + check(r"\u{DAAA}", EscapeError::LoneSurrogateUnicodeEscape); + check(r"\u{DBFF}", EscapeError::LoneSurrogateUnicodeEscape); + } + + #[test] + fn test_unescape_char_good() { + fn check(literal_text: &str, expected_char: char) { + let actual_result = unescape_char(literal_text); + assert_eq!(actual_result, Ok(expected_char)); + } + + check("a", 'a'); + check("ั‹", 'ั‹'); + check("๐Ÿฆ€", '๐Ÿฆ€'); + + check(r#"\""#, '"'); + check(r"\n", '\n'); + check(r"\r", '\r'); + check(r"\t", '\t'); + check(r"\\", '\\'); + check(r"\'", '\''); + check(r"\0", '\0'); + + check(r"\x00", '\0'); + check(r"\x5a", 'Z'); + check(r"\x5A", 'Z'); + check(r"\x7f", 127 as char); + + check(r"\u{0}", '\0'); + check(r"\u{000000}", '\0'); + check(r"\u{41}", 'A'); + check(r"\u{0041}", 'A'); + check(r"\u{00_41}", 'A'); + check(r"\u{4__1__}", 'A'); + check(r"\u{1F63b}", '๐Ÿ˜ป'); + } + + #[test] + fn test_unescape_str_good() { + fn check(literal_text: &str, expected: &str) { + let mut buf = Ok(String::with_capacity(literal_text.len())); + unescape_str(literal_text, &mut |range, c| { + if let Ok(b) = &mut buf { + match c { + Ok(c) => b.push(c), + Err(e) => buf = Err((range, e)), + } + } + }); + let buf = buf.as_ref().map(|it| it.as_ref()); + assert_eq!(buf, Ok(expected)) + } + + check("foo", "foo"); + check("", ""); + check(" \t\n\r\n", " \t\n\n"); + + check("hello \\\n world", "hello world"); + check("hello \\\r\n world", "hello world"); + check("thread's", "thread's") + } + + #[test] + fn test_unescape_byte_bad() { + fn check(literal_text: &str, expected_error: EscapeError) { + let actual_result = unescape_byte(literal_text).map_err(|(_offset, err)| err); + assert_eq!(actual_result, Err(expected_error)); + } + + check("", EscapeError::ZeroChars); + check(r"\", EscapeError::LoneSlash); + + check("\n", EscapeError::EscapeOnlyChar); + check("\r\n", EscapeError::EscapeOnlyChar); + check("\t", EscapeError::EscapeOnlyChar); + check("'", EscapeError::EscapeOnlyChar); + check("\r", EscapeError::BareCarriageReturn); + + check("spam", EscapeError::MoreThanOneChar); + check(r"\x0ff", EscapeError::MoreThanOneChar); + check(r#"\"a"#, EscapeError::MoreThanOneChar); + check(r"\na", EscapeError::MoreThanOneChar); + check(r"\ra", EscapeError::MoreThanOneChar); + check(r"\ta", EscapeError::MoreThanOneChar); + check(r"\\a", EscapeError::MoreThanOneChar); + check(r"\'a", EscapeError::MoreThanOneChar); + check(r"\0a", EscapeError::MoreThanOneChar); + + check(r"\v", EscapeError::InvalidEscape); + check(r"\๐Ÿ’ฉ", EscapeError::InvalidEscape); + check(r"\โ—", EscapeError::InvalidEscape); + + check(r"\x", EscapeError::TooShortHexEscape); + check(r"\x0", EscapeError::TooShortHexEscape); + check(r"\xa", EscapeError::TooShortHexEscape); + check(r"\xf", EscapeError::TooShortHexEscape); + check(r"\xx", EscapeError::InvalidCharInHexEscape); + check(r"\xั‹", EscapeError::InvalidCharInHexEscape); + check(r"\x๐Ÿฆ€", EscapeError::InvalidCharInHexEscape); + check(r"\xtt", EscapeError::InvalidCharInHexEscape); + + check(r"\u", EscapeError::NoBraceInUnicodeEscape); + check(r"\u[0123]", EscapeError::NoBraceInUnicodeEscape); + check(r"\u{0x}", EscapeError::InvalidCharInUnicodeEscape); + check(r"\u{", EscapeError::UnclosedUnicodeEscape); + check(r"\u{0000", EscapeError::UnclosedUnicodeEscape); + check(r"\u{}", EscapeError::EmptyUnicodeEscape); + check(r"\u{_0000}", EscapeError::LeadingUnderscoreUnicodeEscape); + check(r"\u{0000000}", EscapeError::OverlongUnicodeEscape); + + check("ั‹", EscapeError::NonAsciiCharInByte); + check("๐Ÿฆ€", EscapeError::NonAsciiCharInByte); + + check(r"\u{0}", EscapeError::UnicodeEscapeInByte); + check(r"\u{000000}", EscapeError::UnicodeEscapeInByte); + check(r"\u{41}", EscapeError::UnicodeEscapeInByte); + check(r"\u{0041}", EscapeError::UnicodeEscapeInByte); + check(r"\u{00_41}", EscapeError::UnicodeEscapeInByte); + check(r"\u{4__1__}", EscapeError::UnicodeEscapeInByte); + check(r"\u{1F63b}", EscapeError::UnicodeEscapeInByte); + check(r"\u{0}x", EscapeError::UnicodeEscapeInByte); + check(r"\u{1F63b}}", EscapeError::UnicodeEscapeInByte); + check(r"\u{FFFFFF}", EscapeError::UnicodeEscapeInByte); + check(r"\u{ffffff}", EscapeError::UnicodeEscapeInByte); + check(r"\u{ffffff}", EscapeError::UnicodeEscapeInByte); + check(r"\u{DC00}", EscapeError::UnicodeEscapeInByte); + check(r"\u{DDDD}", EscapeError::UnicodeEscapeInByte); + check(r"\u{DFFF}", EscapeError::UnicodeEscapeInByte); + check(r"\u{D800}", EscapeError::UnicodeEscapeInByte); + check(r"\u{DAAA}", EscapeError::UnicodeEscapeInByte); + check(r"\u{DBFF}", EscapeError::UnicodeEscapeInByte); + } + + #[test] + fn test_unescape_byte_good() { + fn check(literal_text: &str, expected_byte: u8) { + let actual_result = unescape_byte(literal_text); + assert_eq!(actual_result, Ok(expected_byte)); + } + + check("a", b'a'); + + check(r#"\""#, b'"'); + check(r"\n", b'\n'); + check(r"\r", b'\r'); + check(r"\t", b'\t'); + check(r"\\", b'\\'); + check(r"\'", b'\''); + check(r"\0", b'\0'); + + check(r"\x00", b'\0'); + check(r"\x5a", b'Z'); + check(r"\x5A", b'Z'); + check(r"\x7f", 127); + check(r"\x80", 128); + check(r"\xff", 255); + check(r"\xFF", 255); + } + + #[test] + fn test_unescape_byte_str_good() { + fn check(literal_text: &str, expected: &[u8]) { + let mut buf = Ok(Vec::with_capacity(literal_text.len())); + unescape_byte_str(literal_text, &mut |range, c| { + if let Ok(b) = &mut buf { + match c { + Ok(c) => b.push(c), + Err(e) => buf = Err((range, e)), + } + } + }); + let buf = buf.as_ref().map(|it| it.as_ref()); + assert_eq!(buf, Ok(expected)) + } + + check("foo", b"foo"); + check("", b""); + check(" \t\n\r\n", b" \t\n\n"); + + check("hello \\\n world", b"hello world"); + check("hello \\\r\n world", b"hello world"); + check("thread's", b"thread's") + } + + #[test] + fn test_unescape_raw_str() { + fn check(literal: &str, expected: &[(Range, Result)]) { + let mut unescaped = Vec::with_capacity(literal.len()); + unescape_raw_str(literal, &mut |range, res| unescaped.push((range, res))); + assert_eq!(unescaped, expected); + } + + check("\r\n", &[(0..2, Ok('\n'))]); + check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]); + check("\rx", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString)), (1..2, Ok('x'))]); + } + + #[test] + fn test_unescape_raw_byte_str() { + fn check(literal: &str, expected: &[(Range, Result)]) { + let mut unescaped = Vec::with_capacity(literal.len()); + unescape_raw_byte_str(literal, &mut |range, res| unescaped.push((range, res))); + assert_eq!(unescaped, expected); + } + + check("\r\n", &[(0..2, Ok(byte_from_char('\n')))]); + check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]); + check("๐Ÿฆ€", &[(0..4, Err(EscapeError::NonAsciiCharInByteString))]); + check( + "๐Ÿฆ€a", + &[(0..4, Err(EscapeError::NonAsciiCharInByteString)), (4..5, Ok(byte_from_char('a')))], + ); + } +} diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index 317c49c7d35..ebb02737822 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -1,12 +1,12 @@ use crate::parse::ParseSess; use crate::parse::token::{self, Token, TokenKind}; use crate::symbol::{sym, Symbol}; -use crate::parse::unescape; use crate::parse::unescape_error_reporting::{emit_unescape_error, push_escaped_char}; use errors::{FatalError, Diagnostic, DiagnosticBuilder}; use syntax_pos::{BytePos, Pos, Span, NO_EXPANSION}; use rustc_lexer::Base; +use rustc_lexer::unescape; use std::borrow::Cow; use std::char; diff --git a/src/libsyntax/parse/literal.rs b/src/libsyntax/parse/literal.rs index 683d1641565..6409acba573 100644 --- a/src/libsyntax/parse/literal.rs +++ b/src/libsyntax/parse/literal.rs @@ -4,9 +4,6 @@ use crate::ast::{self, Lit, LitKind}; use crate::parse::parser::Parser; use crate::parse::PResult; use crate::parse::token::{self, Token, TokenKind}; -use crate::parse::unescape::{unescape_char, unescape_byte}; -use crate::parse::unescape::{unescape_str, unescape_byte_str}; -use crate::parse::unescape::{unescape_raw_str, unescape_raw_byte_str}; use crate::print::pprust; use crate::symbol::{kw, sym, Symbol}; use crate::tokenstream::{TokenStream, TokenTree}; @@ -15,6 +12,9 @@ use errors::{Applicability, Handler}; use log::debug; use rustc_data_structures::sync::Lrc; use syntax_pos::Span; +use rustc_lexer::unescape::{unescape_char, unescape_byte}; +use rustc_lexer::unescape::{unescape_str, unescape_byte_str}; +use rustc_lexer::unescape::{unescape_raw_str, unescape_raw_byte_str}; use std::ascii; diff --git a/src/libsyntax/parse/mod.rs b/src/libsyntax/parse/mod.rs index 4c4551b1757..225065c1cf1 100644 --- a/src/libsyntax/parse/mod.rs +++ b/src/libsyntax/parse/mod.rs @@ -32,7 +32,6 @@ pub mod token; crate mod classify; crate mod diagnostics; crate mod literal; -crate mod unescape; crate mod unescape_error_reporting; /// Info about a parsing session. diff --git a/src/libsyntax/parse/unescape.rs b/src/libsyntax/parse/unescape.rs deleted file mode 100644 index 87cc9c1c9e3..00000000000 --- a/src/libsyntax/parse/unescape.rs +++ /dev/null @@ -1,602 +0,0 @@ -//! Utilities for validating string and char literals and turning them into -//! values they represent. - -use std::str::Chars; -use std::ops::Range; - -#[derive(Debug, PartialEq, Eq)] -pub(crate) enum EscapeError { - ZeroChars, - MoreThanOneChar, - - LoneSlash, - InvalidEscape, - BareCarriageReturn, - BareCarriageReturnInRawString, - EscapeOnlyChar, - - TooShortHexEscape, - InvalidCharInHexEscape, - OutOfRangeHexEscape, - - NoBraceInUnicodeEscape, - InvalidCharInUnicodeEscape, - EmptyUnicodeEscape, - UnclosedUnicodeEscape, - LeadingUnderscoreUnicodeEscape, - OverlongUnicodeEscape, - LoneSurrogateUnicodeEscape, - OutOfRangeUnicodeEscape, - - UnicodeEscapeInByte, - NonAsciiCharInByte, - NonAsciiCharInByteString, -} - -/// Takes a contents of a char literal (without quotes), and returns an -/// unescaped char or an error -pub(crate) fn unescape_char(literal_text: &str) -> Result { - let mut chars = literal_text.chars(); - unescape_char_or_byte(&mut chars, Mode::Char) - .map_err(|err| (literal_text.len() - chars.as_str().len(), err)) -} - -/// Takes a contents of a string literal (without quotes) and produces a -/// sequence of escaped characters or errors. -pub(crate) fn unescape_str(literal_text: &str, callback: &mut F) -where - F: FnMut(Range, Result), -{ - unescape_str_or_byte_str(literal_text, Mode::Str, callback) -} - -pub(crate) fn unescape_byte(literal_text: &str) -> Result { - let mut chars = literal_text.chars(); - unescape_char_or_byte(&mut chars, Mode::Byte) - .map(byte_from_char) - .map_err(|err| (literal_text.len() - chars.as_str().len(), err)) -} - -/// Takes a contents of a string literal (without quotes) and produces a -/// sequence of escaped characters or errors. -pub(crate) fn unescape_byte_str(literal_text: &str, callback: &mut F) -where - F: FnMut(Range, Result), -{ - unescape_str_or_byte_str(literal_text, Mode::ByteStr, &mut |range, char| { - callback(range, char.map(byte_from_char)) - }) -} - -/// Takes a contents of a string literal (without quotes) and produces a -/// sequence of characters or errors. -/// NOTE: Raw strings do not perform any explicit character escaping, here we -/// only translate CRLF to LF and produce errors on bare CR. -pub(crate) fn unescape_raw_str(literal_text: &str, callback: &mut F) -where - F: FnMut(Range, Result), -{ - unescape_raw_str_or_byte_str(literal_text, Mode::Str, callback) -} - -/// Takes a contents of a string literal (without quotes) and produces a -/// sequence of characters or errors. -/// NOTE: Raw strings do not perform any explicit character escaping, here we -/// only translate CRLF to LF and produce errors on bare CR. -pub(crate) fn unescape_raw_byte_str(literal_text: &str, callback: &mut F) -where - F: FnMut(Range, Result), -{ - unescape_raw_str_or_byte_str(literal_text, Mode::ByteStr, &mut |range, char| { - callback(range, char.map(byte_from_char)) - }) -} - -#[derive(Debug, Clone, Copy)] -pub(crate) enum Mode { - Char, - Str, - Byte, - ByteStr, -} - -impl Mode { - fn in_single_quotes(self) -> bool { - match self { - Mode::Char | Mode::Byte => true, - Mode::Str | Mode::ByteStr => false, - } - } - - pub(crate) fn in_double_quotes(self) -> bool { - !self.in_single_quotes() - } - - pub(crate) fn is_bytes(self) -> bool { - match self { - Mode::Byte | Mode::ByteStr => true, - Mode::Char | Mode::Str => false, - } - } -} - - -fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result { - if first_char != '\\' { - return match first_char { - '\t' | '\n' => Err(EscapeError::EscapeOnlyChar), - '\r' => Err(if chars.clone().next() == Some('\n') { - EscapeError::EscapeOnlyChar - } else { - EscapeError::BareCarriageReturn - }), - '\'' if mode.in_single_quotes() => Err(EscapeError::EscapeOnlyChar), - '"' if mode.in_double_quotes() => Err(EscapeError::EscapeOnlyChar), - _ => { - if mode.is_bytes() && !first_char.is_ascii() { - return Err(EscapeError::NonAsciiCharInByte); - } - Ok(first_char) - } - }; - } - - let second_char = chars.next().ok_or(EscapeError::LoneSlash)?; - - let res = match second_char { - '"' => '"', - 'n' => '\n', - 'r' => '\r', - 't' => '\t', - '\\' => '\\', - '\'' => '\'', - '0' => '\0', - - 'x' => { - let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?; - let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; - - let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?; - let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; - - let value = hi * 16 + lo; - - if !mode.is_bytes() && !is_ascii(value) { - return Err(EscapeError::OutOfRangeHexEscape); - } - let value = value as u8; - - value as char - } - - 'u' => { - if chars.next() != Some('{') { - return Err(EscapeError::NoBraceInUnicodeEscape); - } - - let mut n_digits = 1; - let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? { - '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape), - '}' => return Err(EscapeError::EmptyUnicodeEscape), - c => c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?, - }; - - loop { - match chars.next() { - None => return Err(EscapeError::UnclosedUnicodeEscape), - Some('_') => continue, - Some('}') => { - if n_digits > 6 { - return Err(EscapeError::OverlongUnicodeEscape); - } - if mode.is_bytes() { - return Err(EscapeError::UnicodeEscapeInByte); - } - - break std::char::from_u32(value).ok_or_else(|| { - if value > 0x10FFFF { - EscapeError::OutOfRangeUnicodeEscape - } else { - EscapeError::LoneSurrogateUnicodeEscape - } - })?; - } - Some(c) => { - let digit = c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?; - n_digits += 1; - if n_digits > 6 { - continue; - } - let digit = digit as u32; - value = value * 16 + digit; - } - }; - } - } - _ => return Err(EscapeError::InvalidEscape), - }; - Ok(res) -} - -fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result { - let first_char = chars.next().ok_or(EscapeError::ZeroChars)?; - let res = scan_escape(first_char, chars, mode)?; - if chars.next().is_some() { - return Err(EscapeError::MoreThanOneChar); - } - Ok(res) -} - -/// Takes a contents of a string literal (without quotes) and produces a -/// sequence of escaped characters or errors. -fn unescape_str_or_byte_str(src: &str, mode: Mode, callback: &mut F) -where - F: FnMut(Range, Result), -{ - assert!(mode.in_double_quotes()); - let initial_len = src.len(); - let mut chars = src.chars(); - while let Some(first_char) = chars.next() { - let start = initial_len - chars.as_str().len() - first_char.len_utf8(); - - let unescaped_char = match first_char { - '\\' => { - let (second_char, third_char) = { - let mut chars = chars.clone(); - (chars.next(), chars.next()) - }; - match (second_char, third_char) { - (Some('\n'), _) | (Some('\r'), Some('\n')) => { - skip_ascii_whitespace(&mut chars); - continue; - } - _ => scan_escape(first_char, &mut chars, mode), - } - } - '\r' => { - let second_char = chars.clone().next(); - if second_char == Some('\n') { - chars.next(); - Ok('\n') - } else { - scan_escape(first_char, &mut chars, mode) - } - } - '\n' => Ok('\n'), - '\t' => Ok('\t'), - _ => scan_escape(first_char, &mut chars, mode), - }; - let end = initial_len - chars.as_str().len(); - callback(start..end, unescaped_char); - } - - fn skip_ascii_whitespace(chars: &mut Chars<'_>) { - let str = chars.as_str(); - let first_non_space = str - .bytes() - .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r') - .unwrap_or(str.len()); - *chars = str[first_non_space..].chars() - } -} - -/// Takes a contents of a string literal (without quotes) and produces a -/// sequence of characters or errors. -/// NOTE: Raw strings do not perform any explicit character escaping, here we -/// only translate CRLF to LF and produce errors on bare CR. -fn unescape_raw_str_or_byte_str(literal_text: &str, mode: Mode, callback: &mut F) -where - F: FnMut(Range, Result), -{ - assert!(mode.in_double_quotes()); - let initial_len = literal_text.len(); - - let mut chars = literal_text.chars(); - while let Some(curr) = chars.next() { - let start = initial_len - chars.as_str().len() - curr.len_utf8(); - - let result = match (curr, chars.clone().next()) { - ('\r', Some('\n')) => { - chars.next(); - Ok('\n') - }, - ('\r', _) => Err(EscapeError::BareCarriageReturnInRawString), - (c, _) if mode.is_bytes() && !c.is_ascii() => - Err(EscapeError::NonAsciiCharInByteString), - (c, _) => Ok(c), - }; - let end = initial_len - chars.as_str().len(); - - callback(start..end, result); - } -} - -fn byte_from_char(c: char) -> u8 { - let res = c as u32; - assert!(res <= u8::max_value() as u32, "guaranteed because of Mode::Byte(Str)"); - res as u8 -} - -fn is_ascii(x: u32) -> bool { - x <= 0x7F -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_unescape_char_bad() { - fn check(literal_text: &str, expected_error: EscapeError) { - let actual_result = unescape_char(literal_text).map_err(|(_offset, err)| err); - assert_eq!(actual_result, Err(expected_error)); - } - - check("", EscapeError::ZeroChars); - check(r"\", EscapeError::LoneSlash); - - check("\n", EscapeError::EscapeOnlyChar); - check("\r\n", EscapeError::EscapeOnlyChar); - check("\t", EscapeError::EscapeOnlyChar); - check("'", EscapeError::EscapeOnlyChar); - check("\r", EscapeError::BareCarriageReturn); - - check("spam", EscapeError::MoreThanOneChar); - check(r"\x0ff", EscapeError::MoreThanOneChar); - check(r#"\"a"#, EscapeError::MoreThanOneChar); - check(r"\na", EscapeError::MoreThanOneChar); - check(r"\ra", EscapeError::MoreThanOneChar); - check(r"\ta", EscapeError::MoreThanOneChar); - check(r"\\a", EscapeError::MoreThanOneChar); - check(r"\'a", EscapeError::MoreThanOneChar); - check(r"\0a", EscapeError::MoreThanOneChar); - check(r"\u{0}x", EscapeError::MoreThanOneChar); - check(r"\u{1F63b}}", EscapeError::MoreThanOneChar); - - check(r"\v", EscapeError::InvalidEscape); - check(r"\๐Ÿ’ฉ", EscapeError::InvalidEscape); - check(r"\โ—", EscapeError::InvalidEscape); - - check(r"\x", EscapeError::TooShortHexEscape); - check(r"\x0", EscapeError::TooShortHexEscape); - check(r"\xf", EscapeError::TooShortHexEscape); - check(r"\xa", EscapeError::TooShortHexEscape); - check(r"\xx", EscapeError::InvalidCharInHexEscape); - check(r"\xั‹", EscapeError::InvalidCharInHexEscape); - check(r"\x๐Ÿฆ€", EscapeError::InvalidCharInHexEscape); - check(r"\xtt", EscapeError::InvalidCharInHexEscape); - check(r"\xff", EscapeError::OutOfRangeHexEscape); - check(r"\xFF", EscapeError::OutOfRangeHexEscape); - check(r"\x80", EscapeError::OutOfRangeHexEscape); - - check(r"\u", EscapeError::NoBraceInUnicodeEscape); - check(r"\u[0123]", EscapeError::NoBraceInUnicodeEscape); - check(r"\u{0x}", EscapeError::InvalidCharInUnicodeEscape); - check(r"\u{", EscapeError::UnclosedUnicodeEscape); - check(r"\u{0000", EscapeError::UnclosedUnicodeEscape); - check(r"\u{}", EscapeError::EmptyUnicodeEscape); - check(r"\u{_0000}", EscapeError::LeadingUnderscoreUnicodeEscape); - check(r"\u{0000000}", EscapeError::OverlongUnicodeEscape); - check(r"\u{FFFFFF}", EscapeError::OutOfRangeUnicodeEscape); - check(r"\u{ffffff}", EscapeError::OutOfRangeUnicodeEscape); - check(r"\u{ffffff}", EscapeError::OutOfRangeUnicodeEscape); - - check(r"\u{DC00}", EscapeError::LoneSurrogateUnicodeEscape); - check(r"\u{DDDD}", EscapeError::LoneSurrogateUnicodeEscape); - check(r"\u{DFFF}", EscapeError::LoneSurrogateUnicodeEscape); - - check(r"\u{D800}", EscapeError::LoneSurrogateUnicodeEscape); - check(r"\u{DAAA}", EscapeError::LoneSurrogateUnicodeEscape); - check(r"\u{DBFF}", EscapeError::LoneSurrogateUnicodeEscape); - } - - #[test] - fn test_unescape_char_good() { - fn check(literal_text: &str, expected_char: char) { - let actual_result = unescape_char(literal_text); - assert_eq!(actual_result, Ok(expected_char)); - } - - check("a", 'a'); - check("ั‹", 'ั‹'); - check("๐Ÿฆ€", '๐Ÿฆ€'); - - check(r#"\""#, '"'); - check(r"\n", '\n'); - check(r"\r", '\r'); - check(r"\t", '\t'); - check(r"\\", '\\'); - check(r"\'", '\''); - check(r"\0", '\0'); - - check(r"\x00", '\0'); - check(r"\x5a", 'Z'); - check(r"\x5A", 'Z'); - check(r"\x7f", 127 as char); - - check(r"\u{0}", '\0'); - check(r"\u{000000}", '\0'); - check(r"\u{41}", 'A'); - check(r"\u{0041}", 'A'); - check(r"\u{00_41}", 'A'); - check(r"\u{4__1__}", 'A'); - check(r"\u{1F63b}", '๐Ÿ˜ป'); - } - - #[test] - fn test_unescape_str_good() { - fn check(literal_text: &str, expected: &str) { - let mut buf = Ok(String::with_capacity(literal_text.len())); - unescape_str(literal_text, &mut |range, c| { - if let Ok(b) = &mut buf { - match c { - Ok(c) => b.push(c), - Err(e) => buf = Err((range, e)), - } - } - }); - let buf = buf.as_ref().map(|it| it.as_ref()); - assert_eq!(buf, Ok(expected)) - } - - check("foo", "foo"); - check("", ""); - check(" \t\n\r\n", " \t\n\n"); - - check("hello \\\n world", "hello world"); - check("hello \\\r\n world", "hello world"); - check("thread's", "thread's") - } - - #[test] - fn test_unescape_byte_bad() { - fn check(literal_text: &str, expected_error: EscapeError) { - let actual_result = unescape_byte(literal_text).map_err(|(_offset, err)| err); - assert_eq!(actual_result, Err(expected_error)); - } - - check("", EscapeError::ZeroChars); - check(r"\", EscapeError::LoneSlash); - - check("\n", EscapeError::EscapeOnlyChar); - check("\r\n", EscapeError::EscapeOnlyChar); - check("\t", EscapeError::EscapeOnlyChar); - check("'", EscapeError::EscapeOnlyChar); - check("\r", EscapeError::BareCarriageReturn); - - check("spam", EscapeError::MoreThanOneChar); - check(r"\x0ff", EscapeError::MoreThanOneChar); - check(r#"\"a"#, EscapeError::MoreThanOneChar); - check(r"\na", EscapeError::MoreThanOneChar); - check(r"\ra", EscapeError::MoreThanOneChar); - check(r"\ta", EscapeError::MoreThanOneChar); - check(r"\\a", EscapeError::MoreThanOneChar); - check(r"\'a", EscapeError::MoreThanOneChar); - check(r"\0a", EscapeError::MoreThanOneChar); - - check(r"\v", EscapeError::InvalidEscape); - check(r"\๐Ÿ’ฉ", EscapeError::InvalidEscape); - check(r"\โ—", EscapeError::InvalidEscape); - - check(r"\x", EscapeError::TooShortHexEscape); - check(r"\x0", EscapeError::TooShortHexEscape); - check(r"\xa", EscapeError::TooShortHexEscape); - check(r"\xf", EscapeError::TooShortHexEscape); - check(r"\xx", EscapeError::InvalidCharInHexEscape); - check(r"\xั‹", EscapeError::InvalidCharInHexEscape); - check(r"\x๐Ÿฆ€", EscapeError::InvalidCharInHexEscape); - check(r"\xtt", EscapeError::InvalidCharInHexEscape); - - check(r"\u", EscapeError::NoBraceInUnicodeEscape); - check(r"\u[0123]", EscapeError::NoBraceInUnicodeEscape); - check(r"\u{0x}", EscapeError::InvalidCharInUnicodeEscape); - check(r"\u{", EscapeError::UnclosedUnicodeEscape); - check(r"\u{0000", EscapeError::UnclosedUnicodeEscape); - check(r"\u{}", EscapeError::EmptyUnicodeEscape); - check(r"\u{_0000}", EscapeError::LeadingUnderscoreUnicodeEscape); - check(r"\u{0000000}", EscapeError::OverlongUnicodeEscape); - - check("ั‹", EscapeError::NonAsciiCharInByte); - check("๐Ÿฆ€", EscapeError::NonAsciiCharInByte); - - check(r"\u{0}", EscapeError::UnicodeEscapeInByte); - check(r"\u{000000}", EscapeError::UnicodeEscapeInByte); - check(r"\u{41}", EscapeError::UnicodeEscapeInByte); - check(r"\u{0041}", EscapeError::UnicodeEscapeInByte); - check(r"\u{00_41}", EscapeError::UnicodeEscapeInByte); - check(r"\u{4__1__}", EscapeError::UnicodeEscapeInByte); - check(r"\u{1F63b}", EscapeError::UnicodeEscapeInByte); - check(r"\u{0}x", EscapeError::UnicodeEscapeInByte); - check(r"\u{1F63b}}", EscapeError::UnicodeEscapeInByte); - check(r"\u{FFFFFF}", EscapeError::UnicodeEscapeInByte); - check(r"\u{ffffff}", EscapeError::UnicodeEscapeInByte); - check(r"\u{ffffff}", EscapeError::UnicodeEscapeInByte); - check(r"\u{DC00}", EscapeError::UnicodeEscapeInByte); - check(r"\u{DDDD}", EscapeError::UnicodeEscapeInByte); - check(r"\u{DFFF}", EscapeError::UnicodeEscapeInByte); - check(r"\u{D800}", EscapeError::UnicodeEscapeInByte); - check(r"\u{DAAA}", EscapeError::UnicodeEscapeInByte); - check(r"\u{DBFF}", EscapeError::UnicodeEscapeInByte); - } - - #[test] - fn test_unescape_byte_good() { - fn check(literal_text: &str, expected_byte: u8) { - let actual_result = unescape_byte(literal_text); - assert_eq!(actual_result, Ok(expected_byte)); - } - - check("a", b'a'); - - check(r#"\""#, b'"'); - check(r"\n", b'\n'); - check(r"\r", b'\r'); - check(r"\t", b'\t'); - check(r"\\", b'\\'); - check(r"\'", b'\''); - check(r"\0", b'\0'); - - check(r"\x00", b'\0'); - check(r"\x5a", b'Z'); - check(r"\x5A", b'Z'); - check(r"\x7f", 127); - check(r"\x80", 128); - check(r"\xff", 255); - check(r"\xFF", 255); - } - - #[test] - fn test_unescape_byte_str_good() { - fn check(literal_text: &str, expected: &[u8]) { - let mut buf = Ok(Vec::with_capacity(literal_text.len())); - unescape_byte_str(literal_text, &mut |range, c| { - if let Ok(b) = &mut buf { - match c { - Ok(c) => b.push(c), - Err(e) => buf = Err((range, e)), - } - } - }); - let buf = buf.as_ref().map(|it| it.as_ref()); - assert_eq!(buf, Ok(expected)) - } - - check("foo", b"foo"); - check("", b""); - check(" \t\n\r\n", b" \t\n\n"); - - check("hello \\\n world", b"hello world"); - check("hello \\\r\n world", b"hello world"); - check("thread's", b"thread's") - } - - #[test] - fn test_unescape_raw_str() { - fn check(literal: &str, expected: &[(Range, Result)]) { - let mut unescaped = Vec::with_capacity(literal.len()); - unescape_raw_str(literal, &mut |range, res| unescaped.push((range, res))); - assert_eq!(unescaped, expected); - } - - check("\r\n", &[(0..2, Ok('\n'))]); - check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]); - check("\rx", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString)), (1..2, Ok('x'))]); - } - - #[test] - fn test_unescape_raw_byte_str() { - fn check(literal: &str, expected: &[(Range, Result)]) { - let mut unescaped = Vec::with_capacity(literal.len()); - unescape_raw_byte_str(literal, &mut |range, res| unescaped.push((range, res))); - assert_eq!(unescaped, expected); - } - - check("\r\n", &[(0..2, Ok(byte_from_char('\n')))]); - check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]); - check("๐Ÿฆ€", &[(0..4, Err(EscapeError::NonAsciiCharInByteString))]); - check( - "๐Ÿฆ€a", - &[(0..4, Err(EscapeError::NonAsciiCharInByteString)), (4..5, Ok(byte_from_char('a')))], - ); - } -} diff --git a/src/libsyntax/parse/unescape_error_reporting.rs b/src/libsyntax/parse/unescape_error_reporting.rs index 71b41161ad8..bc3ee8620e0 100644 --- a/src/libsyntax/parse/unescape_error_reporting.rs +++ b/src/libsyntax/parse/unescape_error_reporting.rs @@ -3,12 +3,11 @@ use std::ops::Range; use std::iter::once; +use rustc_lexer::unescape::{EscapeError, Mode}; use syntax_pos::{Span, BytePos}; use crate::errors::{Handler, Applicability}; -use super::unescape::{EscapeError, Mode}; - pub(crate) fn emit_unescape_error( handler: &Handler, // interior part of the literal, without quotes -- cgit 1.4.1-3-g733a5