diff options
| author | Aleksey Kladov <aleksey.kladov@gmail.com> | 2019-04-25 11:48:25 +0300 |
|---|---|---|
| committer | Aleksey Kladov <aleksey.kladov@gmail.com> | 2019-05-02 15:31:57 +0300 |
| commit | bfa5f278472d0bad4e7db4a4259b2f1fa97ca0ab (patch) | |
| tree | bb27691d38cfe85ef181ea8d2cb7609da12cbe91 /src/libsyntax/parse/mod.rs | |
| parent | 9b67bd42b7cbf97f72d039afcba02f5177d0d68c (diff) | |
| download | rust-bfa5f278472d0bad4e7db4a4259b2f1fa97ca0ab.tar.gz rust-bfa5f278472d0bad4e7db4a4259b2f1fa97ca0ab.zip | |
introduce unescape module
Currently, we deal with escape sequences twice: once when we lex a string, and a second time when we unescape literals. This PR aims to remove this duplication, by introducing a new `unescape` mode as a single source of truth for character escaping rules
Diffstat (limited to 'src/libsyntax/parse/mod.rs')
| -rw-r--r-- | src/libsyntax/parse/mod.rs | 262 |
1 files changed, 40 insertions, 222 deletions
diff --git a/src/libsyntax/parse/mod.rs b/src/libsyntax/parse/mod.rs index 1abc7832ffa..d6d4f8e8f04 100644 --- a/src/libsyntax/parse/mod.rs +++ b/src/libsyntax/parse/mod.rs @@ -18,7 +18,6 @@ use log::debug; use rustc_data_structures::fx::FxHashSet; use std::borrow::Cow; -use std::iter; use std::path::{Path, PathBuf}; use std::str; @@ -33,6 +32,11 @@ pub mod attr; pub mod classify; +pub(crate) mod unescape; +use unescape::{unescape_str, unescape_char, unescape_byte_str, unescape_byte, EscapeError}; + +pub(crate) mod unescape_error_reporting; + /// Info about a parsing session. pub struct ParseSess { pub span_diagnostic: Handler, @@ -306,133 +310,6 @@ pub fn stream_to_parser(sess: &ParseSess, stream: TokenStream) -> Parser<'_> { Parser::new(sess, stream, None, true, false) } -/// Parses a string representing a character literal into its final form. -/// Rather than just accepting/rejecting a given literal, unescapes it as -/// well. Can take any slice prefixed by a character escape. Returns the -/// character and the number of characters consumed. -fn char_lit(lit: &str, diag: Option<(Span, &Handler)>) -> (char, isize) { - use std::char; - - // Handle non-escaped chars first. - if lit.as_bytes()[0] != b'\\' { - // If the first byte isn't '\\' it might part of a multi-byte char, so - // get the char with chars(). - let c = lit.chars().next().unwrap(); - return (c, 1); - } - - // Handle escaped chars. - match lit.as_bytes()[1] as char { - '"' => ('"', 2), - 'n' => ('\n', 2), - 'r' => ('\r', 2), - 't' => ('\t', 2), - '\\' => ('\\', 2), - '\'' => ('\'', 2), - '0' => ('\0', 2), - 'x' => { - let v = u32::from_str_radix(&lit[2..4], 16).unwrap(); - let c = char::from_u32(v).unwrap(); - (c, 4) - } - 'u' => { - assert_eq!(lit.as_bytes()[2], b'{'); - let idx = lit.find('}').unwrap(); - - // All digits and '_' are ascii, so treat each byte as a char. - let mut v: u32 = 0; - for c in lit[3..idx].bytes() { - let c = char::from(c); - if c != '_' { - let x = c.to_digit(16).unwrap(); - v = v.checked_mul(16).unwrap().checked_add(x).unwrap(); - } - } - let c = char::from_u32(v).unwrap_or_else(|| { - if let Some((span, diag)) = diag { - let mut diag = diag.struct_span_err(span, "invalid unicode character escape"); - if v > 0x10FFFF { - diag.help("unicode escape must be at most 10FFFF").emit(); - } else { - diag.help("unicode escape must not be a surrogate").emit(); - } - } - '\u{FFFD}' - }); - (c, (idx + 1) as isize) - } - _ => panic!("lexer should have rejected a bad character escape {}", lit) - } -} - -/// Parses a string representing a string literal into its final form. Does unescaping. -fn str_lit(lit: &str, diag: Option<(Span, &Handler)>) -> String { - debug!("str_lit: given {}", lit.escape_default()); - let mut res = String::with_capacity(lit.len()); - - let error = |i| format!("lexer should have rejected {} at {}", lit, i); - - /// Eat everything up to a non-whitespace. - fn eat<'a>(it: &mut iter::Peekable<str::CharIndices<'a>>) { - loop { - match it.peek().map(|x| x.1) { - Some(' ') | Some('\n') | Some('\r') | Some('\t') => { - it.next(); - }, - _ => { break; } - } - } - } - - let mut chars = lit.char_indices().peekable(); - while let Some((i, c)) = chars.next() { - match c { - '\\' => { - let ch = chars.peek().unwrap_or_else(|| { - panic!("{}", error(i)) - }).1; - - if ch == '\n' { - eat(&mut chars); - } else if ch == '\r' { - chars.next(); - let ch = chars.peek().unwrap_or_else(|| { - panic!("{}", error(i)) - }).1; - - if ch != '\n' { - panic!("lexer accepted bare CR"); - } - eat(&mut chars); - } else { - // otherwise, a normal escape - let (c, n) = char_lit(&lit[i..], diag); - for _ in 0..n - 1 { // we don't need to move past the first \ - chars.next(); - } - res.push(c); - } - }, - '\r' => { - let ch = chars.peek().unwrap_or_else(|| { - panic!("{}", error(i)) - }).1; - - if ch != '\n' { - panic!("lexer accepted bare CR"); - } - chars.next(); - res.push('\n'); - } - c => res.push(c), - } - } - - res.shrink_to_fit(); // probably not going to do anything, unless there was an escape. - debug!("parse_str_lit: returning {}", res); - res -} - /// Parses a string representing a raw string literal into its final form. The /// only operation this does is convert embedded CRLF into a single LF. fn raw_str_lit(lit: &str) -> String { @@ -475,9 +352,23 @@ crate fn lit_token(lit: token::Lit, suf: Option<Symbol>, diag: Option<(Span, &Ha use ast::LitKind; match lit { - token::Byte(i) => (true, Some(LitKind::Byte(byte_lit(&i.as_str()).0))), - token::Char(i) => (true, Some(LitKind::Char(char_lit(&i.as_str(), diag).0))), - token::Err(i) => (true, Some(LitKind::Err(i))), + token::Byte(i) => { + let lit_kind = match unescape_byte(&i.as_str()) { + Ok(c) => LitKind::Byte(c), + Err((_, EscapeError::MoreThanOneChar)) => LitKind::Err(i), + Err(_) => LitKind::Byte(0), + }; + (true, Some(lit_kind)) + }, + token::Char(i) => { + let lit_kind = match unescape_char(&i.as_str()) { + Ok(c) => LitKind::Char(c), + Err((_, EscapeError::MoreThanOneChar)) => LitKind::Err(i), + Err(_) => LitKind::Char('\u{FFFD}'), + }; + (true, Some(lit_kind)) + }, + token::Err(i) => (true, Some(LitKind::Err(i))), // There are some valid suffixes for integer and float literals, // so all the handling is done internally. @@ -491,7 +382,14 @@ crate fn lit_token(lit: token::Lit, suf: Option<Symbol>, diag: Option<(Span, &Ha // string in the Token. let s = &sym.as_str(); if s.as_bytes().iter().any(|&c| c == b'\\' || c == b'\r') { - sym = Symbol::intern(&str_lit(s, diag)); + let mut buf = String::with_capacity(s.len()); + unescape_str(s, &mut |_, unescaped_char| { + match unescaped_char { + Ok(c) => buf.push(c), + Err(_) => buf.push('\u{FFFD}'), + } + }); + sym = Symbol::intern(&buf) } (true, Some(LitKind::Str(sym, ast::StrStyle::Cooked))) } @@ -504,7 +402,16 @@ crate fn lit_token(lit: token::Lit, suf: Option<Symbol>, diag: Option<(Span, &Ha (true, Some(LitKind::Str(sym, ast::StrStyle::Raw(n)))) } token::ByteStr(i) => { - (true, Some(LitKind::ByteStr(byte_str_lit(&i.as_str())))) + let s = &i.as_str(); + let mut buf = Vec::with_capacity(s.len()); + unescape_byte_str(s, &mut |_, unescaped_byte| { + match unescaped_byte { + Ok(c) => buf.push(c), + Err(_) => buf.push(0), + } + }); + buf.shrink_to_fit(); + (true, Some(LitKind::ByteStr(Lrc::new(buf)))) } token::ByteStrRaw(i, _) => { (true, Some(LitKind::ByteStr(Lrc::new(i.to_string().into_bytes())))) @@ -559,95 +466,6 @@ fn float_lit(s: &str, suffix: Option<Symbol>, diag: Option<(Span, &Handler)>) filtered_float_lit(Symbol::intern(s), suffix, diag) } -/// Parses a string representing a byte literal into its final form. Similar to `char_lit`. -fn byte_lit(lit: &str) -> (u8, usize) { - let err = |i| format!("lexer accepted invalid byte literal {} step {}", lit, i); - - if lit.len() == 1 { - (lit.as_bytes()[0], 1) - } else { - assert_eq!(lit.as_bytes()[0], b'\\', "{}", err(0)); - let b = match lit.as_bytes()[1] { - b'"' => b'"', - b'n' => b'\n', - b'r' => b'\r', - b't' => b'\t', - b'\\' => b'\\', - b'\'' => b'\'', - b'0' => b'\0', - _ => { - match u64::from_str_radix(&lit[2..4], 16).ok() { - Some(c) => - if c > 0xFF { - panic!(err(2)) - } else { - return (c as u8, 4) - }, - None => panic!(err(3)) - } - } - }; - (b, 2) - } -} - -fn byte_str_lit(lit: &str) -> Lrc<Vec<u8>> { - let mut res = Vec::with_capacity(lit.len()); - - let error = |i| panic!("lexer should have rejected {} at {}", lit, i); - - /// Eat everything up to a non-whitespace. - fn eat<I: Iterator<Item=(usize, u8)>>(it: &mut iter::Peekable<I>) { - loop { - match it.peek().map(|x| x.1) { - Some(b' ') | Some(b'\n') | Some(b'\r') | Some(b'\t') => { - it.next(); - }, - _ => { break; } - } - } - } - - // byte string literals *must* be ASCII, but the escapes don't have to be - let mut chars = lit.bytes().enumerate().peekable(); - loop { - match chars.next() { - Some((i, b'\\')) => { - match chars.peek().unwrap_or_else(|| error(i)).1 { - b'\n' => eat(&mut chars), - b'\r' => { - chars.next(); - if chars.peek().unwrap_or_else(|| error(i)).1 != b'\n' { - panic!("lexer accepted bare CR"); - } - eat(&mut chars); - } - _ => { - // otherwise, a normal escape - let (c, n) = byte_lit(&lit[i..]); - // we don't need to move past the first \ - for _ in 0..n - 1 { - chars.next(); - } - res.push(c); - } - } - }, - Some((i, b'\r')) => { - if chars.peek().unwrap_or_else(|| error(i)).1 != b'\n' { - panic!("lexer accepted bare CR"); - } - chars.next(); - res.push(b'\n'); - } - Some((_, c)) => res.push(c), - None => break, - } - } - - Lrc::new(res) -} - fn integer_lit(s: &str, suffix: Option<Symbol>, diag: Option<(Span, &Handler)>) -> Option<ast::LitKind> { // s can only be ascii, byte indexing is fine |
