Auto merge of #60261 - matklad:one-escape, r=petrochenkov

introduce unescape module A WIP PR to gauge early feedback Currently, we deal with escape sequences twice: once when we [lex](https://github.com/rust-lang/rust/blob/112f7e9ac564e2cfcfc13d599c8376a219fde1bc/src/libsyntax/parse/lexer/mod.rs#L928-L1065) a string, and a second time when we [unescape](https://github.com/rust-lang/rust/blob/112f7e9ac564e2cfcfc13d599c8376a219fde1bc/src/libsyntax/parse/mod.rs#L313-L366) literals. Note that we also produce different sets of diagnostics in these two cases. This PR aims to remove this duplication, by introducing a new `unescape` module as a single source of truth for character escaping rules. I think this would be a useful cleanup by itself, but I also need this for https://github.com/rust-lang/rust/pull/59706. In the current state, the PR has `unescape` module which fully (modulo bugs) deals with string and char literals. I am quite happy about the state of this module What this PR doesn't have yet are: * [x] handling of byte and byte string literals (should be simple to add) * [x] good diagnostics * [x] actual removal of code from lexer (giant `scan_char_or_byte` should go away completely) * [x] performance check * [x] general cleanup of the new code Diagnostics will be the most labor-consuming bit here, but they are mostly a question of just correctly adjusting spans to sub-tokens. The current setup for diagnostics is that `unescape` produces a plain old `enum` with various problems, and they are rendered into `Handler` separately. This bit is not actually required (it is possible to just pass the `Handler` in), but I like the separation between diagnostics and logic this approach imposes, and such separation should again be useful for #59706 cc @eddyb , @petrochenkov
author: bors <bors@rust-lang.org> 2019-05-06 00:16:16 +0000
committer: bors <bors@rust-lang.org> 2019-05-06 00:16:16 +0000
commit: 46d0ca00ad60710cd3c46398b7a6ea080a9447ed (patch)
tree: 669941c7808326bac651d6f42f9eb4cddc90b56c /src/libsyntax/parse/mod.rs
parent: 40bd145cbef28d279aacc903907c3945f83a6296 (diff)
parent: 1835cbeb6574997ec5188cb22b9538c61976d2b4 (diff)
download: rust-46d0ca00ad60710cd3c46398b7a6ea080a9447ed.tar.gz
rust-46d0ca00ad60710cd3c46398b7a6ea080a9447ed.zip
1 files changed, 47 insertions, 222 deletions
diff --git a/src/libsyntax/parse/mod.rs b/src/libsyntax/parse/mod.rs
index 85a74df2357..8c2fab9ada7 100644
--- a/src/libsyntax/parse/mod.rs
+++ b/src/libsyntax/parse/mod.rs
@@ -18,7 +18,6 @@ use log::debug;
 
 use rustc_data_structures::fx::FxHashSet;
 use std::borrow::Cow;
-use std::iter;
 use std::path::{Path, PathBuf};
 use std::str;
 
@@ -34,6 +33,11 @@ pub mod diagnostics;
 
 pub mod classify;
 
+pub(crate) mod unescape;
+use unescape::{unescape_str, unescape_char, unescape_byte_str, unescape_byte};
+
+pub(crate) mod unescape_error_reporting;
+
 /// Info about a parsing session.
 pub struct ParseSess {
     pub span_diagnostic: Handler,
@@ -307,133 +311,6 @@ pub fn stream_to_parser(sess: &ParseSess, stream: TokenStream) -> Parser<'_> {
     Parser::new(sess, stream, None, true, false)
 }
 
-/// Parses a string representing a character literal into its final form.
-/// Rather than just accepting/rejecting a given literal, unescapes it as
-/// well. Can take any slice prefixed by a character escape. Returns the
-/// character and the number of characters consumed.
-fn char_lit(lit: &str, diag: Option<(Span, &Handler)>) -> (char, isize) {
-    use std::char;
-
-    // Handle non-escaped chars first.
-    if lit.as_bytes()[0] != b'\\' {
-        // If the first byte isn't '\\' it might part of a multi-byte char, so
-        // get the char with chars().
-        let c = lit.chars().next().unwrap();
-        return (c, 1);
-    }
-
-    // Handle escaped chars.
-    match lit.as_bytes()[1] as char {
-        '"' => ('"', 2),
-        'n' => ('\n', 2),
-        'r' => ('\r', 2),
-        't' => ('\t', 2),
-        '\\' => ('\\', 2),
-        '\'' => ('\'', 2),
-        '0' => ('\0', 2),
-        'x' => {
-            let v = u32::from_str_radix(&lit[2..4], 16).unwrap();
-            let c = char::from_u32(v).unwrap();
-            (c, 4)
-        }
-        'u' => {
-            assert_eq!(lit.as_bytes()[2], b'{');
-            let idx = lit.find('}').unwrap();
-
-            // All digits and '_' are ascii, so treat each byte as a char.
-            let mut v: u32 = 0;
-            for c in lit[3..idx].bytes() {
-                let c = char::from(c);
-                if c != '_' {
-                    let x = c.to_digit(16).unwrap();
-                    v = v.checked_mul(16).unwrap().checked_add(x).unwrap();
-                }
-            }
-            let c = char::from_u32(v).unwrap_or_else(|| {
-                if let Some((span, diag)) = diag {
-                    let mut diag = diag.struct_span_err(span, "invalid unicode character escape");
-                    if v > 0x10FFFF {
-                        diag.help("unicode escape must be at most 10FFFF").emit();
-                    } else {
-                        diag.help("unicode escape must not be a surrogate").emit();
-                    }
-                }
-                '\u{FFFD}'
-            });
-            (c, (idx + 1) as isize)
-        }
-        _ => panic!("lexer should have rejected a bad character escape {}", lit)
-    }
-}
-
-/// Parses a string representing a string literal into its final form. Does unescaping.
-fn str_lit(lit: &str, diag: Option<(Span, &Handler)>) -> String {
-    debug!("str_lit: given {}", lit.escape_default());
-    let mut res = String::with_capacity(lit.len());
-
-    let error = |i| format!("lexer should have rejected {} at {}", lit, i);
-
-    /// Eat everything up to a non-whitespace.
-    fn eat<'a>(it: &mut iter::Peekable<str::CharIndices<'a>>) {
-        loop {
-            match it.peek().map(|x| x.1) {
-                Some(' ') | Some('\n') | Some('\r') | Some('\t') => {
-                    it.next();
-                },
-                _ => { break; }
-            }
-        }
-    }
-
-    let mut chars = lit.char_indices().peekable();
-    while let Some((i, c)) = chars.next() {
-        match c {
-            '\\' => {
-                let ch = chars.peek().unwrap_or_else(|| {
-                    panic!("{}", error(i))
-                }).1;
-
-                if ch == '\n' {
-                    eat(&mut chars);
-                } else if ch == '\r' {
-                    chars.next();
-                    let ch = chars.peek().unwrap_or_else(|| {
-                        panic!("{}", error(i))
-                    }).1;
-
-                    if ch != '\n' {
-                        panic!("lexer accepted bare CR");
-                    }
-                    eat(&mut chars);
-                } else {
-                    // otherwise, a normal escape
-                    let (c, n) = char_lit(&lit[i..], diag);
-                    for _ in 0..n - 1 { // we don't need to move past the first \
-                        chars.next();
-                    }
-                    res.push(c);
-                }
-            },
-            '\r' => {
-                let ch = chars.peek().unwrap_or_else(|| {
-                    panic!("{}", error(i))
-                }).1;
-
-                if ch != '\n' {
-                    panic!("lexer accepted bare CR");
-                }
-                chars.next();
-                res.push('\n');
-            }
-            c => res.push(c),
-        }
-    }
-
-    res.shrink_to_fit(); // probably not going to do anything, unless there was an escape.
-    debug!("parse_str_lit: returning {}", res);
-    res
-}
-
 /// Parses a string representing a raw string literal into its final form. The
 /// only operation this does is convert embedded CRLF into a single LF.
 fn raw_str_lit(lit: &str) -> String {
@@ -476,9 +353,21 @@ crate fn lit_token(lit: token::Lit, suf: Option<Symbol>, diag: Option<(Span, &Ha
     use ast::LitKind;
 
     match lit {
-       token::Byte(i) => (true, Some(LitKind::Byte(byte_lit(&i.as_str()).0))),
-       token::Char(i) => (true, Some(LitKind::Char(char_lit(&i.as_str(), diag).0))),
-       token::Err(i) => (true, Some(LitKind::Err(i))),
+        token::Byte(i) => {
+            let lit_kind = match unescape_byte(&i.as_str()) {
+                Ok(c) => LitKind::Byte(c),
+                Err(_) => LitKind::Err(i),
+            };
+            (true, Some(lit_kind))
+        },
+        token::Char(i) => {
+            let lit_kind = match unescape_char(&i.as_str()) {
+                Ok(c) => LitKind::Char(c),
+                Err(_) => LitKind::Err(i),
+            };
+            (true, Some(lit_kind))
+        },
+        token::Err(i) => (true, Some(LitKind::Err(i))),
 
         // There are some valid suffixes for integer and float literals,
         // so all the handling is done internally.
@@ -490,10 +379,22 @@ crate fn lit_token(lit: token::Lit, suf: Option<Symbol>, diag: Option<(Span, &Ha
             // reuse the symbol from the Token. Otherwise, we must generate a
             // new symbol because the string in the LitKind is different to the
             // string in the Token.
+            let mut has_error = false;
             let s = &sym.as_str();
             if s.as_bytes().iter().any(|&c| c == b'\\' || c == b'\r') {
-                sym = Symbol::intern(&str_lit(s, diag));
+                let mut buf = String::with_capacity(s.len());
+                unescape_str(s, &mut |_, unescaped_char| {
+                    match unescaped_char {
+                        Ok(c) => buf.push(c),
+                        Err(_) => has_error = true,
+                    }
+                });
+                if has_error {
+                    return (true, Some(LitKind::Err(sym)));
+                }
+                sym = Symbol::intern(&buf)
             }
+
             (true, Some(LitKind::Str(sym, ast::StrStyle::Cooked)))
         }
         token::StrRaw(mut sym, n) => {
@@ -505,7 +406,20 @@ crate fn lit_token(lit: token::Lit, suf: Option<Symbol>, diag: Option<(Span, &Ha
             (true, Some(LitKind::Str(sym, ast::StrStyle::Raw(n))))
         }
         token::ByteStr(i) => {
-            (true, Some(LitKind::ByteStr(byte_str_lit(&i.as_str()))))
+            let s = &i.as_str();
+            let mut buf = Vec::with_capacity(s.len());
+            let mut has_error = false;
+            unescape_byte_str(s, &mut |_, unescaped_byte| {
+                match unescaped_byte {
+                    Ok(c) => buf.push(c),
+                    Err(_) => has_error = true,
+                }
+            });
+            if has_error {
+                return (true, Some(LitKind::Err(i)));
+            }
+            buf.shrink_to_fit();
+            (true, Some(LitKind::ByteStr(Lrc::new(buf))))
         }
         token::ByteStrRaw(i, _) => {
             (true, Some(LitKind::ByteStr(Lrc::new(i.to_string().into_bytes()))))
@@ -560,95 +474,6 @@ fn float_lit(s: &str, suffix: Option<Symbol>, diag: Option<(Span, &Handler)>)
     filtered_float_lit(Symbol::intern(s), suffix, diag)
 }
 
-/// Parses a string representing a byte literal into its final form. Similar to `char_lit`.
-fn byte_lit(lit: &str) -> (u8, usize) {
-    let err = |i| format!("lexer accepted invalid byte literal {} step {}", lit, i);
-
-    if lit.len() == 1 {
-        (lit.as_bytes()[0], 1)
-    } else {
-        assert_eq!(lit.as_bytes()[0], b'\\', "{}", err(0));
-        let b = match lit.as_bytes()[1] {
-            b'"' => b'"',
-            b'n' => b'\n',
-            b'r' => b'\r',
-            b't' => b'\t',
-            b'\\' => b'\\',
-            b'\'' => b'\'',
-            b'0' => b'\0',
-            _ => {
-                match u64::from_str_radix(&lit[2..4], 16).ok() {
-                    Some(c) =>
-                        if c > 0xFF {
-                            panic!(err(2))
-                        } else {
-                            return (c as u8, 4)
-                        },
-                    None => panic!(err(3))
-                }
-            }
-        };
-        (b, 2)
-    }
-}
-
-fn byte_str_lit(lit: &str) -> Lrc<Vec<u8>> {
-    let mut res = Vec::with_capacity(lit.len());
-
-    let error = |i| panic!("lexer should have rejected {} at {}", lit, i);
-
-    /// Eat everything up to a non-whitespace.
-    fn eat<I: Iterator<Item=(usize, u8)>>(it: &mut iter::Peekable<I>) {
-        loop {
-            match it.peek().map(|x| x.1) {
-                Some(b' ') | Some(b'\n') | Some(b'\r') | Some(b'\t') => {
-                    it.next();
-                },
-                _ => { break; }
-            }
-        }
-    }
-
-    // byte string literals *must* be ASCII, but the escapes don't have to be
-    let mut chars = lit.bytes().enumerate().peekable();
-    loop {
-        match chars.next() {
-            Some((i, b'\\')) => {
-                match chars.peek().unwrap_or_else(|| error(i)).1 {
-                    b'\n' => eat(&mut chars),
-                    b'\r' => {
-                        chars.next();
-                        if chars.peek().unwrap_or_else(|| error(i)).1 != b'\n' {
-                            panic!("lexer accepted bare CR");
-                        }
-                        eat(&mut chars);
-                    }
-                    _ => {
-                        // otherwise, a normal escape
-                        let (c, n) = byte_lit(&lit[i..]);
-                        // we don't need to move past the first \
-                        for _ in 0..n - 1 {
-                            chars.next();
-                        }
-                        res.push(c);
-                    }
-                }
-            },
-            Some((i, b'\r')) => {
-                if chars.peek().unwrap_or_else(|| error(i)).1 != b'\n' {
-                    panic!("lexer accepted bare CR");
-                }
-                chars.next();
-                res.push(b'\n');
-            }
-            Some((_, c)) => res.push(c),
-            None => break,
-        }
-    }
-
-    Lrc::new(res)
-}
-
 fn integer_lit(s: &str, suffix: Option<Symbol>, diag: Option<(Span, &Handler)>)
                    -> Option<ast::LitKind> {
     // s can only be ascii, byte indexing is fine
author	bors <bors@rust-lang.org>	2019-05-06 00:16:16 +0000
committer	bors <bors@rust-lang.org>	2019-05-06 00:16:16 +0000
commit	46d0ca00ad60710cd3c46398b7a6ea080a9447ed (patch)
tree	669941c7808326bac651d6f42f9eb4cddc90b56c /src/libsyntax/parse/mod.rs
parent	40bd145cbef28d279aacc903907c3945f83a6296 (diff)
parent	1835cbeb6574997ec5188cb22b9538c61976d2b4 (diff)
download	rust-46d0ca00ad60710cd3c46398b7a6ea080a9447ed.tar.gz rust-46d0ca00ad60710cd3c46398b7a6ea080a9447ed.zip