Prohibit bare CRs in raw byte strings

author: Igor Matuszewski <Xanewok@gmail.com> 2019-05-13 20:21:44 +0200
committer: Igor Matuszewski <Xanewok@gmail.com> 2019-06-08 22:58:53 +0200
commit: 49d62e8d5a9df16e8ed6c703031fb72d264e3469 (patch)
tree: 432febaebf84a451599d8ba3db0ab1cb8447694c
parent: cab7e7fe76c3c881078f068a8da4a863efdd2c77 (diff)
download: rust-49d62e8d5a9df16e8ed6c703031fb72d264e3469.tar.gz
rust-49d62e8d5a9df16e8ed6c703031fb72d264e3469.zip
5 files changed, 66 insertions, 74 deletions
diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs
index e3830b1e3b6..685c17d104b 100644
--- a/src/libsyntax/parse/lexer/mod.rs
+++ b/src/libsyntax/parse/lexer/mod.rs
@@ -292,15 +292,6 @@ impl<'a> StringReader<'a> {
         self.sess.span_diagnostic.struct_span_fatal(self.mk_sp(from_pos, to_pos), &m[..])
     }
 
-    /// Report a lexical error spanning [`from_pos`, `to_pos`), appending an
-    /// escaped character to the error message
-    fn err_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) {
-        let mut m = m.to_string();
-        m.push_str(": ");
-        push_escaped_char(&mut m, c);
-        self.err_span_(from_pos, to_pos, &m[..]);
-    }
-
     /// Advance peek_token to refer to the next token, and
     /// possibly update the interner.
     fn advance_token(&mut self) -> Result<(), ()> {
@@ -1070,7 +1061,13 @@ impl<'a> StringReader<'a> {
                         self.validate_byte_str_escape(start_with_quote);
                         (token::ByteStr, symbol)
                     },
-                    Some('r') => self.scan_raw_byte_string(),
+                    Some('r') => {
+                        let (start, end, hash_count) = self.scan_raw_string();
+                        let symbol = self.name_from_to(start, end);
+                        self.validate_raw_byte_str_escape(start, end);
+
+                        (token::ByteStrRaw(hash_count), symbol)
+                    }
                     _ => unreachable!(),  // Should have been a token::Ident above.
                 };
                 let suffix = self.scan_optional_raw_name();
@@ -1300,66 +1297,6 @@ impl<'a> StringReader<'a> {
         (content_start_bpos, content_end_bpos, hash_count)
     }
 
-    fn scan_raw_byte_string(&mut self) -> (token::LitKind, Symbol) {
-        let start_bpos = self.pos;
-        self.bump();
-        let mut hash_count = 0;
-        while self.ch_is('#') {
-            if hash_count == 65535 {
-                let bpos = self.next_pos;
-                self.fatal_span_(start_bpos,
-                                 bpos,
-                                 "too many `#` symbols: raw strings may be \
-                                 delimited by up to 65535 `#` symbols").raise();
-            }
-            self.bump();
-            hash_count += 1;
-        }
-
-        if self.is_eof() {
-            self.fail_unterminated_raw_string(start_bpos, hash_count);
-        } else if !self.ch_is('"') {
-            let last_bpos = self.pos;
-            let curr_char = self.ch.unwrap();
-            self.fatal_span_char(start_bpos,
-                                        pos,
-                                        "found invalid character; only `#` is allowed in raw \
-                                         string delimitation",
-                                        ch).raise();
-        }
-        self.bump();
-        let content_start_bpos = self.pos;
-        let mut content_end_bpos;
-        'outer: loop {
-            match self.ch {
-                None => {
-                    self.fail_unterminated_raw_string(start_bpos, hash_count);
-                }
-                Some('"') => {
-                    content_end_bpos = self.pos;
-                    for _ in 0..hash_count {
-                        self.bump();
-                        if !self.ch_is('#') {
-                            continue 'outer;
-                        }
-                    }
-                    break;
-                }
-                Some(c) => {
-                    if c > '\x7F' {
-                        let pos = self.pos;
-                        self.err_span_char(pos, pos, "raw byte string must be ASCII", c);
-                    }
-                }
-            }
-            self.bump();
-        }
-
-        self.bump();
-
-        (token::ByteStrRaw(hash_count), self.name_from_to(content_start_bpos, content_end_bpos))
-    }
-
     fn validate_char_escape(&self, start_with_quote: BytePos) {
         self.with_str_from_to(start_with_quote + BytePos(1), self.pos - BytePos(1), |lit| {
             if let Err((off, err)) = unescape::unescape_char(lit) {
@@ -1424,6 +1361,23 @@ impl<'a> StringReader<'a> {
         });
     }
 
+    fn validate_raw_byte_str_escape(&self, content_start: BytePos, content_end: BytePos) {
+        self.with_str_from_to(content_start, content_end, |lit: &str| {
+            unescape::unescape_raw_byte_str(lit, &mut |range, c| {
+                if let Err(err) = c {
+                    emit_unescape_error(
+                        &self.sess.span_diagnostic,
+                        lit,
+                        self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)),
+                        unescape::Mode::ByteStr,
+                        range,
+                        err,
+                    )
+                }
+            })
+        });
+    }
+
     fn validate_byte_str_escape(&self, start_with_quote: BytePos) {
         self.with_str_from_to(start_with_quote + BytePos(1), self.pos - BytePos(1), |lit| {
             unescape::unescape_byte_str(lit, &mut |range, c| {
diff --git a/src/libsyntax/parse/unescape.rs b/src/libsyntax/parse/unescape.rs
index d6b7db16305..819463b5472 100644
--- a/src/libsyntax/parse/unescape.rs
+++ b/src/libsyntax/parse/unescape.rs
@@ -29,6 +29,7 @@ pub(crate) enum EscapeError {
 
     UnicodeEscapeInByte,
     NonAsciiCharInByte,
+    NonAsciiCharInByteString,
 }
 
 /// Takes a contents of a char literal (without quotes), and returns an
@@ -88,6 +89,29 @@ where
     }
 }
 
+/// Takes a contents of a string literal (without quotes) and produces a
+/// sequence of characters or errors.
+/// NOTE: Raw strings do not perform any explicit character escaping, here we
+/// only translate CRLF to LF and produce errors on bare CR.
+pub(crate) fn unescape_raw_byte_str<F>(literal_text: &str, callback: &mut F)
+where
+    F: FnMut(Range<usize>, Result<char, EscapeError>),
+{
+    let mut byte_offset: usize = 0;
+
+    let mut chars = literal_text.chars().peekable();
+    while let Some(curr) = chars.next() {
+        let result = match (curr, chars.peek()) {
+            ('\r', Some('\n')) => Ok(curr),
+            ('\r', _) => Err(EscapeError::BareCarriageReturn),
+            (c, _) if c > '\x7F' => Err(EscapeError::NonAsciiCharInByteString),
+            _ => Ok(curr),
+        };
+        callback(byte_offset..(byte_offset + curr.len_utf8()), result);
+        byte_offset += curr.len_utf8();
+    }
+}
+
 #[derive(Debug, Clone, Copy)]
 pub(crate) enum Mode {
     Char,
diff --git a/src/libsyntax/parse/unescape_error_reporting.rs b/src/libsyntax/parse/unescape_error_reporting.rs
index 22777c0884f..8f152974a6d 100644
--- a/src/libsyntax/parse/unescape_error_reporting.rs
+++ b/src/libsyntax/parse/unescape_error_reporting.rs
@@ -124,6 +124,11 @@ pub(crate) fn emit_unescape_error(
             handler.span_err(span, "byte constant must be ASCII. \
                                     Use a \\xHH escape for a non-ASCII byte")
         }
+        EscapeError::NonAsciiCharInByteString => {
+            assert!(mode.is_bytes());
+            let (_c, span) = last_char();
+            handler.span_err(span, "raw byte string must be ASCII")
+        }
         EscapeError::OutOfRangeHexEscape => {
             handler.span_err(span, "this form of character escape may only be used \
                                     with characters in the range [\\x00-\\x7f]")
diff --git a/src/test/ui/parser/raw-byte-string-literals.rs b/src/test/ui/parser/raw-byte-string-literals.rs
index 3b50fb8036a..87ecfb5c544 100644
--- a/src/test/ui/parser/raw-byte-string-literals.rs
+++ b/src/test/ui/parser/raw-byte-string-literals.rs
@@ -1,4 +1,7 @@
+// ignore-tidy-cr
+// compile-flags: -Z continue-parse-after-error
 pub fn main() {
+    br"a
"; //~ ERROR bare CR not allowed in string
     br"é";  //~ ERROR raw byte string must be ASCII
     br##~"a"~##;  //~ ERROR only `#` is allowed in raw string delimitation
 }
diff --git a/src/test/ui/parser/raw-byte-string-literals.stderr b/src/test/ui/parser/raw-byte-string-literals.stderr
index 671ed97d1b5..03fe79722b8 100644
--- a/src/test/ui/parser/raw-byte-string-literals.stderr
+++ b/src/test/ui/parser/raw-byte-string-literals.stderr
@@ -1,14 +1,20 @@
-error: raw byte string must be ASCII: \u{e9}
-  --> $DIR/raw-byte-string-literals.rs:2:8
+error: bare CR not allowed in string, use \r instead
+  --> $DIR/raw-byte-string-literals.rs:4:9
+   |
+LL |     br"a
";
+   |         ^
+
+error: raw byte string must be ASCII
+  --> $DIR/raw-byte-string-literals.rs:5:8
    |
 LL |     br"é";
    |        ^
 
 error: found invalid character; only `#` is allowed in raw string delimitation: ~
-  --> $DIR/raw-byte-string-literals.rs:3:6
+  --> $DIR/raw-byte-string-literals.rs:6:6
    |
 LL |     br##~"a"~##;
    |      ^^^
 
-error: aborting due to 2 previous errors
+error: aborting due to 3 previous errors
author	Igor Matuszewski <Xanewok@gmail.com>	2019-05-13 20:21:44 +0200
committer	Igor Matuszewski <Xanewok@gmail.com>	2019-06-08 22:58:53 +0200
commit	49d62e8d5a9df16e8ed6c703031fb72d264e3469 (patch)
tree	432febaebf84a451599d8ba3db0ab1cb8447694c
parent	cab7e7fe76c3c881078f068a8da4a863efdd2c77 (diff)
download	rust-49d62e8d5a9df16e8ed6c703031fb72d264e3469.tar.gz rust-49d62e8d5a9df16e8ed6c703031fb72d264e3469.zip