Rollup merge of #66015 - popzxc:refactor-librustc_parser, r=matklad

librustc_lexer: Refactor the module This PR introduces a refactoring of the `librustc_lexer` in order to improve readability. All the changes performed are only cosmetic and do not introduce any changes the lexer logic or performance. Newly introduced modules `literal`, `token` and `utils` are just copy-pasted from the `lib.rs` and do not contain even cosmetic changes (I decided to do so so it'll be easier to review changes looking only on diff). r? @petrochenkov cc @Centril @matklad
author: Mazdak Farrokhzad <twingoow@gmail.com> 2019-11-06 07:03:05 +0100
committer: GitHub <noreply@github.com> 2019-11-06 07:03:05 +0100
commit: 81550a00d1e8f83bbaef31205e72b27fdb796d35 (patch)
tree: bd7b39fcc7320311a711ebc9400eb1c9d2f0254a
parent: 24af0c94b30271198d6101cdd14c78182f76fd07 (diff)
parent: 31735b02c95510f2e236ebd773b02e84ee6e1a5b (diff)
download: rust-81550a00d1e8f83bbaef31205e72b27fdb796d35.tar.gz
rust-81550a00d1e8f83bbaef31205e72b27fdb796d35.zip
2 files changed, 159 insertions, 122 deletions
diff --git a/src/librustc_lexer/src/cursor.rs b/src/librustc_lexer/src/cursor.rs
index 73d305c6d4f..ed0911379c4 100644
--- a/src/librustc_lexer/src/cursor.rs
+++ b/src/librustc_lexer/src/cursor.rs
@@ -41,10 +41,20 @@ impl<'a> Cursor<'a> {
     /// If requested position doesn't exist, `EOF_CHAR` is returned.
     /// However, getting `EOF_CHAR` doesn't always mean actual end of file,
     /// it should be checked with `is_eof` method.
-    pub(crate) fn nth_char(&self, n: usize) -> char {
+    fn nth_char(&self, n: usize) -> char {
         self.chars().nth(n).unwrap_or(EOF_CHAR)
     }
 
+    /// Peeks the next symbol from the input stream without consuming it.
+    pub(crate) fn first(&self) -> char {
+        self.nth_char(0)
+    }
+
+    /// Peeks the second symbol from the input stream without consuming it.
+    pub(crate) fn second(&self) -> char {
+        self.nth_char(1)
+    }
+
     /// Checks if there is nothing more to consume.
     pub(crate) fn is_eof(&self) -> bool {
         self.chars.as_str().is_empty()
diff --git a/src/librustc_lexer/src/lib.rs b/src/librustc_lexer/src/lib.rs
index d55ef46d750..c50808adec1 100644
--- a/src/librustc_lexer/src/lib.rs
+++ b/src/librustc_lexer/src/lib.rs
@@ -18,6 +18,8 @@ mod cursor;
 pub mod unescape;
 
 use crate::cursor::{Cursor, EOF_CHAR};
+use self::TokenKind::*;
+use self::LiteralKind::*;
 
 /// Parsed token.
 /// It doesn't contain information about data that has been parsed,
@@ -116,7 +118,6 @@ pub enum TokenKind {
     /// Unknown token, not expected by the lexer, e.g. "№"
     Unknown,
 }
-use self::TokenKind::*;
 
 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
 pub enum LiteralKind {
@@ -137,7 +138,6 @@ pub enum LiteralKind {
     /// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a"
     RawByteStr { n_hashes: usize, started: bool, terminated: bool },
 }
-use self::LiteralKind::*;
 
 /// Base of numeric literal encoding according to its prefix.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
@@ -241,14 +241,13 @@ pub fn is_id_continue(c: char) -> bool {
         || (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_continue(c))
 }
 
-
 impl Cursor<'_> {
     /// Parses a token from the input string.
     fn advance_token(&mut self) -> Token {
         let first_char = self.bump().unwrap();
         let token_kind = match first_char {
             // Slash, comment or block comment.
-            '/' => match self.nth_char(0) {
+            '/' => match self.first() {
                 '/' => self.line_comment(),
                 '*' => self.block_comment(),
                 _ => Slash,
@@ -257,8 +256,8 @@ impl Cursor<'_> {
             // Whitespace sequence.
             c if is_whitespace(c) => self.whitespace(),
 
-            // Raw string literal or identifier.
-            'r' => match (self.nth_char(0), self.nth_char(1)) {
+            // Raw identifier, raw string literal or identifier.
+            'r' => match (self.first(), self.second()) {
                 ('#', c1) if is_id_start(c1) => self.raw_ident(),
                 ('#', _) | ('"', _) => {
                     let (n_hashes, started, terminated) = self.raw_double_quoted_string();
@@ -273,7 +272,7 @@ impl Cursor<'_> {
             },
 
             // Byte literal, byte string literal, raw byte string literal or identifier.
-            'b' => match (self.nth_char(0), self.nth_char(1)) {
+            'b' => match (self.first(), self.second()) {
                 ('\'', _) => {
                     self.bump();
                     let terminated = self.single_quoted_string();
@@ -366,31 +365,23 @@ impl Cursor<'_> {
     }
 
     fn line_comment(&mut self) -> TokenKind {
-        debug_assert!(self.prev() == '/' && self.nth_char(0) == '/');
+        debug_assert!(self.prev() == '/' && self.first() == '/');
         self.bump();
-        loop {
-            match self.nth_char(0) {
-                '\n' => break,
-                EOF_CHAR if self.is_eof() => break,
-                _ => {
-                    self.bump();
-                }
-            }
-        }
+        self.eat_while(|c| c != '\n');
         LineComment
     }
 
     fn block_comment(&mut self) -> TokenKind {
-        debug_assert!(self.prev() == '/' && self.nth_char(0) == '*');
+        debug_assert!(self.prev() == '/' && self.first() == '*');
         self.bump();
         let mut depth = 1usize;
         while let Some(c) = self.bump() {
             match c {
-                '/' if self.nth_char(0) == '*' => {
+                '/' if self.first() == '*' => {
                     self.bump();
                     depth += 1;
                 }
-                '*' if self.nth_char(0) == '/' => {
+                '*' if self.first() == '/' => {
                     self.bump();
                     depth -= 1;
                     if depth == 0 {
@@ -409,31 +400,27 @@ impl Cursor<'_> {
 
     fn whitespace(&mut self) -> TokenKind {
         debug_assert!(is_whitespace(self.prev()));
-        while is_whitespace(self.nth_char(0)) {
-            self.bump();
-        }
+        self.eat_while(is_whitespace);
         Whitespace
     }
 
     fn raw_ident(&mut self) -> TokenKind {
         debug_assert!(
             self.prev() == 'r'
-                && self.nth_char(0) == '#'
-                && is_id_start(self.nth_char(1))
+                && self.first() == '#'
+                && is_id_start(self.second())
         );
+        // Eat "#" symbol.
         self.bump();
-        self.bump();
-        while is_id_continue(self.nth_char(0)) {
-            self.bump();
-        }
+        // Eat the identifier part of RawIdent.
+        self.eat_identifier();
         RawIdent
     }
 
     fn ident(&mut self) -> TokenKind {
         debug_assert!(is_id_start(self.prev()));
-        while is_id_continue(self.nth_char(0)) {
-            self.bump();
-        }
+        // Start is already eaten, eat the rest of identifier.
+        self.eat_while(is_id_continue);
         Ident
     }
 
@@ -442,7 +429,7 @@ impl Cursor<'_> {
         let mut base = Base::Decimal;
         if first_digit == '0' {
             // Attempt to parse encoding base.
-            let has_digits = match self.nth_char(0) {
+            let has_digits = match self.first() {
                 'b' => {
                     base = Base::Binary;
                     self.bump();
@@ -476,23 +463,23 @@ impl Cursor<'_> {
             self.eat_decimal_digits();
         };
 
-        match self.nth_char(0) {
+        match self.first() {
             // Don't be greedy if this is actually an
             // integer literal followed by field/method access or a range pattern
             // (`0..2` and `12.foo()`)
-            '.' if self.nth_char(1) != '.'
-                && !is_id_start(self.nth_char(1)) =>
+            '.' if self.second() != '.'
+                && !is_id_start(self.second()) =>
             {
                 // might have stuff after the ., and if it does, it needs to start
                 // with a number
                 self.bump();
                 let mut empty_exponent = false;
-                if self.nth_char(0).is_digit(10) {
+                if self.first().is_digit(10) {
                     self.eat_decimal_digits();
-                    match self.nth_char(0) {
+                    match self.first() {
                         'e' | 'E' => {
                             self.bump();
-                            empty_exponent = self.float_exponent().is_err()
+                            empty_exponent = !self.eat_float_exponent();
                         }
                         _ => (),
                     }
@@ -501,7 +488,7 @@ impl Cursor<'_> {
             }
             'e' | 'E' => {
                 self.bump();
-                let empty_exponent = self.float_exponent().is_err();
+                let empty_exponent = !self.eat_float_exponent();
                 Float { base, empty_exponent }
             }
             _ => Int { base, empty_int: false },
@@ -510,65 +497,76 @@ impl Cursor<'_> {
 
     fn lifetime_or_char(&mut self) -> TokenKind {
         debug_assert!(self.prev() == '\'');
-        let mut starts_with_number = false;
-
-        // Check if the first symbol after '\'' is a valid identifier
-        // character or a number (not a digit followed by '\'').
-        if (is_id_start(self.nth_char(0))
-            || self.nth_char(0).is_digit(10) && {
-                starts_with_number = true;
-                true
-            })
-            && self.nth_char(1) != '\''
-        {
-            self.bump();
 
-            // Skip the identifier.
-            while is_id_continue(self.nth_char(0)) {
-                self.bump();
-            }
+        let can_be_a_lifetime = if self.second() == '\'' {
+            // It's surely not a lifetime.
+            false
+        } else {
+            // If the first symbol is valid for identifier, it can be a lifetime.
+            // Also check if it's a number for a better error reporting (so '0 will
+            // be reported as invalid lifetime and not as unterminated char literal).
+            is_id_start(self.first()) || self.first().is_digit(10)
+        };
 
-            return if self.nth_char(0) == '\'' {
-                self.bump();
-                let kind = Char { terminated: true };
-                Literal { kind, suffix_start: self.len_consumed() }
-            } else {
-                Lifetime { starts_with_number }
-            };
+        if !can_be_a_lifetime {
+            let terminated = self.single_quoted_string();
+            let suffix_start = self.len_consumed();
+            if terminated {
+                self.eat_literal_suffix();
+            }
+            let kind = Char { terminated };
+            return Literal { kind, suffix_start };
         }
 
-        // This is not a lifetime (checked above), parse a char literal.
-        let terminated = self.single_quoted_string();
-        let suffix_start = self.len_consumed();
-        if terminated {
-            self.eat_literal_suffix();
+        // Either a lifetime or a character literal with
+        // length greater than 1.
+
+        let starts_with_number = self.first().is_digit(10);
+
+        // Skip the literal contents.
+        // First symbol can be a number (which isn't a valid identifier start),
+        // so skip it without any checks.
+        self.bump();
+        self.eat_while(is_id_continue);
+
+        // Check if after skipping literal contents we've met a closing
+        // single quote (which means that user attempted to create a
+        // string with single quotes).
+        if self.first() == '\'' {
+            self.bump();
+            let kind = Char { terminated: true };
+            return Literal { kind, suffix_start: self.len_consumed() };
         }
-        let kind = Char { terminated };
-        return Literal { kind, suffix_start };
+
+        return Lifetime { starts_with_number };
     }
 
     fn single_quoted_string(&mut self) -> bool {
         debug_assert!(self.prev() == '\'');
-        // Parse `'''` as a single char literal.
-        if self.nth_char(0) == '\'' && self.nth_char(1) == '\'' {
+        // Check if it's a one-symbol literal.
+        if self.second() == '\'' && self.first() != '\\' {
+            self.bump();
             self.bump();
+            return true;
         }
+
+        // Literal has more than one symbol.
+
         // Parse until either quotes are terminated or error is detected.
-        let mut first = true;
         loop {
-            match self.nth_char(0) {
-                // Probably beginning of the comment, which we don't want to include
-                // to the error report.
-                '/' if !first => break,
-                // Newline without following '\'' means unclosed quote, stop parsing.
-                '\n' if self.nth_char(1) != '\'' => break,
-                // End of file, stop parsing.
-                EOF_CHAR if self.is_eof() => break,
+            match self.first() {
                 // Quotes are terminated, finish parsing.
                 '\'' => {
                     self.bump();
                     return true;
                 }
+                // Probably beginning of the comment, which we don't want to include
+                // to the error report.
+                '/' => break,
+                // Newline without following '\'' means unclosed quote, stop parsing.
+                '\n' if self.second() != '\'' => break,
+                // End of file, stop parsing.
+                EOF_CHAR if self.is_eof() => break,
                 // Escaped slash is considered one character, so bump twice.
                 '\\' => {
                     self.bump();
@@ -579,8 +577,8 @@ impl Cursor<'_> {
                     self.bump();
                 }
             }
-            first = false;
         }
+        // String was not terminated.
         false
     }
 
@@ -588,62 +586,71 @@ impl Cursor<'_> {
     /// if string is terminated.
     fn double_quoted_string(&mut self) -> bool {
         debug_assert!(self.prev() == '"');
-        loop {
-            match self.nth_char(0) {
+        while let Some(c) = self.bump() {
+            match c {
                 '"' => {
-                    self.bump();
                     return true;
                 }
-                EOF_CHAR if self.is_eof() => return false,
-                '\\' if self.nth_char(1) == '\\' || self.nth_char(1) == '"' => {
+                '\\' if self.first() == '\\' || self.first() == '"' => {
+                    // Bump again to skip escaped character.
                     self.bump();
                 }
                 _ => (),
             }
-            self.bump();
         }
+        // End of file reached.
+        false
     }
 
     /// Eats the double-quoted string and returns a tuple of
     /// (amount of the '#' symbols, raw string started, raw string terminated)
     fn raw_double_quoted_string(&mut self) -> (usize, bool, bool) {
         debug_assert!(self.prev() == 'r');
+        let mut started: bool = false;
+        let mut finished: bool = false;
+
         // Count opening '#' symbols.
-        let n_hashes = {
-            let mut acc: usize = 0;
-            loop {
-                match self.bump() {
-                    Some('#') => acc += 1,
-                    Some('"') => break acc,
-                    None | Some(_) => return (acc, false, false),
-                }
+        let n_hashes = self.eat_while(|c| c == '#');
+
+        // Check that string is started.
+        match self.bump() {
+            Some('"') => started = true,
+            _ => return (n_hashes, started, finished),
+        }
+
+        // Skip the string contents and on each '#' character met, check if this is
+        // a raw string termination.
+        while !finished {
+            self.eat_while(|c| c != '"');
+
+            if self.is_eof() {
+                return (n_hashes, started, finished);
             }
-        };
 
-        // Skip the string itself and check that amount of closing '#'
-        // symbols is equal to the amount of opening ones.
-        loop {
-            match self.bump() {
-                Some('"') => {
-                    let mut acc = n_hashes;
-                    while self.nth_char(0) == '#' && acc > 0 {
-                        self.bump();
-                        acc -= 1;
-                    }
-                    if acc == 0 {
-                        return (n_hashes, true, true);
-                    }
+            // Eat closing double quote.
+            self.bump();
+
+            // Check that amount of closing '#' symbols
+            // is equal to the amount of opening ones.
+            let mut hashes_left = n_hashes;
+            let is_closing_hash = |c| {
+                if c == '#' && hashes_left != 0 {
+                    hashes_left -= 1;
+                    true
+                } else {
+                    false
                 }
-                Some(_) => (),
-                None => return (n_hashes, true, false),
-            }
+            };
+            finished = self.eat_while(is_closing_hash) == n_hashes;
         }
+
+        (n_hashes, started, finished)
     }
 
     fn eat_decimal_digits(&mut self) -> bool {
         let mut has_digits = false;
         loop {
-            match self.nth_char(0) {
+            match self.first() {
                 '_' => {
                     self.bump();
                 }
@@ -660,7 +667,7 @@ impl Cursor<'_> {
     fn eat_hexadecimal_digits(&mut self) -> bool {
         let mut has_digits = false;
         loop {
-            match self.nth_char(0) {
+            match self.first() {
                 '_' => {
                     self.bump();
                 }
@@ -674,23 +681,43 @@ impl Cursor<'_> {
         has_digits
     }
 
-    fn float_exponent(&mut self) -> Result<(), ()> {
+    /// Eats the float exponent. Returns true if at least one digit was met,
+    /// and returns false otherwise.
+    fn eat_float_exponent(&mut self) -> bool {
         debug_assert!(self.prev() == 'e' || self.prev() == 'E');
-        if self.nth_char(0) == '-' || self.nth_char(0) == '+' {
+        if self.first() == '-' || self.first() == '+' {
             self.bump();
         }
-        if self.eat_decimal_digits() { Ok(()) } else { Err(()) }
+        self.eat_decimal_digits()
     }
 
-    // Eats the suffix if it's an identifier.
+    // Eats the suffix of the literal, e.g. "_u8".
     fn eat_literal_suffix(&mut self) {
-        if !is_id_start(self.nth_char(0)) {
+        self.eat_identifier();
+    }
+
+    // Eats the identifier.
+    fn eat_identifier(&mut self) {
+        if !is_id_start(self.first()) {
             return;
         }
         self.bump();
 
-        while is_id_continue(self.nth_char(0)) {
+        self.eat_while(is_id_continue);
+    }
+
+    /// Eats symbols while predicate returns true or until the end of file is reached.
+    /// Returns amount of eaten symbols.
+    fn eat_while<F>(&mut self, mut predicate: F) -> usize
+    where
+        F: FnMut(char) -> bool
+    {
+        let mut eaten: usize = 0;
+        while predicate(self.first()) && !self.is_eof() {
+            eaten += 1;
             self.bump();
         }
+
+        eaten
     }
 }
author	Mazdak Farrokhzad <twingoow@gmail.com>	2019-11-06 07:03:05 +0100
committer	GitHub <noreply@github.com>	2019-11-06 07:03:05 +0100
commit	81550a00d1e8f83bbaef31205e72b27fdb796d35 (patch)
tree	bd7b39fcc7320311a711ebc9400eb1c9d2f0254a
parent	24af0c94b30271198d6101cdd14c78182f76fd07 (diff)
parent	31735b02c95510f2e236ebd773b02e84ee6e1a5b (diff)
download	rust-81550a00d1e8f83bbaef31205e72b27fdb796d35.tar.gz rust-81550a00d1e8f83bbaef31205e72b27fdb796d35.zip