lexer: lex WS/COMMENT/SHEBANG rather than skipping

Now, the lexer will categorize every byte in its input according to the grammar. The parser skips over these while parsing, thus avoiding their presence in the input to syntax extensions.
author: Corey Richardson <corey@octayn.net> 2014-07-04 22:30:39 -0700
committer: Corey Richardson <corey@octayn.net> 2014-07-09 00:06:29 -0700
commit: f512779554a436d11dd9ffde4c198da6241dfd58 (patch)
tree: 6431e33802c11c3ba27484028fb93d2451e3373e /src/libsyntax
parent: cc4213418e3ab225867d8e3911f592481b1bbffc (diff)
download: rust-f512779554a436d11dd9ffde4c198da6241dfd58.tar.gz
rust-f512779554a436d11dd9ffde4c198da6241dfd58.zip
5 files changed, 118 insertions, 62 deletions
diff --git a/src/libsyntax/parse/attr.rs b/src/libsyntax/parse/attr.rs
index b2297ec770c..c227d8a0fed 100644
--- a/src/libsyntax/parse/attr.rs
+++ b/src/libsyntax/parse/attr.rs
@@ -34,7 +34,7 @@ impl<'a> ParserAttr for Parser<'a> {
     fn parse_outer_attributes(&mut self) -> Vec<ast::Attribute> {
         let mut attrs: Vec<ast::Attribute> = Vec::new();
         loop {
-            debug!("parse_outer_attributes: self.token={:?}",
+            debug!("parse_outer_attributes: self.token={}",
                    self.token);
             match self.token {
               token::POUND => {
diff --git a/src/libsyntax/parse/lexer/comments.rs b/src/libsyntax/parse/lexer/comments.rs
index c5dd10382a9..3f3a8a723f1 100644
--- a/src/libsyntax/parse/lexer/comments.rs
+++ b/src/libsyntax/parse/lexer/comments.rs
@@ -13,7 +13,7 @@ use codemap::{BytePos, CharPos, CodeMap, Pos};
 use diagnostic;
 use parse::lexer::{is_whitespace, Reader};
 use parse::lexer::{StringReader, TokenAndSpan};
-use parse::lexer::{is_line_non_doc_comment, is_block_non_doc_comment};
+use parse::lexer::is_block_doc_comment;
 use parse::lexer;
 use parse::token;
 
@@ -42,9 +42,9 @@ pub struct Comment {
 }
 
 pub fn is_doc_comment(s: &str) -> bool {
-    (s.starts_with("///") && !is_line_non_doc_comment(s)) ||
+    (s.starts_with("///") && super::is_doc_comment(s)) ||
     s.starts_with("//!") ||
-    (s.starts_with("/**") && !is_block_non_doc_comment(s)) ||
+    (s.starts_with("/**") && is_block_doc_comment(s)) ||
     s.starts_with("/*!")
 }
 
@@ -260,7 +260,7 @@ fn read_block_comment(rdr: &mut StringReader,
             rdr.bump();
             rdr.bump();
         }
-        if !is_block_non_doc_comment(curr_line.as_slice()) {
+        if is_block_doc_comment(curr_line.as_slice()) {
             return
         }
         assert!(!curr_line.as_slice().contains_char('\n'));
diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs
index 61a37f77d34..947f3d59b86 100644
--- a/src/libsyntax/parse/lexer/mod.rs
+++ b/src/libsyntax/parse/lexer/mod.rs
@@ -187,7 +187,7 @@ impl<'a> StringReader<'a> {
     /// Advance peek_tok and peek_span to refer to the next token, and
     /// possibly update the interner.
     fn advance_token(&mut self) {
-        match self.consume_whitespace_and_comments() {
+        match self.scan_whitespace_or_comment() {
             Some(comment) => {
                 self.peek_span = comment.sp;
                 self.peek_tok = comment.tok;
@@ -339,8 +339,7 @@ impl<'a> StringReader<'a> {
 
     /// PRECONDITION: self.curr is not whitespace
     /// Eats any kind of comment.
-    /// Returns a Some(sugared-doc-attr) if one exists, None otherwise
-    fn consume_any_line_comment(&mut self) -> Option<TokenAndSpan> {
+    fn scan_comment(&mut self) -> Option<TokenAndSpan> {
         match self.curr {
             Some(c) => {
                 if c.is_whitespace() {
@@ -375,28 +374,32 @@ impl<'a> StringReader<'a> {
                             }
                             self.bump();
                         }
-                        let ret = self.with_str_from(start_bpos, |string| {
+                        return self.with_str_from(start_bpos, |string| {
                             // but comments with only more "/"s are not
-                            if !is_line_non_doc_comment(string) {
-                                Some(TokenAndSpan{
-                                    tok: token::DOC_COMMENT(str_to_ident(string)),
-                                    sp: codemap::mk_sp(start_bpos, self.last_pos)
-                                })
+                            let tok = if is_doc_comment(string) {
+                                token::DOC_COMMENT(str_to_ident(string))
                             } else {
-                                None
-                            }
-                        });
+                                token::COMMENT
+                            };
 
-                        if ret.is_some() {
-                            return ret;
-                        }
+                            return Some(TokenAndSpan{
+                                tok: tok,
+                                sp: codemap::mk_sp(start_bpos, self.last_pos)
+                            });
+                        });
                     } else {
+                        let start_bpos = self.last_pos - BytePos(2);
                         while !self.curr_is('\n') && !self.is_eof() { self.bump(); }
+                        return Some(TokenAndSpan {
+                            tok: token::COMMENT,
+                            sp: codemap::mk_sp(start_bpos, self.last_pos)
+                        });
                     }
-                    // Restart whitespace munch.
-                    self.consume_whitespace_and_comments()
                 }
-                Some('*') => { self.bump(); self.bump(); self.consume_block_comment() }
+                Some('*') => {
+                    self.bump(); self.bump();
+                    self.scan_block_comment()
+                }
                 _ => None
             }
         } else if self.curr_is('#') {
@@ -412,9 +415,15 @@ impl<'a> StringReader<'a> {
                 let cmap = CodeMap::new();
                 cmap.files.borrow_mut().push(self.filemap.clone());
                 let loc = cmap.lookup_char_pos_adj(self.last_pos);
+                debug!("Skipping a shebang");
                 if loc.line == 1u && loc.col == CharPos(0u) {
+                    // FIXME: Add shebang "token", return it
+                    let start = self.last_pos;
                     while !self.curr_is('\n') && !self.is_eof() { self.bump(); }
-                    return self.consume_whitespace_and_comments();
+                    return Some(TokenAndSpan {
+                        tok: token::SHEBANG(self.ident_from(start)),
+                        sp: codemap::mk_sp(start, self.last_pos)
+                    });
                 }
             }
             None
@@ -423,15 +432,33 @@ impl<'a> StringReader<'a> {
         }
     }
 
-    /// EFFECT: eats whitespace and comments.
-    /// Returns a Some(sugared-doc-attr) if one exists, None otherwise.
-    fn consume_whitespace_and_comments(&mut self) -> Option<TokenAndSpan> {
-        while is_whitespace(self.curr) { self.bump(); }
-        return self.consume_any_line_comment();
+    /// If there is whitespace, shebang, or a comment, scan it. Otherwise,
+    /// return None.
+    fn scan_whitespace_or_comment(&mut self) -> Option<TokenAndSpan> {
+        match self.curr.unwrap_or('\0') {
+            // # to handle shebang at start of file -- this is the entry point
+            // for skipping over all "junk"
+            '/' | '#' => {
+                let c = self.scan_comment();
+                debug!("scanning a comment {}", c);
+                c
+            },
+            c if is_whitespace(Some(c)) => {
+                let start_bpos = self.last_pos;
+                while is_whitespace(self.curr) { self.bump(); }
+                let c = Some(TokenAndSpan {
+                    tok: token::WS,
+                    sp: codemap::mk_sp(start_bpos, self.last_pos)
+                });
+                debug!("scanning whitespace: {}", c);
+                c
+            },
+            _ => None
+        }
     }
 
     /// Might return a sugared-doc-attr
-    fn consume_block_comment(&mut self) -> Option<TokenAndSpan> {
+    fn scan_block_comment(&mut self) -> Option<TokenAndSpan> {
         // block comments starting with "/**" or "/*!" are doc-comments
         let is_doc_comment = self.curr_is('*') || self.curr_is('!');
         let start_bpos = self.last_pos - BytePos(2);
@@ -466,28 +493,23 @@ impl<'a> StringReader<'a> {
             self.bump();
         }
 
-        let res = if is_doc_comment {
-            self.with_str_from(start_bpos, |string| {
-                // but comments with only "*"s between two "/"s are not
-                if !is_block_non_doc_comment(string) {
-                    let string = if has_cr {
-                        self.translate_crlf(start_bpos, string,
-                                            "bare CR not allowed in block doc-comment")
-                    } else { string.into_maybe_owned() };
-                    Some(TokenAndSpan{
-                            tok: token::DOC_COMMENT(str_to_ident(string.as_slice())),
-                            sp: codemap::mk_sp(start_bpos, self.last_pos)
-                        })
-                } else {
-                    None
-                }
-            })
-        } else {
-            None
-        };
+        self.with_str_from(start_bpos, |string| {
+            // but comments with only "*"s between two "/"s are not
+            let tok = if is_block_doc_comment(string) {
+                let string = if has_cr {
+                    self.translate_crlf(start_bpos, string,
+                                        "bare CR not allowed in block doc-comment")
+                } else { string.into_maybe_owned() };
+                token::DOC_COMMENT(str_to_ident(string.as_slice()))
+            } else {
+                token::COMMENT
+            };
 
-        // restart whitespace munch.
-        if res.is_some() { res } else { self.consume_whitespace_and_comments() }
+            Some(TokenAndSpan{
+                tok: tok,
+                sp: codemap::mk_sp(start_bpos, self.last_pos)
+            })
+        })
     }
 
     /// Scan through any digits (base `radix`) or underscores, and return how
@@ -1242,12 +1264,18 @@ fn in_range(c: Option<char>, lo: char, hi: char) -> bool {
 
 fn is_dec_digit(c: Option<char>) -> bool { return in_range(c, '0', '9'); }
 
-pub fn is_line_non_doc_comment(s: &str) -> bool {
-    s.starts_with("////")
+pub fn is_doc_comment(s: &str) -> bool {
+    let res = (s.starts_with("///") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'/')
+              || s.starts_with("//!");
+    debug!("is `{}` a doc comment? {}", s, res);
+    res
 }
 
-pub fn is_block_non_doc_comment(s: &str) -> bool {
-    s.starts_with("/***")
+pub fn is_block_doc_comment(s: &str) -> bool {
+    let res = (s.starts_with("/**") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'*')
+              || s.starts_with("/*!");
+    debug!("is `{}` a doc comment? {}", s, res);
+    res
 }
 
 fn ident_start(c: Option<char>) -> bool {
@@ -1383,9 +1411,9 @@ mod test {
     }
 
     #[test] fn line_doc_comments() {
-        assert!(!is_line_non_doc_comment("///"));
-        assert!(!is_line_non_doc_comment("/// blah"));
-        assert!(is_line_non_doc_comment("////"));
+        assert!(is_doc_comment("///"));
+        assert!(is_doc_comment("/// blah"));
+        assert!(!is_doc_comment("////"));
     }
 
     #[test] fn nested_block_comments() {
diff --git a/src/libsyntax/parse/parser.rs b/src/libsyntax/parse/parser.rs
index e0bcb41a753..51f2c74d3ae 100644
--- a/src/libsyntax/parse/parser.rs
+++ b/src/libsyntax/parse/parser.rs
@@ -325,10 +325,24 @@ fn is_plain_ident_or_underscore(t: &token::Token) -> bool {
     is_plain_ident(t) || *t == token::UNDERSCORE
 }
 
+/// Get a token the parser cares about
+fn real_token(rdr: &mut Reader) -> TokenAndSpan {
+    let mut t = rdr.next_token();
+    loop {
+        match t.tok {
+            token::WS | token::COMMENT | token::SHEBANG(_) => {
+                t = rdr.next_token();
+            },
+            _ => break
+        }
+    }
+    t
+}
+
 impl<'a> Parser<'a> {
     pub fn new(sess: &'a ParseSess, cfg: ast::CrateConfig,
                mut rdr: Box<Reader>) -> Parser<'a> {
-        let tok0 = rdr.next_token();
+        let tok0 = real_token(rdr);
         let span = tok0.sp;
         let placeholder = TokenAndSpan {
             tok: token::UNDERSCORE,
@@ -864,7 +878,7 @@ impl<'a> Parser<'a> {
             None
         };
         let next = if self.buffer_start == self.buffer_end {
-            self.reader.next_token()
+            real_token(self.reader)
         } else {
             // Avoid token copies with `replace`.
             let buffer_start = self.buffer_start as uint;
@@ -908,7 +922,7 @@ impl<'a> Parser<'a> {
                       -> R {
         let dist = distance as int;
         while self.buffer_length() < dist {
-            self.buffer[self.buffer_end as uint] = self.reader.next_token();
+            self.buffer[self.buffer_end as uint] = real_token(self.reader);
             self.buffer_end = (self.buffer_end + 1) & 3;
         }
         f(&self.buffer[((self.buffer_start + dist - 1) & 3) as uint].tok)
diff --git a/src/libsyntax/parse/token.rs b/src/libsyntax/parse/token.rs
index 83d373d033b..e65f9f208a3 100644
--- a/src/libsyntax/parse/token.rs
+++ b/src/libsyntax/parse/token.rs
@@ -97,8 +97,18 @@ pub enum Token {
 
     /* For interpolation */
     INTERPOLATED(Nonterminal),
-
     DOC_COMMENT(Ident),
+
+    // Junk. These carry no data because we don't really care about the data
+    // they *would* carry, and don't really want to allocate a new ident for
+    // them. Instead, users could extract that from the associated span.
+
+    /// Whitespace
+    WS,
+    /// Comment
+    COMMENT,
+    SHEBANG(Ident),
+
     EOF,
 }
 
@@ -231,6 +241,10 @@ pub fn to_string(t: &Token) -> String {
       /* Other */
       DOC_COMMENT(s) => get_ident(s).get().to_string(),
       EOF => "<eof>".to_string(),
+      WS => " ".to_string(),
+      COMMENT => "/* */".to_string(),
+      SHEBANG(s) => format!("/* shebang: {}*/", s.as_str()),
+
       INTERPOLATED(ref nt) => {
         match nt {
             &NtExpr(ref e) => ::print::pprust::expr_to_string(&**e),
author	Corey Richardson <corey@octayn.net>	2014-07-04 22:30:39 -0700
committer	Corey Richardson <corey@octayn.net>	2014-07-09 00:06:29 -0700
commit	f512779554a436d11dd9ffde4c198da6241dfd58 (patch)
tree	6431e33802c11c3ba27484028fb93d2451e3373e /src/libsyntax
parent	cc4213418e3ab225867d8e3911f592481b1bbffc (diff)
download	rust-f512779554a436d11dd9ffde4c198da6241dfd58.tar.gz rust-f512779554a436d11dd9ffde4c198da6241dfd58.zip