From aa6bfaf04b258e3e23d3f7063de4f2d37845ddec Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Mon, 26 Sep 2022 09:18:23 +1000 Subject: Make `rustc_lexer::cursor::Cursor` public. `Cursor` is currently hidden, and the main tokenization path uses `rustc_lexer::first_token` which involves constructing a new `Cursor` for every single token, which is weird. Also, `first_token` also can't handle empty input, so callers have to check for that first. This commit makes `Cursor` public, so `StringReader` can contain a `Cursor`, which results in a simpler structure. The commit also changes `StringReader::advance_token` so it returns an `Option`, simplifying the the empty input case. --- src/librustdoc/html/highlight.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'src') diff --git a/src/librustdoc/html/highlight.rs b/src/librustdoc/html/highlight.rs index 8922bf37785..0870d6f3824 100644 --- a/src/librustdoc/html/highlight.rs +++ b/src/librustdoc/html/highlight.rs @@ -13,6 +13,7 @@ use std::collections::VecDeque; use std::fmt::{Display, Write}; use rustc_data_structures::fx::FxHashMap; +use rustc_lexer::cursor::Cursor; use rustc_lexer::{LiteralKind, TokenKind}; use rustc_span::edition::Edition; use rustc_span::symbol::Symbol; @@ -408,15 +409,13 @@ enum Highlight<'a> { struct TokenIter<'a> { src: &'a str, + cursor: Cursor<'a>, } impl<'a> Iterator for TokenIter<'a> { type Item = (TokenKind, &'a str); fn next(&mut self) -> Option<(TokenKind, &'a str)> { - if self.src.is_empty() { - return None; - } - let token = rustc_lexer::first_token(self.src); + let token = self.cursor.advance_token()?; let (text, rest) = self.src.split_at(token.len as usize); self.src = rest; Some((token.kind, text)) @@ -525,7 +524,7 @@ impl<'a> Classifier<'a> { /// Takes as argument the source code to HTML-ify, the rust edition to use and the source code /// file span which will be used later on by the `span_correspondance_map`. fn new(src: &str, file_span: Span, decoration_info: Option) -> Classifier<'_> { - let tokens = PeekIter::new(TokenIter { src }); + let tokens = PeekIter::new(TokenIter { src, cursor: Cursor::new(src) }); let decorations = decoration_info.map(Decorations::new); Classifier { tokens, -- cgit 1.4.1-3-g733a5 From da84f0f4c31914c14dd03628395e9c53f28b88b9 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Mon, 26 Sep 2022 13:06:15 +1000 Subject: Add `rustc_lexer::TokenKind::Eof`. For alignment with `rust_ast::TokenKind::Eof`. Plus it's a bit faster, due to less `Option` manipulation in `StringReader::next_token`. --- compiler/rustc_lexer/src/lib.rs | 17 +++++++++++++---- compiler/rustc_parse/src/lexer/mod.rs | 10 ++-------- src/librustdoc/html/highlight.rs | 6 +++++- 3 files changed, 20 insertions(+), 13 deletions(-) (limited to 'src') diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index 69e772c6924..18ebed7c70e 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -139,6 +139,9 @@ pub enum TokenKind { /// Unknown token, not expected by the lexer, e.g. "№" Unknown, + + /// End of input. + Eof, } #[derive(Clone, Copy, Debug, PartialEq, Eq)] @@ -235,7 +238,10 @@ pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError> /// Creates an iterator that produces tokens from the input string. pub fn tokenize(input: &str) -> impl Iterator + '_ { let mut cursor = Cursor::new(input); - std::iter::from_fn(move || cursor.advance_token()) + std::iter::from_fn(move || { + let token = cursor.advance_token(); + if token.kind != TokenKind::Eof { Some(token) } else { None } + }) } /// True if `c` is considered a whitespace according to Rust language definition. @@ -297,8 +303,11 @@ pub fn is_ident(string: &str) -> bool { impl Cursor<'_> { /// Parses a token from the input string. - pub fn advance_token(&mut self) -> Option { - let first_char = self.bump()?; + pub fn advance_token(&mut self) -> Token { + let first_char = match self.bump() { + Some(c) => c, + None => return Token::new(TokenKind::Eof, 0), + }; let token_kind = match first_char { // Slash, comment or block comment. '/' => match self.first() { @@ -419,7 +428,7 @@ impl Cursor<'_> { } _ => Unknown, }; - let res = Some(Token::new(token_kind, self.pos_within_token())); + let res = Token::new(token_kind, self.pos_within_token()); self.reset_pos_within_token(); res } diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index 0f9d585230e..67fefd19d8b 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -80,14 +80,7 @@ impl<'a> StringReader<'a> { // Skip trivial (whitespace & comments) tokens loop { - let token = match self.cursor.advance_token() { - Some(token) => token, - None => { - let span = self.mk_sp(self.pos, self.pos); - return (Token::new(token::Eof, span), preceded_by_whitespace); - } - }; - + let token = self.cursor.advance_token(); let start = self.pos; self.pos = self.pos + BytePos(token.len); @@ -327,6 +320,7 @@ impl<'a> StringReader<'a> { err.emit(); token? } + rustc_lexer::TokenKind::Eof => token::Eof, }) } diff --git a/src/librustdoc/html/highlight.rs b/src/librustdoc/html/highlight.rs index 0870d6f3824..ea65a6334c9 100644 --- a/src/librustdoc/html/highlight.rs +++ b/src/librustdoc/html/highlight.rs @@ -415,7 +415,10 @@ struct TokenIter<'a> { impl<'a> Iterator for TokenIter<'a> { type Item = (TokenKind, &'a str); fn next(&mut self) -> Option<(TokenKind, &'a str)> { - let token = self.cursor.advance_token()?; + let token = self.cursor.advance_token(); + if token.kind == TokenKind::Eof { + return None; + } let (text, rest) = self.src.split_at(token.len as usize); self.src = rest; Some((token.kind, text)) @@ -849,6 +852,7 @@ impl<'a> Classifier<'a> { Class::Ident(self.new_span(before, text)) } TokenKind::Lifetime { .. } => Class::Lifetime, + TokenKind::Eof => panic!("Eof in advance"), }; // Anything that didn't return above is the simple case where we the // class just spans a single token, so we can use the `string` method. -- cgit 1.4.1-3-g733a5 From d0a26acb2ae2d000e516eca92ae8feb08d1f6ea0 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Wed, 28 Sep 2022 10:28:36 +1000 Subject: Address review comments. --- compiler/rustc_lexer/src/lib.rs | 6 ++++-- compiler/rustc_parse/src/lexer/mod.rs | 2 +- compiler/rustc_parse/src/lexer/tokentrees.rs | 14 +++++++------- src/librustdoc/html/highlight.rs | 2 +- 4 files changed, 13 insertions(+), 11 deletions(-) (limited to 'src') diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index 18ebed7c70e..c71e6ffe34d 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -23,15 +23,17 @@ // We want to be able to build this crate with a stable compiler, so no // `#![feature]` attributes should be added. -pub mod cursor; +mod cursor; pub mod unescape; #[cfg(test)] mod tests; +pub use crate::cursor::Cursor; + use self::LiteralKind::*; use self::TokenKind::*; -use crate::cursor::{Cursor, EOF_CHAR}; +use crate::cursor::EOF_CHAR; use std::convert::TryFrom; /// Parsed token. diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index 151e80e2b3e..bcd078a8967 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -4,8 +4,8 @@ use rustc_ast::token::{self, CommentKind, Delimiter, Token, TokenKind}; use rustc_ast::tokenstream::TokenStream; use rustc_ast::util::unicode::contains_text_flow_control_chars; use rustc_errors::{error_code, Applicability, DiagnosticBuilder, ErrorGuaranteed, PResult}; -use rustc_lexer::cursor::Cursor; use rustc_lexer::unescape::{self, Mode}; +use rustc_lexer::Cursor; use rustc_lexer::{Base, DocStyle, RawStrError}; use rustc_session::lint::builtin::{ RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX, TEXT_DIRECTION_CODEPOINT_IN_COMMENT, diff --git a/compiler/rustc_parse/src/lexer/tokentrees.rs b/compiler/rustc_parse/src/lexer/tokentrees.rs index 6f6ab16cb59..364753154db 100644 --- a/compiler/rustc_parse/src/lexer/tokentrees.rs +++ b/compiler/rustc_parse/src/lexer/tokentrees.rs @@ -53,7 +53,7 @@ impl<'a> TokenTreesReader<'a> { token::OpenDelim(delim) => buf.push(self.parse_token_tree_open_delim(delim)), token::CloseDelim(delim) => return Err(self.close_delim_err(delim)), token::Eof => return Ok(buf.into_token_stream()), - _ => buf.push(self.parse_token_tree_other()), + _ => buf.push(self.parse_token_tree_non_delim_non_eof()), } } } @@ -66,11 +66,10 @@ impl<'a> TokenTreesReader<'a> { token::OpenDelim(delim) => buf.push(self.parse_token_tree_open_delim(delim)), token::CloseDelim(..) => return buf.into_token_stream(), token::Eof => { - let mut err = self.eof_err(); - err.emit(); + self.eof_err().emit(); return buf.into_token_stream(); } - _ => buf.push(self.parse_token_tree_other()), + _ => buf.push(self.parse_token_tree_non_delim_non_eof()), } } } @@ -245,9 +244,10 @@ impl<'a> TokenTreesReader<'a> { } #[inline] - fn parse_token_tree_other(&mut self) -> TokenTree { - // `spacing` for the returned token is determined by the next token: - // its kind and its `preceded_by_whitespace` status. + fn parse_token_tree_non_delim_non_eof(&mut self) -> TokenTree { + // `this_spacing` for the returned token refers to whether the token is + // immediately followed by another op token. It is determined by the + // next token: its kind and its `preceded_by_whitespace` status. let (next_tok, is_next_tok_preceded_by_whitespace) = self.string_reader.next_token(); let this_spacing = if is_next_tok_preceded_by_whitespace || !next_tok.is_op() { Spacing::Alone diff --git a/src/librustdoc/html/highlight.rs b/src/librustdoc/html/highlight.rs index ea65a6334c9..78b98431b19 100644 --- a/src/librustdoc/html/highlight.rs +++ b/src/librustdoc/html/highlight.rs @@ -13,7 +13,7 @@ use std::collections::VecDeque; use std::fmt::{Display, Write}; use rustc_data_structures::fx::FxHashMap; -use rustc_lexer::cursor::Cursor; +use rustc_lexer::Cursor; use rustc_lexer::{LiteralKind, TokenKind}; use rustc_span::edition::Edition; use rustc_span::symbol::Symbol; -- cgit 1.4.1-3-g733a5