diff options
| author | Nicholas Nethercote <n.nethercote@gmail.com> | 2022-09-26 09:18:23 +1000 |
|---|---|---|
| committer | Nicholas Nethercote <n.nethercote@gmail.com> | 2022-09-26 13:36:35 +1000 |
| commit | aa6bfaf04b258e3e23d3f7063de4f2d37845ddec (patch) | |
| tree | b6735a3ee3d176bdadcb589e37503dd4d22c42a6 | |
| parent | 33516ac09af7038efce6332afdedc758a3943609 (diff) | |
| download | rust-aa6bfaf04b258e3e23d3f7063de4f2d37845ddec.tar.gz rust-aa6bfaf04b258e3e23d3f7063de4f2d37845ddec.zip | |
Make `rustc_lexer::cursor::Cursor` public.
`Cursor` is currently hidden, and the main tokenization path uses `rustc_lexer::first_token` which involves constructing a new `Cursor` for every single token, which is weird. Also, `first_token` also can't handle empty input, so callers have to check for that first. This commit makes `Cursor` public, so `StringReader` can contain a `Cursor`, which results in a simpler structure. The commit also changes `StringReader::advance_token` so it returns an `Option<Token>`, simplifying the the empty input case.
| -rw-r--r-- | compiler/rustc_lexer/src/cursor.rs | 4 | ||||
| -rw-r--r-- | compiler/rustc_lexer/src/lib.rs | 26 | ||||
| -rw-r--r-- | compiler/rustc_parse/src/lexer/mod.rs | 23 | ||||
| -rw-r--r-- | src/librustdoc/html/highlight.rs | 9 |
4 files changed, 26 insertions, 36 deletions
diff --git a/compiler/rustc_lexer/src/cursor.rs b/compiler/rustc_lexer/src/cursor.rs index 21557a9c854..df9b6afdf56 100644 --- a/compiler/rustc_lexer/src/cursor.rs +++ b/compiler/rustc_lexer/src/cursor.rs @@ -4,7 +4,7 @@ use std::str::Chars; /// /// Next characters can be peeked via `first` method, /// and position can be shifted forward via `bump` method. -pub(crate) struct Cursor<'a> { +pub struct Cursor<'a> { initial_len: usize, /// Iterator over chars. Slightly faster than a &str. chars: Chars<'a>, @@ -15,7 +15,7 @@ pub(crate) struct Cursor<'a> { pub(crate) const EOF_CHAR: char = '\0'; impl<'a> Cursor<'a> { - pub(crate) fn new(input: &'a str) -> Cursor<'a> { + pub fn new(input: &'a str) -> Cursor<'a> { Cursor { initial_len: input.len(), chars: input.chars(), diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index a79c982649a..9182b649bf3 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -23,7 +23,7 @@ // We want to be able to build this crate with a stable compiler, so no // `#![feature]` attributes should be added. -mod cursor; +pub mod cursor; pub mod unescape; #[cfg(test)] @@ -219,13 +219,6 @@ pub fn strip_shebang(input: &str) -> Option<usize> { None } -/// Parses the first token from the provided input string. -#[inline] -pub fn first_token(input: &str) -> Token { - debug_assert!(!input.is_empty()); - Cursor::new(input).advance_token() -} - /// Validates a raw string literal. Used for getting more information about a /// problem with a `RawStr`/`RawByteStr` with a `None` field. #[inline] @@ -242,14 +235,7 @@ pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError> /// Creates an iterator that produces tokens from the input string. pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ { let mut cursor = Cursor::new(input); - std::iter::from_fn(move || { - if cursor.is_eof() { - None - } else { - cursor.reset_len_consumed(); - Some(cursor.advance_token()) - } - }) + std::iter::from_fn(move || cursor.advance_token()) } /// True if `c` is considered a whitespace according to Rust language definition. @@ -311,8 +297,8 @@ pub fn is_ident(string: &str) -> bool { impl Cursor<'_> { /// Parses a token from the input string. - fn advance_token(&mut self) -> Token { - let first_char = self.bump().unwrap(); + pub fn advance_token(&mut self) -> Option<Token> { + let first_char = self.bump()?; let token_kind = match first_char { // Slash, comment or block comment. '/' => match self.first() { @@ -433,7 +419,9 @@ impl Cursor<'_> { } _ => Unknown, }; - Token::new(token_kind, self.len_consumed()) + let res = Some(Token::new(token_kind, self.len_consumed())); + self.reset_len_consumed(); + res } fn line_comment(&mut self) -> TokenKind { diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index bdc8e96b889..c182e86332a 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -4,6 +4,7 @@ use rustc_ast::token::{self, CommentKind, Delimiter, Token, TokenKind}; use rustc_ast::tokenstream::TokenStream; use rustc_ast::util::unicode::contains_text_flow_control_chars; use rustc_errors::{error_code, Applicability, DiagnosticBuilder, ErrorGuaranteed, PResult}; +use rustc_lexer::cursor::Cursor; use rustc_lexer::unescape::{self, Mode}; use rustc_lexer::{Base, DocStyle, RawStrError}; use rustc_session::lint::builtin::{ @@ -48,7 +49,9 @@ pub(crate) fn parse_token_trees<'a>( start_pos = start_pos + BytePos::from_usize(shebang_len); } - let string_reader = StringReader { sess, start_pos, pos: start_pos, src, override_span }; + let cursor = Cursor::new(src); + let string_reader = + StringReader { sess, start_pos, pos: start_pos, src, cursor, override_span }; tokentrees::TokenTreesReader::parse_token_trees(string_reader) } @@ -60,6 +63,8 @@ struct StringReader<'a> { pos: BytePos, /// Source text to tokenize. src: &'a str, + /// Cursor for getting lexer tokens. + cursor: Cursor<'a>, override_span: Option<Span>, } @@ -75,15 +80,13 @@ impl<'a> StringReader<'a> { // Skip trivial (whitespace & comments) tokens loop { - let start_src_index = self.src_index(self.pos); - let text: &str = &self.src[start_src_index..]; - - if text.is_empty() { - let span = self.mk_sp(self.pos, self.pos); - return (Token::new(token::Eof, span), preceded_by_whitespace); - } - - let token = rustc_lexer::first_token(text); + let token = match self.cursor.advance_token() { + Some(token) => token, + None => { + let span = self.mk_sp(self.pos, self.pos); + return (Token::new(token::Eof, span), preceded_by_whitespace); + } + }; let start = self.pos; self.pos = self.pos + BytePos(token.len); diff --git a/src/librustdoc/html/highlight.rs b/src/librustdoc/html/highlight.rs index 8922bf37785..0870d6f3824 100644 --- a/src/librustdoc/html/highlight.rs +++ b/src/librustdoc/html/highlight.rs @@ -13,6 +13,7 @@ use std::collections::VecDeque; use std::fmt::{Display, Write}; use rustc_data_structures::fx::FxHashMap; +use rustc_lexer::cursor::Cursor; use rustc_lexer::{LiteralKind, TokenKind}; use rustc_span::edition::Edition; use rustc_span::symbol::Symbol; @@ -408,15 +409,13 @@ enum Highlight<'a> { struct TokenIter<'a> { src: &'a str, + cursor: Cursor<'a>, } impl<'a> Iterator for TokenIter<'a> { type Item = (TokenKind, &'a str); fn next(&mut self) -> Option<(TokenKind, &'a str)> { - if self.src.is_empty() { - return None; - } - let token = rustc_lexer::first_token(self.src); + let token = self.cursor.advance_token()?; let (text, rest) = self.src.split_at(token.len as usize); self.src = rest; Some((token.kind, text)) @@ -525,7 +524,7 @@ impl<'a> Classifier<'a> { /// Takes as argument the source code to HTML-ify, the rust edition to use and the source code /// file span which will be used later on by the `span_correspondance_map`. fn new(src: &str, file_span: Span, decoration_info: Option<DecorationInfo>) -> Classifier<'_> { - let tokens = PeekIter::new(TokenIter { src }); + let tokens = PeekIter::new(TokenIter { src, cursor: Cursor::new(src) }); let decorations = decoration_info.map(Decorations::new); Classifier { tokens, |
