From aa6bfaf04b258e3e23d3f7063de4f2d37845ddec Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Mon, 26 Sep 2022 09:18:23 +1000
Subject: Make `rustc_lexer::cursor::Cursor` public.

`Cursor` is currently hidden, and the main tokenization path uses
`rustc_lexer::first_token` which involves constructing a new `Cursor`
for every single token, which is weird. Also, `first_token` also can't
handle empty input, so callers have to check for that first.

This commit makes `Cursor` public, so `StringReader` can contain a
`Cursor`, which results in a simpler structure. The commit also changes
`StringReader::advance_token` so it returns an `Option<Token>`,
simplifying the the empty input case.
---
 src/librustdoc/html/highlight.rs | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'src')
diff --git a/src/librustdoc/html/highlight.rs b/src/librustdoc/html/highlight.rs
index 8922bf37785..0870d6f3824 100644
--- a/src/librustdoc/html/highlight.rs
+++ b/src/librustdoc/html/highlight.rs
@@ -13,6 +13,7 @@ use std::collections::VecDeque;
 use std::fmt::{Display, Write};
 
 use rustc_data_structures::fx::FxHashMap;
+use rustc_lexer::cursor::Cursor;
 use rustc_lexer::{LiteralKind, TokenKind};
 use rustc_span::edition::Edition;
 use rustc_span::symbol::Symbol;
@@ -408,15 +409,13 @@ enum Highlight<'a> {
 
 struct TokenIter<'a> {
     src: &'a str,
+    cursor: Cursor<'a>,
 }
 
 impl<'a> Iterator for TokenIter<'a> {
     type Item = (TokenKind, &'a str);
     fn next(&mut self) -> Option<(TokenKind, &'a str)> {
-        if self.src.is_empty() {
-            return None;
-        }
-        let token = rustc_lexer::first_token(self.src);
+        let token = self.cursor.advance_token()?;
         let (text, rest) = self.src.split_at(token.len as usize);
         self.src = rest;
         Some((token.kind, text))
@@ -525,7 +524,7 @@ impl<'a> Classifier<'a> {
     /// Takes as argument the source code to HTML-ify, the rust edition to use and the source code
     /// file span which will be used later on by the `span_correspondance_map`.
     fn new(src: &str, file_span: Span, decoration_info: Option<DecorationInfo>) -> Classifier<'_> {
-        let tokens = PeekIter::new(TokenIter { src });
+        let tokens = PeekIter::new(TokenIter { src, cursor: Cursor::new(src) });
         let decorations = decoration_info.map(Decorations::new);
         Classifier {
             tokens,
-- 
cgit 1.4.1-3-g733a5


From da84f0f4c31914c14dd03628395e9c53f28b88b9 Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Mon, 26 Sep 2022 13:06:15 +1000
Subject: Add `rustc_lexer::TokenKind::Eof`.

For alignment with `rust_ast::TokenKind::Eof`. Plus it's a bit faster,
due to less `Option` manipulation in `StringReader::next_token`.
---
 compiler/rustc_lexer/src/lib.rs       | 17 +++++++++++++----
 compiler/rustc_parse/src/lexer/mod.rs | 10 ++--------
 src/librustdoc/html/highlight.rs      |  6 +++++-
 3 files changed, 20 insertions(+), 13 deletions(-)

(limited to 'src')

diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs
index 69e772c6924..18ebed7c70e 100644
--- a/compiler/rustc_lexer/src/lib.rs
+++ b/compiler/rustc_lexer/src/lib.rs
@@ -139,6 +139,9 @@ pub enum TokenKind {
 
     /// Unknown token, not expected by the lexer, e.g. "№"
     Unknown,
+
+    /// End of input.
+    Eof,
 }
 
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
@@ -235,7 +238,10 @@ pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError>
 /// Creates an iterator that produces tokens from the input string.
 pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ {
     let mut cursor = Cursor::new(input);
-    std::iter::from_fn(move || cursor.advance_token())
+    std::iter::from_fn(move || {
+        let token = cursor.advance_token();
+        if token.kind != TokenKind::Eof { Some(token) } else { None }
+    })
 }
 
 /// True if `c` is considered a whitespace according to Rust language definition.
@@ -297,8 +303,11 @@ pub fn is_ident(string: &str) -> bool {
 
 impl Cursor<'_> {
     /// Parses a token from the input string.
-    pub fn advance_token(&mut self) -> Option<Token> {
-        let first_char = self.bump()?;
+    pub fn advance_token(&mut self) -> Token {
+        let first_char = match self.bump() {
+            Some(c) => c,
+            None => return Token::new(TokenKind::Eof, 0),
+        };
         let token_kind = match first_char {
             // Slash, comment or block comment.
             '/' => match self.first() {
@@ -419,7 +428,7 @@ impl Cursor<'_> {
             }
             _ => Unknown,
         };
-        let res = Some(Token::new(token_kind, self.pos_within_token()));
+        let res = Token::new(token_kind, self.pos_within_token());
         self.reset_pos_within_token();
         res
     }
diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs
index 0f9d585230e..67fefd19d8b 100644
--- a/compiler/rustc_parse/src/lexer/mod.rs
+++ b/compiler/rustc_parse/src/lexer/mod.rs
@@ -80,14 +80,7 @@ impl<'a> StringReader<'a> {
 
         // Skip trivial (whitespace & comments) tokens
         loop {
-            let token = match self.cursor.advance_token() {
-                Some(token) => token,
-                None => {
-                    let span = self.mk_sp(self.pos, self.pos);
-                    return (Token::new(token::Eof, span), preceded_by_whitespace);
-                }
-            };
-
+            let token = self.cursor.advance_token();
             let start = self.pos;
             self.pos = self.pos + BytePos(token.len);
 
@@ -327,6 +320,7 @@ impl<'a> StringReader<'a> {
                 err.emit();
                 token?
             }
+            rustc_lexer::TokenKind::Eof => token::Eof,
         })
     }
 
diff --git a/src/librustdoc/html/highlight.rs b/src/librustdoc/html/highlight.rs
index 0870d6f3824..ea65a6334c9 100644
--- a/src/librustdoc/html/highlight.rs
+++ b/src/librustdoc/html/highlight.rs
@@ -415,7 +415,10 @@ struct TokenIter<'a> {
 impl<'a> Iterator for TokenIter<'a> {
     type Item = (TokenKind, &'a str);
     fn next(&mut self) -> Option<(TokenKind, &'a str)> {
-        let token = self.cursor.advance_token()?;
+        let token = self.cursor.advance_token();
+        if token.kind == TokenKind::Eof {
+            return None;
+        }
         let (text, rest) = self.src.split_at(token.len as usize);
         self.src = rest;
         Some((token.kind, text))
@@ -849,6 +852,7 @@ impl<'a> Classifier<'a> {
                 Class::Ident(self.new_span(before, text))
             }
             TokenKind::Lifetime { .. } => Class::Lifetime,
+            TokenKind::Eof => panic!("Eof in advance"),
         };
         // Anything that didn't return above is the simple case where we the
         // class just spans a single token, so we can use the `string` method.
-- 
cgit 1.4.1-3-g733a5


From d0a26acb2ae2d000e516eca92ae8feb08d1f6ea0 Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Wed, 28 Sep 2022 10:28:36 +1000
Subject: Address review comments.

---
 compiler/rustc_lexer/src/lib.rs              |  6 ++++--
 compiler/rustc_parse/src/lexer/mod.rs        |  2 +-
 compiler/rustc_parse/src/lexer/tokentrees.rs | 14 +++++++-------
 src/librustdoc/html/highlight.rs             |  2 +-
 4 files changed, 13 insertions(+), 11 deletions(-)

(limited to 'src')

diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs
index 18ebed7c70e..c71e6ffe34d 100644
--- a/compiler/rustc_lexer/src/lib.rs
+++ b/compiler/rustc_lexer/src/lib.rs
@@ -23,15 +23,17 @@
 // We want to be able to build this crate with a stable compiler, so no
 // `#![feature]` attributes should be added.
 
-pub mod cursor;
+mod cursor;
 pub mod unescape;
 
 #[cfg(test)]
 mod tests;
 
+pub use crate::cursor::Cursor;
+
 use self::LiteralKind::*;
 use self::TokenKind::*;
-use crate::cursor::{Cursor, EOF_CHAR};
+use crate::cursor::EOF_CHAR;
 use std::convert::TryFrom;
 
 /// Parsed token.
diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs
index 151e80e2b3e..bcd078a8967 100644
--- a/compiler/rustc_parse/src/lexer/mod.rs
+++ b/compiler/rustc_parse/src/lexer/mod.rs
@@ -4,8 +4,8 @@ use rustc_ast::token::{self, CommentKind, Delimiter, Token, TokenKind};
 use rustc_ast::tokenstream::TokenStream;
 use rustc_ast::util::unicode::contains_text_flow_control_chars;
 use rustc_errors::{error_code, Applicability, DiagnosticBuilder, ErrorGuaranteed, PResult};
-use rustc_lexer::cursor::Cursor;
 use rustc_lexer::unescape::{self, Mode};
+use rustc_lexer::Cursor;
 use rustc_lexer::{Base, DocStyle, RawStrError};
 use rustc_session::lint::builtin::{
     RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX, TEXT_DIRECTION_CODEPOINT_IN_COMMENT,
diff --git a/compiler/rustc_parse/src/lexer/tokentrees.rs b/compiler/rustc_parse/src/lexer/tokentrees.rs
index 6f6ab16cb59..364753154db 100644
--- a/compiler/rustc_parse/src/lexer/tokentrees.rs
+++ b/compiler/rustc_parse/src/lexer/tokentrees.rs
@@ -53,7 +53,7 @@ impl<'a> TokenTreesReader<'a> {
                 token::OpenDelim(delim) => buf.push(self.parse_token_tree_open_delim(delim)),
                 token::CloseDelim(delim) => return Err(self.close_delim_err(delim)),
                 token::Eof => return Ok(buf.into_token_stream()),
-                _ => buf.push(self.parse_token_tree_other()),
+                _ => buf.push(self.parse_token_tree_non_delim_non_eof()),
             }
         }
     }
@@ -66,11 +66,10 @@ impl<'a> TokenTreesReader<'a> {
                 token::OpenDelim(delim) => buf.push(self.parse_token_tree_open_delim(delim)),
                 token::CloseDelim(..) => return buf.into_token_stream(),
                 token::Eof => {
-                    let mut err = self.eof_err();
-                    err.emit();
+                    self.eof_err().emit();
                     return buf.into_token_stream();
                 }
-                _ => buf.push(self.parse_token_tree_other()),
+                _ => buf.push(self.parse_token_tree_non_delim_non_eof()),
             }
         }
     }
@@ -245,9 +244,10 @@ impl<'a> TokenTreesReader<'a> {
     }
 
     #[inline]
-    fn parse_token_tree_other(&mut self) -> TokenTree {
-        // `spacing` for the returned token is determined by the next token:
-        // its kind and its `preceded_by_whitespace` status.
+    fn parse_token_tree_non_delim_non_eof(&mut self) -> TokenTree {
+        // `this_spacing` for the returned token refers to whether the token is
+        // immediately followed by another op token. It is determined by the
+        // next token: its kind and its `preceded_by_whitespace` status.
         let (next_tok, is_next_tok_preceded_by_whitespace) = self.string_reader.next_token();
         let this_spacing = if is_next_tok_preceded_by_whitespace || !next_tok.is_op() {
             Spacing::Alone
diff --git a/src/librustdoc/html/highlight.rs b/src/librustdoc/html/highlight.rs
index ea65a6334c9..78b98431b19 100644
--- a/src/librustdoc/html/highlight.rs
+++ b/src/librustdoc/html/highlight.rs
@@ -13,7 +13,7 @@ use std::collections::VecDeque;
 use std::fmt::{Display, Write};
 
 use rustc_data_structures::fx::FxHashMap;
-use rustc_lexer::cursor::Cursor;
+use rustc_lexer::Cursor;
 use rustc_lexer::{LiteralKind, TokenKind};
 use rustc_span::edition::Edition;
 use rustc_span::symbol::Symbol;
-- 
cgit 1.4.1-3-g733a5