soa all the things

author: Aleksey Kladov <aleksey.kladov@gmail.com> 2021-12-18 15:31:50 +0300
committer: Aleksey Kladov <aleksey.kladov@gmail.com> 2021-12-18 15:31:50 +0300
commit: 8b9d145dea17dc28d83fae23b5be63233483ec6d (patch)
tree: 74ffd24cab3a46f9289cb546a1f5191fe5aed6bf
parent: 799941e05ee3da6c4f302adb8962a0f15152949b (diff)
download: rust-8b9d145dea17dc28d83fae23b5be63233483ec6d.tar.gz
rust-8b9d145dea17dc28d83fae23b5be63233483ec6d.zip
4 files changed, 75 insertions, 34 deletions
diff --git a/crates/parser/src/lexer_token.rs b/crates/parser/src/lexed_str.rs
index a9134639d27..595b6072293 100644
--- a/crates/parser/src/lexer_token.rs
+++ b/crates/parser/src/lexed_str.rs
@@ -4,48 +4,55 @@
 //! on tokens which originated from text. Macros, eg, can synthesize tokes out
 //! of thin air. So, ideally, lexer should be an orthogonal crate. It is however
 //! convenient to include a text-based lexer here!
+//!
+//! Note that these tokens, unlike the tokens we feed into the parser, do
+//! include info about comments and whitespace. 
 
 use crate::{
     SyntaxKind::{self, *},
     T,
 };
 
-#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
-pub struct LexerToken {
-    pub kind: SyntaxKind,
-    pub len: usize,
-    pub error: Option<String>,
+pub struct LexedStr<'a> {
+    text: &'a str,
+    kind: Vec<SyntaxKind>,
+    start: Vec<u32>,
+    error: Vec<LexError>,
 }
 
-impl LexerToken {
-    pub fn new(kind: SyntaxKind, len: usize) -> Self {
-        Self { kind, len, error: None }
-    }
+struct LexError {
+    msg: String,
+    token: u32,
+}
 
-    /// Lexes text as a sequence of tokens.
-    pub fn tokenize(text: &str) -> Vec<LexerToken> {
-        let mut res = Vec::new();
-        let mut offset = 0;
+impl<'a> LexedStr<'a> {
+    pub fn new(text: &'a str) -> LexedStr<'a> {
+        let mut res = LexedStr { text, kind: Vec::new(), start: Vec::new(), error: Vec::new() };
 
+        let mut offset = 0;
         if let Some(shebang_len) = rustc_lexer::strip_shebang(text) {
-            res.push(LexerToken::new(SHEBANG, shebang_len));
+            res.push(SHEBANG, offset);
             offset = shebang_len
         };
-
         for token in rustc_lexer::tokenize(&text[offset..]) {
             let token_text = &text[offset..][..token.len];
-            offset += token.len;
 
             let (kind, err) = from_rustc(&token.kind, token_text);
-            let mut token = LexerToken::new(kind, token.len);
-            token.error = err.map(|it| it.to_string());
-            res.push(token);
+            res.push(kind, offset);
+            offset += token.len;
+
+            if let Some(err) = err {
+                let token = res.len() as u32;
+                let msg = err.to_string();
+                res.error.push(LexError { msg, token });
+            }
         }
+        res.push(EOF, offset);
 
         res
     }
-    /// Lexes text as a single token. Returns `None` if there's leftover text.
-    pub fn from_str(text: &str) -> Option<LexerToken> {
+
+    pub fn single_token(text: &'a str) -> Option<SyntaxKind> {
         if text.is_empty() {
             return None;
         }
@@ -56,10 +63,40 @@ impl LexerToken {
         }
 
         let (kind, err) = from_rustc(&token.kind, text);
+        if err.is_some() {
+            return None;
+        }
+
+        Some(kind)
+    }
+
+    pub fn as_str(&self) -> &str {
+        self.text
+    }
+
+    pub fn len(&self) -> usize {
+        self.kind.len() - 1
+    }
+
+    pub fn kind(&self, i: usize) -> SyntaxKind {
+        assert!(i < self.len());
+        self.kind[i]
+    }
+    pub fn text(&self, i: usize) -> &str {
+        assert!(i < self.len());
+        let lo = self.start[i] as usize;
+        let hi = self.start[i + 1] as usize;
+        &self.text[lo..hi]
+    }
+    pub fn error(&self, i: usize) -> Option<&str> {
+        assert!(i < self.len());
+        let err = self.error.binary_search_by_key(&(i as u32), |i| i.token).ok()?;
+        Some(self.error[err].msg.as_str())
+    }
 
-        let mut token = LexerToken::new(kind, token.len);
-        token.error = err.map(|it| it.to_string());
-        Some(token)
+    fn push(&mut self, kind: SyntaxKind, offset: usize) {
+        self.kind.push(kind);
+        self.start.push(offset as u32);
     }
 }
 
diff --git a/crates/parser/src/lib.rs b/crates/parser/src/lib.rs
index 448f22185d0..dc02ae6e83f 100644
--- a/crates/parser/src/lib.rs
+++ b/crates/parser/src/lib.rs
@@ -18,7 +18,7 @@
 //! [`Parser`]: crate::parser::Parser
 #![allow(rustdoc::private_intra_doc_links)]
 
-mod lexer_token;
+mod lexed_str;
 mod token_set;
 mod syntax_kind;
 mod event;
@@ -31,7 +31,7 @@ mod tests;
 
 pub(crate) use token_set::TokenSet;
 
-pub use crate::{lexer_token::LexerToken, syntax_kind::SyntaxKind, tokens::Tokens};
+pub use crate::{lexed_str::LexedStr, syntax_kind::SyntaxKind, tokens::Tokens};
 
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub struct ParseError(pub Box<String>);
diff --git a/crates/parser/src/tests.rs b/crates/parser/src/tests.rs
index f323eba5e43..ebba9925618 100644
--- a/crates/parser/src/tests.rs
+++ b/crates/parser/src/tests.rs
@@ -6,7 +6,7 @@ use std::{
 
 use expect_test::expect_file;
 
-use crate::LexerToken;
+use crate::LexedStr;
 
 #[test]
 fn valid_lexes_input() {
@@ -25,13 +25,16 @@ fn invalid_lexes_input() {
 }
 
 fn lex(text: &str) -> String {
+    let lexed = LexedStr::new(text);
+
     let mut res = String::new();
-    let mut offset = 0;
-    for token in LexerToken::tokenize(text) {
-        let token_text = &text[offset..][..token.len];
-        offset += token.len;
-        let err = token.error.map(|err| format!(" error: {}", err)).unwrap_or_default();
-        writeln!(res, "{:?} {:?}{}", token.kind, token_text, err).unwrap();
+    for i in 0..lexed.len() {
+        let kind = lexed.kind(i);
+        let text = lexed.text(i);
+        let error = lexed.error(i);
+
+        let error = error.map(|err| format!(" error: {}", err)).unwrap_or_default();
+        writeln!(res, "{:?} {:?}{}", kind, text, error).unwrap();
     }
     res
 }
diff --git a/crates/parser/src/tokens.rs b/crates/parser/src/tokens.rs
index 1c0672492da..4fc2361add2 100644
--- a/crates/parser/src/tokens.rs
+++ b/crates/parser/src/tokens.rs
@@ -1,7 +1,8 @@
 //! Input for the parser -- a sequence of tokens.
 //!
 //! As of now, parser doesn't have access to the *text* of the tokens, and makes
-//! decisions based solely on their classification.
+//! decisions based solely on their classification. Unlike `LexerToken`, the
+//! `Tokens` doesn't include whitespace and comments.
 
 use crate::SyntaxKind;
author	Aleksey Kladov <aleksey.kladov@gmail.com>	2021-12-18 15:31:50 +0300
committer	Aleksey Kladov <aleksey.kladov@gmail.com>	2021-12-18 15:31:50 +0300
commit	8b9d145dea17dc28d83fae23b5be63233483ec6d (patch)
tree	74ffd24cab3a46f9289cb546a1f5191fe5aed6bf
parent	799941e05ee3da6c4f302adb8962a0f15152949b (diff)
download	rust-8b9d145dea17dc28d83fae23b5be63233483ec6d.tar.gz rust-8b9d145dea17dc28d83fae23b5be63233483ec6d.zip