diff options
| author | Stefan Lankes <slankes@eonerc.rwth-aachen.de> | 2019-11-13 00:24:37 +0100 |
|---|---|---|
| committer | Stefan Lankes <slankes@eonerc.rwth-aachen.de> | 2019-11-13 00:24:37 +0100 |
| commit | 88717319142e162ae2f48124d94f33d2c21bc2ce (patch) | |
| tree | e50a968f836c55007e9bc8f305d4f0f80aca042c /src/libsyntax/util | |
| parent | 969b74144641bf1c8ae5aba0581f4b52a4c15bac (diff) | |
| parent | 4f03f4a989d1c8346c19dfb417a77c09b34408b8 (diff) | |
| download | rust-88717319142e162ae2f48124d94f33d2c21bc2ce.tar.gz rust-88717319142e162ae2f48124d94f33d2c21bc2ce.zip | |
Merge remote-tracking branch 'rust-lang/master' into hermit
Diffstat (limited to 'src/libsyntax/util')
| -rw-r--r-- | src/libsyntax/util/classify.rs | 25 | ||||
| -rw-r--r-- | src/libsyntax/util/comments.rs | 271 | ||||
| -rw-r--r-- | src/libsyntax/util/comments/tests.rs | 47 | ||||
| -rw-r--r-- | src/libsyntax/util/literal.rs | 305 | ||||
| -rw-r--r-- | src/libsyntax/util/parser.rs | 6 |
5 files changed, 651 insertions, 3 deletions
diff --git a/src/libsyntax/util/classify.rs b/src/libsyntax/util/classify.rs new file mode 100644 index 00000000000..44560688750 --- /dev/null +++ b/src/libsyntax/util/classify.rs @@ -0,0 +1,25 @@ +//! Routines the parser uses to classify AST nodes + +// Predicates on exprs and stmts that the pretty-printer and parser use + +use crate::ast; + +/// Does this expression require a semicolon to be treated +/// as a statement? The negation of this: 'can this expression +/// be used as a statement without a semicolon' -- is used +/// as an early-bail-out in the parser so that, for instance, +/// if true {...} else {...} +/// |x| 5 +/// isn't parsed as (if true {...} else {...} | x) | 5 +pub fn expr_requires_semi_to_be_stmt(e: &ast::Expr) -> bool { + match e.kind { + ast::ExprKind::If(..) | + ast::ExprKind::Match(..) | + ast::ExprKind::Block(..) | + ast::ExprKind::While(..) | + ast::ExprKind::Loop(..) | + ast::ExprKind::ForLoop(..) | + ast::ExprKind::TryBlock(..) => false, + _ => true, + } +} diff --git a/src/libsyntax/util/comments.rs b/src/libsyntax/util/comments.rs new file mode 100644 index 00000000000..5e9b7bf8322 --- /dev/null +++ b/src/libsyntax/util/comments.rs @@ -0,0 +1,271 @@ +pub use CommentStyle::*; + +use crate::ast; +use crate::source_map::SourceMap; +use crate::sess::ParseSess; + +use syntax_pos::{BytePos, CharPos, Pos, FileName}; + +use std::usize; + +use log::debug; + +#[cfg(test)] +mod tests; + +#[derive(Clone, Copy, PartialEq, Debug)] +pub enum CommentStyle { + /// No code on either side of each line of the comment + Isolated, + /// Code exists to the left of the comment + Trailing, + /// Code before /* foo */ and after the comment + Mixed, + /// Just a manual blank line "\n\n", for layout + BlankLine, +} + +#[derive(Clone)] +pub struct Comment { + pub style: CommentStyle, + pub lines: Vec<String>, + pub pos: BytePos, +} + +pub fn is_line_doc_comment(s: &str) -> bool { + let res = (s.starts_with("///") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'/') || + s.starts_with("//!"); + debug!("is {:?} a doc comment? {}", s, res); + res +} + +pub fn is_block_doc_comment(s: &str) -> bool { + // Prevent `/**/` from being parsed as a doc comment + let res = ((s.starts_with("/**") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'*') || + s.starts_with("/*!")) && s.len() >= 5; + debug!("is {:?} a doc comment? {}", s, res); + res +} + +// FIXME(#64197): Try to privatize this again. +pub fn is_doc_comment(s: &str) -> bool { + (s.starts_with("///") && is_line_doc_comment(s)) || s.starts_with("//!") || + (s.starts_with("/**") && is_block_doc_comment(s)) || s.starts_with("/*!") +} + +pub fn doc_comment_style(comment: &str) -> ast::AttrStyle { + assert!(is_doc_comment(comment)); + if comment.starts_with("//!") || comment.starts_with("/*!") { + ast::AttrStyle::Inner + } else { + ast::AttrStyle::Outer + } +} + +pub fn strip_doc_comment_decoration(comment: &str) -> String { + /// remove whitespace-only lines from the start/end of lines + fn vertical_trim(lines: Vec<String>) -> Vec<String> { + let mut i = 0; + let mut j = lines.len(); + // first line of all-stars should be omitted + if !lines.is_empty() && lines[0].chars().all(|c| c == '*') { + i += 1; + } + + while i < j && lines[i].trim().is_empty() { + i += 1; + } + // like the first, a last line of all stars should be omitted + if j > i && + lines[j - 1] + .chars() + .skip(1) + .all(|c| c == '*') { + j -= 1; + } + + while j > i && lines[j - 1].trim().is_empty() { + j -= 1; + } + + lines[i..j].to_vec() + } + + /// remove a "[ \t]*\*" block from each line, if possible + fn horizontal_trim(lines: Vec<String>) -> Vec<String> { + let mut i = usize::MAX; + let mut can_trim = true; + let mut first = true; + + for line in &lines { + for (j, c) in line.chars().enumerate() { + if j > i || !"* \t".contains(c) { + can_trim = false; + break; + } + if c == '*' { + if first { + i = j; + first = false; + } else if i != j { + can_trim = false; + } + break; + } + } + if i >= line.len() { + can_trim = false; + } + if !can_trim { + break; + } + } + + if can_trim { + lines.iter() + .map(|line| (&line[i + 1..line.len()]).to_string()) + .collect() + } else { + lines + } + } + + // one-line comments lose their prefix + const ONELINERS: &[&str] = &["///!", "///", "//!", "//"]; + + for prefix in ONELINERS { + if comment.starts_with(*prefix) { + return (&comment[prefix.len()..]).to_string(); + } + } + + if comment.starts_with("/*") { + let lines = comment[3..comment.len() - 2] + .lines() + .map(|s| s.to_string()) + .collect::<Vec<String>>(); + + let lines = vertical_trim(lines); + let lines = horizontal_trim(lines); + + return lines.join("\n"); + } + + panic!("not a doc-comment: {}", comment); +} + +/// Returns `None` if the first `col` chars of `s` contain a non-whitespace char. +/// Otherwise returns `Some(k)` where `k` is first char offset after that leading +/// whitespace. Note that `k` may be outside bounds of `s`. +fn all_whitespace(s: &str, col: CharPos) -> Option<usize> { + let mut idx = 0; + for (i, ch) in s.char_indices().take(col.to_usize()) { + if !ch.is_whitespace() { + return None; + } + idx = i + ch.len_utf8(); + } + Some(idx) +} + +fn trim_whitespace_prefix(s: &str, col: CharPos) -> &str { + let len = s.len(); + match all_whitespace(&s, col) { + Some(col) => if col < len { &s[col..] } else { "" }, + None => s, + } +} + +fn split_block_comment_into_lines( + text: &str, + col: CharPos, +) -> Vec<String> { + let mut res: Vec<String> = vec![]; + let mut lines = text.lines(); + // just push the first line + res.extend(lines.next().map(|it| it.to_string())); + // for other lines, strip common whitespace prefix + for line in lines { + res.push(trim_whitespace_prefix(line, col).to_string()) + } + res +} + +// it appears this function is called only from pprust... that's +// probably not a good thing. +crate fn gather_comments(sess: &ParseSess, path: FileName, src: String) -> Vec<Comment> { + let cm = SourceMap::new(sess.source_map().path_mapping().clone()); + let source_file = cm.new_source_file(path, src); + let text = (*source_file.src.as_ref().unwrap()).clone(); + + let text: &str = text.as_str(); + let start_bpos = source_file.start_pos; + let mut pos = 0; + let mut comments: Vec<Comment> = Vec::new(); + let mut code_to_the_left = false; + + if let Some(shebang_len) = rustc_lexer::strip_shebang(text) { + comments.push(Comment { + style: Isolated, + lines: vec![text[..shebang_len].to_string()], + pos: start_bpos, + }); + pos += shebang_len; + } + + for token in rustc_lexer::tokenize(&text[pos..]) { + let token_text = &text[pos..pos + token.len]; + match token.kind { + rustc_lexer::TokenKind::Whitespace => { + if let Some(mut idx) = token_text.find('\n') { + code_to_the_left = false; + while let Some(next_newline) = &token_text[idx + 1..].find('\n') { + idx = idx + 1 + next_newline; + comments.push(Comment { + style: BlankLine, + lines: vec![], + pos: start_bpos + BytePos((pos + idx) as u32), + }); + } + } + } + rustc_lexer::TokenKind::BlockComment { terminated: _ } => { + if !is_block_doc_comment(token_text) { + let code_to_the_right = match text[pos + token.len..].chars().next() { + Some('\r') | Some('\n') => false, + _ => true, + }; + let style = match (code_to_the_left, code_to_the_right) { + (true, true) | (false, true) => Mixed, + (false, false) => Isolated, + (true, false) => Trailing, + }; + + // Count the number of chars since the start of the line by rescanning. + let pos_in_file = start_bpos + BytePos(pos as u32); + let line_begin_in_file = source_file.line_begin_pos(pos_in_file); + let line_begin_pos = (line_begin_in_file - start_bpos).to_usize(); + let col = CharPos(text[line_begin_pos..pos].chars().count()); + + let lines = split_block_comment_into_lines(token_text, col); + comments.push(Comment { style, lines, pos: pos_in_file }) + } + } + rustc_lexer::TokenKind::LineComment => { + if !is_doc_comment(token_text) { + comments.push(Comment { + style: if code_to_the_left { Trailing } else { Isolated }, + lines: vec![token_text.to_string()], + pos: start_bpos + BytePos(pos as u32), + }) + } + } + _ => { + code_to_the_left = true; + } + } + pos += token.len; + } + + comments +} diff --git a/src/libsyntax/util/comments/tests.rs b/src/libsyntax/util/comments/tests.rs new file mode 100644 index 00000000000..f9cd69fb50d --- /dev/null +++ b/src/libsyntax/util/comments/tests.rs @@ -0,0 +1,47 @@ +use super::*; + +#[test] +fn test_block_doc_comment_1() { + let comment = "/**\n * Test \n ** Test\n * Test\n*/"; + let stripped = strip_doc_comment_decoration(comment); + assert_eq!(stripped, " Test \n* Test\n Test"); +} + +#[test] +fn test_block_doc_comment_2() { + let comment = "/**\n * Test\n * Test\n*/"; + let stripped = strip_doc_comment_decoration(comment); + assert_eq!(stripped, " Test\n Test"); +} + +#[test] +fn test_block_doc_comment_3() { + let comment = "/**\n let a: *i32;\n *a = 5;\n*/"; + let stripped = strip_doc_comment_decoration(comment); + assert_eq!(stripped, " let a: *i32;\n *a = 5;"); +} + +#[test] +fn test_block_doc_comment_4() { + let comment = "/*******************\n test\n *********************/"; + let stripped = strip_doc_comment_decoration(comment); + assert_eq!(stripped, " test"); +} + +#[test] +fn test_line_doc_comment() { + let stripped = strip_doc_comment_decoration("/// test"); + assert_eq!(stripped, " test"); + let stripped = strip_doc_comment_decoration("///! test"); + assert_eq!(stripped, " test"); + let stripped = strip_doc_comment_decoration("// test"); + assert_eq!(stripped, " test"); + let stripped = strip_doc_comment_decoration("// test"); + assert_eq!(stripped, " test"); + let stripped = strip_doc_comment_decoration("///test"); + assert_eq!(stripped, "test"); + let stripped = strip_doc_comment_decoration("///!test"); + assert_eq!(stripped, "test"); + let stripped = strip_doc_comment_decoration("//test"); + assert_eq!(stripped, "test"); +} diff --git a/src/libsyntax/util/literal.rs b/src/libsyntax/util/literal.rs new file mode 100644 index 00000000000..af7afab6b9b --- /dev/null +++ b/src/libsyntax/util/literal.rs @@ -0,0 +1,305 @@ +//! Code related to parsing literals. + +use crate::ast::{self, Lit, LitKind}; +use crate::symbol::{kw, sym, Symbol}; +use crate::token::{self, Token}; +use crate::tokenstream::TokenTree; + +use log::debug; +use rustc_data_structures::sync::Lrc; +use syntax_pos::Span; +use rustc_lexer::unescape::{unescape_char, unescape_byte}; +use rustc_lexer::unescape::{unescape_str, unescape_byte_str}; +use rustc_lexer::unescape::{unescape_raw_str, unescape_raw_byte_str}; + +use std::ascii; + +pub enum LitError { + NotLiteral, + LexerError, + InvalidSuffix, + InvalidIntSuffix, + InvalidFloatSuffix, + NonDecimalFloat(u32), + IntTooLarge, +} + +impl LitKind { + /// Converts literal token into a semantic literal. + fn from_lit_token(lit: token::Lit) -> Result<LitKind, LitError> { + let token::Lit { kind, symbol, suffix } = lit; + if suffix.is_some() && !kind.may_have_suffix() { + return Err(LitError::InvalidSuffix); + } + + Ok(match kind { + token::Bool => { + assert!(symbol.is_bool_lit()); + LitKind::Bool(symbol == kw::True) + } + token::Byte => return unescape_byte(&symbol.as_str()) + .map(LitKind::Byte).map_err(|_| LitError::LexerError), + token::Char => return unescape_char(&symbol.as_str()) + .map(LitKind::Char).map_err(|_| LitError::LexerError), + + // There are some valid suffixes for integer and float literals, + // so all the handling is done internally. + token::Integer => return integer_lit(symbol, suffix), + token::Float => return float_lit(symbol, suffix), + + token::Str => { + // If there are no characters requiring special treatment we can + // reuse the symbol from the token. Otherwise, we must generate a + // new symbol because the string in the LitKind is different to the + // string in the token. + let s = symbol.as_str(); + let symbol = if s.contains(&['\\', '\r'][..]) { + let mut buf = String::with_capacity(s.len()); + let mut error = Ok(()); + unescape_str(&s, &mut |_, unescaped_char| { + match unescaped_char { + Ok(c) => buf.push(c), + Err(_) => error = Err(LitError::LexerError), + } + }); + error?; + Symbol::intern(&buf) + } else { + symbol + }; + LitKind::Str(symbol, ast::StrStyle::Cooked) + } + token::StrRaw(n) => { + // Ditto. + let s = symbol.as_str(); + let symbol = if s.contains('\r') { + let mut buf = String::with_capacity(s.len()); + let mut error = Ok(()); + unescape_raw_str(&s, &mut |_, unescaped_char| { + match unescaped_char { + Ok(c) => buf.push(c), + Err(_) => error = Err(LitError::LexerError), + } + }); + error?; + buf.shrink_to_fit(); + Symbol::intern(&buf) + } else { + symbol + }; + LitKind::Str(symbol, ast::StrStyle::Raw(n)) + } + token::ByteStr => { + let s = symbol.as_str(); + let mut buf = Vec::with_capacity(s.len()); + let mut error = Ok(()); + unescape_byte_str(&s, &mut |_, unescaped_byte| { + match unescaped_byte { + Ok(c) => buf.push(c), + Err(_) => error = Err(LitError::LexerError), + } + }); + error?; + buf.shrink_to_fit(); + LitKind::ByteStr(Lrc::new(buf)) + } + token::ByteStrRaw(_) => { + let s = symbol.as_str(); + let bytes = if s.contains('\r') { + let mut buf = Vec::with_capacity(s.len()); + let mut error = Ok(()); + unescape_raw_byte_str(&s, &mut |_, unescaped_byte| { + match unescaped_byte { + Ok(c) => buf.push(c), + Err(_) => error = Err(LitError::LexerError), + } + }); + error?; + buf.shrink_to_fit(); + buf + } else { + symbol.to_string().into_bytes() + }; + + LitKind::ByteStr(Lrc::new(bytes)) + }, + token::Err => LitKind::Err(symbol), + }) + } + + /// Attempts to recover a token from semantic literal. + /// This function is used when the original token doesn't exist (e.g. the literal is created + /// by an AST-based macro) or unavailable (e.g. from HIR pretty-printing). + pub fn to_lit_token(&self) -> token::Lit { + let (kind, symbol, suffix) = match *self { + LitKind::Str(symbol, ast::StrStyle::Cooked) => { + // Don't re-intern unless the escaped string is different. + let s = symbol.as_str(); + let escaped = s.escape_default().to_string(); + let symbol = if s == escaped { symbol } else { Symbol::intern(&escaped) }; + (token::Str, symbol, None) + } + LitKind::Str(symbol, ast::StrStyle::Raw(n)) => { + (token::StrRaw(n), symbol, None) + } + LitKind::ByteStr(ref bytes) => { + let string = bytes.iter().cloned().flat_map(ascii::escape_default) + .map(Into::<char>::into).collect::<String>(); + (token::ByteStr, Symbol::intern(&string), None) + } + LitKind::Byte(byte) => { + let string: String = ascii::escape_default(byte).map(Into::<char>::into).collect(); + (token::Byte, Symbol::intern(&string), None) + } + LitKind::Char(ch) => { + let string: String = ch.escape_default().map(Into::<char>::into).collect(); + (token::Char, Symbol::intern(&string), None) + } + LitKind::Int(n, ty) => { + let suffix = match ty { + ast::LitIntType::Unsigned(ty) => Some(ty.name()), + ast::LitIntType::Signed(ty) => Some(ty.name()), + ast::LitIntType::Unsuffixed => None, + }; + (token::Integer, sym::integer(n), suffix) + } + LitKind::Float(symbol, ty) => { + let suffix = match ty { + ast::LitFloatType::Suffixed(ty) => Some(ty.name()), + ast::LitFloatType::Unsuffixed => None, + }; + (token::Float, symbol, suffix) + } + LitKind::Bool(value) => { + let symbol = if value { kw::True } else { kw::False }; + (token::Bool, symbol, None) + } + LitKind::Err(symbol) => { + (token::Err, symbol, None) + } + }; + + token::Lit::new(kind, symbol, suffix) + } +} + +impl Lit { + /// Converts literal token into an AST literal. + pub fn from_lit_token(token: token::Lit, span: Span) -> Result<Lit, LitError> { + Ok(Lit { token, kind: LitKind::from_lit_token(token)?, span }) + } + + /// Converts arbitrary token into an AST literal. + pub fn from_token(token: &Token) -> Result<Lit, LitError> { + let lit = match token.kind { + token::Ident(name, false) if name.is_bool_lit() => + token::Lit::new(token::Bool, name, None), + token::Literal(lit) => + lit, + token::Interpolated(ref nt) => { + if let token::NtExpr(expr) | token::NtLiteral(expr) = &**nt { + if let ast::ExprKind::Lit(lit) = &expr.kind { + return Ok(lit.clone()); + } + } + return Err(LitError::NotLiteral); + } + _ => return Err(LitError::NotLiteral) + }; + + Lit::from_lit_token(lit, token.span) + } + + /// Attempts to recover an AST literal from semantic literal. + /// This function is used when the original token doesn't exist (e.g. the literal is created + /// by an AST-based macro) or unavailable (e.g. from HIR pretty-printing). + pub fn from_lit_kind(kind: LitKind, span: Span) -> Lit { + Lit { token: kind.to_lit_token(), kind, span } + } + + /// Losslessly convert an AST literal into a token stream. + pub fn token_tree(&self) -> TokenTree { + let token = match self.token.kind { + token::Bool => token::Ident(self.token.symbol, false), + _ => token::Literal(self.token), + }; + TokenTree::token(token, self.span) + } +} + +fn strip_underscores(symbol: Symbol) -> Symbol { + // Do not allocate a new string unless necessary. + let s = symbol.as_str(); + if s.contains('_') { + let mut s = s.to_string(); + s.retain(|c| c != '_'); + return Symbol::intern(&s); + } + symbol +} + +fn filtered_float_lit(symbol: Symbol, suffix: Option<Symbol>, base: u32) + -> Result<LitKind, LitError> { + debug!("filtered_float_lit: {:?}, {:?}, {:?}", symbol, suffix, base); + if base != 10 { + return Err(LitError::NonDecimalFloat(base)); + } + Ok(match suffix { + Some(suf) => LitKind::Float(symbol, ast::LitFloatType::Suffixed(match suf { + sym::f32 => ast::FloatTy::F32, + sym::f64 => ast::FloatTy::F64, + _ => return Err(LitError::InvalidFloatSuffix), + })), + None => LitKind::Float(symbol, ast::LitFloatType::Unsuffixed) + }) +} + +fn float_lit(symbol: Symbol, suffix: Option<Symbol>) -> Result<LitKind, LitError> { + debug!("float_lit: {:?}, {:?}", symbol, suffix); + filtered_float_lit(strip_underscores(symbol), suffix, 10) +} + +fn integer_lit(symbol: Symbol, suffix: Option<Symbol>) -> Result<LitKind, LitError> { + debug!("integer_lit: {:?}, {:?}", symbol, suffix); + let symbol = strip_underscores(symbol); + let s = symbol.as_str(); + + let base = match s.as_bytes() { + [b'0', b'x', ..] => 16, + [b'0', b'o', ..] => 8, + [b'0', b'b', ..] => 2, + _ => 10, + }; + + let ty = match suffix { + Some(suf) => match suf { + sym::isize => ast::LitIntType::Signed(ast::IntTy::Isize), + sym::i8 => ast::LitIntType::Signed(ast::IntTy::I8), + sym::i16 => ast::LitIntType::Signed(ast::IntTy::I16), + sym::i32 => ast::LitIntType::Signed(ast::IntTy::I32), + sym::i64 => ast::LitIntType::Signed(ast::IntTy::I64), + sym::i128 => ast::LitIntType::Signed(ast::IntTy::I128), + sym::usize => ast::LitIntType::Unsigned(ast::UintTy::Usize), + sym::u8 => ast::LitIntType::Unsigned(ast::UintTy::U8), + sym::u16 => ast::LitIntType::Unsigned(ast::UintTy::U16), + sym::u32 => ast::LitIntType::Unsigned(ast::UintTy::U32), + sym::u64 => ast::LitIntType::Unsigned(ast::UintTy::U64), + sym::u128 => ast::LitIntType::Unsigned(ast::UintTy::U128), + // `1f64` and `2f32` etc. are valid float literals, and + // `fxxx` looks more like an invalid float literal than invalid integer literal. + _ if suf.as_str().starts_with('f') => return filtered_float_lit(symbol, suffix, base), + _ => return Err(LitError::InvalidIntSuffix), + } + _ => ast::LitIntType::Unsuffixed + }; + + let s = &s[if base != 10 { 2 } else { 0 } ..]; + u128::from_str_radix(s, base).map(|i| LitKind::Int(i, ty)).map_err(|_| { + // Small bases are lexed as if they were base 10, e.g, the string + // might be `0b10201`. This will cause the conversion above to fail, + // but these kinds of errors are already reported by the lexer. + let from_lexer = + base < 10 && s.chars().any(|c| c.to_digit(10).map_or(false, |d| d >= base)); + if from_lexer { LitError::LexerError } else { LitError::IntTooLarge } + }) +} diff --git a/src/libsyntax/util/parser.rs b/src/libsyntax/util/parser.rs index 982755e8680..df72fdc8014 100644 --- a/src/libsyntax/util/parser.rs +++ b/src/libsyntax/util/parser.rs @@ -1,4 +1,4 @@ -use crate::parse::token::{self, Token, BinOpToken}; +use crate::token::{self, Token, BinOpToken}; use crate::symbol::kw; use crate::ast::{self, BinOpKind}; @@ -69,7 +69,7 @@ pub enum Fixity { impl AssocOp { /// Creates a new AssocOP from a token - crate fn from_token(t: &Token) -> Option<AssocOp> { + pub fn from_token(t: &Token) -> Option<AssocOp> { use AssocOp::*; match t.kind { token::BinOpEq(k) => Some(AssignOp(k)), @@ -358,7 +358,7 @@ impl ExprPrecedence { } /// In `let p = e`, operators with precedence `<=` this one requires parenthesis in `e`. -crate fn prec_let_scrutinee_needs_par() -> usize { +pub fn prec_let_scrutinee_needs_par() -> usize { AssocOp::LAnd.precedence() } |
