From 6a1c10bd85442f52f1d55d51005481cfa5cb40b5 Mon Sep 17 00:00:00 2001 From: Trevor Gross Date: Mon, 19 Dec 2022 12:09:40 -0600 Subject: Add a simple markdown parser for formatting `rustc --explain` Currently, the output of `rustc --explain foo` displays the raw markdown in a pager. This is acceptable, but using actual formatting makes it easier to understand. This patch consists of three major components: 1. A markdown parser. This is an extremely simple non-backtracking recursive implementation that requires normalization of the final token stream 2. A utility to write the token stream to an output buffer 3. Configuration within rustc_driver_impl to invoke this combination for `--explain`. Like the current implementation, it first attempts to print to a pager with a fallback colorized terminal, and standard print as a last resort. If color is disabled, or if the output does not support it, or if printing with color fails, it will write the raw markdown (which matches current behavior). Pagers known to support color are: `less` (with `-r`), `bat` (aka `catbat`), and `delta`. The markdown parser does not support the entire markdown specification, but should support the following with reasonable accuracy: - Headings, including formatting - Comments - Code, inline and fenced block (no indented block) - Strong, emphasis, and strikethrough formatted text - Links, anchor, inline, and reference-style - Horizontal rules - Unordered and ordered list items, including formatting This parser and writer should be reusable by other systems if ever needed. --- compiler/rustc_errors/src/emitter.rs | 2 +- compiler/rustc_errors/src/lib.rs | 1 + compiler/rustc_errors/src/markdown/mod.rs | 76 +++ compiler/rustc_errors/src/markdown/parse.rs | 588 +++++++++++++++++++++ compiler/rustc_errors/src/markdown/term.rs | 189 +++++++ compiler/rustc_errors/src/markdown/tests/input.md | 50 ++ .../rustc_errors/src/markdown/tests/output.stdout | 35 ++ compiler/rustc_errors/src/markdown/tests/parse.rs | 312 +++++++++++ compiler/rustc_errors/src/markdown/tests/term.rs | 90 ++++ 9 files changed, 1342 insertions(+), 1 deletion(-) create mode 100644 compiler/rustc_errors/src/markdown/mod.rs create mode 100644 compiler/rustc_errors/src/markdown/parse.rs create mode 100644 compiler/rustc_errors/src/markdown/term.rs create mode 100644 compiler/rustc_errors/src/markdown/tests/input.md create mode 100644 compiler/rustc_errors/src/markdown/tests/output.stdout create mode 100644 compiler/rustc_errors/src/markdown/tests/parse.rs create mode 100644 compiler/rustc_errors/src/markdown/tests/term.rs (limited to 'compiler/rustc_errors/src') diff --git a/compiler/rustc_errors/src/emitter.rs b/compiler/rustc_errors/src/emitter.rs index d8c997b49a1..9d4d159fd96 100644 --- a/compiler/rustc_errors/src/emitter.rs +++ b/compiler/rustc_errors/src/emitter.rs @@ -616,7 +616,7 @@ pub enum ColorConfig { } impl ColorConfig { - fn to_color_choice(self) -> ColorChoice { + pub fn to_color_choice(self) -> ColorChoice { match self { ColorConfig::Always => { if io::stderr().is_terminal() { diff --git a/compiler/rustc_errors/src/lib.rs b/compiler/rustc_errors/src/lib.rs index 24d1cc8af82..b9db25103a3 100644 --- a/compiler/rustc_errors/src/lib.rs +++ b/compiler/rustc_errors/src/lib.rs @@ -61,6 +61,7 @@ pub mod emitter; pub mod error; pub mod json; mod lock; +pub mod markdown; pub mod registry; mod snippet; mod styled_buffer; diff --git a/compiler/rustc_errors/src/markdown/mod.rs b/compiler/rustc_errors/src/markdown/mod.rs new file mode 100644 index 00000000000..53b766dfcce --- /dev/null +++ b/compiler/rustc_errors/src/markdown/mod.rs @@ -0,0 +1,76 @@ +//! A simple markdown parser that can write formatted text to the terminal +//! +//! Entrypoint is `MdStream::parse_str(...)` +use std::io; + +use termcolor::{Buffer, BufferWriter, ColorChoice}; +mod parse; +mod term; + +/// An AST representation of a Markdown document +#[derive(Clone, Debug, Default, PartialEq)] +pub struct MdStream<'a>(Vec>); + +impl<'a> MdStream<'a> { + /// Parse a markdown string to a tokenstream + #[must_use] + pub fn parse_str(s: &str) -> MdStream<'_> { + parse::entrypoint(s) + } + + /// Write formatted output to a termcolor buffer + pub fn write_termcolor_buf(&self, buf: &mut Buffer) -> io::Result<()> { + term::entrypoint(self, buf) + } +} + +/// Create a termcolor buffer with the `Always` color choice +pub fn create_stdout_bufwtr() -> BufferWriter { + BufferWriter::stdout(ColorChoice::Always) +} + +/// A single tokentree within a Markdown document +#[derive(Clone, Debug, PartialEq)] +pub enum MdTree<'a> { + /// Leaf types + Comment(&'a str), + CodeBlock { + txt: &'a str, + lang: Option<&'a str>, + }, + CodeInline(&'a str), + Strong(&'a str), + Emphasis(&'a str), + Strikethrough(&'a str), + PlainText(&'a str), + /// [Foo](www.foo.com) or simple anchor + Link { + disp: &'a str, + link: &'a str, + }, + /// `[Foo link][ref]` + RefLink { + disp: &'a str, + id: Option<&'a str>, + }, + /// [ref]: www.foo.com + LinkDef { + id: &'a str, + link: &'a str, + }, + /// Break bewtween two paragraphs (double `\n`), not directly parsed but + /// added later + ParagraphBreak, + /// Break bewtween two lines (single `\n`) + LineBreak, + HorizontalRule, + Heading(u8, MdStream<'a>), + OrderedListItem(u16, MdStream<'a>), + UnorderedListItem(MdStream<'a>), +} + +impl<'a> From>> for MdStream<'a> { + fn from(value: Vec>) -> Self { + Self(value) + } +} diff --git a/compiler/rustc_errors/src/markdown/parse.rs b/compiler/rustc_errors/src/markdown/parse.rs new file mode 100644 index 00000000000..362a451fde6 --- /dev/null +++ b/compiler/rustc_errors/src/markdown/parse.rs @@ -0,0 +1,588 @@ +use crate::markdown::{MdStream, MdTree}; +use std::{iter, mem, str}; + +/// Short aliases that we can use in match patterns. If an end pattern is not +/// included, this type may be variable +const ANC_E: &[u8] = b">"; +const ANC_S: &[u8] = b"<"; +const BRK: &[u8] = b"---"; +const CBK: &[u8] = b"```"; +const CIL: &[u8] = b"`"; +const CMT_E: &[u8] = b"-->"; +const CMT_S: &[u8] = b"rest"; + let (t, r) = parse_simple_pat(buf.as_bytes(), CMT_S, CMT_E, opt, MdTree::Comment).unwrap(); + assert_eq!(t, MdTree::Comment("foobar!")); + assert_eq!(r, b"rest"); + + let buf = r"rest"; + let (t, r) = parse_simple_pat(buf.as_bytes(), CMT_S, CMT_E, opt, MdTree::Comment).unwrap(); + assert_eq!(t, MdTree::Comment(r"foobar! \")); + assert_eq!(r, b"rest"); +} + +#[test] +fn test_parse_heading() { + let buf1 = "# Top level\nrest"; + let (t, r) = parse_heading(buf1.as_bytes()).unwrap(); + assert_eq!(t, MdTree::Heading(1, vec![MdTree::PlainText("Top level")].into())); + assert_eq!(r, b"\nrest"); + + let buf1 = "# Empty"; + let (t, r) = parse_heading(buf1.as_bytes()).unwrap(); + assert_eq!(t, MdTree::Heading(1, vec![MdTree::PlainText("Empty")].into())); + assert_eq!(r, b""); + + // Combo + let buf2 = "### Top `level` _woo_\nrest"; + let (t, r) = parse_heading(buf2.as_bytes()).unwrap(); + assert_eq!( + t, + MdTree::Heading( + 3, + vec![ + MdTree::PlainText("Top "), + MdTree::CodeInline("level"), + MdTree::PlainText(" "), + MdTree::Emphasis("woo"), + ] + .into() + ) + ); + assert_eq!(r, b"\nrest"); +} + +#[test] +fn test_parse_code_inline() { + let buf1 = "`abcd` rest"; + let (t, r) = parse_codeinline(buf1.as_bytes()).unwrap(); + assert_eq!(t, MdTree::CodeInline("abcd")); + assert_eq!(r, b" rest"); + + // extra backticks, newline + let buf2 = "```ab\ncd``` rest"; + let (t, r) = parse_codeinline(buf2.as_bytes()).unwrap(); + assert_eq!(t, MdTree::CodeInline("ab\ncd")); + assert_eq!(r, b" rest"); + + // test no escaping + let buf3 = r"`abcd\` rest"; + let (t, r) = parse_codeinline(buf3.as_bytes()).unwrap(); + assert_eq!(t, MdTree::CodeInline(r"abcd\")); + assert_eq!(r, b" rest"); +} + +#[test] +fn test_parse_code_block() { + let buf1 = "```rust\ncode\ncode\n```\nleftovers"; + let (t, r) = parse_codeblock(buf1.as_bytes()); + assert_eq!(t, MdTree::CodeBlock { txt: "code\ncode", lang: Some("rust") }); + assert_eq!(r, b"\nleftovers"); + + let buf2 = "`````\ncode\ncode````\n`````\nleftovers"; + let (t, r) = parse_codeblock(buf2.as_bytes()); + assert_eq!(t, MdTree::CodeBlock { txt: "code\ncode````", lang: None }); + assert_eq!(r, b"\nleftovers"); +} + +#[test] +fn test_parse_link() { + let simple = "[see here](docs.rs) other"; + let (t, r) = parse_any_link(simple.as_bytes(), false).unwrap(); + assert_eq!(t, MdTree::Link { disp: "see here", link: "docs.rs" }); + assert_eq!(r, b" other"); + + let simple_toplevel = "[see here](docs.rs) other"; + let (t, r) = parse_any_link(simple_toplevel.as_bytes(), true).unwrap(); + assert_eq!(t, MdTree::Link { disp: "see here", link: "docs.rs" }); + assert_eq!(r, b" other"); + + let reference = "[see here] other"; + let (t, r) = parse_any_link(reference.as_bytes(), true).unwrap(); + assert_eq!(t, MdTree::RefLink { disp: "see here", id: None }); + assert_eq!(r, b" other"); + + let reference_full = "[see here][docs-rs] other"; + let (t, r) = parse_any_link(reference_full.as_bytes(), false).unwrap(); + assert_eq!(t, MdTree::RefLink { disp: "see here", id: Some("docs-rs") }); + assert_eq!(r, b" other"); + + let reference_def = "[see here]: docs.rs\nother"; + let (t, r) = parse_any_link(reference_def.as_bytes(), true).unwrap(); + assert_eq!(t, MdTree::LinkDef { id: "see here", link: "docs.rs" }); + assert_eq!(r, b"\nother"); +} + +const IND1: &str = r"test standard + ind + ind2 +not ind"; +const IND2: &str = r"test end of stream + 1 + 2 +"; +const IND3: &str = r"test empty lines + 1 + 2 + +not ind"; + +#[test] +fn test_indented_section() { + let (t, r) = get_indented_section(IND1.as_bytes()); + assert_eq!(str::from_utf8(t).unwrap(), "test standard\n ind\n ind2"); + assert_eq!(str::from_utf8(r).unwrap(), "\nnot ind"); + + let (txt, rest) = get_indented_section(IND2.as_bytes()); + assert_eq!(str::from_utf8(txt).unwrap(), "test end of stream\n 1\n 2"); + assert_eq!(str::from_utf8(rest).unwrap(), "\n"); + + let (txt, rest) = get_indented_section(IND3.as_bytes()); + assert_eq!(str::from_utf8(txt).unwrap(), "test empty lines\n 1\n 2"); + assert_eq!(str::from_utf8(rest).unwrap(), "\n\nnot ind"); +} + +const HBT: &str = r"# Heading + +content"; + +#[test] +fn test_heading_breaks() { + let expected = vec![ + MdTree::Heading(1, vec![MdTree::PlainText("Heading")].into()), + MdTree::PlainText("content"), + ] + .into(); + let res = entrypoint(HBT); + assert_eq!(res, expected); +} + +const NL1: &str = r"start + +end"; +const NL2: &str = r"start + + +end"; +const NL3: &str = r"start + + + +end"; + +#[test] +fn test_newline_breaks() { + let expected = + vec![MdTree::PlainText("start"), MdTree::ParagraphBreak, MdTree::PlainText("end")].into(); + for (idx, check) in [NL1, NL2, NL3].iter().enumerate() { + let res = entrypoint(check); + assert_eq!(res, expected, "failed {idx}"); + } +} + +const WRAP: &str = "plain _italics +italics_"; + +#[test] +fn test_wrap_pattern() { + let expected = vec![ + MdTree::PlainText("plain "), + MdTree::Emphasis("italics"), + MdTree::Emphasis(" "), + MdTree::Emphasis("italics"), + ] + .into(); + let res = entrypoint(WRAP); + assert_eq!(res, expected); +} + +const WRAP_NOTXT: &str = r"_italics_ +**bold**"; + +#[test] +fn test_wrap_notxt() { + let expected = + vec![MdTree::Emphasis("italics"), MdTree::PlainText(" "), MdTree::Strong("bold")].into(); + let res = entrypoint(WRAP_NOTXT); + assert_eq!(res, expected); +} + +const MIXED_LIST: &str = r"start +- _italics item_ + +- **bold item** + second line [link1](foobar1) + third line [link2][link-foo] +- :crab: + extra indent +end +[link-foo]: foobar2 +"; + +#[test] +fn test_list() { + let expected = vec![ + MdTree::PlainText("start"), + MdTree::ParagraphBreak, + MdTree::UnorderedListItem(vec![MdTree::Emphasis("italics item")].into()), + MdTree::LineBreak, + MdTree::UnorderedListItem( + vec![ + MdTree::Strong("bold item"), + MdTree::PlainText(" second line "), + MdTree::Link { disp: "link1", link: "foobar1" }, + MdTree::PlainText(" third line "), + MdTree::Link { disp: "link2", link: "foobar2" }, + ] + .into(), + ), + MdTree::LineBreak, + MdTree::UnorderedListItem( + vec![MdTree::PlainText("🦀"), MdTree::PlainText(" extra indent")].into(), + ), + MdTree::ParagraphBreak, + MdTree::PlainText("end"), + ] + .into(); + let res = entrypoint(MIXED_LIST); + assert_eq!(res, expected); +} + +const SMOOSHED: &str = r#" +start +### heading +1. ordered item +```rust +println!("Hello, world!"); +``` +`inline` +``end`` +"#; + +#[test] +fn test_without_breaks() { + let expected = vec![ + MdTree::PlainText("start"), + MdTree::ParagraphBreak, + MdTree::Heading(3, vec![MdTree::PlainText("heading")].into()), + MdTree::OrderedListItem(1, vec![MdTree::PlainText("ordered item")].into()), + MdTree::ParagraphBreak, + MdTree::CodeBlock { txt: r#"println!("Hello, world!");"#, lang: Some("rust") }, + MdTree::ParagraphBreak, + MdTree::CodeInline("inline"), + MdTree::PlainText(" "), + MdTree::CodeInline("end"), + ] + .into(); + let res = entrypoint(SMOOSHED); + assert_eq!(res, expected); +} + +const CODE_STARTLINE: &str = r#" +start +`code` +middle +`more code` +end +"#; + +#[test] +fn test_code_at_start() { + let expected = vec![ + MdTree::PlainText("start"), + MdTree::PlainText(" "), + MdTree::CodeInline("code"), + MdTree::PlainText(" "), + MdTree::PlainText("middle"), + MdTree::PlainText(" "), + MdTree::CodeInline("more code"), + MdTree::PlainText(" "), + MdTree::PlainText("end"), + ] + .into(); + let res = entrypoint(CODE_STARTLINE); + assert_eq!(res, expected); +} diff --git a/compiler/rustc_errors/src/markdown/tests/term.rs b/compiler/rustc_errors/src/markdown/tests/term.rs new file mode 100644 index 00000000000..3b31c6d6295 --- /dev/null +++ b/compiler/rustc_errors/src/markdown/tests/term.rs @@ -0,0 +1,90 @@ +use std::io::BufWriter; +use std::path::PathBuf; +use termcolor::{BufferWriter, ColorChoice}; + +use super::*; +use crate::markdown::MdStream; + +const INPUT: &str = include_str!("input.md"); +const OUTPUT_PATH: &[&str] = &[env!("CARGO_MANIFEST_DIR"), "src","markdown","tests","output.stdout"]; + +const TEST_WIDTH: usize = 80; + +// We try to make some words long to create corner cases +const TXT: &str = r"Lorem ipsum dolor sit amet, consecteturadipiscingelit. +Fusce-id-urna-sollicitudin, pharetra nisl nec, lobortis tellus. In at +metus hendrerit, tincidunteratvel, ultrices turpis. Curabitur_risus_sapien, +porta-sed-nunc-sed, ultricesposuerelacus. Sed porttitor quis +dolor non venenatis. Aliquam ut. "; + +const WRAPPED: &str = r"Lorem ipsum dolor sit amet, consecteturadipiscingelit. Fusce-id-urna- +sollicitudin, pharetra nisl nec, lobortis tellus. In at metus hendrerit, +tincidunteratvel, ultrices turpis. Curabitur_risus_sapien, porta-sed-nunc-sed, +ultricesposuerelacus. Sed porttitor quis dolor non venenatis. Aliquam ut. Lorem + ipsum dolor sit amet, consecteturadipiscingelit. Fusce-id-urna- + sollicitudin, pharetra nisl nec, lobortis tellus. In at metus hendrerit, + tincidunteratvel, ultrices turpis. Curabitur_risus_sapien, porta-sed-nunc- + sed, ultricesposuerelacus. Sed porttitor quis dolor non venenatis. Aliquam + ut. Sample link lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, +consecteturadipiscingelit. Fusce-id-urna-sollicitudin, pharetra nisl nec, +lobortis tellus. In at metus hendrerit, tincidunteratvel, ultrices turpis. +Curabitur_risus_sapien, porta-sed-nunc-sed, ultricesposuerelacus. Sed porttitor +quis dolor non venenatis. Aliquam ut. "; + +#[test] +fn test_wrapping_write() { + WIDTH.with(|w| w.set(TEST_WIDTH)); + let mut buf = BufWriter::new(Vec::new()); + let txt = TXT.replace("-\n","-").replace("_\n","_").replace('\n', " ").replace(" ", ""); + write_wrapping(&mut buf, &txt, 0, None).unwrap(); + write_wrapping(&mut buf, &txt, 4, None).unwrap(); + write_wrapping( + &mut buf, + "Sample link lorem ipsum dolor sit amet. ", + 4, + Some("link-address-placeholder"), + ) + .unwrap(); + write_wrapping(&mut buf, &txt, 0, None).unwrap(); + let out = String::from_utf8(buf.into_inner().unwrap()).unwrap(); + let out = out + .replace("\x1b\\", "") + .replace('\x1b', "") + .replace("]8;;", "") + .replace("link-address-placeholder", ""); + + for line in out.lines() { + assert!(line.len() <= TEST_WIDTH, "line length\n'{line}'") + } + + assert_eq!(out, WRAPPED); +} + +#[test] +fn test_output() { + // Capture `--bless` when run via ./x + let bless = std::env::var("RUSTC_BLESS").unwrap_or_default() == "1"; + let ast = MdStream::parse_str(INPUT); + let bufwtr = BufferWriter::stderr(ColorChoice::Always); + let mut buffer = bufwtr.buffer(); + ast.write_termcolor_buf(&mut buffer).unwrap(); + + let mut blessed = PathBuf::new(); + blessed.extend(OUTPUT_PATH); + + if bless { + std::fs::write(&blessed, buffer.into_inner()).unwrap(); + eprintln!("blessed output at {}", blessed.display()); + } else { + let output = buffer.into_inner(); + if std::fs::read(blessed).unwrap() != output { + // hack: I don't know any way to write bytes to the captured stdout + // that cargo test uses + let mut out = std::io::stdout(); + out.write_all(b"\n\nMarkdown output did not match. Expected:\n").unwrap(); + out.write_all(&output).unwrap(); + out.write_all(b"\n\n").unwrap(); + panic!("markdown output mismatch"); + } + } +} -- cgit 1.4.1-3-g733a5