Auto merge of #114843 - Zalathar:test-coverage-map, r=oli-obk

coverage: Explicitly test the coverage maps produced by codegen/LLVM Our existing coverage tests verify the output of end-to-end coverage reports, but we don't have any way to test the specific mapping information (code regions and their associated counters) that are emitted by `rustc_codegen_llvm` and LLVM. That makes it harder to to be confident in changes that would modify those mappings (whether deliberately or accidentally). This PR addresses that by adding a new `coverage-map` test suite that does the following: - Compiles test files to LLVM IR assembly (`.ll`) - Feeds those IR files to a custom tool (`src/tools/coverage-dump`) that extracts and decodes coverage mappings, and prints them in a more human-readable format - Checks the output of that tool against known-good snapshots --- I recommend excluding the last commit while reviewing the main changes, because that last commit is just ~40 test files copied over from `tests/run-coverage`, plus their blessed coverage-map snapshots and a readme file. Those snapshots aren't really intended to be checked by hand; they're mostly there to increase the chances that an unintended change to coverage maps will be observable (even if it requires relatively specific circumstances to manifest).
author: bors <bors@rust-lang.org> 2023-09-05 15:30:59 +0000
committer: bors <bors@rust-lang.org> 2023-09-05 15:30:59 +0000
commit: ab45885dec2a6552cb060a5b7183653baaecd580 (patch)
tree: cbc89120fae2ede8bdc74e158fb7724cbe567852 /src
parent: f222a2dd8f6391e6433f57a7c5f1514166edbec1 (diff)
parent: 3141177995f52fc3cbb64d66cf0a98ea0f754fca (diff)
download: rust-ab45885dec2a6552cb060a5b7183653baaecd580.tar.gz
rust-ab45885dec2a6552cb060a5b7183653baaecd580.zip
13 files changed, 632 insertions, 6 deletions
diff --git a/src/bootstrap/builder.rs b/src/bootstrap/builder.rs
index b3666192853..a24a6a4636d 100644
--- a/src/bootstrap/builder.rs
+++ b/src/bootstrap/builder.rs
@@ -703,7 +703,8 @@ impl<'a> Builder<'a> {
                 llvm::Lld,
                 llvm::CrtBeginEnd,
                 tool::RustdocGUITest,
-                tool::OptimizedDist
+                tool::OptimizedDist,
+                tool::CoverageDump,
             ),
             Kind::Check | Kind::Clippy | Kind::Fix => describe!(
                 check::Std,
@@ -725,6 +726,7 @@ impl<'a> Builder<'a> {
                 test::Tidy,
                 test::Ui,
                 test::RunPassValgrind,
+                test::CoverageMap,
                 test::RunCoverage,
                 test::MirOpt,
                 test::Codegen,
diff --git a/src/bootstrap/test.rs b/src/bootstrap/test.rs
index d1018978f78..d78e0deda69 100644
--- a/src/bootstrap/test.rs
+++ b/src/bootstrap/test.rs
@@ -1340,6 +1340,12 @@ host_test!(RunMakeFullDeps {
 
 default_test!(Assembly { path: "tests/assembly", mode: "assembly", suite: "assembly" });
 
+default_test!(CoverageMap {
+    path: "tests/coverage-map",
+    mode: "coverage-map",
+    suite: "coverage-map"
+});
+
 host_test!(RunCoverage { path: "tests/run-coverage", mode: "run-coverage", suite: "run-coverage" });
 host_test!(RunCoverageRustdoc {
     path: "tests/run-coverage-rustdoc",
@@ -1545,6 +1551,14 @@ note: if you're sure you want to do this, please open an issue as to why. In the
                 .arg(builder.ensure(tool::JsonDocLint { compiler: json_compiler, target }));
         }
 
+        if mode == "coverage-map" {
+            let coverage_dump = builder.ensure(tool::CoverageDump {
+                compiler: compiler.with_stage(0),
+                target: compiler.host,
+            });
+            cmd.arg("--coverage-dump-path").arg(coverage_dump);
+        }
+
         if mode == "run-make" || mode == "run-coverage" {
             let rust_demangler = builder
                 .ensure(tool::RustDemangler {
diff --git a/src/bootstrap/tool.rs b/src/bootstrap/tool.rs
index 07ff3da6b4a..f094dd9d7c9 100644
--- a/src/bootstrap/tool.rs
+++ b/src/bootstrap/tool.rs
@@ -306,6 +306,7 @@ bootstrap_tool!(
     GenerateWindowsSys, "src/tools/generate-windows-sys", "generate-windows-sys";
     RustdocGUITest, "src/tools/rustdoc-gui-test", "rustdoc-gui-test", is_unstable_tool = true, allow_features = "test";
     OptimizedDist, "src/tools/opt-dist", "opt-dist";
+    CoverageDump, "src/tools/coverage-dump", "coverage-dump";
 );
 
 #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq, Ord, PartialOrd)]
diff --git a/src/tools/compiletest/src/common.rs b/src/tools/compiletest/src/common.rs
index 7c17e92d0df..b91d5a958bb 100644
--- a/src/tools/compiletest/src/common.rs
+++ b/src/tools/compiletest/src/common.rs
@@ -66,6 +66,7 @@ string_enum! {
         JsDocTest => "js-doc-test",
         MirOpt => "mir-opt",
         Assembly => "assembly",
+        CoverageMap => "coverage-map",
         RunCoverage => "run-coverage",
     }
 }
@@ -161,6 +162,9 @@ pub struct Config {
     /// The rust-demangler executable.
     pub rust_demangler_path: Option<PathBuf>,
 
+    /// The coverage-dump executable.
+    pub coverage_dump_path: Option<PathBuf>,
+
     /// The Python executable to use for LLDB and htmldocck.
     pub python: String,
 
@@ -639,6 +643,7 @@ pub const UI_EXTENSIONS: &[&str] = &[
     UI_STDERR_32,
     UI_STDERR_16,
     UI_COVERAGE,
+    UI_COVERAGE_MAP,
 ];
 pub const UI_STDERR: &str = "stderr";
 pub const UI_STDOUT: &str = "stdout";
@@ -649,6 +654,7 @@ pub const UI_STDERR_64: &str = "64bit.stderr";
 pub const UI_STDERR_32: &str = "32bit.stderr";
 pub const UI_STDERR_16: &str = "16bit.stderr";
 pub const UI_COVERAGE: &str = "coverage";
+pub const UI_COVERAGE_MAP: &str = "cov-map";
 
 /// Absolute path to the directory where all output for all tests in the given
 /// `relative_dir` group should reside. Example:
diff --git a/src/tools/compiletest/src/lib.rs b/src/tools/compiletest/src/lib.rs
index 1a765477fe5..619ff9b3221 100644
--- a/src/tools/compiletest/src/lib.rs
+++ b/src/tools/compiletest/src/lib.rs
@@ -48,6 +48,7 @@ pub fn parse_config(args: Vec<String>) -> Config {
         .reqopt("", "rustc-path", "path to rustc to use for compiling", "PATH")
         .optopt("", "rustdoc-path", "path to rustdoc to use for compiling", "PATH")
         .optopt("", "rust-demangler-path", "path to rust-demangler to use in tests", "PATH")
+        .optopt("", "coverage-dump-path", "path to coverage-dump to use in tests", "PATH")
         .reqopt("", "python", "path to python to use for doc tests", "PATH")
         .optopt("", "jsondocck-path", "path to jsondocck to use for doc tests", "PATH")
         .optopt("", "jsondoclint-path", "path to jsondoclint to use for doc tests", "PATH")
@@ -218,6 +219,7 @@ pub fn parse_config(args: Vec<String>) -> Config {
         rustc_path: opt_path(matches, "rustc-path"),
         rustdoc_path: matches.opt_str("rustdoc-path").map(PathBuf::from),
         rust_demangler_path: matches.opt_str("rust-demangler-path").map(PathBuf::from),
+        coverage_dump_path: matches.opt_str("coverage-dump-path").map(PathBuf::from),
         python: matches.opt_str("python").unwrap(),
         jsondocck_path: matches.opt_str("jsondocck-path"),
         jsondoclint_path: matches.opt_str("jsondoclint-path"),
diff --git a/src/tools/compiletest/src/runtest.rs b/src/tools/compiletest/src/runtest.rs
index dd4c59fdff5..670441aacbd 100644
--- a/src/tools/compiletest/src/runtest.rs
+++ b/src/tools/compiletest/src/runtest.rs
@@ -6,8 +6,8 @@ use crate::common::{Assembly, Incremental, JsDocTest, MirOpt, RunMake, RustdocJs
 use crate::common::{Codegen, CodegenUnits, DebugInfo, Debugger, Rustdoc};
 use crate::common::{CompareMode, FailMode, PassMode};
 use crate::common::{Config, TestPaths};
-use crate::common::{Pretty, RunCoverage, RunPassValgrind};
-use crate::common::{UI_COVERAGE, UI_RUN_STDERR, UI_RUN_STDOUT};
+use crate::common::{CoverageMap, Pretty, RunCoverage, RunPassValgrind};
+use crate::common::{UI_COVERAGE, UI_COVERAGE_MAP, UI_RUN_STDERR, UI_RUN_STDOUT};
 use crate::compute_diff::{write_diff, write_filtered_diff};
 use crate::errors::{self, Error, ErrorKind};
 use crate::header::TestProps;
@@ -254,6 +254,7 @@ impl<'test> TestCx<'test> {
             MirOpt => self.run_mir_opt_test(),
             Assembly => self.run_assembly_test(),
             JsDocTest => self.run_js_doc_test(),
+            CoverageMap => self.run_coverage_map_test(),
             RunCoverage => self.run_coverage_test(),
         }
     }
@@ -467,6 +468,46 @@ impl<'test> TestCx<'test> {
         }
     }
 
+    fn run_coverage_map_test(&self) {
+        let Some(coverage_dump_path) = &self.config.coverage_dump_path else {
+            self.fatal("missing --coverage-dump");
+        };
+
+        let proc_res = self.compile_test_and_save_ir();
+        if !proc_res.status.success() {
+            self.fatal_proc_rec("compilation failed!", &proc_res);
+        }
+        drop(proc_res);
+
+        let llvm_ir_path = self.output_base_name().with_extension("ll");
+
+        let mut dump_command = Command::new(coverage_dump_path);
+        dump_command.arg(llvm_ir_path);
+        let proc_res = self.run_command_to_procres(&mut dump_command);
+        if !proc_res.status.success() {
+            self.fatal_proc_rec("coverage-dump failed!", &proc_res);
+        }
+
+        let kind = UI_COVERAGE_MAP;
+
+        let expected_coverage_dump = self.load_expected_output(kind);
+        let actual_coverage_dump = self.normalize_output(&proc_res.stdout, &[]);
+
+        let coverage_dump_errors = self.compare_output(
+            kind,
+            &actual_coverage_dump,
+            &expected_coverage_dump,
+            self.props.compare_output_lines_by_subset,
+        );
+
+        if coverage_dump_errors > 0 {
+            self.fatal_proc_rec(
+                &format!("{coverage_dump_errors} errors occurred comparing coverage output."),
+                &proc_res,
+            );
+        }
+    }
+
     fn run_coverage_test(&self) {
         let should_run = self.run_if_enabled();
         let proc_res = self.compile_test(should_run, Emit::None);
@@ -650,6 +691,10 @@ impl<'test> TestCx<'test> {
         let mut cmd = Command::new(tool_path);
         configure_cmd_fn(&mut cmd);
 
+        self.run_command_to_procres(&mut cmd)
+    }
+
+    fn run_command_to_procres(&self, cmd: &mut Command) -> ProcRes {
         let output = cmd.output().unwrap_or_else(|_| panic!("failed to exec `{cmd:?}`"));
 
         let proc_res = ProcRes {
@@ -2321,9 +2366,11 @@ impl<'test> TestCx<'test> {
                     }
                 }
                 DebugInfo => { /* debuginfo tests must be unoptimized */ }
-                RunCoverage => {
-                    // Coverage reports are affected by optimization level, and
-                    // the current snapshots assume no optimization by default.
+                CoverageMap | RunCoverage => {
+                    // Coverage mappings and coverage reports are affected by
+                    // optimization level, so they ignore the optimize-tests
+                    // setting and set an optimization level in their mode's
+                    // compile flags (below) or in per-test `compile-flags`.
                 }
                 _ => {
                     rustc.arg("-O");
@@ -2392,8 +2439,22 @@ impl<'test> TestCx<'test> {
 
                 rustc.arg(dir_opt);
             }
+            CoverageMap => {
+                rustc.arg("-Cinstrument-coverage");
+                // These tests only compile to MIR, so they don't need the
+                // profiler runtime to be present.
+                rustc.arg("-Zno-profiler-runtime");
+                // Coverage mappings are sensitive to MIR optimizations, and
+                // the current snapshots assume `opt-level=2` unless overridden
+                // by `compile-flags`.
+                rustc.arg("-Copt-level=2");
+            }
             RunCoverage => {
                 rustc.arg("-Cinstrument-coverage");
+                // Coverage reports are sometimes sensitive to optimizations,
+                // and the current snapshots assume no optimization unless
+                // overridden by `compile-flags`.
+                rustc.arg("-Copt-level=0");
             }
             RunPassValgrind | Pretty | DebugInfo | Codegen | Rustdoc | RustdocJson | RunMake
             | CodegenUnits | JsDocTest | Assembly => {
diff --git a/src/tools/coverage-dump/Cargo.toml b/src/tools/coverage-dump/Cargo.toml
new file mode 100644
index 00000000000..7f14286b5d0
--- /dev/null
+++ b/src/tools/coverage-dump/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "coverage-dump"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+anyhow = "1.0.71"
+leb128 = "0.2.5"
+md5 = { package = "md-5" , version = "0.10.5" }
+miniz_oxide = "0.7.1"
+regex = "1.8.4"
+rustc-demangle = "0.1.23"
diff --git a/src/tools/coverage-dump/README.md b/src/tools/coverage-dump/README.md
new file mode 100644
index 00000000000..e2625d5adf2
--- /dev/null
+++ b/src/tools/coverage-dump/README.md
@@ -0,0 +1,8 @@
+This tool extracts coverage mapping information from an LLVM IR assembly file
+(`.ll`), and prints it in a more human-readable form that can be used for
+snapshot tests.
+
+The output format is mostly arbitrary, so it's OK to change the output as long
+as any affected tests are also re-blessed. However, the output should be
+consistent across different executions on different platforms, so avoid
+printing any information that is platform-specific or non-deterministic.
diff --git a/src/tools/coverage-dump/src/covfun.rs b/src/tools/coverage-dump/src/covfun.rs
new file mode 100644
index 00000000000..3a5866dea3e
--- /dev/null
+++ b/src/tools/coverage-dump/src/covfun.rs
@@ -0,0 +1,296 @@
+use crate::parser::{unescape_llvm_string_contents, Parser};
+use anyhow::{anyhow, Context};
+use regex::Regex;
+use std::collections::HashMap;
+use std::fmt::{self, Debug, Write as _};
+use std::sync::OnceLock;
+
+pub(crate) fn dump_covfun_mappings(
+    llvm_ir: &str,
+    function_names: &HashMap<u64, String>,
+) -> anyhow::Result<()> {
+    // Extract function coverage entries from the LLVM IR assembly, and associate
+    // each entry with its (demangled) name.
+    let mut covfun_entries = llvm_ir
+        .lines()
+        .filter_map(covfun_line_data)
+        .map(|line_data| (function_names.get(&line_data.name_hash).map(String::as_str), line_data))
+        .collect::<Vec<_>>();
+    covfun_entries.sort_by(|a, b| {
+        // Sort entries primarily by name, to help make the order consistent
+        // across platforms and relatively insensitive to changes.
+        // (Sadly we can't use `sort_by_key` because we would need to return references.)
+        Ord::cmp(&a.0, &b.0)
+            .then_with(|| Ord::cmp(&a.1.is_used, &b.1.is_used))
+            .then_with(|| Ord::cmp(a.1.payload.as_slice(), b.1.payload.as_slice()))
+    });
+
+    for (name, line_data) in &covfun_entries {
+        let name = name.unwrap_or("(unknown)");
+        let unused = if line_data.is_used { "" } else { " (unused)" };
+        println!("Function name: {name}{unused}");
+
+        let payload: &[u8] = &line_data.payload;
+        println!("Raw bytes ({len}): 0x{payload:02x?}", len = payload.len());
+
+        let mut parser = Parser::new(payload);
+
+        let num_files = parser.read_uleb128_u32()?;
+        println!("Number of files: {num_files}");
+
+        for i in 0..num_files {
+            let global_file_id = parser.read_uleb128_u32()?;
+            println!("- file {i} => global file {global_file_id}");
+        }
+
+        let num_expressions = parser.read_uleb128_u32()?;
+        println!("Number of expressions: {num_expressions}");
+
+        let mut expression_resolver = ExpressionResolver::new();
+        for i in 0..num_expressions {
+            let lhs = parser.read_simple_term()?;
+            let rhs = parser.read_simple_term()?;
+            println!("- expression {i} operands: lhs = {lhs:?}, rhs = {rhs:?}");
+            expression_resolver.push_operands(lhs, rhs);
+        }
+
+        for i in 0..num_files {
+            let num_mappings = parser.read_uleb128_u32()?;
+            println!("Number of file {i} mappings: {num_mappings}");
+
+            for _ in 0..num_mappings {
+                let (kind, region) = parser.read_mapping_kind_and_region()?;
+                println!("- {kind:?} at {region:?}");
+
+                match kind {
+                    // Also print expression mappings in resolved form.
+                    MappingKind::Code(term @ CovTerm::Expression { .. })
+                    | MappingKind::Gap(term @ CovTerm::Expression { .. }) => {
+                        println!("    = {}", expression_resolver.format_term(term));
+                    }
+                    // If the mapping is a branch region, print both of its arms
+                    // in resolved form (even if they aren't expressions).
+                    MappingKind::Branch { r#true, r#false } => {
+                        println!("    true  = {}", expression_resolver.format_term(r#true));
+                        println!("    false = {}", expression_resolver.format_term(r#false));
+                    }
+                    _ => (),
+                }
+            }
+        }
+
+        parser.ensure_empty()?;
+        println!();
+    }
+    Ok(())
+}
+
+struct CovfunLineData {
+    name_hash: u64,
+    is_used: bool,
+    payload: Vec<u8>,
+}
+
+/// Checks a line of LLVM IR assembly to see if it contains an `__llvm_covfun`
+/// entry, and if so extracts relevant data in a `CovfunLineData`.
+fn covfun_line_data(line: &str) -> Option<CovfunLineData> {
+    let re = {
+        // We cheat a little bit and match variable names `@__covrec_[HASH]u`
+        // rather than the section name, because the section name is harder to
+        // extract and differs across Linux/Windows/macOS. We also extract the
+        // symbol name hash from the variable name rather than the data, since
+        // it's easier and both should match.
+        static RE: OnceLock<Regex> = OnceLock::new();
+        RE.get_or_init(|| {
+            Regex::new(
+                r#"^@__covrec_(?<name_hash>[0-9A-Z]+)(?<is_used>u)? = .*\[[0-9]+ x i8\] c"(?<payload>[^"]*)".*$"#,
+            )
+            .unwrap()
+        })
+    };
+
+    let captures = re.captures(line)?;
+    let name_hash = u64::from_str_radix(&captures["name_hash"], 16).unwrap();
+    let is_used = captures.name("is_used").is_some();
+    let payload = unescape_llvm_string_contents(&captures["payload"]);
+
+    Some(CovfunLineData { name_hash, is_used, payload })
+}
+
+// Extra parser methods only needed when parsing `covfun` payloads.
+impl<'a> Parser<'a> {
+    fn read_simple_term(&mut self) -> anyhow::Result<CovTerm> {
+        let raw_term = self.read_uleb128_u32()?;
+        CovTerm::decode(raw_term).context("decoding term")
+    }
+
+    fn read_mapping_kind_and_region(&mut self) -> anyhow::Result<(MappingKind, MappingRegion)> {
+        let mut kind = self.read_raw_mapping_kind()?;
+        let mut region = self.read_raw_mapping_region()?;
+
+        const HIGH_BIT: u32 = 1u32 << 31;
+        if region.end_column & HIGH_BIT != 0 {
+            region.end_column &= !HIGH_BIT;
+            kind = match kind {
+                MappingKind::Code(term) => MappingKind::Gap(term),
+                // LLVM's coverage mapping reader will actually handle this
+                // case without complaint, but the result is almost certainly
+                // a meaningless implementation artifact.
+                _ => return Err(anyhow!("unexpected base kind for gap region: {kind:?}")),
+            }
+        }
+
+        Ok((kind, region))
+    }
+
+    fn read_raw_mapping_kind(&mut self) -> anyhow::Result<MappingKind> {
+        let raw_mapping_kind = self.read_uleb128_u32()?;
+        if let Some(term) = CovTerm::decode(raw_mapping_kind) {
+            return Ok(MappingKind::Code(term));
+        }
+
+        assert_eq!(raw_mapping_kind & 0b11, 0);
+        assert_ne!(raw_mapping_kind, 0);
+
+        let (high, is_expansion) = (raw_mapping_kind >> 3, raw_mapping_kind & 0b100 != 0);
+        if is_expansion {
+            Ok(MappingKind::Expansion(high))
+        } else {
+            match high {
+                0 => unreachable!("zero kind should have already been handled as a code mapping"),
+                2 => Ok(MappingKind::Skip),
+                4 => {
+                    let r#true = self.read_simple_term()?;
+                    let r#false = self.read_simple_term()?;
+                    Ok(MappingKind::Branch { r#true, r#false })
+                }
+                _ => Err(anyhow!("unknown mapping kind: {raw_mapping_kind:#x}")),
+            }
+        }
+    }
+
+    fn read_raw_mapping_region(&mut self) -> anyhow::Result<MappingRegion> {
+        let start_line_offset = self.read_uleb128_u32()?;
+        let start_column = self.read_uleb128_u32()?;
+        let end_line_offset = self.read_uleb128_u32()?;
+        let end_column = self.read_uleb128_u32()?;
+        Ok(MappingRegion { start_line_offset, start_column, end_line_offset, end_column })
+    }
+}
+
+/// Enum that can hold a constant zero value, the ID of an physical coverage
+/// counter, or the ID (and operation) of a coverage-counter expression.
+///
+/// Terms are used as the operands of coverage-counter expressions, as the arms
+/// of branch mappings, and as the value of code/gap mappings.
+#[derive(Clone, Copy, Debug)]
+pub(crate) enum CovTerm {
+    Zero,
+    Counter(u32),
+    Expression(u32, Op),
+}
+
+/// Operator (addition or subtraction) used by an expression.
+#[derive(Clone, Copy, Debug)]
+pub(crate) enum Op {
+    Sub,
+    Add,
+}
+
+impl CovTerm {
+    pub(crate) fn decode(input: u32) -> Option<Self> {
+        let (high, tag) = (input >> 2, input & 0b11);
+        match tag {
+            0b00 if high == 0 => Some(Self::Zero),
+            0b01 => Some(Self::Counter(high)),
+            0b10 => Some(Self::Expression(high, Op::Sub)),
+            0b11 => Some(Self::Expression(high, Op::Add)),
+            // When reading expression operands or branch arms, the LLVM coverage
+            // mapping reader will always interpret a `0b00` tag as a zero
+            // term, even when the high bits are non-zero.
+            // We treat that case as failure instead, so that this code can be
+            // shared by the full mapping-kind reader as well.
+            _ => None,
+        }
+    }
+}
+
+#[derive(Debug)]
+enum MappingKind {
+    Code(CovTerm),
+    Gap(CovTerm),
+    Expansion(u32),
+    Skip,
+    // Using raw identifiers here makes the dump output a little bit nicer
+    // (via the derived Debug), at the expense of making this tool's source
+    // code a little bit uglier.
+    Branch { r#true: CovTerm, r#false: CovTerm },
+}
+
+struct MappingRegion {
+    /// Offset of this region's start line, relative to the *start line* of
+    /// the *previous mapping* (or 0). Line numbers are 1-based.
+    start_line_offset: u32,
+    /// This region's start column, absolute and 1-based.
+    start_column: u32,
+    /// Offset of this region's end line, relative to the *this mapping's*
+    /// start line. Line numbers are 1-based.
+    end_line_offset: u32,
+    /// This region's end column, absolute, 1-based, and exclusive.
+    ///
+    /// If the highest bit is set, that bit is cleared and the associated
+    /// mapping becomes a gap region mapping.
+    end_column: u32,
+}
+
+impl Debug for MappingRegion {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(
+            f,
+            "(prev + {}, {}) to (start + {}, {})",
+            self.start_line_offset, self.start_column, self.end_line_offset, self.end_column
+        )
+    }
+}
+
+/// Helper type that prints expressions in a "resolved" form, so that
+/// developers reading the dump don't need to resolve expressions by hand.
+struct ExpressionResolver {
+    operands: Vec<(CovTerm, CovTerm)>,
+}
+
+impl ExpressionResolver {
+    fn new() -> Self {
+        Self { operands: Vec::new() }
+    }
+
+    fn push_operands(&mut self, lhs: CovTerm, rhs: CovTerm) {
+        self.operands.push((lhs, rhs));
+    }
+
+    fn format_term(&self, term: CovTerm) -> String {
+        let mut output = String::new();
+        self.write_term(&mut output, term);
+        output
+    }
+
+    fn write_term(&self, output: &mut String, term: CovTerm) {
+        match term {
+            CovTerm::Zero => output.push_str("Zero"),
+            CovTerm::Counter(id) => write!(output, "c{id}").unwrap(),
+            CovTerm::Expression(id, op) => {
+                let (lhs, rhs) = self.operands[id as usize];
+                let op = match op {
+                    Op::Sub => "-",
+                    Op::Add => "+",
+                };
+
+                output.push('(');
+                self.write_term(output, lhs);
+                write!(output, " {op} ").unwrap();
+                self.write_term(output, rhs);
+                output.push(')');
+            }
+        }
+    }
+}
diff --git a/src/tools/coverage-dump/src/main.rs b/src/tools/coverage-dump/src/main.rs
new file mode 100644
index 00000000000..93fed1799e0
--- /dev/null
+++ b/src/tools/coverage-dump/src/main.rs
@@ -0,0 +1,17 @@
+mod covfun;
+mod parser;
+mod prf_names;
+
+fn main() -> anyhow::Result<()> {
+    use anyhow::Context as _;
+
+    let args = std::env::args().collect::<Vec<_>>();
+
+    let llvm_ir_path = args.get(1).context("LLVM IR file not specified")?;
+    let llvm_ir = std::fs::read_to_string(llvm_ir_path).context("couldn't read LLVM IR file")?;
+
+    let function_names = crate::prf_names::make_function_names_table(&llvm_ir)?;
+    crate::covfun::dump_covfun_mappings(&llvm_ir, &function_names)?;
+
+    Ok(())
+}
diff --git a/src/tools/coverage-dump/src/parser.rs b/src/tools/coverage-dump/src/parser.rs
new file mode 100644
index 00000000000..eefac1a4f94
--- /dev/null
+++ b/src/tools/coverage-dump/src/parser.rs
@@ -0,0 +1,80 @@
+#[cfg(test)]
+mod tests;
+
+use anyhow::ensure;
+use regex::bytes;
+use std::sync::OnceLock;
+
+/// Given the raw contents of a string literal in LLVM IR assembly, decodes any
+/// backslash escapes and returns a vector containing the resulting byte string.
+pub(crate) fn unescape_llvm_string_contents(contents: &str) -> Vec<u8> {
+    let escape_re = {
+        static RE: OnceLock<bytes::Regex> = OnceLock::new();
+        // LLVM IR supports two string escapes: `\\` and `\xx`.
+        RE.get_or_init(|| bytes::Regex::new(r"\\\\|\\([0-9A-Za-z]{2})").unwrap())
+    };
+
+    fn u8_from_hex_digits(digits: &[u8]) -> u8 {
+        // We know that the input contains exactly 2 hex digits, so these calls
+        // should never fail.
+        assert_eq!(digits.len(), 2);
+        let digits = std::str::from_utf8(digits).unwrap();
+        u8::from_str_radix(digits, 16).unwrap()
+    }
+
+    escape_re
+        .replace_all(contents.as_bytes(), |captures: &bytes::Captures<'_>| {
+            let byte = match captures.get(1) {
+                None => b'\\',
+                Some(hex_digits) => u8_from_hex_digits(hex_digits.as_bytes()),
+            };
+            [byte]
+        })
+        .into_owned()
+}
+
+pub(crate) struct Parser<'a> {
+    rest: &'a [u8],
+}
+
+impl<'a> Parser<'a> {
+    pub(crate) fn new(input: &'a [u8]) -> Self {
+        Self { rest: input }
+    }
+
+    pub(crate) fn ensure_empty(self) -> anyhow::Result<()> {
+        ensure!(self.rest.is_empty(), "unparsed bytes: 0x{:02x?}", self.rest);
+        Ok(())
+    }
+
+    pub(crate) fn read_n_bytes(&mut self, n: usize) -> anyhow::Result<&'a [u8]> {
+        ensure!(n <= self.rest.len());
+
+        let (bytes, rest) = self.rest.split_at(n);
+        self.rest = rest;
+        Ok(bytes)
+    }
+
+    pub(crate) fn read_uleb128_u32(&mut self) -> anyhow::Result<u32> {
+        self.read_uleb128_u64_and_convert()
+    }
+
+    pub(crate) fn read_uleb128_usize(&mut self) -> anyhow::Result<usize> {
+        self.read_uleb128_u64_and_convert()
+    }
+
+    fn read_uleb128_u64_and_convert<T>(&mut self) -> anyhow::Result<T>
+    where
+        T: TryFrom<u64> + 'static,
+        T::Error: std::error::Error + Send + Sync,
+    {
+        let mut temp_rest = self.rest;
+        let raw_value: u64 = leb128::read::unsigned(&mut temp_rest)?;
+        let converted_value = T::try_from(raw_value)?;
+
+        // Only update `self.rest` if the above steps succeeded, so that the
+        // parser position can be used for error reporting if desired.
+        self.rest = temp_rest;
+        Ok(converted_value)
+    }
+}
diff --git a/src/tools/coverage-dump/src/parser/tests.rs b/src/tools/coverage-dump/src/parser/tests.rs
new file mode 100644
index 00000000000..a673606b9c4
--- /dev/null
+++ b/src/tools/coverage-dump/src/parser/tests.rs
@@ -0,0 +1,38 @@
+use super::unescape_llvm_string_contents;
+
+// WARNING: These tests don't necessarily run in CI, and were mainly used to
+// help track down problems when originally developing this tool.
+// (The tool is still tested indirectly by snapshot tests that rely on it.)
+
+// Tests for `unescape_llvm_string_contents`:
+
+#[test]
+fn unescape_empty() {
+    assert_eq!(unescape_llvm_string_contents(""), &[]);
+}
+
+#[test]
+fn unescape_noop() {
+    let input = "The quick brown fox jumps over the lazy dog.";
+    assert_eq!(unescape_llvm_string_contents(input), input.as_bytes());
+}
+
+#[test]
+fn unescape_backslash() {
+    let input = r"\\Hello\\world\\";
+    assert_eq!(unescape_llvm_string_contents(input), r"\Hello\world\".as_bytes());
+}
+
+#[test]
+fn unescape_hex() {
+    let input = r"\01\02\03\04\0a\0b\0C\0D\fd\fE\FF";
+    let expected: &[u8] = &[0x01, 0x02, 0x03, 0x04, 0x0a, 0x0b, 0x0c, 0x0d, 0xfd, 0xfe, 0xff];
+    assert_eq!(unescape_llvm_string_contents(input), expected);
+}
+
+#[test]
+fn unescape_mixed() {
+    let input = r"\\01.\5c\5c";
+    let expected: &[u8] = br"\01.\\";
+    assert_eq!(unescape_llvm_string_contents(input), expected);
+}
diff --git a/src/tools/coverage-dump/src/prf_names.rs b/src/tools/coverage-dump/src/prf_names.rs
new file mode 100644
index 00000000000..d3f7b819e48
--- /dev/null
+++ b/src/tools/coverage-dump/src/prf_names.rs
@@ -0,0 +1,87 @@
+use crate::parser::{unescape_llvm_string_contents, Parser};
+use anyhow::{anyhow, ensure};
+use regex::Regex;
+use std::collections::HashMap;
+use std::sync::OnceLock;
+
+/// Scans through the contents of an LLVM IR assembly file to find `__llvm_prf_names`
+/// entries, decodes them, and creates a table that maps name hash values to
+/// (demangled) function names.
+pub(crate) fn make_function_names_table(llvm_ir: &str) -> anyhow::Result<HashMap<u64, String>> {
+    fn prf_names_payload(line: &str) -> Option<&str> {
+        let re = {
+            // We cheat a little bit and match the variable name `@__llvm_prf_nm`
+            // rather than the section name, because the section name is harder
+            // to extract and differs across Linux/Windows/macOS.
+            static RE: OnceLock<Regex> = OnceLock::new();
+            RE.get_or_init(|| {
+                Regex::new(r#"^@__llvm_prf_nm =.*\[[0-9]+ x i8\] c"([^"]*)".*$"#).unwrap()
+            })
+        };
+
+        let payload = re.captures(line)?.get(1).unwrap().as_str();
+        Some(payload)
+    }
+
+    /// LLVM's profiler/coverage metadata often uses an MD5 hash truncated to
+    /// 64 bits as a way to associate data stored in different tables/sections.
+    fn truncated_md5(bytes: &[u8]) -> u64 {
+        use md5::{Digest, Md5};
+        let mut hasher = Md5::new();
+        hasher.update(bytes);
+        let hash: [u8; 8] = hasher.finalize().as_slice()[..8].try_into().unwrap();
+        // The truncated hash is explicitly little-endian, regardless of host
+        // or target platform. (See `MD5Result::low` in LLVM's `MD5.h`.)
+        u64::from_le_bytes(hash)
+    }
+
+    fn demangle_if_able(symbol_name_bytes: &[u8]) -> anyhow::Result<String> {
+        // In practice, raw symbol names should always be ASCII.
+        let symbol_name_str = std::str::from_utf8(symbol_name_bytes)?;
+        match rustc_demangle::try_demangle(symbol_name_str) {
+            Ok(d) => Ok(format!("{d:#}")),
+            // If demangling failed, don't treat it as an error. This lets us
+            // run the dump tool against non-Rust coverage maps produced by
+            // `clang`, for testing purposes.
+            Err(_) => Ok(format!("(couldn't demangle) {symbol_name_str}")),
+        }
+    }
+
+    let mut map = HashMap::new();
+
+    for payload in llvm_ir.lines().filter_map(prf_names_payload).map(unescape_llvm_string_contents)
+    {
+        let mut parser = Parser::new(&payload);
+        let uncompressed_len = parser.read_uleb128_usize()?;
+        let compressed_len = parser.read_uleb128_usize()?;
+
+        let uncompressed_bytes_vec;
+        let uncompressed_bytes: &[u8] = if compressed_len == 0 {
+            // The symbol name bytes are uncompressed, so read them directly.
+            parser.read_n_bytes(uncompressed_len)?
+        } else {
+            // The symbol name bytes are compressed, so read and decompress them.
+            let compressed_bytes = parser.read_n_bytes(compressed_len)?;
+
+            uncompressed_bytes_vec = miniz_oxide::inflate::decompress_to_vec_zlib_with_limit(
+                compressed_bytes,
+                uncompressed_len,
+            )
+            .map_err(|e| anyhow!("{e:?}"))?;
+            ensure!(uncompressed_bytes_vec.len() == uncompressed_len);
+
+            &uncompressed_bytes_vec
+        };
+
+        // Symbol names in the payload are separated by `0x01` bytes.
+        for raw_name in uncompressed_bytes.split(|&b| b == 0x01) {
+            let hash = truncated_md5(raw_name);
+            let demangled = demangle_if_able(raw_name)?;
+            map.insert(hash, demangled);
+        }
+
+        parser.ensure_empty()?;
+    }
+
+    Ok(map)
+}
author	bors <bors@rust-lang.org>	2023-09-05 15:30:59 +0000
committer	bors <bors@rust-lang.org>	2023-09-05 15:30:59 +0000
commit	ab45885dec2a6552cb060a5b7183653baaecd580 (patch)
tree	cbc89120fae2ede8bdc74e158fb7724cbe567852 /src
parent	f222a2dd8f6391e6433f57a7c5f1514166edbec1 (diff)
parent	3141177995f52fc3cbb64d66cf0a98ea0f754fca (diff)
download	rust-ab45885dec2a6552cb060a5b7183653baaecd580.tar.gz rust-ab45885dec2a6552cb060a5b7183653baaecd580.zip