diff options
| author | Zalathar <Zalathar@users.noreply.github.com> | 2025-04-23 17:36:01 +1000 |
|---|---|---|
| committer | Zalathar <Zalathar@users.noreply.github.com> | 2025-05-06 11:58:50 +1000 |
| commit | 89319f2e128379a96943de2be347ec09a80abca4 (patch) | |
| tree | a0b14b15a489c8dab444119211b9a9d753c9dc3a /src | |
| parent | a3d5562fcf7f67491d55588ac3279bfade9dadcd (diff) | |
| download | rust-89319f2e128379a96943de2be347ec09a80abca4.tar.gz rust-89319f2e128379a96943de2be347ec09a80abca4.zip | |
coverage-dump: Extract some common code to an `llvm_utils` submodule
Diffstat (limited to 'src')
| -rw-r--r-- | src/tools/coverage-dump/src/covfun.rs | 3 | ||||
| -rw-r--r-- | src/tools/coverage-dump/src/llvm_utils.rs | 46 | ||||
| -rw-r--r-- | src/tools/coverage-dump/src/llvm_utils/tests.rs (renamed from src/tools/coverage-dump/src/parser/tests.rs) | 0 | ||||
| -rw-r--r-- | src/tools/coverage-dump/src/main.rs | 1 | ||||
| -rw-r--r-- | src/tools/coverage-dump/src/parser.rs | 34 | ||||
| -rw-r--r-- | src/tools/coverage-dump/src/prf_names.rs | 15 |
6 files changed, 51 insertions, 48 deletions
diff --git a/src/tools/coverage-dump/src/covfun.rs b/src/tools/coverage-dump/src/covfun.rs index 82ebd33d0d1..458fd680429 100644 --- a/src/tools/coverage-dump/src/covfun.rs +++ b/src/tools/coverage-dump/src/covfun.rs @@ -5,7 +5,8 @@ use std::sync::OnceLock; use anyhow::{Context, anyhow}; use regex::Regex; -use crate::parser::{Parser, unescape_llvm_string_contents}; +use crate::llvm_utils::unescape_llvm_string_contents; +use crate::parser::Parser; pub(crate) fn dump_covfun_mappings( llvm_ir: &str, diff --git a/src/tools/coverage-dump/src/llvm_utils.rs b/src/tools/coverage-dump/src/llvm_utils.rs new file mode 100644 index 00000000000..017fdbec0fc --- /dev/null +++ b/src/tools/coverage-dump/src/llvm_utils.rs @@ -0,0 +1,46 @@ +use std::sync::OnceLock; + +use regex::bytes; + +#[cfg(test)] +mod tests; + +/// Given the raw contents of a string literal in LLVM IR assembly, decodes any +/// backslash escapes and returns a vector containing the resulting byte string. +pub(crate) fn unescape_llvm_string_contents(contents: &str) -> Vec<u8> { + let escape_re = { + static RE: OnceLock<bytes::Regex> = OnceLock::new(); + // LLVM IR supports two string escapes: `\\` and `\xx`. + RE.get_or_init(|| bytes::Regex::new(r"\\\\|\\([0-9A-Za-z]{2})").unwrap()) + }; + + fn u8_from_hex_digits(digits: &[u8]) -> u8 { + // We know that the input contains exactly 2 hex digits, so these calls + // should never fail. + assert_eq!(digits.len(), 2); + let digits = std::str::from_utf8(digits).unwrap(); + u8::from_str_radix(digits, 16).unwrap() + } + + escape_re + .replace_all(contents.as_bytes(), |captures: &bytes::Captures<'_>| { + let byte = match captures.get(1) { + None => b'\\', + Some(hex_digits) => u8_from_hex_digits(hex_digits.as_bytes()), + }; + [byte] + }) + .into_owned() +} + +/// LLVM's profiler/coverage metadata often uses an MD5 hash truncated to +/// 64 bits as a way to associate data stored in different tables/sections. +pub(crate) fn truncated_md5(bytes: &[u8]) -> u64 { + use md5::{Digest, Md5}; + let mut hasher = Md5::new(); + hasher.update(bytes); + let hash: [u8; 8] = hasher.finalize().as_slice()[..8].try_into().unwrap(); + // The truncated hash is explicitly little-endian, regardless of host + // or target platform. (See `MD5Result::low` in LLVM's `MD5.h`.) + u64::from_le_bytes(hash) +} diff --git a/src/tools/coverage-dump/src/parser/tests.rs b/src/tools/coverage-dump/src/llvm_utils/tests.rs index 506b0a6200b..506b0a6200b 100644 --- a/src/tools/coverage-dump/src/parser/tests.rs +++ b/src/tools/coverage-dump/src/llvm_utils/tests.rs diff --git a/src/tools/coverage-dump/src/main.rs b/src/tools/coverage-dump/src/main.rs index b21e3e292f2..6408b97a06f 100644 --- a/src/tools/coverage-dump/src/main.rs +++ b/src/tools/coverage-dump/src/main.rs @@ -1,4 +1,5 @@ mod covfun; +mod llvm_utils; mod parser; mod prf_names; diff --git a/src/tools/coverage-dump/src/parser.rs b/src/tools/coverage-dump/src/parser.rs index 0bd4abdae3e..f26a57b43b3 100644 --- a/src/tools/coverage-dump/src/parser.rs +++ b/src/tools/coverage-dump/src/parser.rs @@ -1,38 +1,4 @@ -#[cfg(test)] -mod tests; - -use std::sync::OnceLock; - use anyhow::ensure; -use regex::bytes; - -/// Given the raw contents of a string literal in LLVM IR assembly, decodes any -/// backslash escapes and returns a vector containing the resulting byte string. -pub(crate) fn unescape_llvm_string_contents(contents: &str) -> Vec<u8> { - let escape_re = { - static RE: OnceLock<bytes::Regex> = OnceLock::new(); - // LLVM IR supports two string escapes: `\\` and `\xx`. - RE.get_or_init(|| bytes::Regex::new(r"\\\\|\\([0-9A-Za-z]{2})").unwrap()) - }; - - fn u8_from_hex_digits(digits: &[u8]) -> u8 { - // We know that the input contains exactly 2 hex digits, so these calls - // should never fail. - assert_eq!(digits.len(), 2); - let digits = std::str::from_utf8(digits).unwrap(); - u8::from_str_radix(digits, 16).unwrap() - } - - escape_re - .replace_all(contents.as_bytes(), |captures: &bytes::Captures<'_>| { - let byte = match captures.get(1) { - None => b'\\', - Some(hex_digits) => u8_from_hex_digits(hex_digits.as_bytes()), - }; - [byte] - }) - .into_owned() -} pub(crate) struct Parser<'a> { rest: &'a [u8], diff --git a/src/tools/coverage-dump/src/prf_names.rs b/src/tools/coverage-dump/src/prf_names.rs index 96d097c79a3..fe193efd8e5 100644 --- a/src/tools/coverage-dump/src/prf_names.rs +++ b/src/tools/coverage-dump/src/prf_names.rs @@ -4,7 +4,8 @@ use std::sync::OnceLock; use anyhow::{anyhow, ensure}; use regex::Regex; -use crate::parser::{Parser, unescape_llvm_string_contents}; +use crate::llvm_utils::{truncated_md5, unescape_llvm_string_contents}; +use crate::parser::Parser; /// Scans through the contents of an LLVM IR assembly file to find `__llvm_prf_names` /// entries, decodes them, and creates a table that maps name hash values to @@ -25,18 +26,6 @@ pub(crate) fn make_function_names_table(llvm_ir: &str) -> anyhow::Result<HashMap Some(payload) } - /// LLVM's profiler/coverage metadata often uses an MD5 hash truncated to - /// 64 bits as a way to associate data stored in different tables/sections. - fn truncated_md5(bytes: &[u8]) -> u64 { - use md5::{Digest, Md5}; - let mut hasher = Md5::new(); - hasher.update(bytes); - let hash: [u8; 8] = hasher.finalize().as_slice()[..8].try_into().unwrap(); - // The truncated hash is explicitly little-endian, regardless of host - // or target platform. (See `MD5Result::low` in LLVM's `MD5.h`.) - u64::from_le_bytes(hash) - } - fn demangle_if_able(symbol_name_bytes: &[u8]) -> anyhow::Result<String> { // In practice, raw symbol names should always be ASCII. let symbol_name_str = std::str::from_utf8(symbol_name_bytes)?; |
