1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
|
use std::borrow::Cow;
use std::sync::OnceLock;
use anyhow::{anyhow, ensure};
use regex::bytes;
use crate::parser::Parser;
#[cfg(test)]
mod tests;
/// Given the raw contents of a string literal in LLVM IR assembly, decodes any
/// backslash escapes and returns a vector containing the resulting byte string.
pub(crate) fn unescape_llvm_string_contents(contents: &str) -> Vec<u8> {
let escape_re = {
static RE: OnceLock<bytes::Regex> = OnceLock::new();
// LLVM IR supports two string escapes: `\\` and `\xx`.
RE.get_or_init(|| bytes::Regex::new(r"\\\\|\\([0-9A-Za-z]{2})").unwrap())
};
fn u8_from_hex_digits(digits: &[u8]) -> u8 {
// We know that the input contains exactly 2 hex digits, so these calls
// should never fail.
assert_eq!(digits.len(), 2);
let digits = std::str::from_utf8(digits).unwrap();
u8::from_str_radix(digits, 16).unwrap()
}
escape_re
.replace_all(contents.as_bytes(), |captures: &bytes::Captures<'_>| {
let byte = match captures.get(1) {
None => b'\\',
Some(hex_digits) => u8_from_hex_digits(hex_digits.as_bytes()),
};
[byte]
})
.into_owned()
}
/// LLVM's profiler/coverage metadata often uses an MD5 hash truncated to
/// 64 bits as a way to associate data stored in different tables/sections.
pub(crate) fn truncated_md5(bytes: &[u8]) -> u64 {
use md5::{Digest, Md5};
let mut hasher = Md5::new();
hasher.update(bytes);
let hash: [u8; 8] = hasher.finalize().as_slice()[..8].try_into().unwrap();
// The truncated hash is explicitly little-endian, regardless of host
// or target platform. (See `MD5Result::low` in LLVM's `MD5.h`.)
u64::from_le_bytes(hash)
}
impl<'a> Parser<'a> {
/// Reads a sequence of:
/// - Length of uncompressed data in bytes, as ULEB128
/// - Length of compressed data in bytes (or 0), as ULEB128
/// - The indicated number of compressed or uncompressed bytes
///
/// If the number of compressed bytes is 0, the subsequent bytes are
/// uncompressed. Otherwise, the subsequent bytes are compressed, and will
/// be decompressed.
///
/// Returns the uncompressed bytes that were read directly or decompressed.
pub(crate) fn read_chunk_to_uncompressed_bytes(&mut self) -> anyhow::Result<Cow<'a, [u8]>> {
let uncompressed_len = self.read_uleb128_usize()?;
let compressed_len = self.read_uleb128_usize()?;
if compressed_len == 0 {
// The bytes are uncompressed, so read them directly.
let uncompressed_bytes = self.read_n_bytes(uncompressed_len)?;
Ok(Cow::Borrowed(uncompressed_bytes))
} else {
// The bytes are compressed, so read and decompress them.
let compressed_bytes = self.read_n_bytes(compressed_len)?;
let uncompressed_bytes = miniz_oxide::inflate::decompress_to_vec_zlib_with_limit(
compressed_bytes,
uncompressed_len,
)
.map_err(|e| anyhow!("{e:?}"))?;
ensure!(uncompressed_bytes.len() == uncompressed_len);
Ok(Cow::Owned(uncompressed_bytes))
}
}
}
|