3 files changed, 481 insertions, 0 deletions
diff --git a/library/stdarch/crates/stdarch-test/src/disassembly.rs b/library/stdarch/crates/stdarch-test/src/disassembly.rs
new file mode 100644
index 00000000000..f5167ea8d8e
--- /dev/null
+++ b/library/stdarch/crates/stdarch-test/src/disassembly.rs
@@ -0,0 +1,208 @@
+//! Disassembly calling function for most targets.
+
+use crate::Function;
+use std::{collections::HashSet, env, str};
+
+// Extracts the "shim" name from the `symbol`.
+fn normalize(mut symbol: &str) -> String {
+    // Remove trailing colon:
+    if symbol.ends_with(':') {
+        symbol = &symbol[..symbol.len() - 1];
+    }
+    if symbol.ends_with('>') {
+        symbol = &symbol[..symbol.len() - 1];
+    }
+    if let Some(idx) = symbol.find('<') {
+        symbol = &symbol[idx + 1..];
+    }
+
+    let mut symbol = rustc_demangle::demangle(symbol).to_string();
+    symbol = match symbol.rfind("::h") {
+        Some(i) => symbol[..i].to_string(),
+        None => symbol.to_string(),
+    };
+
+    // Remove Rust paths
+    if let Some(last_colon) = symbol.rfind(':') {
+        symbol = symbol[last_colon + 1..].to_string();
+    }
+
+    // Normalize to no leading underscore to handle platforms that may
+    // inject extra ones in symbol names.
+    while symbol.starts_with('_') || symbol.starts_with('.') {
+        symbol.remove(0);
+    }
+    // Windows/x86 has a suffix such as @@4.
+    if let Some(idx) = symbol.find("@@") {
+        symbol = symbol[..idx].to_string();
+    }
+    symbol
+}
+
+#[cfg(target_env = "msvc")]
+pub(crate) fn disassemble_myself() -> HashSet<Function> {
+    let me = env::current_exe().expect("failed to get current exe");
+
+    let target = if cfg!(target_arch = "x86_64") {
+        "x86_64-pc-windows-msvc"
+    } else if cfg!(target_arch = "x86") {
+        "i686-pc-windows-msvc"
+    } else if cfg!(target_arch = "aarch64") {
+        "aarch64-pc-windows-msvc"
+    } else {
+        panic!("disassembly unimplemented")
+    };
+    let mut cmd =
+        cc::windows_registry::find(target, "dumpbin.exe").expect("failed to find `dumpbin` tool");
+    let output = cmd
+        .arg("/DISASM:NOBYTES")
+        .arg(&me)
+        .output()
+        .expect("failed to execute dumpbin");
+    println!(
+        "{}\n{}",
+        output.status,
+        String::from_utf8_lossy(&output.stderr)
+    );
+    assert!(output.status.success());
+    // Windows does not return valid UTF-8 output:
+    parse(&String::from_utf8_lossy(Vec::leak(output.stdout)))
+}
+
+#[cfg(not(target_env = "msvc"))]
+pub(crate) fn disassemble_myself() -> HashSet<Function> {
+    let me = env::current_exe().expect("failed to get current exe");
+
+    let objdump = env::var("OBJDUMP").unwrap_or_else(|_| "objdump".to_string());
+    let add_args = if cfg!(target_vendor = "apple") && cfg!(target_arch = "aarch64") {
+        // Target features need to be enabled for LLVM objdump on Darwin ARM64
+        vec!["--mattr=+v8.6a,+crypto,+tme"]
+    } else if cfg!(any(target_arch = "riscv32", target_arch = "riscv64")) {
+        vec!["--mattr=+zk,+zks,+zbc,+zbb"]
+    } else {
+        vec![]
+    };
+    let output = std::process::Command::new(objdump.clone())
+        .arg("--disassemble")
+        .arg("--no-show-raw-insn")
+        .args(add_args)
+        .arg(&me)
+        .output()
+        .unwrap_or_else(|_| panic!("failed to execute objdump. OBJDUMP={objdump}"));
+    println!(
+        "{}\n{}",
+        output.status,
+        String::from_utf8_lossy(&output.stderr)
+    );
+    assert!(output.status.success());
+
+    let disassembly = String::from_utf8_lossy(Vec::leak(output.stdout));
+
+    parse(&disassembly)
+}
+
+fn parse(output: &str) -> HashSet<Function> {
+    let mut lines = output.lines();
+
+    println!(
+        "First 100 lines of the disassembly input containing {} lines:",
+        lines.clone().count()
+    );
+    for line in output.lines().take(100) {
+        println!("{line}");
+    }
+
+    let mut functions = HashSet::new();
+    let mut cached_header = None;
+    while let Some(header) = cached_header.take().or_else(|| lines.next()) {
+        if !header.ends_with(':') || !header.contains("stdarch_test_shim") {
+            continue;
+        }
+        eprintln!("header: {header}");
+        let symbol = normalize(header);
+        eprintln!("normalized symbol: {symbol}");
+        let mut instructions = Vec::new();
+        for instruction in lines.by_ref() {
+            if instruction.ends_with(':') {
+                cached_header = Some(instruction);
+                break;
+            }
+            if instruction.is_empty() {
+                cached_header = None;
+                break;
+            }
+            let mut parts = if cfg!(target_env = "msvc") {
+                // Each line looks like:
+                //
+                // >  $addr: $instr..
+                instruction
+                    .split(&[' ', ','])
+                    .filter(|&x| !x.is_empty())
+                    .skip(1)
+                    .map(str::to_lowercase)
+                    .skip_while(|s| matches!(&**s, "lock" | "vex")) // skip x86-specific prefix
+                    .collect::<Vec<String>>()
+            } else {
+                // objdump with --no-show-raw-insn
+                // Each line of instructions should look like:
+                //
+                //      $rel_offset:       $instruction...
+                instruction
+                    .split_whitespace()
+                    .skip(1)
+                    .skip_while(|s| matches!(*s, "lock" | "{evex}" | "{vex}")) // skip x86-specific prefix
+                    .map(ToString::to_string)
+                    .collect::<Vec<String>>()
+            };
+
+            if cfg!(any(target_arch = "aarch64", target_arch = "arm64ec")) {
+                // Normalize [us]shll.* ..., #0 instructions to the preferred form: [us]xtl.* ...
+                // as neither LLVM objdump nor dumpbin does that.
+                // See https://developer.arm.com/documentation/ddi0602/latest/SIMD-FP-Instructions/UXTL--UXTL2--Unsigned-extend-Long--an-alias-of-USHLL--USHLL2-
+                // and https://developer.arm.com/documentation/ddi0602/latest/SIMD-FP-Instructions/SXTL--SXTL2--Signed-extend-Long--an-alias-of-SSHLL--SSHLL2-
+                // for details.
+                fn is_shll(instr: &str) -> bool {
+                    if cfg!(target_env = "msvc") {
+                        instr.starts_with("ushll") || instr.starts_with("sshll")
+                    } else {
+                        instr.starts_with("ushll.") || instr.starts_with("sshll.")
+                    }
+                }
+                match (parts.first(), parts.last()) {
+                    (Some(instr), Some(last_arg)) if is_shll(instr) && last_arg == "#0" => {
+                        assert_eq!(parts.len(), 4);
+                        let mut new_parts = Vec::with_capacity(3);
+                        let new_instr = format!("{}{}{}", &instr[..1], "xtl", &instr[5..]);
+                        new_parts.push(new_instr);
+                        new_parts.push(parts[1].clone());
+                        new_parts.push(parts[2][0..parts[2].len() - 1].to_owned()); // strip trailing comma
+                        parts = new_parts;
+                    }
+                    // dumpbin uses "ins" instead of "mov"
+                    (Some(instr), _) if cfg!(target_env = "msvc") && instr == "ins" => {
+                        parts[0] = "mov".to_string()
+                    }
+                    _ => {}
+                };
+            }
+
+            instructions.push(parts.join(" "));
+            if matches!(&**instructions.last().unwrap(), "ret" | "retq") {
+                cached_header = None;
+                break;
+            }
+        }
+        let function = Function {
+            name: symbol,
+            instrs: instructions,
+        };
+        assert!(functions.insert(function));
+    }
+
+    eprintln!("all found functions dump:");
+    for k in &functions {
+        eprintln!("  f: {}", k.name);
+    }
+
+    functions
+}
diff --git a/library/stdarch/crates/stdarch-test/src/lib.rs b/library/stdarch/crates/stdarch-test/src/lib.rs
new file mode 100644
index 00000000000..f6614f6d51c
--- /dev/null
+++ b/library/stdarch/crates/stdarch-test/src/lib.rs
@@ -0,0 +1,218 @@
+//! Runtime support needed for testing the stdarch crate.
+//!
+//! This basically just disassembles the current executable and then parses the
+//! output once globally and then provides the `assert` function which makes
+//! assertions about the disassembly of a function.
+#![deny(rust_2018_idioms)]
+#![allow(clippy::missing_docs_in_private_items, clippy::print_stdout)]
+
+#[macro_use]
+extern crate lazy_static;
+#[macro_use]
+extern crate cfg_if;
+
+pub use assert_instr_macro::*;
+pub use simd_test_macro::*;
+use std::{cmp, collections::HashSet, env, hash, hint::black_box, str};
+
+cfg_if! {
+    if #[cfg(target_arch = "wasm32")] {
+        pub mod wasm;
+        use wasm::disassemble_myself;
+    } else {
+        mod disassembly;
+        use crate::disassembly::disassemble_myself;
+    }
+}
+
+lazy_static! {
+    static ref DISASSEMBLY: HashSet<Function> = disassemble_myself();
+}
+
+#[derive(Debug)]
+struct Function {
+    name: String,
+    instrs: Vec<String>,
+}
+impl Function {
+    fn new(n: &str) -> Self {
+        Self {
+            name: n.to_string(),
+            instrs: Vec::new(),
+        }
+    }
+}
+
+impl cmp::PartialEq for Function {
+    fn eq(&self, other: &Self) -> bool {
+        self.name == other.name
+    }
+}
+impl cmp::Eq for Function {}
+
+impl hash::Hash for Function {
+    fn hash<H: hash::Hasher>(&self, state: &mut H) {
+        self.name.hash(state)
+    }
+}
+
+/// Main entry point for this crate, called by the `#[assert_instr]` macro.
+///
+/// This asserts that the function at `fnptr` contains the instruction
+/// `expected` provided.
+pub fn assert(shim_addr: usize, fnname: &str, expected: &str) {
+    // Make sure that the shim is not removed
+    black_box(shim_addr);
+
+    //eprintln!("shim name: {fnname}");
+    let function = &DISASSEMBLY
+        .get(&Function::new(fnname))
+        .unwrap_or_else(|| panic!("function \"{fnname}\" not found in the disassembly"));
+    //eprintln!("  function: {:?}", function);
+
+    let mut instrs = &function.instrs[..];
+    while instrs.last().is_some_and(|s| s == "nop" || s == "int3") {
+        instrs = &instrs[..instrs.len() - 1];
+    }
+
+    // Look for `expected` as the first part of any instruction in this
+    // function, e.g., tzcntl in tzcntl %rax,%rax.
+    //
+    // There are two cases when the expected instruction is nop:
+    // 1. The expected intrinsic is compiled away so we can't
+    // check for it - aka the intrinsic is not generating any code.
+    // 2. It is a mark, indicating that the instruction will be
+    // compiled into other instructions - mainly because of llvm
+    // optimization.
+    let expected = if expected == "unknown" {
+        "<unknown>" // Workaround for rust-lang/stdarch#1674, todo: remove when the issue is fixed
+    } else {
+        expected
+    };
+    let found = expected == "nop" || instrs.iter().any(|s| s.starts_with(expected));
+
+    // Look for subroutine call instructions in the disassembly to detect whether
+    // inlining failed: all intrinsics are `#[inline(always)]`, so calling one
+    // intrinsic from another should not generate subroutine call instructions.
+    let inlining_failed = if cfg!(target_arch = "x86_64") || cfg!(target_arch = "wasm32") {
+        instrs.iter().any(|s| s.starts_with("call "))
+    } else if cfg!(target_arch = "x86") {
+        instrs.windows(2).any(|s| {
+            // On 32-bit x86 position independent code will call itself and be
+            // immediately followed by a `pop` to learn about the current address.
+            // Let's not take that into account when considering whether a function
+            // failed inlining something.
+            s[0].starts_with("call ") && s[1].starts_with("pop") // FIXME: original logic but does not match comment
+        })
+    } else if cfg!(any(
+        target_arch = "aarch64",
+        target_arch = "arm64ec",
+        target_arch = "powerpc",
+        target_arch = "powerpc64"
+    )) {
+        instrs.iter().any(|s| s.starts_with("bl "))
+    } else {
+        // FIXME: Add detection for other archs
+        false
+    };
+
+    let instruction_limit = std::env::var("STDARCH_ASSERT_INSTR_LIMIT")
+        .ok()
+        .map_or_else(
+            || match expected {
+                // `cpuid` returns a pretty big aggregate structure, so exempt
+                // it from the slightly more restrictive 22 instructions below.
+                "cpuid" => 30,
+
+                // These require 8 loads and stores, so it _just_ overflows the limit
+                "aesencwide128kl" | "aesencwide256kl" | "aesdecwide128kl" | "aesdecwide256kl" => 24,
+
+                // Apparently, on Windows, LLVM generates a bunch of
+                // saves/restores of xmm registers around these instructions,
+                // which exceeds the limit of 20 below. As it seems dictated by
+                // Windows's ABI (I believe?), we probably can't do much
+                // about it.
+                "vzeroall" | "vzeroupper" if cfg!(windows) => 30,
+
+                // Intrinsics using `cvtpi2ps` are typically "composites" and
+                // in some cases exceed the limit.
+                "cvtpi2ps" => 25,
+                // core_arch/src/arm_shared/simd32
+                // vfmaq_n_f32_vfma : #instructions = 26 >= 22 (limit)
+                "usad8" | "vfma" | "vfms" => 27,
+                "qadd8" | "qsub8" | "sadd8" | "sel" | "shadd8" | "shsub8" | "usub8" | "ssub8" => 29,
+                // core_arch/src/arm_shared/simd32
+                // vst1q_s64_x4_vst1 : #instructions = 27 >= 22 (limit)
+                "vld3" => 28,
+                // core_arch/src/arm_shared/simd32
+                // vld4q_lane_u32_vld4 : #instructions = 36 >= 22 (limit)
+                "vld4" => 37,
+                // core_arch/src/arm_shared/simd32
+                // vst1q_s64_x4_vst1 : #instructions = 40 >= 22 (limit)
+                "vst1" => 41,
+                // core_arch/src/arm_shared/simd32
+                // vst3q_u32_vst3 : #instructions = 25 >= 22 (limit)
+                "vst3" => 26,
+                // core_arch/src/arm_shared/simd32
+                // vst4q_u32_vst4 : #instructions = 33 >= 22 (limit)
+                "vst4" => 34,
+
+                // core_arch/src/arm_shared/simd32
+                // vst1q_p64_x4_nop : #instructions = 33 >= 22 (limit)
+                "nop" if fnname.contains("vst1q_p64") => 34,
+
+                // Original limit was 20 instructions, but ARM DSP Intrinsics
+                // are exactly 20 instructions long. So, bump the limit to 22
+                // instead of adding here a long list of exceptions.
+                _ => {
+                    // aarch64_be may add reverse instructions which increases
+                    // the number of instructions generated.
+                    if cfg!(all(target_endian = "big", target_arch = "aarch64")) {
+                        32
+                    } else {
+                        22
+                    }
+                }
+            },
+            |v| v.parse().unwrap(),
+        );
+    let probably_only_one_instruction = instrs.len() < instruction_limit;
+
+    if found && probably_only_one_instruction && !inlining_failed {
+        return;
+    }
+
+    // Help debug by printing out the found disassembly, and then panic as we
+    // didn't find the instruction.
+    println!("disassembly for {fnname}: ",);
+    for (i, instr) in instrs.iter().enumerate() {
+        println!("\t{i:2}: {instr}");
+    }
+
+    if !found {
+        panic!("failed to find instruction `{expected}` in the disassembly");
+    } else if !probably_only_one_instruction {
+        panic!(
+            "instruction found, but the disassembly contains too many \
+             instructions: #instructions = {} >= {} (limit)",
+            instrs.len(),
+            instruction_limit
+        );
+    } else if inlining_failed {
+        panic!(
+            "instruction found, but the disassembly contains subroutine \
+             call instructions, which hint that inlining failed"
+        );
+    }
+}
+
+pub fn assert_skip_test_ok(name: &str, missing_features: &[&str]) {
+    println!("Skipping test `{name}` due to missing target features:");
+    for feature in missing_features {
+        println!("  - {feature}");
+    }
+    match env::var("STDARCH_TEST_EVERYTHING") {
+        Ok(_) => panic!("skipped test `{name}` when it shouldn't be skipped"),
+        Err(_) => println!("Set STDARCH_TEST_EVERYTHING to make this an error."),
+    }
+}
diff --git a/library/stdarch/crates/stdarch-test/src/wasm.rs b/library/stdarch/crates/stdarch-test/src/wasm.rs
new file mode 100644
index 00000000000..bf411c12148
--- /dev/null
+++ b/library/stdarch/crates/stdarch-test/src/wasm.rs
@@ -0,0 +1,55 @@
+//! Disassembly calling function for `wasm32` targets.
+
+use crate::Function;
+use std::collections::HashSet;
+
+pub(crate) fn disassemble_myself() -> HashSet<Function> {
+    // Use `std::env::args` to find the path to our executable. Assume the
+    // environment is configured such that we can read that file. Read it and
+    // use the `wasmprinter` crate to transform the binary to text, then search
+    // the text for appropriately named functions.
+    let me = std::env::args()
+        .next()
+        .expect("failed to find current wasm file");
+    let output = wasmprinter::print_file(&me).unwrap();
+
+    let mut ret: HashSet<Function> = HashSet::new();
+    let mut lines = output.lines().map(|s| s.trim());
+    while let Some(line) = lines.next() {
+        // If this isn't a function, we don't care about it.
+        if !line.starts_with("(func ") {
+            continue;
+        }
+
+        let mut function = Function {
+            name: String::new(),
+            instrs: Vec::new(),
+        };
+
+        // Empty functions will end in `))` so there's nothing to do, otherwise
+        // we'll have a bunch of following lines which are instructions.
+        //
+        // Lines that have an imbalanced `)` mark the end of a function.
+        if !line.ends_with("))") {
+            while let Some(line) = lines.next() {
+                function.instrs.push(line.to_string());
+                if !line.starts_with("(") && line.ends_with(")") {
+                    break;
+                }
+            }
+        }
+        // The second element here split on whitespace should be the name of
+        // the function, skipping the type/params/results
+        function.name = line.split_whitespace().nth(1).unwrap().to_string();
+        if function.name.starts_with("$") {
+            function.name = function.name[1..].to_string()
+        }
+
+        if !function.name.contains("stdarch_test_shim") {
+            continue;
+        }
+
+        assert!(ret.insert(function));
+    }
+    return ret;
+}