about summary refs log tree commit diff
path: root/library/compiler-builtins/builtins-test
diff options
context:
space:
mode:
authorTrevor Gross <tmgross@umich.edu>2025-03-18 09:52:53 +0000
committerTrevor Gross <t.gross35@gmail.com>2025-04-18 21:14:41 -0400
commit92b1e8454d7c4ea424ff970e6be283144584f812 (patch)
tree3064bf6a40146c3104364b87eb418e7c7d1c150a /library/compiler-builtins/builtins-test
parent3ea9f849d54b6fee2bdacea04cad4d15d68b3fb5 (diff)
downloadrust-92b1e8454d7c4ea424ff970e6be283144584f812.tar.gz
rust-92b1e8454d7c4ea424ff970e6be283144584f812.zip
Rename `testcrate` to `builtins-test`
The repo will soon have `libm` as a top-level crate, so make it clear
that this is only the test crate for `compiler-builtins`.
Diffstat (limited to 'library/compiler-builtins/builtins-test')
-rw-r--r--library/compiler-builtins/builtins-test/Cargo.toml102
-rw-r--r--library/compiler-builtins/builtins-test/benches/float_add.rs93
-rw-r--r--library/compiler-builtins/builtins-test/benches/float_cmp.rs208
-rw-r--r--library/compiler-builtins/builtins-test/benches/float_conv.rs688
-rw-r--r--library/compiler-builtins/builtins-test/benches/float_div.rs93
-rw-r--r--library/compiler-builtins/builtins-test/benches/float_extend.rs133
-rw-r--r--library/compiler-builtins/builtins-test/benches/float_mul.rs93
-rw-r--r--library/compiler-builtins/builtins-test/benches/float_pow.rs49
-rw-r--r--library/compiler-builtins/builtins-test/benches/float_sub.rs93
-rw-r--r--library/compiler-builtins/builtins-test/benches/float_trunc.rs146
-rw-r--r--library/compiler-builtins/builtins-test/benches/mem.rs364
-rw-r--r--library/compiler-builtins/builtins-test/benches/mem_icount.rs499
-rw-r--r--library/compiler-builtins/builtins-test/build.rs120
-rw-r--r--library/compiler-builtins/builtins-test/src/bench.rs366
-rw-r--r--library/compiler-builtins/builtins-test/src/lib.rs338
-rw-r--r--library/compiler-builtins/builtins-test/tests/addsub.rs143
-rw-r--r--library/compiler-builtins/builtins-test/tests/aeabi_memclr.rs60
-rw-r--r--library/compiler-builtins/builtins-test/tests/aeabi_memcpy.rs71
-rw-r--r--library/compiler-builtins/builtins-test/tests/aeabi_memset.rs240
-rw-r--r--library/compiler-builtins/builtins-test/tests/big.rs134
-rw-r--r--library/compiler-builtins/builtins-test/tests/cmp.rs185
-rw-r--r--library/compiler-builtins/builtins-test/tests/conv.rs364
-rw-r--r--library/compiler-builtins/builtins-test/tests/div_rem.rs165
-rw-r--r--library/compiler-builtins/builtins-test/tests/float_pow.rs72
-rw-r--r--library/compiler-builtins/builtins-test/tests/lse.rs97
-rw-r--r--library/compiler-builtins/builtins-test/tests/mem.rs286
-rw-r--r--library/compiler-builtins/builtins-test/tests/misc.rs208
-rw-r--r--library/compiler-builtins/builtins-test/tests/mul.rs156
-rw-r--r--library/compiler-builtins/builtins-test/tests/shift.rs35
29 files changed, 5601 insertions, 0 deletions
diff --git a/library/compiler-builtins/builtins-test/Cargo.toml b/library/compiler-builtins/builtins-test/Cargo.toml
new file mode 100644
index 00000000000..526e9b18af0
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/Cargo.toml
@@ -0,0 +1,102 @@
+[package]
+name = "builtins-test"
+version = "0.1.0"
+authors = ["Alex Crichton <alex@alexcrichton.com>"]
+edition = "2024"
+publish = false
+
+[lib]
+test = false
+doctest = false
+
+[dependencies]
+# For fuzzing tests we want a deterministic seedable RNG. We also eliminate potential
+# problems with system RNGs on the variety of platforms this crate is tested on.
+# `xoshiro128**` is used for its quality, size, and speed at generating `u32` shift amounts.
+rand_xoshiro = "0.6"
+# To compare float builtins against
+rustc_apfloat = "0.2.1"
+# Really a dev dependency, but dev dependencies can't be optional
+iai-callgrind = { version = "0.14.0", optional = true }
+
+[dependencies.compiler_builtins]
+path = "../compiler-builtins"
+default-features = false
+features = ["public-test-deps"]
+
+[dev-dependencies]
+criterion = { version = "0.5.1", default-features = false, features = ["cargo_bench_support"] }
+paste = "1.0.15"
+
+[target.'cfg(all(target_arch = "arm", not(any(target_env = "gnu", target_env = "musl")), target_os = "linux"))'.dev-dependencies]
+test = { git = "https://github.com/japaric/utest" }
+utest-cortex-m-qemu = { default-features = false, git = "https://github.com/japaric/utest" }
+utest-macros = { git = "https://github.com/japaric/utest" }
+
+[features]
+default = ["mangled-names"]
+c = ["compiler_builtins/c"]
+no-asm = ["compiler_builtins/no-asm"]
+no-f16-f128 = ["compiler_builtins/no-f16-f128"]
+mem = ["compiler_builtins/mem"]
+mangled-names = ["compiler_builtins/mangled-names"]
+# Skip tests that rely on f128 symbols being available on the system
+no-sys-f128 = ["no-sys-f128-int-convert", "no-sys-f16-f128-convert"]
+# Some platforms have some f128 functions but everything except integer conversions
+no-sys-f128-int-convert = []
+no-sys-f16-f128-convert = []
+no-sys-f16-f64-convert = []
+# Skip tests that rely on f16 symbols being available on the system
+no-sys-f16 = ["no-sys-f16-f64-convert"]
+
+# Enable icount benchmarks (requires iai-callgrind and valgrind)
+icount = ["dep:iai-callgrind"]
+
+# Enable report generation without bringing in more dependencies by default
+benchmarking-reports = ["criterion/plotters", "criterion/html_reports"]
+
+# NOTE: benchmarks must be run with `--no-default-features` or with
+# `-p builtins-test`, otherwise the default `compiler-builtins` feature
+# of the `compiler_builtins` crate gets activated, resulting in linker
+# errors.
+
+[[bench]]
+name = "float_add"
+harness = false
+
+[[bench]]
+name = "float_sub"
+harness = false
+
+[[bench]]
+name = "float_mul"
+harness = false
+
+[[bench]]
+name = "float_div"
+harness = false
+
+[[bench]]
+name = "float_cmp"
+harness = false
+
+[[bench]]
+name = "float_conv"
+harness = false
+
+[[bench]]
+name = "float_extend"
+harness = false
+
+[[bench]]
+name = "float_trunc"
+harness = false
+
+[[bench]]
+name = "float_pow"
+harness = false
+
+[[bench]]
+name = "mem_icount"
+harness = false
+required-features = ["icount"]
diff --git a/library/compiler-builtins/builtins-test/benches/float_add.rs b/library/compiler-builtins/builtins-test/benches/float_add.rs
new file mode 100644
index 00000000000..197f90b319d
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/benches/float_add.rs
@@ -0,0 +1,93 @@
+#![cfg_attr(f128_enabled, feature(f128))]
+
+use builtins_test::float_bench;
+use compiler_builtins::float::add;
+use criterion::{Criterion, criterion_main};
+
+float_bench! {
+    name: add_f32,
+    sig: (a: f32, b: f32) -> f32,
+    crate_fn: add::__addsf3,
+    sys_fn: __addsf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            asm!(
+                "addss {a}, {b}",
+                a = inout(xmm_reg) a,
+                b = in(xmm_reg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            asm!(
+                "fadd {a:s}, {a:s}, {b:s}",
+                a = inout(vreg) a,
+                b = in(vreg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+    ],
+}
+
+float_bench! {
+    name: add_f64,
+    sig: (a: f64, b: f64) -> f64,
+    crate_fn: add::__adddf3,
+    sys_fn: __adddf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            asm!(
+                "addsd {a}, {b}",
+                a = inout(xmm_reg) a,
+                b = in(xmm_reg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            asm!(
+                "fadd {a:d}, {a:d}, {b:d}",
+                a = inout(vreg) a,
+                b = in(vreg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+    ],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: add_f128,
+    sig: (a: f128, b: f128) -> f128,
+    crate_fn: add::__addtf3,
+    crate_fn_ppc: add::__addkf3,
+    sys_fn: __addtf3,
+    sys_fn_ppc: __addkf3,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: []
+}
+
+pub fn float_add() {
+    let mut criterion = Criterion::default().configure_from_args();
+
+    add_f32(&mut criterion);
+    add_f64(&mut criterion);
+
+    #[cfg(f128_enabled)]
+    {
+        add_f128(&mut criterion);
+    }
+}
+
+criterion_main!(float_add);
diff --git a/library/compiler-builtins/builtins-test/benches/float_cmp.rs b/library/compiler-builtins/builtins-test/benches/float_cmp.rs
new file mode 100644
index 00000000000..4493765ec1b
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/benches/float_cmp.rs
@@ -0,0 +1,208 @@
+#![cfg_attr(f128_enabled, feature(f128))]
+
+use builtins_test::float_bench;
+use criterion::{Criterion, criterion_main};
+
+use compiler_builtins::float::cmp;
+
+/// `gt` symbols are allowed to return differing results, they just get compared
+/// to 0.
+fn gt_res_eq(a: i32, b: i32) -> bool {
+    let a_lt_0 = a <= 0;
+    let b_lt_0 = b <= 0;
+    (a_lt_0 && b_lt_0) || (!a_lt_0 && !b_lt_0)
+}
+
+float_bench! {
+    name: cmp_f32_gt,
+    sig: (a: f32, b: f32) -> i32,
+    crate_fn: cmp::__gtsf2,
+    sys_fn: __gtsf2,
+    sys_available: all(),
+    output_eq: gt_res_eq,
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: i32;
+            asm!(
+                "xor     {ret:e}, {ret:e}",
+                "ucomiss {a}, {b}",
+                "seta    {ret:l}",
+                a = in(xmm_reg) a,
+                b = in(xmm_reg) b,
+                ret = out(reg) ret,
+                options(nomem, nostack, pure)
+            );
+
+            ret
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: i32;
+            asm!(
+                "fcmp    {a:s}, {b:s}",
+                "cset    {ret:w}, gt",
+                a = in(vreg) a,
+                b = in(vreg) b,
+                ret = out(reg) ret,
+                options(nomem,nostack),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: cmp_f32_unord,
+    sig: (a: f32, b: f32) -> i32,
+    crate_fn: cmp::__unordsf2,
+    sys_fn: __unordsf2,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: i32;
+            asm!(
+                "xor     {ret:e}, {ret:e}",
+                "ucomiss {a}, {b}",
+                "setp    {ret:l}",
+                a = in(xmm_reg) a,
+                b = in(xmm_reg) b,
+                ret = out(reg) ret,
+                options(nomem, nostack, pure)
+            );
+
+            ret
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: i32;
+            asm!(
+                "fcmp    {a:s}, {b:s}",
+                "cset    {ret:w}, vs",
+                a = in(vreg) a,
+                b = in(vreg) b,
+                ret = out(reg) ret,
+                options(nomem, nostack, pure)
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: cmp_f64_gt,
+    sig: (a: f64, b: f64) -> i32,
+    crate_fn: cmp::__gtdf2,
+    sys_fn: __gtdf2,
+    sys_available: all(),
+    output_eq: gt_res_eq,
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: i32;
+            asm!(
+                "xor     {ret:e}, {ret:e}",
+                "ucomisd {a}, {b}",
+                "seta    {ret:l}",
+                a = in(xmm_reg) a,
+                b = in(xmm_reg) b,
+                ret = out(reg) ret,
+                options(nomem, nostack, pure)
+            );
+
+            ret
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: i32;
+            asm!(
+                "fcmp    {a:d}, {b:d}",
+                "cset {ret:w}, gt",
+                a = in(vreg) a,
+                b = in(vreg) b,
+                ret = out(reg) ret,
+                options(nomem, nostack, pure)
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: cmp_f64_unord,
+    sig: (a: f64, b: f64) -> i32,
+    crate_fn: cmp::__unorddf2,
+    sys_fn: __unorddf2,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: i32;
+            asm!(
+                "xor     {ret:e}, {ret:e}",
+                "ucomisd {a}, {b}",
+                "setp    {ret:l}",
+                a = in(xmm_reg) a,
+                b = in(xmm_reg) b,
+                ret = out(reg) ret,
+                options(nomem, nostack, pure)
+            );
+
+            ret
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: i32;
+            asm!(
+                "fcmp    {a:d}, {b:d}",
+                "cset    {ret:w}, vs",
+                a = in(vreg) a,
+                b = in(vreg) b,
+                ret = out(reg) ret,
+                options(nomem, nostack, pure)
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: cmp_f128_gt,
+    sig: (a: f128, b: f128) -> i32,
+    crate_fn: cmp::__gttf2,
+    crate_fn_ppc: cmp::__gtkf2,
+    sys_fn: __gttf2,
+    sys_fn_ppc: __gtkf2,
+    sys_available: not(feature = "no-sys-f128"),
+    output_eq: gt_res_eq,
+    asm: []
+}
+
+float_bench! {
+    name: cmp_f128_unord,
+    sig: (a: f128, b: f128) -> i32,
+    crate_fn: cmp::__unordtf2,
+    crate_fn_ppc: cmp::__unordkf2,
+    sys_fn: __unordtf2,
+    sys_fn_ppc: __unordkf2,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: []
+}
+
+pub fn float_cmp() {
+    let mut criterion = Criterion::default().configure_from_args();
+
+    cmp_f32_gt(&mut criterion);
+    cmp_f32_unord(&mut criterion);
+    cmp_f64_gt(&mut criterion);
+    cmp_f64_unord(&mut criterion);
+
+    #[cfg(f128_enabled)]
+    {
+        cmp_f128_gt(&mut criterion);
+        cmp_f128_unord(&mut criterion);
+    }
+}
+
+criterion_main!(float_cmp);
diff --git a/library/compiler-builtins/builtins-test/benches/float_conv.rs b/library/compiler-builtins/builtins-test/benches/float_conv.rs
new file mode 100644
index 00000000000..d4a7346d1d5
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/benches/float_conv.rs
@@ -0,0 +1,688 @@
+#![allow(improper_ctypes)]
+#![cfg_attr(f128_enabled, feature(f128))]
+
+use builtins_test::float_bench;
+use compiler_builtins::float::conv;
+use criterion::{Criterion, criterion_main};
+
+/* unsigned int -> float */
+
+float_bench! {
+    name: conv_u32_f32,
+    sig: (a: u32) -> f32,
+    crate_fn: conv::__floatunsisf,
+    sys_fn: __floatunsisf,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: f32;
+            asm!(
+                "mov {tmp:e}, {a:e}",
+                "cvtsi2ss {ret}, {tmp}",
+                a = in(reg) a,
+                tmp = out(reg) _,
+                ret = lateout(xmm_reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f32;
+            asm!(
+                "ucvtf {ret:s}, {a:w}",
+                a = in(reg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: conv_u32_f64,
+    sig: (a: u32) -> f64,
+    crate_fn: conv::__floatunsidf,
+    sys_fn: __floatunsidf,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: f64;
+            asm!(
+                "mov {tmp:e}, {a:e}",
+                "cvtsi2sd {ret}, {tmp}",
+                a = in(reg) a,
+                tmp = out(reg) _,
+                ret = lateout(xmm_reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f64;
+            asm!(
+                "ucvtf {ret:d}, {a:w}",
+                a = in(reg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: conv_u32_f128,
+    sig: (a: u32) -> f128,
+    crate_fn: conv::__floatunsitf,
+    crate_fn_ppc: conv::__floatunsikf,
+    sys_fn: __floatunsitf,
+    sys_fn_ppc: __floatunsikf,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: []
+}
+
+float_bench! {
+    name: conv_u64_f32,
+    sig: (a: u64) -> f32,
+    crate_fn: conv::__floatundisf,
+    sys_fn: __floatundisf,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f32;
+            asm!(
+                "ucvtf {ret:s}, {a:x}",
+                a = in(reg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: conv_u64_f64,
+    sig: (a: u64) -> f64,
+    crate_fn: conv::__floatundidf,
+    sys_fn: __floatundidf,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f64;
+            asm!(
+                "ucvtf {ret:d}, {a:x}",
+                a = in(reg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: conv_u64_f128,
+    sig: (a: u64) -> f128,
+    crate_fn: conv::__floatunditf,
+    crate_fn_ppc: conv::__floatundikf,
+    sys_fn: __floatunditf,
+    sys_fn_ppc: __floatundikf,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: []
+}
+
+float_bench! {
+    name: conv_u128_f32,
+    sig: (a: u128) -> f32,
+    crate_fn: conv::__floatuntisf,
+    sys_fn: __floatuntisf,
+    sys_available: all(),
+    asm: []
+}
+
+float_bench! {
+    name: conv_u128_f64,
+    sig: (a: u128) -> f64,
+    crate_fn: conv::__floatuntidf,
+    sys_fn: __floatuntidf,
+    sys_available: all(),
+    asm: []
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: conv_u128_f128,
+    sig: (a: u128) -> f128,
+    crate_fn: conv::__floatuntitf,
+    crate_fn_ppc: conv::__floatuntikf,
+    sys_fn: __floatuntitf,
+    sys_fn_ppc: __floatuntikf,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: []
+}
+
+/* signed int -> float */
+
+float_bench! {
+    name: conv_i32_f32,
+    sig: (a: i32) -> f32,
+    crate_fn: conv::__floatsisf,
+    sys_fn: __floatsisf,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: f32;
+            asm!(
+                "cvtsi2ss    {ret}, {a:e}",
+                a = in(reg) a,
+                ret = lateout(xmm_reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f32;
+            asm!(
+                "scvtf {ret:s}, {a:w}",
+                a = in(reg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: conv_i32_f64,
+    sig: (a: i32) -> f64,
+    crate_fn: conv::__floatsidf,
+    sys_fn: __floatsidf,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: f64;
+            asm!(
+                "cvtsi2sd    {ret}, {a:e}",
+                a = in(reg) a,
+                ret = lateout(xmm_reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f64;
+            asm!(
+                "scvtf {ret:d}, {a:w}",
+                a = in(reg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: conv_i32_f128,
+    sig: (a: i32) -> f128,
+    crate_fn: conv::__floatsitf,
+    crate_fn_ppc: conv::__floatsikf,
+    sys_fn: __floatsitf,
+    sys_fn_ppc: __floatsikf,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: []
+}
+
+float_bench! {
+    name: conv_i64_f32,
+    sig: (a: i64) -> f32,
+    crate_fn: conv::__floatdisf,
+    sys_fn: __floatdisf,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: f32;
+            asm!(
+                "cvtsi2ss    {ret}, {a:r}",
+                a = in(reg) a,
+                ret = lateout(xmm_reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f32;
+            asm!(
+                "scvtf {ret:s}, {a:x}",
+                a = in(reg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: conv_i64_f64,
+    sig: (a: i64) -> f64,
+    crate_fn: conv::__floatdidf,
+    sys_fn: __floatdidf,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: f64;
+            asm!(
+                "cvtsi2sd    {ret}, {a:r}",
+                a = in(reg) a,
+                ret = lateout(xmm_reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f64;
+            asm!(
+                "scvtf {ret:d}, {a:x}",
+                a = in(reg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: conv_i64_f128,
+    sig: (a: i64) -> f128,
+    crate_fn: conv::__floatditf,
+    crate_fn_ppc: conv::__floatdikf,
+    sys_fn: __floatditf,
+    sys_fn_ppc: __floatdikf,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: []
+}
+
+float_bench! {
+    name: conv_i128_f32,
+    sig: (a: i128) -> f32,
+    crate_fn: conv::__floattisf,
+    sys_fn: __floattisf,
+    sys_available: all(),
+    asm: []
+}
+
+float_bench! {
+    name: conv_i128_f64,
+    sig: (a: i128) -> f64,
+    crate_fn: conv::__floattidf,
+    sys_fn: __floattidf,
+    sys_available: all(),
+    asm: []
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: conv_i128_f128,
+    sig: (a: i128) -> f128,
+    crate_fn: conv::__floattitf,
+    crate_fn_ppc: conv::__floattikf,
+    sys_fn: __floattitf,
+    sys_fn_ppc: __floattikf,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: []
+}
+
+/* float -> unsigned int */
+
+#[cfg(not(all(target_arch = "powerpc64", target_endian = "little")))]
+float_bench! {
+    name: conv_f32_u32,
+    sig: (a: f32) -> u32,
+    crate_fn: conv::__fixunssfsi,
+    sys_fn: __fixunssfsi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: u32;
+            asm!(
+                "fcvtzu {ret:w}, {a:s}",
+                a = in(vreg) a,
+                ret = lateout(reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(not(all(target_arch = "powerpc64", target_endian = "little")))]
+float_bench! {
+    name: conv_f32_u64,
+    sig: (a: f32) -> u64,
+    crate_fn: conv::__fixunssfdi,
+    sys_fn: __fixunssfdi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: u64;
+            asm!(
+                "fcvtzu {ret:x}, {a:s}",
+                a = in(vreg) a,
+                ret = lateout(reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(not(all(target_arch = "powerpc64", target_endian = "little")))]
+float_bench! {
+    name: conv_f32_u128,
+    sig: (a: f32) -> u128,
+    crate_fn: conv::__fixunssfti,
+    sys_fn: __fixunssfti,
+    sys_available: all(),
+    asm: []
+}
+
+float_bench! {
+    name: conv_f64_u32,
+    sig: (a: f64) -> u32,
+    crate_fn: conv::__fixunsdfsi,
+    sys_fn: __fixunsdfsi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: u32;
+            asm!(
+                "fcvtzu {ret:w}, {a:d}",
+                a = in(vreg) a,
+                ret = lateout(reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: conv_f64_u64,
+    sig: (a: f64) -> u64,
+    crate_fn: conv::__fixunsdfdi,
+    sys_fn: __fixunsdfdi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: u64;
+            asm!(
+                "fcvtzu {ret:x}, {a:d}",
+                a = in(vreg) a,
+                ret = lateout(reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: conv_f64_u128,
+    sig: (a: f64) -> u128,
+    crate_fn: conv::__fixunsdfti,
+    sys_fn: __fixunsdfti,
+    sys_available: all(),
+    asm: []
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: conv_f128_u32,
+    sig: (a: f128) -> u32,
+    crate_fn: conv::__fixunstfsi,
+    crate_fn_ppc: conv::__fixunskfsi,
+    sys_fn: __fixunstfsi,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: []
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: conv_f128_u64,
+    sig: (a: f128) -> u64,
+    crate_fn: conv::__fixunstfdi,
+    crate_fn_ppc: conv::__fixunskfdi,
+    sys_fn: __fixunstfdi,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: []
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: conv_f128_u128,
+    sig: (a: f128) -> u128,
+    crate_fn: conv::__fixunstfti,
+    crate_fn_ppc: conv::__fixunskfti,
+    sys_fn: __fixunstfti,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: []
+}
+
+/* float -> signed int */
+
+#[cfg(not(all(target_arch = "powerpc64", target_endian = "little")))]
+float_bench! {
+    name: conv_f32_i32,
+    sig: (a: f32) -> i32,
+    crate_fn: conv::__fixsfsi,
+    sys_fn: __fixsfsi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: i32;
+            asm!(
+                "fcvtzs {ret:w}, {a:s}",
+                a = in(vreg) a,
+                ret = lateout(reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(not(all(target_arch = "powerpc64", target_endian = "little")))]
+float_bench! {
+    name: conv_f32_i64,
+    sig: (a: f32) -> i64,
+    crate_fn: conv::__fixsfdi,
+    sys_fn: __fixsfdi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: i64;
+            asm!(
+                "fcvtzs {ret:x}, {a:s}",
+                a = in(vreg) a,
+                ret = lateout(reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(not(all(target_arch = "powerpc64", target_endian = "little")))]
+float_bench! {
+    name: conv_f32_i128,
+    sig: (a: f32) -> i128,
+    crate_fn: conv::__fixsfti,
+    sys_fn: __fixsfti,
+    sys_available: all(),
+    asm: []
+}
+
+float_bench! {
+    name: conv_f64_i32,
+    sig: (a: f64) -> i32,
+    crate_fn: conv::__fixdfsi,
+    sys_fn: __fixdfsi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: i32;
+            asm!(
+                "fcvtzs {ret:w}, {a:d}",
+                a = in(vreg) a,
+                ret = lateout(reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: conv_f64_i64,
+    sig: (a: f64) -> i64,
+    crate_fn: conv::__fixdfdi,
+    sys_fn: __fixdfdi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: i64;
+            asm!(
+                "fcvtzs {ret:x}, {a:d}",
+                a = in(vreg) a,
+                ret = lateout(reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: conv_f64_i128,
+    sig: (a: f64) -> i128,
+    crate_fn: conv::__fixdfti,
+    sys_fn: __fixdfti,
+    sys_available: all(),
+    asm: []
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: conv_f128_i32,
+    sig: (a: f128) -> i32,
+    crate_fn: conv::__fixtfsi,
+    crate_fn_ppc: conv::__fixkfsi,
+    sys_fn: __fixtfsi,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: []
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: conv_f128_i64,
+    sig: (a: f128) -> i64,
+    crate_fn: conv::__fixtfdi,
+    crate_fn_ppc: conv::__fixkfdi,
+    sys_fn: __fixtfdi,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: []
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: conv_f128_i128,
+    sig: (a: f128) -> i128,
+    crate_fn: conv::__fixtfti,
+    crate_fn_ppc: conv::__fixkfti,
+    sys_fn: __fixtfti,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: []
+}
+
+pub fn float_conv() {
+    let mut criterion = Criterion::default().configure_from_args();
+
+    conv_u32_f32(&mut criterion);
+    conv_u32_f64(&mut criterion);
+    conv_u64_f32(&mut criterion);
+    conv_u64_f64(&mut criterion);
+    conv_u128_f32(&mut criterion);
+    conv_u128_f64(&mut criterion);
+    conv_i32_f32(&mut criterion);
+    conv_i32_f64(&mut criterion);
+    conv_i64_f32(&mut criterion);
+    conv_i64_f64(&mut criterion);
+    conv_i128_f32(&mut criterion);
+    conv_i128_f64(&mut criterion);
+    conv_f64_u32(&mut criterion);
+    conv_f64_u64(&mut criterion);
+    conv_f64_u128(&mut criterion);
+    conv_f64_i32(&mut criterion);
+    conv_f64_i64(&mut criterion);
+    conv_f64_i128(&mut criterion);
+
+    #[cfg(f128_enabled)]
+    // FIXME: ppc64le has a sporadic overflow panic in the crate functions
+    // <https://github.com/rust-lang/compiler-builtins/issues/617#issuecomment-2125914639>
+    #[cfg(not(all(target_arch = "powerpc64", target_endian = "little")))]
+    {
+        conv_u32_f128(&mut criterion);
+        conv_u64_f128(&mut criterion);
+        conv_u128_f128(&mut criterion);
+        conv_i32_f128(&mut criterion);
+        conv_i64_f128(&mut criterion);
+        conv_i128_f128(&mut criterion);
+        conv_f128_u32(&mut criterion);
+        conv_f128_u64(&mut criterion);
+        conv_f128_u128(&mut criterion);
+        conv_f128_i32(&mut criterion);
+        conv_f128_i64(&mut criterion);
+        conv_f128_i128(&mut criterion);
+    }
+}
+
+criterion_main!(float_conv);
diff --git a/library/compiler-builtins/builtins-test/benches/float_div.rs b/library/compiler-builtins/builtins-test/benches/float_div.rs
new file mode 100644
index 00000000000..d5b0ad0fd40
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/benches/float_div.rs
@@ -0,0 +1,93 @@
+#![cfg_attr(f128_enabled, feature(f128))]
+
+use builtins_test::float_bench;
+use compiler_builtins::float::div;
+use criterion::{Criterion, criterion_main};
+
+float_bench! {
+    name: div_f32,
+    sig: (a: f32, b: f32) -> f32,
+    crate_fn: div::__divsf3,
+    sys_fn: __divsf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            asm!(
+                "divss {a}, {b}",
+                a = inout(xmm_reg) a,
+                b = in(xmm_reg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            asm!(
+                "fdiv {a:s}, {a:s}, {b:s}",
+                a = inout(vreg) a,
+                b = in(vreg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+    ],
+}
+
+float_bench! {
+    name: div_f64,
+    sig: (a: f64, b: f64) -> f64,
+    crate_fn: div::__divdf3,
+    sys_fn: __divdf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            asm!(
+                "divsd {a}, {b}",
+                a = inout(xmm_reg) a,
+                b = in(xmm_reg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            asm!(
+                "fdiv {a:d}, {a:d}, {b:d}",
+                a = inout(vreg) a,
+                b = in(vreg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+    ],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: div_f128,
+    sig: (a: f128, b: f128) -> f128,
+    crate_fn: div::__divtf3,
+    crate_fn_ppc: div::__divkf3,
+    sys_fn: __divtf3,
+    sys_fn_ppc: __divkf3,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: []
+}
+
+pub fn float_div() {
+    let mut criterion = Criterion::default().configure_from_args();
+
+    div_f32(&mut criterion);
+    div_f64(&mut criterion);
+
+    #[cfg(f128_enabled)]
+    {
+        div_f128(&mut criterion);
+    }
+}
+
+criterion_main!(float_div);
diff --git a/library/compiler-builtins/builtins-test/benches/float_extend.rs b/library/compiler-builtins/builtins-test/benches/float_extend.rs
new file mode 100644
index 00000000000..fc44e80c9e1
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/benches/float_extend.rs
@@ -0,0 +1,133 @@
+#![allow(unused_variables)] // "unused" f16 registers
+#![cfg_attr(f128_enabled, feature(f128))]
+#![cfg_attr(f16_enabled, feature(f16))]
+
+use builtins_test::float_bench;
+use compiler_builtins::float::extend;
+use criterion::{Criterion, criterion_main};
+
+#[cfg(f16_enabled)]
+float_bench! {
+    name: extend_f16_f32,
+    sig: (a: f16) -> f32,
+    crate_fn: extend::__extendhfsf2,
+    sys_fn: __extendhfsf2,
+    sys_available: not(feature = "no-sys-f16"),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f32;
+            asm!(
+                "fcvt    {ret:s}, {a:h}",
+                a = in(vreg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(f16_enabled)]
+float_bench! {
+    name: extend_f16_f64,
+    sig: (a: f16) -> f64,
+    crate_fn: extend::__extendhfdf2,
+    sys_fn: __extendhfdf2,
+    sys_available: not(feature = "no-sys-f16-f64-convert"),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f64;
+            asm!(
+                "fcvt    {ret:d}, {a:h}",
+                a = in(vreg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(all(f16_enabled, f128_enabled))]
+float_bench! {
+    name: extend_f16_f128,
+    sig: (a: f16) -> f128,
+    crate_fn: extend::__extendhftf2,
+    crate_fn_ppc: extend::__extendhfkf2,
+    sys_fn: __extendhftf2,
+    sys_fn_ppc: __extendhfkf2,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: [],
+}
+
+float_bench! {
+    name: extend_f32_f64,
+    sig: (a: f32) -> f64,
+    crate_fn: extend::__extendsfdf2,
+    sys_fn: __extendsfdf2,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f64;
+            asm!(
+                "fcvt    {ret:d}, {a:s}",
+                a = in(vreg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: extend_f32_f128,
+    sig: (a: f32) -> f128,
+    crate_fn: extend::__extendsftf2,
+    crate_fn_ppc: extend::__extendsfkf2,
+    sys_fn: __extendsftf2,
+    sys_fn_ppc: __extendsfkf2,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: [],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: extend_f64_f128,
+    sig: (a: f64) -> f128,
+    crate_fn: extend::__extenddftf2,
+    crate_fn_ppc: extend::__extenddfkf2,
+    sys_fn: __extenddftf2,
+    sys_fn_ppc: __extenddfkf2,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: [],
+}
+
+pub fn float_extend() {
+    let mut criterion = Criterion::default().configure_from_args();
+
+    // FIXME(#655): `f16` tests disabled until we can bootstrap symbols
+    #[cfg(f16_enabled)]
+    #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+    {
+        extend_f16_f32(&mut criterion);
+        extend_f16_f64(&mut criterion);
+
+        #[cfg(f128_enabled)]
+        extend_f16_f128(&mut criterion);
+    }
+
+    extend_f32_f64(&mut criterion);
+
+    #[cfg(f128_enabled)]
+    {
+        extend_f32_f128(&mut criterion);
+        extend_f64_f128(&mut criterion);
+    }
+}
+
+criterion_main!(float_extend);
diff --git a/library/compiler-builtins/builtins-test/benches/float_mul.rs b/library/compiler-builtins/builtins-test/benches/float_mul.rs
new file mode 100644
index 00000000000..a7a2d34aa04
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/benches/float_mul.rs
@@ -0,0 +1,93 @@
+#![cfg_attr(f128_enabled, feature(f128))]
+
+use builtins_test::float_bench;
+use compiler_builtins::float::mul;
+use criterion::{Criterion, criterion_main};
+
+float_bench! {
+    name: mul_f32,
+    sig: (a: f32, b: f32) -> f32,
+    crate_fn: mul::__mulsf3,
+    sys_fn: __mulsf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            asm!(
+                "mulss {a}, {b}",
+                a = inout(xmm_reg) a,
+                b = in(xmm_reg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            asm!(
+                "fmul {a:s}, {a:s}, {b:s}",
+                a = inout(vreg) a,
+                b = in(vreg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+    ],
+}
+
+float_bench! {
+    name: mul_f64,
+    sig: (a: f64, b: f64) -> f64,
+    crate_fn: mul::__muldf3,
+    sys_fn: __muldf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            asm!(
+                "mulsd {a}, {b}",
+                a = inout(xmm_reg) a,
+                b = in(xmm_reg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            asm!(
+                "fmul {a:d}, {a:d}, {b:d}",
+                a = inout(vreg) a,
+                b = in(vreg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+    ],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: mul_f128,
+    sig: (a: f128, b: f128) -> f128,
+    crate_fn: mul::__multf3,
+    crate_fn_ppc: mul::__mulkf3,
+    sys_fn: __multf3,
+    sys_fn_ppc: __mulkf3,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: []
+}
+
+pub fn float_mul() {
+    let mut criterion = Criterion::default().configure_from_args();
+
+    mul_f32(&mut criterion);
+    mul_f64(&mut criterion);
+
+    #[cfg(f128_enabled)]
+    {
+        mul_f128(&mut criterion);
+    }
+}
+
+criterion_main!(float_mul);
diff --git a/library/compiler-builtins/builtins-test/benches/float_pow.rs b/library/compiler-builtins/builtins-test/benches/float_pow.rs
new file mode 100644
index 00000000000..64e37dd3241
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/benches/float_pow.rs
@@ -0,0 +1,49 @@
+#![cfg_attr(f128_enabled, feature(f128))]
+
+use builtins_test::float_bench;
+use compiler_builtins::float::pow;
+use criterion::{Criterion, criterion_main};
+
+float_bench! {
+    name: powi_f32,
+    sig: (a: f32, b: i32) -> f32,
+    crate_fn: pow::__powisf2,
+    sys_fn: __powisf2,
+    sys_available: all(),
+    asm: [],
+}
+
+float_bench! {
+    name: powi_f64,
+    sig: (a: f64, b: i32) -> f64,
+    crate_fn: pow::__powidf2,
+    sys_fn: __powidf2,
+    sys_available: all(),
+    asm: [],
+}
+
+// FIXME(f16_f128): can be changed to only `f128_enabled` once `__multf3` and `__divtf3` are
+// distributed by nightly.
+#[cfg(all(f128_enabled, not(feature = "no-sys-f128")))]
+float_bench! {
+    name: powi_f128,
+    sig: (a: f128, b: i32) -> f128,
+    crate_fn: pow::__powitf2,
+    crate_fn_ppc: pow::__powikf2,
+    sys_fn: __powitf2,
+    sys_fn_ppc: __powikf2,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: []
+}
+
+pub fn float_pow() {
+    let mut criterion = Criterion::default().configure_from_args();
+
+    powi_f32(&mut criterion);
+    powi_f64(&mut criterion);
+
+    #[cfg(all(f128_enabled, not(feature = "no-sys-f128")))]
+    powi_f128(&mut criterion);
+}
+
+criterion_main!(float_pow);
diff --git a/library/compiler-builtins/builtins-test/benches/float_sub.rs b/library/compiler-builtins/builtins-test/benches/float_sub.rs
new file mode 100644
index 00000000000..8bae294cd56
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/benches/float_sub.rs
@@ -0,0 +1,93 @@
+#![cfg_attr(f128_enabled, feature(f128))]
+
+use builtins_test::float_bench;
+use compiler_builtins::float::sub;
+use criterion::{Criterion, criterion_main};
+
+float_bench! {
+    name: sub_f32,
+    sig: (a: f32, b: f32) -> f32,
+    crate_fn: sub::__subsf3,
+    sys_fn: __subsf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            asm!(
+                "subss {a}, {b}",
+                a = inout(xmm_reg) a,
+                b = in(xmm_reg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            asm!(
+                "fsub {a:s}, {a:s}, {b:s}",
+                a = inout(vreg) a,
+                b = in(vreg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+    ],
+}
+
+float_bench! {
+    name: sub_f64,
+    sig: (a: f64, b: f64) -> f64,
+    crate_fn: sub::__subdf3,
+    sys_fn: __subdf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            asm!(
+                "subsd {a}, {b}",
+                a = inout(xmm_reg) a,
+                b = in(xmm_reg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            asm!(
+                "fsub {a:d}, {a:d}, {b:d}",
+                a = inout(vreg) a,
+                b = in(vreg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+    ],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: sub_f128,
+    sig: (a: f128, b: f128) -> f128,
+    crate_fn: sub::__subtf3,
+    crate_fn_ppc: sub::__subkf3,
+    sys_fn: __subtf3,
+    sys_fn_ppc: __subkf3,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: []
+}
+
+pub fn float_sub() {
+    let mut criterion = Criterion::default().configure_from_args();
+
+    sub_f32(&mut criterion);
+    sub_f64(&mut criterion);
+
+    #[cfg(f128_enabled)]
+    {
+        sub_f128(&mut criterion);
+    }
+}
+
+criterion_main!(float_sub);
diff --git a/library/compiler-builtins/builtins-test/benches/float_trunc.rs b/library/compiler-builtins/builtins-test/benches/float_trunc.rs
new file mode 100644
index 00000000000..43310c7cfc8
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/benches/float_trunc.rs
@@ -0,0 +1,146 @@
+#![cfg_attr(f128_enabled, feature(f128))]
+#![cfg_attr(f16_enabled, feature(f16))]
+
+use builtins_test::float_bench;
+use compiler_builtins::float::trunc;
+use criterion::{Criterion, criterion_main};
+
+#[cfg(f16_enabled)]
+float_bench! {
+    name: trunc_f32_f16,
+    sig: (a: f32) -> f16,
+    crate_fn: trunc::__truncsfhf2,
+    sys_fn: __truncsfhf2,
+    sys_available: not(feature = "no-sys-f16"),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f16;
+            asm!(
+                "fcvt    {ret:h}, {a:s}",
+                a = in(vreg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(f16_enabled)]
+float_bench! {
+    name: trunc_f64_f16,
+    sig: (a: f64) -> f16,
+    crate_fn: trunc::__truncdfhf2,
+    sys_fn: __truncdfhf2,
+    sys_available: not(feature = "no-sys-f16-f64-convert"),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f16;
+            asm!(
+                "fcvt    {ret:h}, {a:d}",
+                a = in(vreg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: trunc_f64_f32,
+    sig: (a: f64) -> f32,
+    crate_fn: trunc::__truncdfsf2,
+    sys_fn: __truncdfsf2,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: f32;
+            asm!(
+                "cvtsd2ss {ret}, {a}",
+                a = in(xmm_reg) a,
+                ret = lateout(xmm_reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f32;
+            asm!(
+                "fcvt    {ret:s}, {a:d}",
+                a = in(vreg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(all(f16_enabled, f128_enabled))]
+float_bench! {
+    name: trunc_f128_f16,
+    sig: (a: f128) -> f16,
+    crate_fn: trunc::__trunctfhf2,
+    crate_fn_ppc: trunc::__trunckfhf2,
+    sys_fn: __trunctfhf2,
+    sys_fn_ppc: __trunckfhf2,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: [],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: trunc_f128_f32,
+    sig: (a: f128) -> f32,
+    crate_fn: trunc::__trunctfsf2,
+    crate_fn_ppc: trunc::__trunckfsf2,
+    sys_fn: __trunctfsf2,
+    sys_fn_ppc: __trunckfsf2,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: [],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: trunc_f128_f64,
+    sig: (a: f128) -> f64,
+    crate_fn: trunc::__trunctfdf2,
+    crate_fn_ppc: trunc::__trunckfdf2,
+    sys_fn: __trunctfdf2,
+    sys_fn_ppc: __trunckfdf2,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: [],
+}
+
+pub fn float_trunc() {
+    let mut criterion = Criterion::default().configure_from_args();
+
+    // FIXME(#655): `f16` tests disabled until we can bootstrap symbols
+    #[cfg(f16_enabled)]
+    #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+    {
+        trunc_f32_f16(&mut criterion);
+        trunc_f64_f16(&mut criterion);
+    }
+
+    trunc_f64_f32(&mut criterion);
+
+    #[cfg(f128_enabled)]
+    {
+        // FIXME(#655): `f16` tests disabled until we can bootstrap symbols
+        #[cfg(f16_enabled)]
+        #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+        trunc_f128_f16(&mut criterion);
+
+        trunc_f128_f32(&mut criterion);
+        trunc_f128_f64(&mut criterion);
+    }
+}
+
+criterion_main!(float_trunc);
diff --git a/library/compiler-builtins/builtins-test/benches/mem.rs b/library/compiler-builtins/builtins-test/benches/mem.rs
new file mode 100644
index 00000000000..3f83926b6c5
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/benches/mem.rs
@@ -0,0 +1,364 @@
+#![feature(test)]
+
+extern crate test;
+use test::{Bencher, black_box};
+
+extern crate compiler_builtins;
+use compiler_builtins::mem::{memcmp, memcpy, memmove, memset};
+
+const WORD_SIZE: usize = core::mem::size_of::<usize>();
+
+struct AlignedVec {
+    vec: Vec<usize>,
+    size: usize,
+}
+
+impl AlignedVec {
+    fn new(fill: u8, size: usize) -> Self {
+        let mut broadcast = fill as usize;
+        let mut bits = 8;
+        while bits < WORD_SIZE * 8 {
+            broadcast |= broadcast << bits;
+            bits *= 2;
+        }
+
+        let vec = vec![broadcast; (size + WORD_SIZE - 1) & !WORD_SIZE];
+        AlignedVec { vec, size }
+    }
+}
+
+impl core::ops::Deref for AlignedVec {
+    type Target = [u8];
+    fn deref(&self) -> &[u8] {
+        unsafe { core::slice::from_raw_parts(self.vec.as_ptr() as *const u8, self.size) }
+    }
+}
+
+impl core::ops::DerefMut for AlignedVec {
+    fn deref_mut(&mut self) -> &mut [u8] {
+        unsafe { core::slice::from_raw_parts_mut(self.vec.as_mut_ptr() as *mut u8, self.size) }
+    }
+}
+
+fn memcpy_builtin(b: &mut Bencher, n: usize, offset1: usize, offset2: usize) {
+    let v1 = AlignedVec::new(1, n + offset1);
+    let mut v2 = AlignedVec::new(0, n + offset2);
+    b.bytes = n as u64;
+    b.iter(|| {
+        let src: &[u8] = black_box(&v1[offset1..]);
+        let dst: &mut [u8] = black_box(&mut v2[offset2..]);
+        dst.copy_from_slice(src);
+    })
+}
+
+fn memcpy_rust(b: &mut Bencher, n: usize, offset1: usize, offset2: usize) {
+    let v1 = AlignedVec::new(1, n + offset1);
+    let mut v2 = AlignedVec::new(0, n + offset2);
+    b.bytes = n as u64;
+    b.iter(|| {
+        let src: &[u8] = black_box(&v1[offset1..]);
+        let dst: &mut [u8] = black_box(&mut v2[offset2..]);
+        unsafe { memcpy(dst.as_mut_ptr(), src.as_ptr(), n) }
+    })
+}
+
+fn memset_builtin(b: &mut Bencher, n: usize, offset: usize) {
+    let mut v1 = AlignedVec::new(0, n + offset);
+    b.bytes = n as u64;
+    b.iter(|| {
+        let dst: &mut [u8] = black_box(&mut v1[offset..]);
+        let val: u8 = black_box(27);
+        for b in dst {
+            *b = val;
+        }
+    })
+}
+
+fn memset_rust(b: &mut Bencher, n: usize, offset: usize) {
+    let mut v1 = AlignedVec::new(0, n + offset);
+    b.bytes = n as u64;
+    b.iter(|| {
+        let dst: &mut [u8] = black_box(&mut v1[offset..]);
+        let val = black_box(27);
+        unsafe { memset(dst.as_mut_ptr(), val, n) }
+    })
+}
+
+fn memcmp_builtin(b: &mut Bencher, n: usize) {
+    let v1 = AlignedVec::new(0, n);
+    let mut v2 = AlignedVec::new(0, n);
+    v2[n - 1] = 1;
+    b.bytes = n as u64;
+    b.iter(|| {
+        let s1: &[u8] = black_box(&v1);
+        let s2: &[u8] = black_box(&v2);
+        s1.cmp(s2)
+    })
+}
+
+fn memcmp_builtin_unaligned(b: &mut Bencher, n: usize) {
+    let v1 = AlignedVec::new(0, n);
+    let mut v2 = AlignedVec::new(0, n);
+    v2[n - 1] = 1;
+    b.bytes = n as u64;
+    b.iter(|| {
+        let s1: &[u8] = black_box(&v1[0..]);
+        let s2: &[u8] = black_box(&v2[1..]);
+        s1.cmp(s2)
+    })
+}
+
+fn memcmp_rust(b: &mut Bencher, n: usize) {
+    let v1 = AlignedVec::new(0, n);
+    let mut v2 = AlignedVec::new(0, n);
+    v2[n - 1] = 1;
+    b.bytes = n as u64;
+    b.iter(|| {
+        let s1: &[u8] = black_box(&v1);
+        let s2: &[u8] = black_box(&v2);
+        unsafe { memcmp(s1.as_ptr(), s2.as_ptr(), n) }
+    })
+}
+
+fn memcmp_rust_unaligned(b: &mut Bencher, n: usize) {
+    let v1 = AlignedVec::new(0, n);
+    let mut v2 = AlignedVec::new(0, n);
+    v2[n - 1] = 1;
+    b.bytes = n as u64;
+    b.iter(|| {
+        let s1: &[u8] = black_box(&v1[0..]);
+        let s2: &[u8] = black_box(&v2[1..]);
+        unsafe { memcmp(s1.as_ptr(), s2.as_ptr(), n - 1) }
+    })
+}
+
+fn memmove_builtin(b: &mut Bencher, n: usize, offset: usize) {
+    let mut v = AlignedVec::new(0, n + n / 2 + offset);
+    b.bytes = n as u64;
+    b.iter(|| {
+        let s: &mut [u8] = black_box(&mut v);
+        s.copy_within(0..n, n / 2 + offset);
+    })
+}
+
+fn memmove_rust(b: &mut Bencher, n: usize, offset: usize) {
+    let mut v = AlignedVec::new(0, n + n / 2 + offset);
+    b.bytes = n as u64;
+    b.iter(|| {
+        let dst: *mut u8 = black_box(&mut v[n / 2 + offset..]).as_mut_ptr();
+        let src: *const u8 = black_box(&v).as_ptr();
+        unsafe { memmove(dst, src, n) };
+    })
+}
+
+#[bench]
+fn memcpy_builtin_4096(b: &mut Bencher) {
+    memcpy_builtin(b, 4096, 0, 0)
+}
+#[bench]
+fn memcpy_rust_4096(b: &mut Bencher) {
+    memcpy_rust(b, 4096, 0, 0)
+}
+#[bench]
+fn memcpy_builtin_1048576(b: &mut Bencher) {
+    memcpy_builtin(b, 1048576, 0, 0)
+}
+#[bench]
+fn memcpy_rust_1048576(b: &mut Bencher) {
+    memcpy_rust(b, 1048576, 0, 0)
+}
+#[bench]
+fn memcpy_builtin_4096_offset(b: &mut Bencher) {
+    memcpy_builtin(b, 4096, 65, 65)
+}
+#[bench]
+fn memcpy_rust_4096_offset(b: &mut Bencher) {
+    memcpy_rust(b, 4096, 65, 65)
+}
+#[bench]
+fn memcpy_builtin_1048576_offset(b: &mut Bencher) {
+    memcpy_builtin(b, 1048576, 65, 65)
+}
+#[bench]
+fn memcpy_rust_1048576_offset(b: &mut Bencher) {
+    memcpy_rust(b, 1048576, 65, 65)
+}
+#[bench]
+fn memcpy_builtin_4096_misalign(b: &mut Bencher) {
+    memcpy_builtin(b, 4096, 65, 66)
+}
+#[bench]
+fn memcpy_rust_4096_misalign(b: &mut Bencher) {
+    memcpy_rust(b, 4096, 65, 66)
+}
+#[bench]
+fn memcpy_builtin_1048576_misalign(b: &mut Bencher) {
+    memcpy_builtin(b, 1048576, 65, 66)
+}
+#[bench]
+fn memcpy_rust_1048576_misalign(b: &mut Bencher) {
+    memcpy_rust(b, 1048576, 65, 66)
+}
+
+#[bench]
+fn memset_builtin_4096(b: &mut Bencher) {
+    memset_builtin(b, 4096, 0)
+}
+#[bench]
+fn memset_rust_4096(b: &mut Bencher) {
+    memset_rust(b, 4096, 0)
+}
+#[bench]
+fn memset_builtin_1048576(b: &mut Bencher) {
+    memset_builtin(b, 1048576, 0)
+}
+#[bench]
+fn memset_rust_1048576(b: &mut Bencher) {
+    memset_rust(b, 1048576, 0)
+}
+#[bench]
+fn memset_builtin_4096_offset(b: &mut Bencher) {
+    memset_builtin(b, 4096, 65)
+}
+#[bench]
+fn memset_rust_4096_offset(b: &mut Bencher) {
+    memset_rust(b, 4096, 65)
+}
+#[bench]
+fn memset_builtin_1048576_offset(b: &mut Bencher) {
+    memset_builtin(b, 1048576, 65)
+}
+#[bench]
+fn memset_rust_1048576_offset(b: &mut Bencher) {
+    memset_rust(b, 1048576, 65)
+}
+
+#[bench]
+fn memcmp_builtin_8(b: &mut Bencher) {
+    memcmp_builtin(b, 8)
+}
+#[bench]
+fn memcmp_rust_8(b: &mut Bencher) {
+    memcmp_rust(b, 8)
+}
+#[bench]
+fn memcmp_builtin_16(b: &mut Bencher) {
+    memcmp_builtin(b, 16)
+}
+#[bench]
+fn memcmp_rust_16(b: &mut Bencher) {
+    memcmp_rust(b, 16)
+}
+#[bench]
+fn memcmp_builtin_32(b: &mut Bencher) {
+    memcmp_builtin(b, 32)
+}
+#[bench]
+fn memcmp_rust_32(b: &mut Bencher) {
+    memcmp_rust(b, 32)
+}
+#[bench]
+fn memcmp_builtin_64(b: &mut Bencher) {
+    memcmp_builtin(b, 64)
+}
+#[bench]
+fn memcmp_rust_64(b: &mut Bencher) {
+    memcmp_rust(b, 64)
+}
+#[bench]
+fn memcmp_builtin_4096(b: &mut Bencher) {
+    memcmp_builtin(b, 4096)
+}
+#[bench]
+fn memcmp_rust_4096(b: &mut Bencher) {
+    memcmp_rust(b, 4096)
+}
+#[bench]
+fn memcmp_builtin_1048576(b: &mut Bencher) {
+    memcmp_builtin(b, 1048576)
+}
+#[bench]
+fn memcmp_rust_1048576(b: &mut Bencher) {
+    memcmp_rust(b, 1048576)
+}
+#[bench]
+fn memcmp_builtin_unaligned_7(b: &mut Bencher) {
+    memcmp_builtin_unaligned(b, 8)
+}
+#[bench]
+fn memcmp_rust_unaligned_7(b: &mut Bencher) {
+    memcmp_rust_unaligned(b, 8)
+}
+#[bench]
+fn memcmp_builtin_unaligned_15(b: &mut Bencher) {
+    memcmp_builtin_unaligned(b, 16)
+}
+#[bench]
+fn memcmp_rust_unaligned_15(b: &mut Bencher) {
+    memcmp_rust_unaligned(b, 16)
+}
+#[bench]
+fn memcmp_builtin_unaligned_31(b: &mut Bencher) {
+    memcmp_builtin_unaligned(b, 32)
+}
+#[bench]
+fn memcmp_rust_unaligned_31(b: &mut Bencher) {
+    memcmp_rust_unaligned(b, 32)
+}
+#[bench]
+fn memcmp_builtin_unaligned_63(b: &mut Bencher) {
+    memcmp_builtin_unaligned(b, 64)
+}
+#[bench]
+fn memcmp_rust_unaligned_63(b: &mut Bencher) {
+    memcmp_rust_unaligned(b, 64)
+}
+#[bench]
+fn memcmp_builtin_unaligned_4095(b: &mut Bencher) {
+    memcmp_builtin_unaligned(b, 4096)
+}
+#[bench]
+fn memcmp_rust_unaligned_4095(b: &mut Bencher) {
+    memcmp_rust_unaligned(b, 4096)
+}
+#[bench]
+fn memcmp_builtin_unaligned_1048575(b: &mut Bencher) {
+    memcmp_builtin_unaligned(b, 1048576)
+}
+#[bench]
+fn memcmp_rust_unaligned_1048575(b: &mut Bencher) {
+    memcmp_rust_unaligned(b, 1048576)
+}
+
+#[bench]
+fn memmove_builtin_4096(b: &mut Bencher) {
+    memmove_builtin(b, 4096, 0)
+}
+#[bench]
+fn memmove_rust_4096(b: &mut Bencher) {
+    memmove_rust(b, 4096, 0)
+}
+#[bench]
+fn memmove_builtin_1048576(b: &mut Bencher) {
+    memmove_builtin(b, 1048576, 0)
+}
+#[bench]
+fn memmove_rust_1048576(b: &mut Bencher) {
+    memmove_rust(b, 1048576, 0)
+}
+#[bench]
+fn memmove_builtin_4096_misalign(b: &mut Bencher) {
+    memmove_builtin(b, 4096, 1)
+}
+#[bench]
+fn memmove_rust_4096_misalign(b: &mut Bencher) {
+    memmove_rust(b, 4096, 1)
+}
+#[bench]
+fn memmove_builtin_1048576_misalign(b: &mut Bencher) {
+    memmove_builtin(b, 1048576, 1)
+}
+#[bench]
+fn memmove_rust_1048576_misalign(b: &mut Bencher) {
+    memmove_rust(b, 1048576, 1)
+}
diff --git a/library/compiler-builtins/builtins-test/benches/mem_icount.rs b/library/compiler-builtins/builtins-test/benches/mem_icount.rs
new file mode 100644
index 00000000000..63045f6e1ec
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/benches/mem_icount.rs
@@ -0,0 +1,499 @@
+//! Benchmarks that use Callgrind (via `iai_callgrind`) to report instruction count metrics. This
+//! is stable enough to be tested in CI.
+
+use std::hint::black_box;
+use std::{ops, slice};
+
+use compiler_builtins::mem::{memcmp, memcpy, memmove, memset};
+use iai_callgrind::{library_benchmark, library_benchmark_group, main};
+
+const PAGE_SIZE: usize = 0x1000; // 4 kiB
+const MAX_ALIGN: usize = 512; // assume we may use avx512 operations one day
+const MEG1: usize = 1 << 20; // 1 MiB
+
+#[derive(Clone)]
+#[repr(C, align(0x1000))]
+struct Page([u8; PAGE_SIZE]);
+
+/// A buffer that is page-aligned by default, with an optional offset to create a
+/// misalignment.
+struct AlignedSlice {
+    buf: Box<[Page]>,
+    len: usize,
+    offset: usize,
+}
+
+impl AlignedSlice {
+    /// Allocate a slice aligned to ALIGN with at least `len` items, with `offset` from
+    /// page alignment.
+    fn new_zeroed(len: usize, offset: usize) -> Self {
+        assert!(offset < PAGE_SIZE);
+        let total_len = len + offset;
+        let items = (total_len / PAGE_SIZE) + if total_len % PAGE_SIZE > 0 { 1 } else { 0 };
+        let buf = vec![Page([0u8; PAGE_SIZE]); items].into_boxed_slice();
+        AlignedSlice { buf, len, offset }
+    }
+}
+
+impl ops::Deref for AlignedSlice {
+    type Target = [u8];
+    fn deref(&self) -> &Self::Target {
+        unsafe { slice::from_raw_parts(self.buf.as_ptr().cast::<u8>().add(self.offset), self.len) }
+    }
+}
+
+impl ops::DerefMut for AlignedSlice {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        unsafe {
+            slice::from_raw_parts_mut(
+                self.buf.as_mut_ptr().cast::<u8>().add(self.offset),
+                self.len,
+            )
+        }
+    }
+}
+
+mod mcpy {
+    use super::*;
+
+    struct Cfg {
+        len: usize,
+        s_off: usize,
+        d_off: usize,
+    }
+
+    fn setup(cfg: Cfg) -> (usize, AlignedSlice, AlignedSlice) {
+        let Cfg { len, s_off, d_off } = cfg;
+        println!("bytes: {len} bytes, src offset: {s_off}, dst offset: {d_off}");
+        let mut src = AlignedSlice::new_zeroed(len, s_off);
+        let dst = AlignedSlice::new_zeroed(len, d_off);
+        src.fill(1);
+        (len, src, dst)
+    }
+
+    #[library_benchmark]
+    #[benches::aligned(
+        // Both aligned
+        args = [
+            Cfg { len: 16, s_off: 0, d_off: 0 },
+            Cfg { len: 32, s_off: 0, d_off: 0 },
+            Cfg { len: 64, s_off: 0, d_off: 0 },
+            Cfg { len: 512, s_off: 0, d_off: 0 },
+            Cfg { len: 4096, s_off: 0, d_off: 0 },
+            Cfg { len: MEG1, s_off: 0, d_off: 0 },
+        ],
+        setup = setup,
+    )]
+    #[benches::offset(
+        // Both at the same offset
+        args = [
+            Cfg { len: 16, s_off: 65, d_off: 65 },
+            Cfg { len: 32, s_off: 65, d_off: 65 },
+            Cfg { len: 64, s_off: 65, d_off: 65 },
+            Cfg { len: 512, s_off: 65, d_off: 65 },
+            Cfg { len: 4096, s_off: 65, d_off: 65 },
+            Cfg { len: MEG1, s_off: 65, d_off: 65 },
+        ],
+        setup = setup,
+    )]
+    #[benches::misaligned(
+        // `src` and `dst` both misaligned by different amounts
+        args = [
+            Cfg { len: 16, s_off: 65, d_off: 66 },
+            Cfg { len: 32, s_off: 65, d_off: 66 },
+            Cfg { len: 64, s_off: 65, d_off: 66 },
+            Cfg { len: 512, s_off: 65, d_off: 66 },
+            Cfg { len: 4096, s_off: 65, d_off: 66 },
+            Cfg { len: MEG1, s_off: 65, d_off: 66 },
+        ],
+        setup = setup,
+    )]
+    fn bench((len, mut dst, src): (usize, AlignedSlice, AlignedSlice)) {
+        unsafe {
+            black_box(memcpy(
+                black_box(dst.as_mut_ptr()),
+                black_box(src.as_ptr()),
+                black_box(len),
+            ));
+        }
+    }
+
+    library_benchmark_group!(name = memcpy; benchmarks = bench);
+}
+
+mod mset {
+    use super::*;
+
+    struct Cfg {
+        len: usize,
+        offset: usize,
+    }
+
+    fn setup(Cfg { len, offset }: Cfg) -> (usize, AlignedSlice) {
+        println!("bytes: {len}, offset: {offset}");
+        (len, AlignedSlice::new_zeroed(len, offset))
+    }
+
+    #[library_benchmark]
+    #[benches::aligned(
+        args = [
+            Cfg { len: 16, offset: 0 },
+            Cfg { len: 32, offset: 0 },
+            Cfg { len: 64, offset: 0 },
+            Cfg { len: 512, offset: 0 },
+            Cfg { len: 4096, offset: 0 },
+            Cfg { len: MEG1, offset: 0 },
+        ],
+        setup = setup,
+    )]
+    #[benches::offset(
+        args = [
+            Cfg { len: 16, offset: 65 },
+            Cfg { len: 32, offset: 65 },
+            Cfg { len: 64, offset: 65 },
+            Cfg { len: 512, offset: 65 },
+            Cfg { len: 4096, offset: 65 },
+            Cfg { len: MEG1, offset: 65 },
+        ],
+        setup = setup,
+    )]
+    fn bench((len, mut dst): (usize, AlignedSlice)) {
+        unsafe {
+            black_box(memset(
+                black_box(dst.as_mut_ptr()),
+                black_box(27),
+                black_box(len),
+            ));
+        }
+    }
+
+    library_benchmark_group!(name = memset; benchmarks = bench);
+}
+
+mod mcmp {
+    use super::*;
+
+    struct Cfg {
+        len: usize,
+        s_off: usize,
+        d_off: usize,
+    }
+
+    fn setup(cfg: Cfg) -> (usize, AlignedSlice, AlignedSlice) {
+        let Cfg { len, s_off, d_off } = cfg;
+        println!("bytes: {len}, src offset: {s_off}, dst offset: {d_off}");
+        let b1 = AlignedSlice::new_zeroed(len, s_off);
+        let mut b2 = AlignedSlice::new_zeroed(len, d_off);
+        b2[len - 1] = 1;
+        (len, b1, b2)
+    }
+
+    #[library_benchmark]
+    #[benches::aligned(
+        // Both aligned
+        args = [
+            Cfg { len: 16, s_off: 0, d_off: 0 },
+            Cfg { len: 32, s_off: 0, d_off: 0 },
+            Cfg { len: 64, s_off: 0, d_off: 0 },
+            Cfg { len: 512, s_off: 0, d_off: 0 },
+            Cfg { len: 4096, s_off: 0, d_off: 0 },
+            Cfg { len: MEG1, s_off: 0, d_off: 0 },
+        ],
+        setup = setup
+    )]
+    #[benches::offset(
+        // Both at the same offset
+        args = [
+            Cfg { len: 16, s_off: 65, d_off: 65 },
+            Cfg { len: 32, s_off: 65, d_off: 65 },
+            Cfg { len: 64, s_off: 65, d_off: 65 },
+            Cfg { len: 512, s_off: 65, d_off: 65 },
+            Cfg { len: 4096, s_off: 65, d_off: 65 },
+            Cfg { len: MEG1, s_off: 65, d_off: 65 },
+        ],
+        setup = setup
+    )]
+    #[benches::misaligned(
+        // `src` and `dst` both misaligned by different amounts
+        args = [
+            Cfg { len: 16, s_off: 65, d_off: 66 },
+            Cfg { len: 32, s_off: 65, d_off: 66 },
+            Cfg { len: 64, s_off: 65, d_off: 66 },
+            Cfg { len: 512, s_off: 65, d_off: 66 },
+            Cfg { len: 4096, s_off: 65, d_off: 66 },
+            Cfg { len: MEG1, s_off: 65, d_off: 66 },
+        ],
+        setup = setup
+    )]
+    fn bench((len, mut dst, src): (usize, AlignedSlice, AlignedSlice)) {
+        unsafe {
+            black_box(memcmp(
+                black_box(dst.as_mut_ptr()),
+                black_box(src.as_ptr()),
+                black_box(len),
+            ));
+        }
+    }
+
+    library_benchmark_group!(name = memcmp; benchmarks = bench);
+}
+
+mod mmove {
+    use super::*;
+    use Spread::{Aligned, Large, Medium, Small};
+
+    struct Cfg {
+        len: usize,
+        spread: Spread,
+        off: usize,
+    }
+
+    enum Spread {
+        /// `src` and `dst` are close and have the same alignment (or offset).
+        Aligned,
+        /// `src` and `dst` are close.
+        Small,
+        /// `src` and `dst` are halfway offset in the buffer.
+        Medium,
+        /// `src` and `dst` only overlap by a single byte.
+        Large,
+    }
+
+    // Note that small and large are
+    fn calculate_spread(len: usize, spread: Spread) -> usize {
+        match spread {
+            // Note that this test doesn't make sense for lengths less than len=128
+            Aligned => {
+                assert!(len > MAX_ALIGN, "aligned memset would have no overlap");
+                MAX_ALIGN
+            }
+            Small => 1,
+            Medium => (len / 2) + 1, // add 1 so all are misaligned
+            Large => len - 1,
+        }
+    }
+
+    fn setup_forward(cfg: Cfg) -> (usize, usize, AlignedSlice) {
+        let Cfg { len, spread, off } = cfg;
+        let spread = calculate_spread(len, spread);
+        println!("bytes: {len}, spread: {spread}, offset: {off}, forward");
+        assert!(spread < len, "memmove tests should have some overlap");
+        let mut buf = AlignedSlice::new_zeroed(len + spread, off);
+        let mut fill: usize = 0;
+        buf[..len].fill_with(|| {
+            fill += 1;
+            fill as u8
+        });
+        (len, spread, buf)
+    }
+
+    fn setup_backward(cfg: Cfg) -> (usize, usize, AlignedSlice) {
+        let Cfg { len, spread, off } = cfg;
+        let spread = calculate_spread(len, spread);
+        println!("bytes: {len}, spread: {spread}, offset: {off}, backward");
+        assert!(spread < len, "memmove tests should have some overlap");
+        let mut buf = AlignedSlice::new_zeroed(len + spread, off);
+        let mut fill: usize = 0;
+        buf[spread..].fill_with(|| {
+            fill += 1;
+            fill as u8
+        });
+        (len, spread, buf)
+    }
+
+    #[library_benchmark]
+    #[benches::aligned(
+        args = [
+            // Don't test small spreads since there is no overlap
+            Cfg { len: 4096, spread: Aligned, off: 0 },
+            Cfg { len: MEG1, spread: Aligned, off: 0 },
+        ],
+        setup = setup_forward
+    )]
+    #[benches::small_spread(
+        args = [
+            Cfg { len: 16, spread: Small, off: 0 },
+            Cfg { len: 32, spread: Small, off: 0 },
+            Cfg { len: 64, spread: Small, off: 0 },
+            Cfg { len: 512, spread: Small, off: 0 },
+            Cfg { len: 4096, spread: Small, off: 0 },
+            Cfg { len: MEG1, spread: Small, off: 0 },
+        ],
+        setup = setup_forward
+    )]
+    #[benches::medium_spread(
+        args = [
+            Cfg { len: 16, spread: Medium, off: 0 },
+            Cfg { len: 32, spread: Medium, off: 0 },
+            Cfg { len: 64, spread: Medium, off: 0 },
+            Cfg { len: 512, spread: Medium, off: 0 },
+            Cfg { len: 4096, spread: Medium, off: 0 },
+            Cfg { len: MEG1, spread: Medium, off: 0 },
+        ],
+        setup = setup_forward
+    )]
+    #[benches::large_spread(
+        args = [
+            Cfg { len: 16, spread: Large, off: 0 },
+            Cfg { len: 32, spread: Large, off: 0 },
+            Cfg { len: 64, spread: Large, off: 0 },
+            Cfg { len: 512, spread: Large, off: 0 },
+            Cfg { len: 4096, spread: Large, off: 0 },
+            Cfg { len: MEG1, spread: Large, off: 0 },
+        ],
+        setup = setup_forward
+    )]
+    #[benches::aligned_off(
+        args = [
+            Cfg { len: 4096, spread: Aligned, off: 65 },
+            Cfg { len: MEG1, spread: Aligned, off: 65 },
+        ],
+        setup = setup_forward
+    )]
+    #[benches::small_spread_off(
+        args = [
+            Cfg { len: 16, spread: Small, off: 65 },
+            Cfg { len: 32, spread: Small, off: 65 },
+            Cfg { len: 64, spread: Small, off: 65 },
+            Cfg { len: 512, spread: Small, off: 65 },
+            Cfg { len: 4096, spread: Small, off: 65 },
+            Cfg { len: MEG1, spread: Small, off: 65 },
+        ],
+        setup = setup_forward
+    )]
+    #[benches::medium_spread_off(
+        args = [
+            Cfg { len: 16, spread: Medium, off: 65 },
+            Cfg { len: 32, spread: Medium, off: 65 },
+            Cfg { len: 64, spread: Medium, off: 65 },
+            Cfg { len: 512, spread: Medium, off: 65 },
+            Cfg { len: 4096, spread: Medium, off: 65 },
+            Cfg { len: MEG1, spread: Medium, off: 65 },
+        ],
+        setup = setup_forward
+    )]
+    #[benches::large_spread_off(
+        args = [
+            Cfg { len: 16, spread: Large, off: 65 },
+            Cfg { len: 32, spread: Large, off: 65 },
+            Cfg { len: 64, spread: Large, off: 65 },
+            Cfg { len: 512, spread: Large, off: 65 },
+            Cfg { len: 4096, spread: Large, off: 65 },
+            Cfg { len: MEG1, spread: Large, off: 65 },
+        ],
+        setup = setup_forward
+    )]
+    fn forward((len, spread, mut buf): (usize, usize, AlignedSlice)) {
+        // Test moving from the start of the buffer toward the end
+        unsafe {
+            black_box(memmove(
+                black_box(buf[spread..].as_mut_ptr()),
+                black_box(buf.as_ptr()),
+                black_box(len),
+            ));
+        }
+    }
+
+    #[library_benchmark]
+    #[benches::aligned(
+        args = [
+            // Don't test small spreads since there is no overlap
+            Cfg { len: 4096, spread: Aligned, off: 0 },
+            Cfg { len: MEG1, spread: Aligned, off: 0 },
+        ],
+        setup = setup_backward
+    )]
+    #[benches::small_spread(
+        args = [
+            Cfg { len: 16, spread: Small, off: 0 },
+            Cfg { len: 32, spread: Small, off: 0 },
+            Cfg { len: 64, spread: Small, off: 0 },
+            Cfg { len: 512, spread: Small, off: 0 },
+            Cfg { len: 4096, spread: Small, off: 0 },
+            Cfg { len: MEG1, spread: Small, off: 0 },
+        ],
+        setup = setup_backward
+    )]
+    #[benches::medium_spread(
+        args = [
+            Cfg { len: 16, spread: Medium, off: 0 },
+            Cfg { len: 32, spread: Medium, off: 0 },
+            Cfg { len: 64, spread: Medium, off: 0 },
+            Cfg { len: 512, spread: Medium, off: 0 },
+            Cfg { len: 4096, spread: Medium, off: 0 },
+            Cfg { len: MEG1, spread: Medium, off: 0 },
+        ],
+        setup = setup_backward
+    )]
+    #[benches::large_spread(
+        args = [
+            Cfg { len: 16, spread: Large, off: 0 },
+            Cfg { len: 32, spread: Large, off: 0 },
+            Cfg { len: 64, spread: Large, off: 0 },
+            Cfg { len: 512, spread: Large, off: 0 },
+            Cfg { len: 4096, spread: Large, off: 0 },
+            Cfg { len: MEG1, spread: Large, off: 0 },
+        ],
+        setup = setup_backward
+    )]
+    #[benches::aligned_off(
+        args = [
+            // Don't test small spreads since there is no overlap
+            Cfg { len: 4096, spread: Aligned, off: 65 },
+            Cfg { len: MEG1, spread: Aligned, off: 65 },
+        ],
+        setup = setup_backward
+    )]
+    #[benches::small_spread_off(
+        args = [
+            Cfg { len: 16, spread: Small, off: 65 },
+            Cfg { len: 32, spread: Small, off: 65 },
+            Cfg { len: 64, spread: Small, off: 65 },
+            Cfg { len: 512, spread: Small, off: 65 },
+            Cfg { len: 4096, spread: Small, off: 65 },
+            Cfg { len: MEG1, spread: Small, off: 65 },
+        ],
+        setup = setup_backward
+    )]
+    #[benches::medium_spread_off(
+        args = [
+            Cfg { len: 16, spread: Medium, off: 65 },
+            Cfg { len: 32, spread: Medium, off: 65 },
+            Cfg { len: 64, spread: Medium, off: 65 },
+            Cfg { len: 512, spread: Medium, off: 65 },
+            Cfg { len: 4096, spread: Medium, off: 65 },
+            Cfg { len: MEG1, spread: Medium, off: 65 },
+        ],
+        setup = setup_backward
+    )]
+    #[benches::large_spread_off(
+        args = [
+            Cfg { len: 16, spread: Large, off: 65 },
+            Cfg { len: 32, spread: Large, off: 65 },
+            Cfg { len: 64, spread: Large, off: 65 },
+            Cfg { len: 512, spread: Large, off: 65 },
+            Cfg { len: 4096, spread: Large, off: 65 },
+            Cfg { len: MEG1, spread: Large, off: 65 },
+        ],
+        setup = setup_backward
+    )]
+    fn backward((len, spread, mut buf): (usize, usize, AlignedSlice)) {
+        // Test moving from the end of the buffer toward the start
+        unsafe {
+            black_box(memmove(
+                black_box(buf.as_mut_ptr()),
+                black_box(buf[spread..].as_ptr()),
+                black_box(len),
+            ));
+        }
+    }
+
+    library_benchmark_group!(name = memmove; benchmarks = forward, backward);
+}
+
+use mcmp::memcmp;
+use mcpy::memcpy;
+use mmove::memmove;
+use mset::memset;
+
+main!(library_benchmark_groups = memcpy, memset, memcmp, memmove);
diff --git a/library/compiler-builtins/builtins-test/build.rs b/library/compiler-builtins/builtins-test/build.rs
new file mode 100644
index 00000000000..e8f4eb4dd22
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/build.rs
@@ -0,0 +1,120 @@
+use std::collections::HashSet;
+
+mod builtins_configure {
+    include!("../compiler-builtins/configure.rs");
+}
+
+/// Features to enable
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+enum Feature {
+    NoSysF128,
+    NoSysF128IntConvert,
+    NoSysF16,
+    NoSysF16F64Convert,
+    NoSysF16F128Convert,
+}
+
+impl Feature {
+    fn implies(self) -> &'static [Self] {
+        match self {
+            Self::NoSysF128 => [Self::NoSysF128IntConvert, Self::NoSysF16F128Convert].as_slice(),
+            Self::NoSysF128IntConvert => [].as_slice(),
+            Self::NoSysF16 => [Self::NoSysF16F64Convert, Self::NoSysF16F128Convert].as_slice(),
+            Self::NoSysF16F64Convert => [].as_slice(),
+            Self::NoSysF16F128Convert => [].as_slice(),
+        }
+    }
+}
+
+fn main() {
+    println!("cargo::rerun-if-changed=../configure.rs");
+
+    let target = builtins_configure::Target::from_env();
+    let mut features = HashSet::new();
+
+    // These platforms do not have f128 symbols available in their system libraries, so
+    // skip related tests.
+    if target.arch == "arm"
+        || target.vendor == "apple"
+        || target.env == "msvc"
+        // GCC and LLVM disagree on the ABI of `f16` and `f128` with MinGW. See
+        // <https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115054>.
+        || (target.os == "windows" && target.env == "gnu")
+        // FIXME(llvm): There is an ABI incompatibility between GCC and Clang on 32-bit x86.
+        // See <https://github.com/llvm/llvm-project/issues/77401>.
+        || target.arch == "x86"
+        // 32-bit PowerPC and 64-bit LE gets code generated that Qemu cannot handle. See
+        // <https://github.com/rust-lang/compiler-builtins/pull/606#issuecomment-2105635926>.
+        || target.arch == "powerpc"
+        || target.arch == "powerpc64le"
+        // FIXME: We get different results from the builtin functions. See
+        // <https://github.com/rust-lang/compiler-builtins/pull/606#issuecomment-2105657287>.
+        || target.arch == "powerpc64"
+    {
+        features.insert(Feature::NoSysF128);
+    }
+
+    if target.arch == "x86" {
+        // 32-bit x86 does not have `__fixunstfti`/`__fixtfti` but does have everything else
+        features.insert(Feature::NoSysF128IntConvert);
+        // FIXME: 32-bit x86 has a bug in `f128 -> f16` system libraries
+        features.insert(Feature::NoSysF16F128Convert);
+    }
+
+    // These platforms do not have f16 symbols available in their system libraries, so
+    // skip related tests. Most of these are missing `f16 <-> f32` conversion routines.
+    if (target.arch == "aarch64" && target.os == "linux")
+        || target.arch.starts_with("arm")
+        || target.arch == "powerpc"
+        || target.arch == "powerpc64"
+        || target.arch == "powerpc64le"
+        || target.arch == "loongarch64"
+        || (target.arch == "x86" && !target.has_feature("sse"))
+        || target.os == "windows"
+        // Linking says "error: function signature mismatch: __extendhfsf2" and seems to
+        // think the signature is either `(i32) -> f32` or `(f32) -> f32`. See
+        // <https://github.com/llvm/llvm-project/issues/96438>.
+        || target.arch == "wasm32"
+        || target.arch == "wasm64"
+    {
+        features.insert(Feature::NoSysF16);
+    }
+
+    // These platforms are missing either `__extendhfdf2` or `__truncdfhf2`.
+    if target.vendor == "apple" || target.os == "windows" {
+        features.insert(Feature::NoSysF16F64Convert);
+    }
+
+    // Add implied features. Collection is required for borrows.
+    features.extend(
+        features
+            .iter()
+            .flat_map(|x| x.implies())
+            .copied()
+            .collect::<Vec<_>>(),
+    );
+
+    for feature in features {
+        let (name, warning) = match feature {
+            Feature::NoSysF128 => ("no-sys-f128", "using apfloat fallback for f128"),
+            Feature::NoSysF128IntConvert => (
+                "no-sys-f128-int-convert",
+                "using apfloat fallback for f128 <-> int conversions",
+            ),
+            Feature::NoSysF16F64Convert => (
+                "no-sys-f16-f64-convert",
+                "using apfloat fallback for f16 <-> f64 conversions",
+            ),
+            Feature::NoSysF16F128Convert => (
+                "no-sys-f16-f128-convert",
+                "using apfloat fallback for f16 <-> f128 conversions",
+            ),
+            Feature::NoSysF16 => ("no-sys-f16", "using apfloat fallback for f16"),
+        };
+        println!("cargo:warning={warning}");
+        println!("cargo:rustc-cfg=feature=\"{name}\"");
+    }
+
+    builtins_configure::configure_aliases(&target);
+    builtins_configure::configure_f16_f128(&target);
+}
diff --git a/library/compiler-builtins/builtins-test/src/bench.rs b/library/compiler-builtins/builtins-test/src/bench.rs
new file mode 100644
index 00000000000..45a3a1ad467
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/src/bench.rs
@@ -0,0 +1,366 @@
+use core::cell::RefCell;
+
+use alloc::vec::Vec;
+use compiler_builtins::float::Float;
+
+/// Fuzz with these many items to ensure equal functions
+pub const CHECK_ITER_ITEMS: u32 = 10_000;
+/// Benchmark with this many items to get a variety
+pub const BENCH_ITER_ITEMS: u32 = 500;
+
+/// Still run benchmarks/tests but don't check correctness between compiler-builtins and
+/// builtin system functions functions
+pub fn skip_sys_checks(test_name: &str) -> bool {
+    const ALWAYS_SKIPPED: &[&str] = &[
+        // FIXME(f16_f128): system symbols have incorrect results
+        // <https://github.com/rust-lang/compiler-builtins/issues/617>
+        "extend_f16_f32",
+        "trunc_f32_f16",
+        "trunc_f64_f16",
+        // FIXME(#616): re-enable once fix is in nightly
+        // <https://github.com/rust-lang/compiler-builtins/issues/616>
+        "mul_f32",
+        "mul_f64",
+    ];
+
+    // FIXME(f16_f128): error on LE ppc64. There are more tests that are cfg-ed out completely
+    // in their benchmark modules due to runtime panics.
+    // <https://github.com/rust-lang/compiler-builtins/issues/617#issuecomment-2125914639>
+    const PPC64LE_SKIPPED: &[&str] = &["extend_f32_f128"];
+
+    // FIXME(f16_f128): system symbols have incorrect results
+    // <https://github.com/rust-lang/compiler-builtins/issues/617#issuecomment-2125914639>
+    const X86_NO_SSE_SKIPPED: &[&str] = &[
+        "add_f128", "sub_f128", "mul_f128", "div_f128", "powi_f32", "powi_f64",
+    ];
+
+    // FIXME(f16_f128): Wide multiply carry bug in `compiler-rt`, re-enable when nightly no longer
+    // uses `compiler-rt` version.
+    // <https://github.com/llvm/llvm-project/issues/91840>
+    const AARCH64_SKIPPED: &[&str] = &["mul_f128", "div_f128"];
+
+    // FIXME(llvm): system symbols have incorrect results on Windows
+    // <https://github.com/rust-lang/compiler-builtins/issues/617#issuecomment-2121359807>
+    const WINDOWS_SKIPPED: &[&str] = &[
+        "conv_f32_u128",
+        "conv_f32_i128",
+        "conv_f64_u128",
+        "conv_f64_i128",
+    ];
+
+    if cfg!(target_arch = "arm") {
+        // The Arm symbols need a different ABI that our macro doesn't handle, just skip it
+        return true;
+    }
+
+    if ALWAYS_SKIPPED.contains(&test_name) {
+        return true;
+    }
+
+    if cfg!(all(target_arch = "powerpc64", target_endian = "little"))
+        && PPC64LE_SKIPPED.contains(&test_name)
+    {
+        return true;
+    }
+
+    if cfg!(all(target_arch = "x86", not(target_feature = "sse")))
+        && X86_NO_SSE_SKIPPED.contains(&test_name)
+    {
+        return true;
+    }
+
+    if cfg!(target_arch = "aarch64") && AARCH64_SKIPPED.contains(&test_name) {
+        return true;
+    }
+
+    if cfg!(target_family = "windows") && WINDOWS_SKIPPED.contains(&test_name) {
+        return true;
+    }
+
+    false
+}
+
+/// Still run benchmarks/tests but don't check correctness between compiler-builtins and
+/// assembly functions
+pub fn skip_asm_checks(_test_name: &str) -> bool {
+    // Nothing to skip at this time
+    false
+}
+
+/// Create a comparison of the system symbol, compiler_builtins, and optionally handwritten
+/// assembly.
+///
+/// # Safety
+///
+/// The signature must be correct and any assembly must be sound.
+#[macro_export]
+macro_rules! float_bench {
+    (
+        // Name of this benchmark
+        name: $name:ident,
+        // The function signature to be tested
+        sig: ($($arg:ident: $arg_ty:ty),*) -> $ret_ty:ty,
+        // Path to the crate in compiler_builtins
+        crate_fn: $crate_fn:path,
+        // Optional alias on ppc
+        $( crate_fn_ppc: $crate_fn_ppc:path, )?
+        // Name of the system symbol
+        sys_fn: $sys_fn:ident,
+        // Optional alias on ppc
+        $( sys_fn_ppc: $sys_fn_ppc:path, )?
+        // Meta saying whether the system symbol is available
+        sys_available: $sys_available:meta,
+        // An optional function to validate the results of two functions are equal, if not
+        // just `$ret_ty::check_eq`
+        $( output_eq: $output_eq:expr, )?
+        // Assembly implementations, if any.
+        asm: [
+            $(
+                #[cfg($asm_meta:meta)] {
+                    $($asm_tt:tt)*
+                }
+            );*
+            $(;)?
+        ]
+        $(,)?
+    ) => {paste::paste! {
+        // SAFETY: macro invocation must use the correct signature
+        #[cfg($sys_available)]
+        unsafe extern "C" {
+            /// Binding for the system function
+            #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+            fn $sys_fn($($arg: $arg_ty),*) -> $ret_ty;
+
+
+            #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+            float_bench! { @coalesce_fn $($sys_fn_ppc)? =>
+                fn $sys_fn($($arg: $arg_ty),*) -> $ret_ty;
+            }
+        }
+
+        fn $name(c: &mut Criterion) {
+            use core::hint::black_box;
+            use compiler_builtins::float::Float;
+            use $crate::bench::TestIO;
+
+            #[inline(never)] // equalize with external calls
+            fn crate_fn($($arg: $arg_ty),*) -> $ret_ty {
+                #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+                let target_crate_fn = $crate_fn;
+
+                // On PPC, use an alias if specified
+                #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+                let target_crate_fn = float_bench!(@coalesce $($crate_fn_ppc)?, $crate_fn);
+
+                target_crate_fn( $($arg),* )
+            }
+
+            #[inline(always)] // already a branch
+            #[cfg($sys_available)]
+            fn sys_fn($($arg: $arg_ty),*) -> $ret_ty {
+                #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+                let target_sys_fn = $sys_fn;
+
+                // On PPC, use an alias if specified
+                #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+                let target_sys_fn = float_bench!(@coalesce $($sys_fn_ppc)?, $sys_fn);
+
+                unsafe { target_sys_fn( $($arg),* ) }
+            }
+
+            #[inline(never)] // equalize with external calls
+            #[cfg(any( $($asm_meta),* ))]
+            fn asm_fn($(mut $arg: $arg_ty),*) -> $ret_ty {
+                use core::arch::asm;
+                $(
+                    #[cfg($asm_meta)]
+                    unsafe { $($asm_tt)* }
+                )*
+            }
+
+            let testvec = <($($arg_ty),*)>::make_testvec($crate::bench::CHECK_ITER_ITEMS);
+            let benchvec = <($($arg_ty),*)>::make_testvec($crate::bench::BENCH_ITER_ITEMS);
+            let test_name = stringify!($name);
+            let check_eq = float_bench!(@coalesce $($output_eq)?, $ret_ty::check_eq);
+
+            // Verify math lines up. We run the crate functions even if we don't validate the
+            // output here to make sure there are no panics or crashes.
+
+            #[cfg($sys_available)]
+            for ($($arg),*) in testvec.iter().copied() {
+                let crate_res = crate_fn($($arg),*);
+                let sys_res = sys_fn($($arg),*);
+
+                if $crate::bench::skip_sys_checks(test_name) {
+                    continue;
+                }
+
+                assert!(
+                    check_eq(crate_res, sys_res),
+                    "{test_name}{:?}: crate: {crate_res:?}, sys: {sys_res:?}",
+                    ($($arg),* ,)
+                );
+            }
+
+            #[cfg(any( $($asm_meta),* ))]
+            {
+                for ($($arg),*) in testvec.iter().copied() {
+                    let crate_res = crate_fn($($arg),*);
+                    let asm_res = asm_fn($($arg),*);
+
+                    if $crate::bench::skip_asm_checks(test_name) {
+                        continue;
+                    }
+
+                    assert!(
+                        check_eq(crate_res, asm_res),
+                        "{test_name}{:?}: crate: {crate_res:?}, asm: {asm_res:?}",
+                        ($($arg),* ,)
+                    );
+                }
+            }
+
+            let mut group = c.benchmark_group(test_name);
+            group.bench_function("compiler-builtins", |b| b.iter(|| {
+                for ($($arg),*) in benchvec.iter().copied() {
+                    black_box(crate_fn( $(black_box($arg)),* ));
+                }
+            }));
+
+            #[cfg($sys_available)]
+            group.bench_function("system", |b| b.iter(|| {
+                for ($($arg),*) in benchvec.iter().copied() {
+                    black_box(sys_fn( $(black_box($arg)),* ));
+                }
+            }));
+
+            #[cfg(any( $($asm_meta),* ))]
+            group.bench_function(&format!(
+                "assembly ({} {})", std::env::consts::ARCH, std::env::consts::FAMILY
+            ), |b| b.iter(|| {
+                for ($($arg),*) in benchvec.iter().copied() {
+                    black_box(asm_fn( $(black_box($arg)),* ));
+                }
+            }));
+
+            group.finish();
+        }
+    }};
+
+    // Allow overriding a default
+    (@coalesce $specified:expr, $default:expr) => { $specified };
+    (@coalesce, $default:expr) => { $default };
+
+    // Allow overriding a function name
+    (@coalesce_fn $specified:ident => fn $default_name:ident $($tt:tt)+) => {
+        fn $specified $($tt)+
+    };
+    (@coalesce_fn => fn $default_name:ident $($tt:tt)+) => {
+        fn $default_name $($tt)+
+    };
+}
+
+/// A type used as either an input or output to/from a benchmark function.
+pub trait TestIO: Sized {
+    fn make_testvec(len: u32) -> Vec<Self>;
+    fn check_eq(a: Self, b: Self) -> bool;
+}
+
+macro_rules! impl_testio {
+    (float $($f_ty:ty),+) => {$(
+        impl TestIO for $f_ty {
+            fn make_testvec(len: u32) -> Vec<Self> {
+                // refcell because fuzz_* takes a `Fn`
+                let ret = RefCell::new(Vec::new());
+                crate::fuzz_float(len, |a| ret.borrow_mut().push(a));
+                ret.into_inner()
+            }
+
+            fn check_eq(a: Self, b: Self) -> bool {
+                Float::eq_repr(a, b)
+            }
+        }
+
+        impl TestIO for ($f_ty, $f_ty) {
+            fn make_testvec(len: u32) -> Vec<Self> {
+                // refcell because fuzz_* takes a `Fn`
+                let ret = RefCell::new(Vec::new());
+                crate::fuzz_float_2(len, |a, b| ret.borrow_mut().push((a, b)));
+                ret.into_inner()
+            }
+
+            fn check_eq(_a: Self, _b: Self) -> bool {
+                unimplemented!()
+            }
+        }
+    )*};
+
+    (int $($i_ty:ty),+) => {$(
+        impl TestIO for $i_ty {
+            fn make_testvec(len: u32) -> Vec<Self> {
+                // refcell because fuzz_* takes a `Fn`
+                let ret = RefCell::new(Vec::new());
+                crate::fuzz(len, |a| ret.borrow_mut().push(a));
+                ret.into_inner()
+            }
+
+            fn check_eq(a: Self, b: Self) -> bool {
+                a == b
+            }
+        }
+
+        impl TestIO for ($i_ty, $i_ty) {
+            fn make_testvec(len: u32) -> Vec<Self> {
+                // refcell because fuzz_* takes a `Fn`
+                let ret = RefCell::new(Vec::new());
+                crate::fuzz_2(len, |a, b| ret.borrow_mut().push((a, b)));
+                ret.into_inner()
+            }
+
+            fn check_eq(_a: Self, _b: Self) -> bool {
+                unimplemented!()
+            }
+        }
+    )*};
+
+    ((float, int) ($f_ty:ty, $i_ty:ty)) => {
+        impl TestIO for ($f_ty, $i_ty) {
+            fn make_testvec(len: u32) -> Vec<Self> {
+                // refcell because fuzz_* takes a `Fn`
+                let ivec = RefCell::new(Vec::new());
+                let fvec = RefCell::new(Vec::new());
+
+                crate::fuzz(len.isqrt(), |a| ivec.borrow_mut().push(a));
+                crate::fuzz_float(len.isqrt(), |a| fvec.borrow_mut().push(a));
+
+                let mut ret = Vec::new();
+                let ivec = ivec.into_inner();
+                let fvec = fvec.into_inner();
+
+                for f in fvec {
+                    for i in &ivec {
+                        ret.push((f, *i));
+                    }
+                }
+
+                ret
+            }
+
+            fn check_eq(_a: Self, _b: Self) -> bool {
+                unimplemented!()
+            }
+        }
+    }
+}
+
+#[cfg(f16_enabled)]
+impl_testio!(float f16);
+impl_testio!(float f32, f64);
+#[cfg(f128_enabled)]
+impl_testio!(float f128);
+impl_testio!(int i16, i32, i64, i128);
+impl_testio!(int u16, u32, u64, u128);
+impl_testio!((float, int)(f32, i32));
+impl_testio!((float, int)(f64, i32));
+#[cfg(f128_enabled)]
+impl_testio!((float, int)(f128, i32));
diff --git a/library/compiler-builtins/builtins-test/src/lib.rs b/library/compiler-builtins/builtins-test/src/lib.rs
new file mode 100644
index 00000000000..a83aea56206
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/src/lib.rs
@@ -0,0 +1,338 @@
+//! This crate is for integration testing and fuzz testing of functions in `compiler-builtins`. This
+//! includes publicly documented intrinsics and some internal alternative implementation functions
+//! such as `usize_leading_zeros_riscv` (which are tested because they are configured for
+//! architectures not tested by the CI).
+//!
+//! The general idea is to use a combination of edge case testing and randomized fuzz testing. The
+//! edge case testing is crucial for checking cases like where both inputs are equal or equal to
+//! special values such as `i128::MIN`, which is unlikely for the random fuzzer by itself to
+//! encounter. The randomized fuzz testing is specially designed to cover wide swaths of search
+//! space in as few iterations as possible. See `fuzz_values` in `builtins-test/tests/misc.rs` for
+//! an example.
+//!
+//! Some floating point tests are disabled for specific architectures, because they do not have
+//! correct rounding.
+#![no_std]
+#![cfg_attr(f128_enabled, feature(f128))]
+#![cfg_attr(f16_enabled, feature(f16))]
+
+pub mod bench;
+extern crate alloc;
+
+use compiler_builtins::float::Float;
+use compiler_builtins::int::{Int, MinInt};
+
+use rand_xoshiro::Xoshiro128StarStar;
+use rand_xoshiro::rand_core::{RngCore, SeedableRng};
+
+/// Sets the number of fuzz iterations run for most tests. In practice, the vast majority of bugs
+/// are caught by the edge case testers. Most of the remaining bugs triggered by more complex
+/// sequences are caught well within 10_000 fuzz iterations. For classes of algorithms like division
+/// that are vulnerable to rare edge cases, we want 1_000_000 iterations to be more confident. In
+/// practical CI, however, we only want to run the more strenuous test once to catch algorithmic
+/// level bugs, and run the 10_000 iteration test on most targets. Target-dependent bugs are likely
+/// to involve miscompilation and misconfiguration that is likely to break algorithms in quickly
+/// caught ways. We choose to configure `N = 1_000_000` iterations for `x86_64` targets (and if
+/// debug assertions are disabled. Tests without `--release` would take too long) which are likely
+/// to have fast hardware, and run `N = 10_000` for all other targets.
+pub const N: u32 = if cfg!(target_arch = "x86_64") && !cfg!(debug_assertions) {
+    1_000_000
+} else {
+    10_000
+};
+
+/// Random fuzzing step. When run several times, it results in excellent fuzzing entropy such as:
+/// 11110101010101011110111110011111
+/// 10110101010100001011101011001010
+/// 1000000000000000
+/// 10000000000000110111110000001010
+/// 1111011111111101010101111110101
+/// 101111111110100000000101000000
+/// 10000000110100000000100010101
+/// 1010101010101000
+fn fuzz_step<I: Int>(rng: &mut Xoshiro128StarStar, x: &mut I) {
+    let ones = !I::ZERO;
+    let bit_indexing_mask: u32 = I::BITS - 1;
+    // It happens that all the RNG we need can come from one call. 7 bits are needed to index a
+    // worst case 128 bit integer, and there are 4 indexes that need to be made plus 4 bits for
+    // selecting operations
+    let rng32 = rng.next_u32();
+
+    // Randomly OR, AND, and XOR randomly sized and shifted continuous strings of
+    // ones with `lhs` and `rhs`.
+    let r0 = bit_indexing_mask & rng32;
+    let r1 = bit_indexing_mask & (rng32 >> 7);
+    let mask = ones.wrapping_shl(r0).rotate_left(r1);
+    match (rng32 >> 14) % 4 {
+        0 => *x |= mask,
+        1 => *x &= mask,
+        // both 2 and 3 to make XORs as common as ORs and ANDs combined
+        _ => *x ^= mask,
+    }
+
+    // Alternating ones and zeros (e.x. 0b1010101010101010). This catches second-order
+    // problems that might occur for algorithms with two modes of operation (potentially
+    // there is some invariant that can be broken and maintained via alternating between modes,
+    // breaking the algorithm when it reaches the end).
+    let mut alt_ones = I::ONE;
+    for _ in 0..(I::BITS / 2) {
+        alt_ones <<= 2;
+        alt_ones |= I::ONE;
+    }
+    let r0 = bit_indexing_mask & (rng32 >> 16);
+    let r1 = bit_indexing_mask & (rng32 >> 23);
+    let mask = alt_ones.wrapping_shl(r0).rotate_left(r1);
+    match rng32 >> 30 {
+        0 => *x |= mask,
+        1 => *x &= mask,
+        _ => *x ^= mask,
+    }
+}
+
+// We need macros like this, because `#![no_std]` prevents us from using iterators
+macro_rules! edge_cases {
+    ($I:ident, $case:ident, $inner:block) => {
+        for i0 in 0..$I::FUZZ_NUM {
+            let mask_lo = (!$I::UnsignedInt::ZERO).wrapping_shr($I::FUZZ_LENGTHS[i0] as u32);
+            for i1 in i0..I::FUZZ_NUM {
+                let mask_hi =
+                    (!$I::UnsignedInt::ZERO).wrapping_shl($I::FUZZ_LENGTHS[i1 - i0] as u32);
+                let $case = I::from_unsigned(mask_lo & mask_hi);
+                $inner
+            }
+        }
+    };
+}
+
+/// Feeds a series of fuzzing inputs to `f`. The fuzzer first uses an algorithm designed to find
+/// edge cases, followed by a more random fuzzer that runs `n` times.
+pub fn fuzz<I: Int, F: FnMut(I)>(n: u32, mut f: F)
+where
+    <I as MinInt>::UnsignedInt: Int,
+{
+    // edge case tester. Calls `f` 210 times for u128.
+    // zero gets skipped by the loop
+    f(I::ZERO);
+    edge_cases!(I, case, {
+        f(case);
+    });
+
+    // random fuzzer
+    let mut rng = Xoshiro128StarStar::seed_from_u64(0);
+    let mut x: I = MinInt::ZERO;
+    for _ in 0..n {
+        fuzz_step(&mut rng, &mut x);
+        f(x)
+    }
+}
+
+/// The same as `fuzz`, except `f` has two inputs.
+pub fn fuzz_2<I: Int, F: Fn(I, I)>(n: u32, f: F)
+where
+    <I as MinInt>::UnsignedInt: Int,
+{
+    // Check cases where the first and second inputs are zero. Both call `f` 210 times for `u128`.
+    edge_cases!(I, case, {
+        f(I::ZERO, case);
+    });
+    edge_cases!(I, case, {
+        f(case, I::ZERO);
+    });
+    // Nested edge tester. Calls `f` 44100 times for `u128`.
+    edge_cases!(I, case0, {
+        edge_cases!(I, case1, {
+            f(case0, case1);
+        })
+    });
+
+    // random fuzzer
+    let mut rng = Xoshiro128StarStar::seed_from_u64(0);
+    let mut x: I = I::ZERO;
+    let mut y: I = I::ZERO;
+    for _ in 0..n {
+        fuzz_step(&mut rng, &mut x);
+        fuzz_step(&mut rng, &mut y);
+        f(x, y)
+    }
+}
+
+/// Tester for shift functions
+pub fn fuzz_shift<I: Int, F: Fn(I, u32)>(f: F) {
+    // Shift functions are very simple and do not need anything other than shifting a small
+    // set of random patterns for every fuzz length.
+    let mut rng = Xoshiro128StarStar::seed_from_u64(0);
+    let mut x: I = MinInt::ZERO;
+    for i in 0..I::FUZZ_NUM {
+        fuzz_step(&mut rng, &mut x);
+        f(x, MinInt::ZERO);
+        f(x, I::FUZZ_LENGTHS[i] as u32);
+    }
+}
+
+fn fuzz_float_step<F: Float>(rng: &mut Xoshiro128StarStar, f: &mut F) {
+    let rng32 = rng.next_u32();
+    // we need to fuzz the different parts of the float separately, because the masking on larger
+    // significands will tend to set the exponent to all ones or all zeros frequently
+
+    // sign bit fuzzing
+    let sign = (rng32 & 1) != 0;
+
+    // exponent fuzzing. Only 4 bits for the selector needed.
+    let ones = (F::Int::ONE << F::EXP_BITS) - F::Int::ONE;
+    let r0 = (rng32 >> 1) % F::EXP_BITS;
+    let r1 = (rng32 >> 5) % F::EXP_BITS;
+    // custom rotate shift. Note that `F::Int` is unsigned, so we can shift right without smearing
+    // the sign bit.
+    let mask = if r1 == 0 {
+        ones.wrapping_shr(r0)
+    } else {
+        let tmp = ones.wrapping_shr(r0);
+        (tmp.wrapping_shl(r1) | tmp.wrapping_shr(F::EXP_BITS - r1)) & ones
+    };
+    let mut exp = (f.to_bits() & F::EXP_MASK) >> F::SIG_BITS;
+    match (rng32 >> 9) % 4 {
+        0 => exp |= mask,
+        1 => exp &= mask,
+        _ => exp ^= mask,
+    }
+
+    // significand fuzzing
+    let mut sig = f.to_bits() & F::SIG_MASK;
+    fuzz_step(rng, &mut sig);
+    sig &= F::SIG_MASK;
+
+    *f = F::from_parts(sign, exp, sig);
+}
+
+macro_rules! float_edge_cases {
+    ($F:ident, $case:ident, $inner:block) => {
+        for exponent in [
+            F::Int::ZERO,
+            F::Int::ONE,
+            F::Int::ONE << (F::EXP_BITS / 2),
+            (F::Int::ONE << (F::EXP_BITS - 1)) - F::Int::ONE,
+            F::Int::ONE << (F::EXP_BITS - 1),
+            (F::Int::ONE << (F::EXP_BITS - 1)) + F::Int::ONE,
+            (F::Int::ONE << F::EXP_BITS) - F::Int::ONE,
+        ]
+        .iter()
+        {
+            for significand in [
+                F::Int::ZERO,
+                F::Int::ONE,
+                F::Int::ONE << (F::SIG_BITS / 2),
+                (F::Int::ONE << (F::SIG_BITS - 1)) - F::Int::ONE,
+                F::Int::ONE << (F::SIG_BITS - 1),
+                (F::Int::ONE << (F::SIG_BITS - 1)) + F::Int::ONE,
+                (F::Int::ONE << F::SIG_BITS) - F::Int::ONE,
+            ]
+            .iter()
+            {
+                for sign in [false, true].iter() {
+                    let $case = F::from_parts(*sign, *exponent, *significand);
+                    $inner
+                }
+            }
+        }
+    };
+}
+
+pub fn fuzz_float<F: Float, E: Fn(F)>(n: u32, f: E) {
+    float_edge_cases!(F, case, {
+        f(case);
+    });
+
+    // random fuzzer
+    let mut rng = Xoshiro128StarStar::seed_from_u64(0);
+    let mut x = F::ZERO;
+    for _ in 0..n {
+        fuzz_float_step(&mut rng, &mut x);
+        f(x);
+    }
+}
+
+pub fn fuzz_float_2<F: Float, E: Fn(F, F)>(n: u32, f: E) {
+    float_edge_cases!(F, case0, {
+        float_edge_cases!(F, case1, {
+            f(case0, case1);
+        });
+    });
+
+    // random fuzzer
+    let mut rng = Xoshiro128StarStar::seed_from_u64(0);
+    let mut x = F::ZERO;
+    let mut y = F::ZERO;
+    for _ in 0..n {
+        fuzz_float_step(&mut rng, &mut x);
+        fuzz_float_step(&mut rng, &mut y);
+        f(x, y)
+    }
+}
+
+/// Perform an operation using builtin types if available, falling back to apfloat if not.
+#[macro_export]
+macro_rules! apfloat_fallback {
+    (
+        $float_ty:ty,
+        // Type name in `rustc_apfloat::ieee`. Not a full path, it automatically gets the prefix.
+        $apfloat_ty:ident,
+        // Cfg expression for when builtin system operations should be used
+        $sys_available:meta,
+        // The expression to run. This expression may use `FloatTy` for its signature.
+        // Optionally, the final conversion back to a float can be suppressed using
+        // `=> no_convert` (for e.g. operations that return a bool).
+        //
+        // If the apfloat needs a different operation, it can be provided here.
+        $op:expr $(=> $convert:ident)? $(; $apfloat_op:expr)?,
+        // Arguments that get passed to `$op` after converting to a float
+        $($arg:expr),+
+        $(,)?
+    ) => {{
+        #[cfg($sys_available)]
+        let ret = {
+            type FloatTy = $float_ty;
+            $op( $($arg),+ )
+        };
+
+        #[cfg(not($sys_available))]
+        let ret = {
+            use rustc_apfloat::Float;
+            type FloatTy = rustc_apfloat::ieee::$apfloat_ty;
+
+            apfloat_fallback!(@inner
+                fty: $float_ty,
+                // Apply a conversion to `FloatTy` to each arg, then pass all args to `$op`
+                op_res: $op( $(FloatTy::from_bits($arg.to_bits().into())),+ ),
+                $(apfloat_op: $apfloat_op, )?
+                $(conv_opts: $convert,)?
+                args: $($arg),+
+            )
+        };
+
+        ret
+    }};
+
+    // Operations that do not need converting back to a float
+    (@inner fty: $float_ty:ty, op_res: $val:expr, conv_opts: no_convert, args: $($_arg:expr),+) => {
+        $val
+    };
+
+    // Some apfloat operations return a `StatusAnd` that we need to extract the value from. This
+    // is the default.
+    (@inner fty: $float_ty:ty, op_res: $val:expr, args: $($_arg:expr),+) => {{
+        // ignore the status, just get the value
+        let unwrapped = $val.value;
+
+        <$float_ty>::from_bits(FloatTy::to_bits(unwrapped).try_into().unwrap())
+    }};
+
+    // This is the case where we can't use the same expression for the default builtin and
+    // nonstandard apfloat fallback (e.g. `as` casts in std are normal functions in apfloat, so
+    // two separate expressions must be specified.
+    (@inner
+        fty: $float_ty:ty, op_res: $_val:expr,
+        apfloat_op: $apfloat_op:expr, args: $($arg:expr),+
+    ) => {{
+        $apfloat_op($($arg),+)
+    }};
+}
diff --git a/library/compiler-builtins/builtins-test/tests/addsub.rs b/library/compiler-builtins/builtins-test/tests/addsub.rs
new file mode 100644
index 00000000000..865b9e472ab
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/tests/addsub.rs
@@ -0,0 +1,143 @@
+#![allow(unused_macros)]
+#![cfg_attr(f128_enabled, feature(f128))]
+
+use builtins_test::*;
+
+mod int_addsub {
+    use super::*;
+
+    macro_rules! sum {
+        ($($i:ty, $fn_add:ident, $fn_sub:ident);*;) => {
+            $(
+                #[test]
+                fn $fn_add() {
+                    use compiler_builtins::int::addsub::{$fn_add, $fn_sub};
+
+                    fuzz_2(N, |x: $i, y: $i| {
+                        let add0 = x.wrapping_add(y);
+                        let sub0 = x.wrapping_sub(y);
+                        let add1: $i = $fn_add(x, y);
+                        let sub1: $i = $fn_sub(x, y);
+                        if add0 != add1 {
+                            panic!(
+                                "{}({}, {}): std: {}, builtins: {}",
+                                stringify!($fn_add), x, y, add0, add1
+                            );
+                        }
+                        if sub0 != sub1 {
+                            panic!(
+                                "{}({}, {}): std: {}, builtins: {}",
+                                stringify!($fn_sub), x, y, sub0, sub1
+                            );
+                        }
+                    });
+                }
+            )*
+        };
+    }
+
+    macro_rules! overflowing_sum {
+        ($($i:ty, $fn_add:ident, $fn_sub:ident);*;) => {
+            $(
+                #[test]
+                fn $fn_add() {
+                    use compiler_builtins::int::addsub::{$fn_add, $fn_sub};
+
+                    fuzz_2(N, |x: $i, y: $i| {
+                        let (add0, add_o0)= x.overflowing_add(y);
+                        let (sub0, sub_o0)= x.overflowing_sub(y);
+                        let mut add_o1 = 0;
+                        let mut sub_o1 = 0;
+                        let add1: $i = $fn_add(x, y, &mut add_o1);
+                        let sub1: $i = $fn_sub(x, y, &mut sub_o1);
+                        if add0 != add1 || i32::from(add_o0) != add_o1 {
+                            panic!(
+                                "{}({}, {}): std: {:?}, builtins: {:?}",
+                                stringify!($fn_add), x, y, (add0, add_o0) , (add1, add_o1)
+                            );
+                        }
+                        if sub0 != sub1 || i32::from(sub_o0) != sub_o1 {
+                            panic!(
+                                "{}({}, {}): std: {:?}, builtins: {:?}",
+                                stringify!($fn_sub), x, y, (sub0, sub_o0) , (sub1, sub_o1)
+                            );
+                        }
+                    });
+                }
+            )*
+        };
+    }
+
+    // Integer addition and subtraction is very simple, so 100 fuzzing passes should be plenty.
+    sum! {
+        u128, __rust_u128_add, __rust_u128_sub;
+        i128, __rust_i128_add, __rust_i128_sub;
+    }
+
+    overflowing_sum! {
+        u128, __rust_u128_addo, __rust_u128_subo;
+        i128, __rust_i128_addo, __rust_i128_subo;
+    }
+}
+
+macro_rules! float_sum {
+    ($($f:ty, $fn_add:ident, $fn_sub:ident, $apfloat_ty:ident, $sys_available:meta);*;) => {
+        $(
+            #[test]
+            fn $fn_add() {
+                use core::ops::{Add, Sub};
+                use compiler_builtins::float::{{add::$fn_add, sub::$fn_sub}, Float};
+
+                fuzz_float_2(N, |x: $f, y: $f| {
+                    let add0 = apfloat_fallback!($f, $apfloat_ty, $sys_available, Add::add, x, y);
+                    let sub0 = apfloat_fallback!($f, $apfloat_ty, $sys_available, Sub::sub, x, y);
+                    let add1: $f = $fn_add(x, y);
+                    let sub1: $f = $fn_sub(x, y);
+                    if !Float::eq_repr(add0, add1) {
+                        panic!(
+                            "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
+                            stringify!($fn_add), x, y, add0, add1
+                        );
+                    }
+                    if !Float::eq_repr(sub0, sub1) {
+                        panic!(
+                            "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
+                            stringify!($fn_sub), x, y, sub0, sub1
+                        );
+                    }
+                });
+            }
+        )*
+    }
+}
+
+#[cfg(not(all(target_arch = "x86", not(target_feature = "sse"))))]
+mod float_addsub {
+    use super::*;
+
+    float_sum! {
+        f32, __addsf3, __subsf3, Single, all();
+        f64, __adddf3, __subdf3, Double, all();
+    }
+}
+
+#[cfg(f128_enabled)]
+#[cfg(not(all(target_arch = "x86", not(target_feature = "sse"))))]
+#[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+mod float_addsub_f128 {
+    use super::*;
+
+    float_sum! {
+        f128, __addtf3, __subtf3, Quad, not(feature = "no-sys-f128");
+    }
+}
+
+#[cfg(f128_enabled)]
+#[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+mod float_addsub_f128_ppc {
+    use super::*;
+
+    float_sum! {
+        f128, __addkf3, __subkf3, Quad, not(feature = "no-sys-f128");
+    }
+}
diff --git a/library/compiler-builtins/builtins-test/tests/aeabi_memclr.rs b/library/compiler-builtins/builtins-test/tests/aeabi_memclr.rs
new file mode 100644
index 00000000000..bfd15a391aa
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/tests/aeabi_memclr.rs
@@ -0,0 +1,60 @@
+#![cfg(all(
+    target_arch = "arm",
+    not(any(target_env = "gnu", target_env = "musl")),
+    target_os = "linux",
+    feature = "mem"
+))]
+#![feature(compiler_builtins_lib)]
+#![no_std]
+
+extern crate compiler_builtins;
+
+// test runner
+extern crate utest_cortex_m_qemu;
+
+// overrides `panic!`
+#[macro_use]
+extern crate utest_macros;
+
+use core::mem;
+
+macro_rules! panic {
+    ($($tt:tt)*) => {
+        upanic!($($tt)*);
+    };
+}
+
+extern "C" {
+    fn __aeabi_memclr4(dest: *mut u8, n: usize);
+    fn __aeabi_memset4(dest: *mut u8, n: usize, c: u32);
+}
+
+struct Aligned {
+    array: [u8; 8],
+    _alignment: [u32; 0],
+}
+
+impl Aligned {
+    fn new() -> Self {
+        Aligned {
+            array: [0; 8],
+            _alignment: [],
+        }
+    }
+}
+
+#[test]
+fn memclr4() {
+    let mut aligned = Aligned::new();
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+
+    for n in 0..9 {
+        unsafe {
+            __aeabi_memset4(xs.as_mut_ptr(), n, 0xff);
+            __aeabi_memclr4(xs.as_mut_ptr(), n);
+        }
+
+        assert!(xs[0..n].iter().all(|x| *x == 0));
+    }
+}
diff --git a/library/compiler-builtins/builtins-test/tests/aeabi_memcpy.rs b/library/compiler-builtins/builtins-test/tests/aeabi_memcpy.rs
new file mode 100644
index 00000000000..c892c5aba0f
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/tests/aeabi_memcpy.rs
@@ -0,0 +1,71 @@
+#![cfg(all(
+    target_arch = "arm",
+    not(any(target_env = "gnu", target_env = "musl")),
+    target_os = "linux",
+    feature = "mem"
+))]
+#![feature(compiler_builtins_lib)]
+#![no_std]
+
+extern crate compiler_builtins;
+
+// test runner
+extern crate utest_cortex_m_qemu;
+
+// overrides `panic!`
+#[macro_use]
+extern crate utest_macros;
+
+macro_rules! panic {
+    ($($tt:tt)*) => {
+        upanic!($($tt)*);
+    };
+}
+
+extern "C" {
+    fn __aeabi_memcpy(dest: *mut u8, src: *const u8, n: usize);
+    fn __aeabi_memcpy4(dest: *mut u8, src: *const u8, n: usize);
+}
+
+struct Aligned {
+    array: [u8; 8],
+    _alignment: [u32; 0],
+}
+
+impl Aligned {
+    fn new(array: [u8; 8]) -> Self {
+        Aligned {
+            array: array,
+            _alignment: [],
+        }
+    }
+}
+
+#[test]
+fn memcpy() {
+    let mut dest = [0; 4];
+    let src = [0xde, 0xad, 0xbe, 0xef];
+
+    for n in 0..dest.len() {
+        dest.copy_from_slice(&[0; 4]);
+
+        unsafe { __aeabi_memcpy(dest.as_mut_ptr(), src.as_ptr(), n) }
+
+        assert_eq!(&dest[0..n], &src[0..n])
+    }
+}
+
+#[test]
+fn memcpy4() {
+    let mut aligned = Aligned::new([0; 8]);
+    let dest = &mut aligned.array;
+    let src = [0xde, 0xad, 0xbe, 0xef, 0xba, 0xad, 0xf0, 0x0d];
+
+    for n in 0..dest.len() {
+        dest.copy_from_slice(&[0; 8]);
+
+        unsafe { __aeabi_memcpy4(dest.as_mut_ptr(), src.as_ptr(), n) }
+
+        assert_eq!(&dest[0..n], &src[0..n])
+    }
+}
diff --git a/library/compiler-builtins/builtins-test/tests/aeabi_memset.rs b/library/compiler-builtins/builtins-test/tests/aeabi_memset.rs
new file mode 100644
index 00000000000..34ab3acc78c
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/tests/aeabi_memset.rs
@@ -0,0 +1,240 @@
+#![cfg(all(
+    target_arch = "arm",
+    not(any(target_env = "gnu", target_env = "musl")),
+    target_os = "linux",
+    feature = "mem"
+))]
+#![feature(compiler_builtins_lib)]
+#![no_std]
+
+extern crate compiler_builtins;
+
+// test runner
+extern crate utest_cortex_m_qemu;
+
+// overrides `panic!`
+#[macro_use]
+extern crate utest_macros;
+
+use core::mem;
+
+macro_rules! panic {
+    ($($tt:tt)*) => {
+        upanic!($($tt)*);
+    };
+}
+
+extern "C" {
+    fn __aeabi_memset4(dest: *mut u8, n: usize, c: u32);
+}
+
+struct Aligned {
+    array: [u8; 8],
+    _alignment: [u32; 0],
+}
+
+impl Aligned {
+    fn new(array: [u8; 8]) -> Self {
+        Aligned {
+            array: array,
+            _alignment: [],
+        }
+    }
+}
+
+#[test]
+fn zero() {
+    let mut aligned = Aligned::new([0u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), 0, c) }
+
+    assert_eq!(*xs, [0; 8]);
+
+    let mut aligned = Aligned::new([1u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), 0, c) }
+
+    assert_eq!(*xs, [1; 8]);
+}
+
+#[test]
+fn one() {
+    let mut aligned = Aligned::new([0u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let n = 1;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0, 0, 0, 0, 0, 0, 0]);
+
+    let mut aligned = Aligned::new([1u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 1, 1, 1, 1, 1, 1, 1]);
+}
+
+#[test]
+fn two() {
+    let mut aligned = Aligned::new([0u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let n = 2;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 0, 0, 0, 0, 0, 0]);
+
+    let mut aligned = Aligned::new([1u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 1, 1, 1, 1, 1, 1]);
+}
+
+#[test]
+fn three() {
+    let mut aligned = Aligned::new([0u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let n = 3;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 0xef, 0, 0, 0, 0, 0]);
+
+    let mut aligned = Aligned::new([1u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 0xef, 1, 1, 1, 1, 1]);
+}
+
+#[test]
+fn four() {
+    let mut aligned = Aligned::new([0u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let n = 4;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 0xef, 0xef, 0, 0, 0, 0]);
+
+    let mut aligned = Aligned::new([1u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 0xef, 0xef, 1, 1, 1, 1]);
+}
+
+#[test]
+fn five() {
+    let mut aligned = Aligned::new([0u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let n = 5;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 0xef, 0xef, 0xef, 0, 0, 0]);
+
+    let mut aligned = Aligned::new([1u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 0xef, 0xef, 0xef, 1, 1, 1]);
+}
+
+#[test]
+fn six() {
+    let mut aligned = Aligned::new([0u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let n = 6;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 0xef, 0xef, 0xef, 0xef, 0, 0]);
+
+    let mut aligned = Aligned::new([1u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 0xef, 0xef, 0xef, 0xef, 1, 1]);
+}
+
+#[test]
+fn seven() {
+    let mut aligned = Aligned::new([0u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let n = 7;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 0xef, 0xef, 0xef, 0xef, 0xef, 0]);
+
+    let mut aligned = Aligned::new([1u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 0xef, 0xef, 0xef, 0xef, 0xef, 1]);
+}
+
+#[test]
+fn eight() {
+    let mut aligned = Aligned::new([0u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let n = 8;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 0xef, 0xef, 0xef, 0xef, 0xef, 0xef]);
+
+    let mut aligned = Aligned::new([1u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 0xef, 0xef, 0xef, 0xef, 0xef, 0xef]);
+}
diff --git a/library/compiler-builtins/builtins-test/tests/big.rs b/library/compiler-builtins/builtins-test/tests/big.rs
new file mode 100644
index 00000000000..d1ae88bd164
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/tests/big.rs
@@ -0,0 +1,134 @@
+use compiler_builtins::int::{HInt, MinInt, i256, u256};
+
+const LOHI_SPLIT: u128 = 0xaaaaaaaaaaaaaaaaffffffffffffffff;
+
+/// Print a `u256` as hex since we can't add format implementations
+fn hexu(v: u256) -> String {
+    format!(
+        "0x{:016x}{:016x}{:016x}{:016x}",
+        v.0[3], v.0[2], v.0[1], v.0[0]
+    )
+}
+
+#[test]
+fn widen_u128() {
+    assert_eq!(u128::MAX.widen(), u256([u64::MAX, u64::MAX, 0, 0]));
+    assert_eq!(
+        LOHI_SPLIT.widen(),
+        u256([u64::MAX, 0xaaaaaaaaaaaaaaaa, 0, 0])
+    );
+}
+
+#[test]
+fn widen_i128() {
+    assert_eq!((-1i128).widen(), u256::MAX.signed());
+    assert_eq!(
+        (LOHI_SPLIT as i128).widen(),
+        i256([u64::MAX, 0xaaaaaaaaaaaaaaaa, u64::MAX, u64::MAX])
+    );
+    assert_eq!((-1i128).zero_widen().unsigned(), (u128::MAX).widen());
+}
+
+#[test]
+fn widen_mul_u128() {
+    let tests = [
+        (u128::MAX / 2, 2_u128, u256([u64::MAX - 1, u64::MAX, 0, 0])),
+        (u128::MAX, 2_u128, u256([u64::MAX - 1, u64::MAX, 1, 0])),
+        (u128::MAX, u128::MAX, u256([1, 0, u64::MAX - 1, u64::MAX])),
+        (u128::MIN, u128::MIN, u256::ZERO),
+        (1234, 0, u256::ZERO),
+        (0, 1234, u256::ZERO),
+    ];
+
+    let mut errors = Vec::new();
+    for (i, (a, b, exp)) in tests.iter().copied().enumerate() {
+        let res = a.widen_mul(b);
+        let res_z = a.zero_widen_mul(b);
+        assert_eq!(res, res_z);
+        if res != exp {
+            errors.push((i, a, b, exp, res));
+        }
+    }
+
+    for (i, a, b, exp, res) in &errors {
+        eprintln!(
+            "FAILURE ({i}): {a:#034x} * {b:#034x} = {} got {}",
+            hexu(*exp),
+            hexu(*res)
+        );
+    }
+    assert!(errors.is_empty());
+}
+
+#[test]
+fn not_u128() {
+    assert_eq!(!u256::ZERO, u256::MAX);
+}
+
+#[test]
+fn shr_u128() {
+    let only_low = [
+        1,
+        u16::MAX.into(),
+        u32::MAX.into(),
+        u64::MAX.into(),
+        u128::MAX,
+    ];
+
+    let mut errors = Vec::new();
+
+    for a in only_low {
+        for perturb in 0..10 {
+            let a = a.saturating_add(perturb);
+            for shift in 0..128 {
+                let res = a.widen() >> shift;
+                let expected = (a >> shift).widen();
+                if res != expected {
+                    errors.push((a.widen(), shift, res, expected));
+                }
+            }
+        }
+    }
+
+    let check = [
+        (
+            u256::MAX,
+            1,
+            u256([u64::MAX, u64::MAX, u64::MAX, u64::MAX >> 1]),
+        ),
+        (
+            u256::MAX,
+            5,
+            u256([u64::MAX, u64::MAX, u64::MAX, u64::MAX >> 5]),
+        ),
+        (u256::MAX, 63, u256([u64::MAX, u64::MAX, u64::MAX, 1])),
+        (u256::MAX, 64, u256([u64::MAX, u64::MAX, u64::MAX, 0])),
+        (u256::MAX, 65, u256([u64::MAX, u64::MAX, u64::MAX >> 1, 0])),
+        (u256::MAX, 127, u256([u64::MAX, u64::MAX, 1, 0])),
+        (u256::MAX, 128, u256([u64::MAX, u64::MAX, 0, 0])),
+        (u256::MAX, 129, u256([u64::MAX, u64::MAX >> 1, 0, 0])),
+        (u256::MAX, 191, u256([u64::MAX, 1, 0, 0])),
+        (u256::MAX, 192, u256([u64::MAX, 0, 0, 0])),
+        (u256::MAX, 193, u256([u64::MAX >> 1, 0, 0, 0])),
+        (u256::MAX, 191, u256([u64::MAX, 1, 0, 0])),
+        (u256::MAX, 254, u256([0b11, 0, 0, 0])),
+        (u256::MAX, 255, u256([1, 0, 0, 0])),
+    ];
+
+    for (input, shift, expected) in check {
+        let res = input >> shift;
+        if res != expected {
+            errors.push((input, shift, res, expected));
+        }
+    }
+
+    for (a, b, res, expected) in &errors {
+        eprintln!(
+            "FAILURE: {} >> {b} = {} got {}",
+            hexu(*a),
+            hexu(*expected),
+            hexu(*res),
+        );
+    }
+    assert!(errors.is_empty());
+}
diff --git a/library/compiler-builtins/builtins-test/tests/cmp.rs b/library/compiler-builtins/builtins-test/tests/cmp.rs
new file mode 100644
index 00000000000..dbedd213e90
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/tests/cmp.rs
@@ -0,0 +1,185 @@
+#![allow(unused_macros)]
+#![allow(unreachable_code)]
+#![cfg_attr(f128_enabled, feature(f128))]
+
+use builtins_test::*;
+
+mod float_comparisons {
+    use super::*;
+
+    macro_rules! cmp {
+        (
+            $f:ty, $x:ident, $y:ident, $apfloat_ty:ident, $sys_available:meta,
+            $($unordered_val:expr, $fn:ident);*;
+        ) => {
+            $(
+                let cmp0 = if apfloat_fallback!(
+                        $f, $apfloat_ty, $sys_available,
+                        |x: FloatTy| x.is_nan() => no_convert,
+                        $x
+                    ) || apfloat_fallback!(
+                        $f, $apfloat_ty, $sys_available,
+                        |y: FloatTy| y.is_nan() => no_convert,
+                        $y
+                    )
+                {
+                    $unordered_val
+                } else if apfloat_fallback!(
+                    $f, $apfloat_ty, $sys_available,
+                    |x, y| x < y => no_convert,
+                    $x, $y
+                ) {
+                    -1
+                } else if apfloat_fallback!(
+                    $f, $apfloat_ty, $sys_available,
+                    |x, y| x == y => no_convert,
+                    $x, $y
+                ) {
+                    0
+                } else {
+                    1
+                };
+
+                let cmp1 = $fn($x, $y);
+                if cmp0 != cmp1 {
+                    panic!(
+                        "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
+                        stringify!($fn), $x, $y, cmp0, cmp1
+                    );
+                }
+            )*
+        };
+    }
+
+    #[test]
+    fn cmp_f32() {
+        use compiler_builtins::float::cmp::{
+            __eqsf2, __gesf2, __gtsf2, __lesf2, __ltsf2, __nesf2, __unordsf2,
+        };
+
+        fuzz_float_2(N, |x: f32, y: f32| {
+            assert_eq!(__unordsf2(x, y) != 0, x.is_nan() || y.is_nan());
+            cmp!(f32, x, y, Single, all(),
+                1, __ltsf2;
+                1, __lesf2;
+                1, __eqsf2;
+                -1, __gesf2;
+                -1, __gtsf2;
+                1, __nesf2;
+            );
+        });
+    }
+
+    #[test]
+    fn cmp_f64() {
+        use compiler_builtins::float::cmp::{
+            __eqdf2, __gedf2, __gtdf2, __ledf2, __ltdf2, __nedf2, __unorddf2,
+        };
+
+        fuzz_float_2(N, |x: f64, y: f64| {
+            assert_eq!(__unorddf2(x, y) != 0, x.is_nan() || y.is_nan());
+            cmp!(f64, x, y, Double, all(),
+                1, __ltdf2;
+                1, __ledf2;
+                1, __eqdf2;
+                -1, __gedf2;
+                -1, __gtdf2;
+                1, __nedf2;
+            );
+        });
+    }
+
+    #[test]
+    #[cfg(f128_enabled)]
+    fn cmp_f128() {
+        #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+        use compiler_builtins::float::cmp::{
+            __eqkf2 as __eqtf2, __gekf2 as __getf2, __gtkf2 as __gttf2, __lekf2 as __letf2,
+            __ltkf2 as __lttf2, __nekf2 as __netf2, __unordkf2 as __unordtf2,
+        };
+
+        #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+        use compiler_builtins::float::cmp::{
+            __eqtf2, __getf2, __gttf2, __letf2, __lttf2, __netf2, __unordtf2,
+        };
+
+        fuzz_float_2(N, |x: f128, y: f128| {
+            let x_is_nan = apfloat_fallback!(
+                f128, Quad, not(feature = "no-sys-f128"),
+                |x: FloatTy| x.is_nan() => no_convert,
+                x
+            );
+            let y_is_nan = apfloat_fallback!(
+                f128, Quad, not(feature = "no-sys-f128"),
+                |x: FloatTy| x.is_nan() => no_convert,
+                y
+            );
+
+            assert_eq!(__unordtf2(x, y) != 0, x_is_nan || y_is_nan);
+
+            cmp!(f128, x, y, Quad, not(feature = "no-sys-f128"),
+                1, __lttf2;
+                1, __letf2;
+                1, __eqtf2;
+                -1, __getf2;
+                -1, __gttf2;
+                1, __netf2;
+            );
+        });
+    }
+}
+
+#[cfg(target_arch = "arm")]
+mod float_comparisons_arm {
+    use super::*;
+
+    macro_rules! cmp2 {
+        ($x:ident, $y:ident, $($unordered_val:expr, $fn_std:expr, $fn_builtins:ident);*;) => {
+            $(
+                let cmp0: i32 = if $x.is_nan() || $y.is_nan() {
+                    $unordered_val
+                } else {
+                    $fn_std as i32
+                };
+                let cmp1: i32 = $fn_builtins($x, $y);
+                if cmp0 != cmp1 {
+                    panic!("{}({}, {}): std: {}, builtins: {}", stringify!($fn_builtins), $x, $y, cmp0, cmp1);
+                }
+            )*
+        };
+    }
+
+    #[test]
+    fn cmp_f32() {
+        use compiler_builtins::float::cmp::{
+            __aeabi_fcmpeq, __aeabi_fcmpge, __aeabi_fcmpgt, __aeabi_fcmple, __aeabi_fcmplt,
+        };
+
+        fuzz_float_2(N, |x: f32, y: f32| {
+            cmp2!(x, y,
+                0, x < y, __aeabi_fcmplt;
+                0, x <= y, __aeabi_fcmple;
+                0, x == y, __aeabi_fcmpeq;
+                0, x >= y, __aeabi_fcmpge;
+                0, x > y, __aeabi_fcmpgt;
+            );
+        });
+    }
+
+    #[test]
+    fn cmp_f64() {
+        use compiler_builtins::float::cmp::{
+            __aeabi_dcmpeq, __aeabi_dcmpge, __aeabi_dcmpgt, __aeabi_dcmple, __aeabi_dcmplt,
+        };
+
+        fuzz_float_2(N, |x: f64, y: f64| {
+            cmp2!(x, y,
+                0, x < y, __aeabi_dcmplt;
+                0, x <= y, __aeabi_dcmple;
+                0, x == y, __aeabi_dcmpeq;
+                0, x >= y, __aeabi_dcmpge;
+                0, x > y, __aeabi_dcmpgt;
+            );
+        });
+    }
+}
diff --git a/library/compiler-builtins/builtins-test/tests/conv.rs b/library/compiler-builtins/builtins-test/tests/conv.rs
new file mode 100644
index 00000000000..491915d9bb1
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/tests/conv.rs
@@ -0,0 +1,364 @@
+#![cfg_attr(f128_enabled, feature(f128))]
+#![cfg_attr(f16_enabled, feature(f16))]
+// makes configuration easier
+#![allow(unused_macros)]
+#![allow(unused_imports)]
+
+use builtins_test::*;
+use compiler_builtins::float::Float;
+use rustc_apfloat::{Float as _, FloatConvert as _};
+
+mod i_to_f {
+    use super::*;
+
+    macro_rules! i_to_f {
+        ($f_ty:ty, $apfloat_ty:ident, $sys_available:meta, $($i_ty:ty, $fn:ident);*;) => {
+            $(
+                #[test]
+                fn $fn() {
+                    use compiler_builtins::float::conv::$fn;
+                    use compiler_builtins::int::Int;
+
+                    fuzz(N, |x: $i_ty| {
+                        let f0 = apfloat_fallback!(
+                            $f_ty, $apfloat_ty, $sys_available,
+                            |x| x as $f_ty;
+                            // When the builtin is not available, we need to use a different conversion
+                            // method (since apfloat doesn't support `as` casting).
+                            |x: $i_ty| {
+                                use compiler_builtins::int::MinInt;
+
+                                let apf = if <$i_ty>::SIGNED {
+                                    FloatTy::from_i128(x.try_into().unwrap()).value
+                                } else {
+                                    FloatTy::from_u128(x.try_into().unwrap()).value
+                                };
+
+                                <$f_ty>::from_bits(apf.to_bits())
+                            },
+                            x
+                        );
+                        let f1: $f_ty = $fn(x);
+
+                        #[cfg($sys_available)] {
+                            // This makes sure that the conversion produced the best rounding possible, and does
+                            // this independent of `x as $into` rounding correctly.
+                            // This assumes that float to integer conversion is correct.
+                            let y_minus_ulp = <$f_ty>::from_bits(f1.to_bits().wrapping_sub(1)) as $i_ty;
+                            let y = f1 as $i_ty;
+                            let y_plus_ulp = <$f_ty>::from_bits(f1.to_bits().wrapping_add(1)) as $i_ty;
+                            let error_minus = <$i_ty as Int>::abs_diff(y_minus_ulp, x);
+                            let error = <$i_ty as Int>::abs_diff(y, x);
+                            let error_plus = <$i_ty as Int>::abs_diff(y_plus_ulp, x);
+
+                            // The first two conditions check that none of the two closest float values are
+                            // strictly closer in representation to `x`. The second makes sure that rounding is
+                            // towards even significand if two float values are equally close to the integer.
+                            if error_minus < error
+                                || error_plus < error
+                                || ((error_minus == error || error_plus == error)
+                                    && ((f0.to_bits() & 1) != 0))
+                            {
+                                if !cfg!(any(
+                                    target_arch = "powerpc",
+                                    target_arch = "powerpc64"
+                                )) {
+                                    panic!(
+                                        "incorrect rounding by {}({}): {}, ({}, {}, {}), errors ({}, {}, {})",
+                                        stringify!($fn),
+                                        x,
+                                        f1.to_bits(),
+                                        y_minus_ulp,
+                                        y,
+                                        y_plus_ulp,
+                                        error_minus,
+                                        error,
+                                        error_plus,
+                                    );
+                                }
+                            }
+                        }
+
+                        // Test against native conversion. We disable testing on all `x86` because of
+                        // rounding bugs with `i686`. `powerpc` also has the same rounding bug.
+                        if !Float::eq_repr(f0, f1) && !cfg!(any(
+                            target_arch = "x86",
+                            target_arch = "powerpc",
+                            target_arch = "powerpc64"
+                        )) {
+                            panic!(
+                                "{}({}): std: {:?}, builtins: {:?}",
+                                stringify!($fn),
+                                x,
+                                f0,
+                                f1,
+                            );
+                        }
+                    });
+                }
+            )*
+        };
+    }
+
+    i_to_f! { f32, Single, all(),
+        u32, __floatunsisf;
+        i32, __floatsisf;
+        u64, __floatundisf;
+        i64, __floatdisf;
+        u128, __floatuntisf;
+        i128, __floattisf;
+    }
+
+    i_to_f! { f64, Double, all(),
+        u32, __floatunsidf;
+        i32, __floatsidf;
+        u64, __floatundidf;
+        i64, __floatdidf;
+        u128, __floatuntidf;
+        i128, __floattidf;
+    }
+
+    #[cfg(not(feature = "no-f16-f128"))]
+    #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+    i_to_f! { f128, Quad, not(feature = "no-sys-f128-int-convert"),
+        u32, __floatunsitf;
+        i32, __floatsitf;
+        u64, __floatunditf;
+        i64, __floatditf;
+        u128, __floatuntitf;
+        i128, __floattitf;
+    }
+
+    #[cfg(not(feature = "no-f16-f128"))]
+    #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+    i_to_f! { f128, Quad, not(feature = "no-sys-f128-int-convert"),
+        u32, __floatunsikf;
+        i32, __floatsikf;
+        u64, __floatundikf;
+        i64, __floatdikf;
+        u128, __floatuntikf;
+        i128, __floattikf;
+    }
+}
+
+mod f_to_i {
+    use super::*;
+
+    macro_rules! f_to_i {
+        ($x:ident, $f_ty:ty, $apfloat_ty:ident, $sys_available:meta, $($i_ty:ty, $fn:ident);*;) => {
+            $(
+                // it is undefined behavior in the first place to do conversions with NaNs
+                if !apfloat_fallback!(
+                    $f_ty, $apfloat_ty, $sys_available, |x: FloatTy| x.is_nan() => no_convert, $x
+                ) {
+                    let conv0 = apfloat_fallback!(
+                        $f_ty, $apfloat_ty, $sys_available,
+                        // Use an `as` cast when the builtin is available on the system.
+                        |x| x as $i_ty;
+                        // When the builtin is not available, we need to use a different conversion
+                        // method (since apfloat doesn't support `as` casting).
+                        |x: $f_ty| {
+                            use compiler_builtins::int::MinInt;
+
+                            let apf = FloatTy::from_bits(x.to_bits().into());
+                            let bits: usize = <$i_ty>::BITS.try_into().unwrap();
+
+                            let err_fn = || panic!(
+                                "Unable to convert value {x:?} to type {}:", stringify!($i_ty)
+                            );
+
+                            if <$i_ty>::SIGNED {
+                               <$i_ty>::try_from(apf.to_i128(bits).value).ok().unwrap_or_else(err_fn)
+                            } else {
+                               <$i_ty>::try_from(apf.to_u128(bits).value).ok().unwrap_or_else(err_fn)
+                            }
+                        },
+                        $x
+                    );
+                    let conv1: $i_ty = $fn($x);
+                    if conv0 != conv1 {
+                        panic!("{}({:?}): std: {:?}, builtins: {:?}", stringify!($fn), $x, conv0, conv1);
+                    }
+                }
+            )*
+        };
+    }
+
+    #[test]
+    fn f32_to_int() {
+        use compiler_builtins::float::conv::{
+            __fixsfdi, __fixsfsi, __fixsfti, __fixunssfdi, __fixunssfsi, __fixunssfti,
+        };
+
+        fuzz_float(N, |x: f32| {
+            f_to_i!(x, f32, Single, all(),
+                u32, __fixunssfsi;
+                u64, __fixunssfdi;
+                u128, __fixunssfti;
+                i32, __fixsfsi;
+                i64, __fixsfdi;
+                i128, __fixsfti;
+            );
+        });
+    }
+
+    #[test]
+    fn f64_to_int() {
+        use compiler_builtins::float::conv::{
+            __fixdfdi, __fixdfsi, __fixdfti, __fixunsdfdi, __fixunsdfsi, __fixunsdfti,
+        };
+
+        fuzz_float(N, |x: f64| {
+            f_to_i!(x, f64, Double, all(),
+                u32, __fixunsdfsi;
+                u64, __fixunsdfdi;
+                u128, __fixunsdfti;
+                i32, __fixdfsi;
+                i64, __fixdfdi;
+                i128, __fixdfti;
+            );
+        });
+    }
+
+    #[test]
+    #[cfg(f128_enabled)]
+    fn f128_to_int() {
+        #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+        use compiler_builtins::float::conv::{
+            __fixkfdi as __fixtfdi, __fixkfsi as __fixtfsi, __fixkfti as __fixtfti,
+            __fixunskfdi as __fixunstfdi, __fixunskfsi as __fixunstfsi,
+            __fixunskfti as __fixunstfti,
+        };
+        #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+        use compiler_builtins::float::conv::{
+            __fixtfdi, __fixtfsi, __fixtfti, __fixunstfdi, __fixunstfsi, __fixunstfti,
+        };
+
+        fuzz_float(N, |x: f128| {
+            f_to_i!(
+                x,
+                f128,
+                Quad,
+                not(feature = "no-sys-f128-int-convert"),
+                u32, __fixunstfsi;
+                u64, __fixunstfdi;
+                u128, __fixunstfti;
+                i32, __fixtfsi;
+                i64, __fixtfdi;
+                i128, __fixtfti;
+            );
+        });
+    }
+}
+
+macro_rules! f_to_f {
+    (
+        $mod:ident,
+        $(
+            $from_ty:ty => $to_ty:ty,
+            $from_ap_ty:ident => $to_ap_ty:ident,
+            $fn:ident, $sys_available:meta
+        );+;
+    ) => {$(
+        #[test]
+        fn $fn() {
+            use compiler_builtins::float::{$mod::$fn, Float};
+            use rustc_apfloat::ieee::{$from_ap_ty, $to_ap_ty};
+
+            fuzz_float(N, |x: $from_ty| {
+                let tmp0: $to_ty = apfloat_fallback!(
+                    $from_ty,
+                    $from_ap_ty,
+                    $sys_available,
+                    |x: $from_ty| x as $to_ty;
+                    |x: $from_ty| {
+                        let from_apf = FloatTy::from_bits(x.to_bits().into());
+                        // Get `value` directly to ignore INVALID_OP
+                        let to_apf: $to_ap_ty = from_apf.convert(&mut false).value;
+                        <$to_ty>::from_bits(to_apf.to_bits().try_into().unwrap())
+                    },
+                    x
+                );
+                let tmp1: $to_ty = $fn(x);
+
+                if !Float::eq_repr(tmp0, tmp1) {
+                    panic!(
+                        "{}({:?}): std: {:?}, builtins: {:?}",
+                        stringify!($fn),
+                        x,
+                        tmp0,
+                        tmp1
+                    );
+                }
+            })
+        }
+    )+};
+}
+
+mod extend {
+    use super::*;
+
+    f_to_f! {
+        extend,
+        f32 => f64, Single => Double, __extendsfdf2, all();
+    }
+
+    #[cfg(all(f16_enabled, f128_enabled))]
+    #[cfg(not(any(
+        target_arch = "powerpc",
+        target_arch = "powerpc64",
+        target_arch = "loongarch64"
+    )))]
+    f_to_f! {
+        extend,
+        f16 => f32, Half => Single, __extendhfsf2, not(feature = "no-sys-f16");
+        f16 => f32, Half => Single, __gnu_h2f_ieee, not(feature = "no-sys-f16");
+        f16 => f64, Half => Double, __extendhfdf2, not(feature = "no-sys-f16-f64-convert");
+        f16 => f128, Half => Quad, __extendhftf2, not(feature = "no-sys-f16-f128-convert");
+        f32 => f128, Single => Quad, __extendsftf2, not(feature = "no-sys-f128");
+        f64 => f128, Double => Quad, __extenddftf2, not(feature = "no-sys-f128");
+    }
+
+    #[cfg(f128_enabled)]
+    #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+    f_to_f! {
+        extend,
+        // FIXME(#655): `f16` tests disabled until we can bootstrap symbols
+        f32 => f128, Single => Quad, __extendsfkf2, not(feature = "no-sys-f128");
+        f64 => f128, Double => Quad, __extenddfkf2, not(feature = "no-sys-f128");
+    }
+}
+
+mod trunc {
+    use super::*;
+
+    f_to_f! {
+        trunc,
+        f64 => f32, Double => Single, __truncdfsf2, all();
+    }
+
+    #[cfg(all(f16_enabled, f128_enabled))]
+    #[cfg(not(any(
+        target_arch = "powerpc",
+        target_arch = "powerpc64",
+        target_arch = "loongarch64"
+    )))]
+    f_to_f! {
+        trunc,
+        f32 => f16, Single => Half, __truncsfhf2, not(feature = "no-sys-f16");
+        f32 => f16, Single => Half, __gnu_f2h_ieee, not(feature = "no-sys-f16");
+        f64 => f16, Double => Half, __truncdfhf2, not(feature = "no-sys-f16-f64-convert");
+        f128 => f16, Quad => Half, __trunctfhf2, not(feature = "no-sys-f16-f128-convert");
+        f128 => f32, Quad => Single, __trunctfsf2, not(feature = "no-sys-f128");
+        f128 => f64, Quad => Double, __trunctfdf2, not(feature = "no-sys-f128");
+    }
+
+    #[cfg(f128_enabled)]
+    #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+    f_to_f! {
+        trunc,
+        // FIXME(#655): `f16` tests disabled until we can bootstrap symbols
+        f128 => f32, Quad => Single, __trunckfsf2, not(feature = "no-sys-f128");
+        f128 => f64, Quad => Double, __trunckfdf2, not(feature = "no-sys-f128");
+    }
+}
diff --git a/library/compiler-builtins/builtins-test/tests/div_rem.rs b/library/compiler-builtins/builtins-test/tests/div_rem.rs
new file mode 100644
index 00000000000..6c0280a3286
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/tests/div_rem.rs
@@ -0,0 +1,165 @@
+#![feature(f128)]
+#![allow(unused_macros)]
+
+use compiler_builtins::int::sdiv::{__divmoddi4, __divmodsi4, __divmodti4};
+use compiler_builtins::int::udiv::{__udivmoddi4, __udivmodsi4, __udivmodti4, u128_divide_sparc};
+
+use builtins_test::*;
+
+// Division algorithms have by far the nastiest and largest number of edge cases, and experience shows
+// that sometimes 100_000 iterations of the random fuzzer is needed.
+
+/// Creates intensive test functions for division functions of a certain size
+macro_rules! test {
+    (
+        $n:expr, // the number of bits in a $iX or $uX
+        $uX:ident, // unsigned integer that will be shifted
+        $iX:ident, // signed version of $uX
+        $test_name:ident, // name of the test function
+        $unsigned_name:ident, // unsigned division function
+        $signed_name:ident // signed division function
+    ) => {
+        #[test]
+        fn $test_name() {
+            fuzz_2(N, |lhs, rhs| {
+                if rhs == 0 {
+                    return;
+                }
+
+                let mut rem: $uX = 0;
+                let quo: $uX = $unsigned_name(lhs, rhs, Some(&mut rem));
+                if rhs <= rem || (lhs != rhs.wrapping_mul(quo).wrapping_add(rem)) {
+                    panic!(
+                        "unsigned division function failed with lhs:{} rhs:{} \
+                        std:({}, {}) builtins:({}, {})",
+                        lhs,
+                        rhs,
+                        lhs.wrapping_div(rhs),
+                        lhs.wrapping_rem(rhs),
+                        quo,
+                        rem
+                    );
+                }
+
+                // test the signed division function also
+                let lhs = lhs as $iX;
+                let rhs = rhs as $iX;
+                let mut rem: $iX = 0;
+                let quo: $iX = $signed_name(lhs, rhs, &mut rem);
+                // We cannot just test that
+                // `lhs == rhs.wrapping_mul(quo).wrapping_add(rem)`, but also
+                // need to make sure the remainder isn't larger than the divisor
+                // and has the correct sign.
+                let incorrect_rem = if rem == 0 {
+                    false
+                } else if rhs == $iX::MIN {
+                    // `rhs.wrapping_abs()` would overflow, so handle this case
+                    // separately.
+                    (lhs.is_negative() != rem.is_negative()) || (rem == $iX::MIN)
+                } else {
+                    (lhs.is_negative() != rem.is_negative())
+                        || (rhs.wrapping_abs() <= rem.wrapping_abs())
+                };
+                if incorrect_rem || lhs != rhs.wrapping_mul(quo).wrapping_add(rem) {
+                    panic!(
+                        "signed division function failed with lhs:{} rhs:{} \
+                        std:({}, {}) builtins:({}, {})",
+                        lhs,
+                        rhs,
+                        lhs.wrapping_div(rhs),
+                        lhs.wrapping_rem(rhs),
+                        quo,
+                        rem
+                    );
+                }
+            });
+        }
+    };
+}
+
+test!(32, u32, i32, div_rem_si4, __udivmodsi4, __divmodsi4);
+test!(64, u64, i64, div_rem_di4, __udivmoddi4, __divmoddi4);
+test!(128, u128, i128, div_rem_ti4, __udivmodti4, __divmodti4);
+
+#[test]
+fn divide_sparc() {
+    fuzz_2(N, |lhs, rhs| {
+        if rhs == 0 {
+            return;
+        }
+
+        let mut rem: u128 = 0;
+        let quo: u128 = u128_divide_sparc(lhs, rhs, &mut rem);
+        if rhs <= rem || (lhs != rhs.wrapping_mul(quo).wrapping_add(rem)) {
+            panic!(
+                "u128_divide_sparc({}, {}): \
+                std:({}, {}), builtins:({}, {})",
+                lhs,
+                rhs,
+                lhs.wrapping_div(rhs),
+                lhs.wrapping_rem(rhs),
+                quo,
+                rem
+            );
+        }
+    });
+}
+
+macro_rules! float {
+    ($($f:ty, $fn:ident, $apfloat_ty:ident, $sys_available:meta);*;) => {
+        $(
+            #[test]
+            fn $fn() {
+                use compiler_builtins::float::{div::$fn, Float};
+                use core::ops::Div;
+
+                fuzz_float_2(N, |x: $f, y: $f| {
+                    let quo0: $f = apfloat_fallback!($f, $apfloat_ty, $sys_available, Div::div, x, y);
+                    let quo1: $f = $fn(x, y);
+
+                    // ARM SIMD instructions always flush subnormals to zero
+                    if cfg!(target_arch = "arm") &&
+                        ((Float::is_subnormal(quo0)) || Float::is_subnormal(quo1)) {
+                        return;
+                    }
+
+                    if !Float::eq_repr(quo0, quo1) {
+                        panic!(
+                            "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
+                            stringify!($fn),
+                            x,
+                            y,
+                            quo0,
+                            quo1
+                        );
+                    }
+                });
+            }
+        )*
+    };
+}
+
+#[cfg(not(all(target_arch = "x86", not(target_feature = "sse"))))]
+mod float_div {
+    use super::*;
+
+    float! {
+        f32, __divsf3, Single, all();
+        f64, __divdf3, Double, all();
+    }
+
+    #[cfg(not(feature = "no-f16-f128"))]
+    #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+    float! {
+        f128, __divtf3, Quad,
+        // FIXME(llvm): there is a bug in LLVM rt.
+        // See <https://github.com/llvm/llvm-project/issues/91840>.
+        not(any(feature = "no-sys-f128", all(target_arch = "aarch64", target_os = "linux")));
+    }
+
+    #[cfg(not(feature = "no-f16-f128"))]
+    #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+    float! {
+        f128, __divkf3, Quad, not(feature = "no-sys-f128");
+    }
+}
diff --git a/library/compiler-builtins/builtins-test/tests/float_pow.rs b/library/compiler-builtins/builtins-test/tests/float_pow.rs
new file mode 100644
index 00000000000..8209543e666
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/tests/float_pow.rs
@@ -0,0 +1,72 @@
+#![allow(unused_macros)]
+#![cfg_attr(f128_enabled, feature(f128))]
+#![cfg(not(all(target_arch = "x86", not(target_feature = "sse"))))]
+
+use builtins_test::*;
+
+// This is approximate because of issues related to
+// https://github.com/rust-lang/rust/issues/73920.
+// TODO how do we resolve this indeterminacy?
+macro_rules! pow {
+    ($($f:ty, $tolerance:expr, $fn:ident, $sys_available:meta);*;) => {
+        $(
+            #[test]
+            // FIXME(apfloat): We skip tests if system symbols aren't available rather
+            // than providing a fallback, since `rustc_apfloat` does not provide `pow`.
+            #[cfg($sys_available)]
+            fn $fn() {
+                use compiler_builtins::float::pow::$fn;
+                use compiler_builtins::float::Float;
+                fuzz_float_2(N, |x: $f, y: $f| {
+                    if !(Float::is_subnormal(x) || Float::is_subnormal(y) || x.is_nan()) {
+                        let n = y.to_bits() & !<$f as Float>::SIG_MASK;
+                        let n = (n as <$f as Float>::SignedInt) >> <$f as Float>::SIG_BITS;
+                        let n = n as i32;
+                        let tmp0: $f = x.powi(n);
+                        let tmp1: $f = $fn(x, n);
+                        let (a, b) = if tmp0 < tmp1 {
+                            (tmp0, tmp1)
+                        } else {
+                            (tmp1, tmp0)
+                        };
+
+                        let good = if a == b {
+                            // handles infinity equality
+                            true
+                        } else if a < $tolerance {
+                            b < $tolerance
+                        } else {
+                            let quo = b / a;
+                            (quo < (1. + $tolerance)) && (quo > (1. - $tolerance))
+                        };
+
+                        assert!(
+                            good,
+                            "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
+                            stringify!($fn), x, n, tmp0, tmp1
+                        );
+                    }
+                });
+            }
+        )*
+    };
+}
+
+pow! {
+    f32, 1e-4, __powisf2, all();
+    f64, 1e-12, __powidf2, all();
+}
+
+#[cfg(f128_enabled)]
+// FIXME(f16_f128): MSVC cannot build these until `__divtf3` is available in nightly.
+#[cfg(not(target_env = "msvc"))]
+#[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+pow! {
+    f128, 1e-36, __powitf2, not(feature = "no-sys-f128");
+}
+
+#[cfg(f128_enabled)]
+#[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+pow! {
+    f128, 1e-36, __powikf2, not(feature = "no-sys-f128");
+}
diff --git a/library/compiler-builtins/builtins-test/tests/lse.rs b/library/compiler-builtins/builtins-test/tests/lse.rs
new file mode 100644
index 00000000000..53167d98fc0
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/tests/lse.rs
@@ -0,0 +1,97 @@
+#![feature(decl_macro)] // so we can use pub(super)
+#![cfg(all(target_arch = "aarch64", target_os = "linux", not(feature = "no-asm")))]
+
+/// Translate a byte size to a Rust type.
+macro int_ty {
+    (1) => { i8 },
+    (2) => { i16 },
+    (4) => { i32 },
+    (8) => { i64 },
+    (16) => { i128 }
+}
+
+mod cas {
+    pub(super) macro test($_ordering:ident, $bytes:tt, $name:ident) {
+        #[test]
+        fn $name() {
+            builtins_test::fuzz_2(10000, |expected: super::int_ty!($bytes), new| {
+                let mut target = expected.wrapping_add(10);
+                assert_eq!(
+                    unsafe {
+                        compiler_builtins::aarch64_linux::$name::$name(expected, new, &mut target)
+                    },
+                    expected.wrapping_add(10),
+                    "return value should always be the previous value",
+                );
+                assert_eq!(
+                    target,
+                    expected.wrapping_add(10),
+                    "shouldn't have changed target"
+                );
+
+                target = expected;
+                assert_eq!(
+                    unsafe {
+                        compiler_builtins::aarch64_linux::$name::$name(expected, new, &mut target)
+                    },
+                    expected
+                );
+                assert_eq!(target, new, "should have updated target");
+            });
+        }
+    }
+}
+
+macro test_cas16($_ordering:ident, $name:ident) {
+    cas::test!($_ordering, 16, $name);
+}
+
+mod swap {
+    pub(super) macro test($_ordering:ident, $bytes:tt, $name:ident) {
+        #[test]
+        fn $name() {
+            builtins_test::fuzz_2(10000, |left: super::int_ty!($bytes), mut right| {
+                let orig_right = right;
+                assert_eq!(
+                    unsafe { compiler_builtins::aarch64_linux::$name::$name(left, &mut right) },
+                    orig_right
+                );
+                assert_eq!(left, right);
+            });
+        }
+    }
+}
+
+macro_rules! test_op {
+    ($mod:ident, $( $op:tt )* ) => {
+        mod $mod {
+            pub(super) macro test {
+                ($_ordering:ident, $bytes:tt, $name:ident) => {
+                    #[test]
+                    fn $name() {
+                        builtins_test::fuzz_2(10000, |old, val| {
+                            let mut target = old;
+                            let op: fn(super::int_ty!($bytes), super::int_ty!($bytes)) -> _ = $($op)*;
+                            let expected = op(old, val);
+                            assert_eq!(old, unsafe { compiler_builtins::aarch64_linux::$name::$name(val, &mut target) }, "{} should return original value", stringify!($name));
+                            assert_eq!(expected, target, "{} should store to target", stringify!($name));
+                        });
+                    }
+                }
+            }
+        }
+    };
+}
+
+test_op!(add, |left, right| left.wrapping_add(right));
+test_op!(clr, |left, right| left & !right);
+test_op!(xor, std::ops::BitXor::bitxor);
+test_op!(or, std::ops::BitOr::bitor);
+
+compiler_builtins::foreach_cas!(cas::test);
+compiler_builtins::foreach_cas16!(test_cas16);
+compiler_builtins::foreach_swp!(swap::test);
+compiler_builtins::foreach_ldadd!(add::test);
+compiler_builtins::foreach_ldclr!(clr::test);
+compiler_builtins::foreach_ldeor!(xor::test);
+compiler_builtins::foreach_ldset!(or::test);
diff --git a/library/compiler-builtins/builtins-test/tests/mem.rs b/library/compiler-builtins/builtins-test/tests/mem.rs
new file mode 100644
index 00000000000..d838ef159a0
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/tests/mem.rs
@@ -0,0 +1,286 @@
+extern crate compiler_builtins;
+use compiler_builtins::mem::{memcmp, memcpy, memmove, memset};
+
+const WORD_SIZE: usize = core::mem::size_of::<usize>();
+
+#[test]
+fn memcpy_3() {
+    let mut arr: [u8; 12] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11];
+    unsafe {
+        let src = arr.as_ptr().offset(9);
+        let dst = arr.as_mut_ptr().offset(1);
+        assert_eq!(memcpy(dst, src, 3), dst);
+        assert_eq!(arr, [0, 9, 10, 11, 4, 5, 6, 7, 8, 9, 10, 11]);
+    }
+    arr = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11];
+    unsafe {
+        let src = arr.as_ptr().offset(1);
+        let dst = arr.as_mut_ptr().offset(9);
+        assert_eq!(memcpy(dst, src, 3), dst);
+        assert_eq!(arr, [0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3]);
+    }
+}
+
+#[test]
+fn memcpy_10() {
+    let arr: [u8; 18] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17];
+    let mut dst: [u8; 12] = [0; 12];
+    unsafe {
+        let src = arr.as_ptr().offset(1);
+        assert_eq!(memcpy(dst.as_mut_ptr(), src, 10), dst.as_mut_ptr());
+        assert_eq!(dst, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 0]);
+    }
+    unsafe {
+        let src = arr.as_ptr().offset(8);
+        assert_eq!(memcpy(dst.as_mut_ptr(), src, 10), dst.as_mut_ptr());
+        assert_eq!(dst, [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 0, 0]);
+    }
+}
+
+#[test]
+fn memcpy_big() {
+    // Make the arrays cross 3 pages
+    const SIZE: usize = 8193;
+    let src: [u8; SIZE] = [22; SIZE];
+    struct Dst {
+        start: usize,
+        buf: [u8; SIZE],
+        end: usize,
+    }
+
+    let mut dst = Dst {
+        start: 0,
+        buf: [0; SIZE],
+        end: 0,
+    };
+    unsafe {
+        assert_eq!(
+            memcpy(dst.buf.as_mut_ptr(), src.as_ptr(), SIZE),
+            dst.buf.as_mut_ptr()
+        );
+        assert_eq!(dst.start, 0);
+        assert_eq!(dst.buf, [22; SIZE]);
+        assert_eq!(dst.end, 0);
+    }
+}
+
+#[test]
+fn memmove_forward() {
+    let mut arr: [u8; 12] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11];
+    unsafe {
+        let src = arr.as_ptr().offset(6);
+        let dst = arr.as_mut_ptr().offset(3);
+        assert_eq!(memmove(dst, src, 5), dst);
+        assert_eq!(arr, [0, 1, 2, 6, 7, 8, 9, 10, 8, 9, 10, 11]);
+    }
+}
+
+#[test]
+fn memmove_backward() {
+    let mut arr: [u8; 12] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11];
+    unsafe {
+        let src = arr.as_ptr().offset(3);
+        let dst = arr.as_mut_ptr().offset(6);
+        assert_eq!(memmove(dst, src, 5), dst);
+        assert_eq!(arr, [0, 1, 2, 3, 4, 5, 3, 4, 5, 6, 7, 11]);
+    }
+}
+
+#[test]
+fn memset_zero() {
+    let mut arr: [u8; 8] = [0, 1, 2, 3, 4, 5, 6, 7];
+    unsafe {
+        let ptr = arr.as_mut_ptr().offset(5);
+        assert_eq!(memset(ptr, 0, 2), ptr);
+        assert_eq!(arr, [0, 1, 2, 3, 4, 0, 0, 7]);
+
+        // Only the LSB matters for a memset
+        assert_eq!(memset(arr.as_mut_ptr(), 0x2000, 8), arr.as_mut_ptr());
+        assert_eq!(arr, [0, 0, 0, 0, 0, 0, 0, 0]);
+    }
+}
+
+#[test]
+fn memset_nonzero() {
+    let mut arr: [u8; 8] = [0, 1, 2, 3, 4, 5, 6, 7];
+    unsafe {
+        let ptr = arr.as_mut_ptr().offset(2);
+        assert_eq!(memset(ptr, 22, 3), ptr);
+        assert_eq!(arr, [0, 1, 22, 22, 22, 5, 6, 7]);
+
+        // Only the LSB matters for a memset
+        assert_eq!(memset(arr.as_mut_ptr(), 0x2009, 8), arr.as_mut_ptr());
+        assert_eq!(arr, [9, 9, 9, 9, 9, 9, 9, 9]);
+    }
+}
+
+#[test]
+fn memcmp_eq() {
+    let arr1 @ arr2 = gen_arr::<256>();
+    for i in 0..256 {
+        unsafe {
+            assert_eq!(memcmp(arr1.0.as_ptr(), arr2.0.as_ptr(), i), 0);
+            assert_eq!(memcmp(arr2.0.as_ptr(), arr1.0.as_ptr(), i), 0);
+        }
+    }
+}
+
+#[test]
+fn memcmp_ne() {
+    let arr1 @ arr2 = gen_arr::<256>();
+    // Reduce iteration count in Miri as it is too slow otherwise.
+    let limit = if cfg!(miri) { 64 } else { 256 };
+    for i in 0..limit {
+        let mut diff_arr = arr1;
+        diff_arr.0[i] = 127;
+        let expect = diff_arr.0[i].cmp(&arr2.0[i]);
+        for k in i + 1..limit {
+            let result = unsafe { memcmp(diff_arr.0.as_ptr(), arr2.0.as_ptr(), k) };
+            assert_eq!(expect, result.cmp(&0));
+        }
+    }
+}
+
+#[derive(Clone, Copy)]
+struct AlignedStorage<const N: usize>([u8; N], [usize; 0]);
+
+fn gen_arr<const N: usize>() -> AlignedStorage<N> {
+    let mut ret = AlignedStorage::<N>([0; N], []);
+    for i in 0..N {
+        ret.0[i] = i as u8;
+    }
+    ret
+}
+
+#[test]
+fn memmove_forward_misaligned_nonaligned_start() {
+    let mut arr = gen_arr::<32>();
+    let mut reference = arr;
+    unsafe {
+        let src = arr.0.as_ptr().offset(6);
+        let dst = arr.0.as_mut_ptr().offset(3);
+        assert_eq!(memmove(dst, src, 17), dst);
+        reference.0.copy_within(6..6 + 17, 3);
+        assert_eq!(arr.0, reference.0);
+    }
+}
+
+#[test]
+fn memmove_forward_misaligned_aligned_start() {
+    let mut arr = gen_arr::<32>();
+    let mut reference = arr;
+    unsafe {
+        let src = arr.0.as_ptr().offset(6);
+        let dst = arr.0.as_mut_ptr().add(0);
+        assert_eq!(memmove(dst, src, 17), dst);
+        reference.0.copy_within(6..6 + 17, 0);
+        assert_eq!(arr.0, reference.0);
+    }
+}
+
+#[test]
+fn memmove_forward_aligned() {
+    let mut arr = gen_arr::<32>();
+    let mut reference = arr;
+    unsafe {
+        let src = arr.0.as_ptr().add(3 + WORD_SIZE);
+        let dst = arr.0.as_mut_ptr().add(3);
+        assert_eq!(memmove(dst, src, 17), dst);
+        reference
+            .0
+            .copy_within(3 + WORD_SIZE..3 + WORD_SIZE + 17, 3);
+        assert_eq!(arr.0, reference.0);
+    }
+}
+
+#[test]
+fn memmove_backward_misaligned_nonaligned_start() {
+    let mut arr = gen_arr::<32>();
+    let mut reference = arr;
+    unsafe {
+        let src = arr.0.as_ptr().offset(3);
+        let dst = arr.0.as_mut_ptr().offset(6);
+        assert_eq!(memmove(dst, src, 17), dst);
+        reference.0.copy_within(3..3 + 17, 6);
+        assert_eq!(arr.0, reference.0);
+    }
+}
+
+#[test]
+fn memmove_backward_misaligned_aligned_start() {
+    let mut arr = gen_arr::<32>();
+    let mut reference = arr;
+    unsafe {
+        let src = arr.0.as_ptr().offset(3);
+        let dst = arr.0.as_mut_ptr().add(WORD_SIZE);
+        assert_eq!(memmove(dst, src, 17), dst);
+        reference.0.copy_within(3..3 + 17, WORD_SIZE);
+        assert_eq!(arr.0, reference.0);
+    }
+}
+
+#[test]
+fn memmove_backward_aligned() {
+    let mut arr = gen_arr::<32>();
+    let mut reference = arr;
+    unsafe {
+        let src = arr.0.as_ptr().add(3);
+        let dst = arr.0.as_mut_ptr().add(3 + WORD_SIZE);
+        assert_eq!(memmove(dst, src, 17), dst);
+        reference.0.copy_within(3..3 + 17, 3 + WORD_SIZE);
+        assert_eq!(arr.0, reference.0);
+    }
+}
+
+#[test]
+fn memmove_misaligned_bounds() {
+    // The above test have the downside that the addresses surrounding the range-to-copy are all
+    // still in-bounds, so Miri would not actually complain about OOB accesses. So we also test with
+    // an array that has just the right size. We test a few times to avoid it being accidentally
+    // aligned.
+    for _ in 0..8 {
+        let mut arr1 = [0u8; 17];
+        let mut arr2 = [0u8; 17];
+        unsafe {
+            // Copy both ways so we hit both the forward and backward cases.
+            memmove(arr1.as_mut_ptr(), arr2.as_mut_ptr(), 17);
+            memmove(arr2.as_mut_ptr(), arr1.as_mut_ptr(), 17);
+        }
+    }
+}
+
+#[test]
+fn memset_backward_misaligned_nonaligned_start() {
+    let mut arr = gen_arr::<32>();
+    let mut reference = arr;
+    unsafe {
+        let ptr = arr.0.as_mut_ptr().offset(6);
+        assert_eq!(memset(ptr, 0xCC, 17), ptr);
+        core::ptr::write_bytes(reference.0.as_mut_ptr().add(6), 0xCC, 17);
+        assert_eq!(arr.0, reference.0);
+    }
+}
+
+#[test]
+fn memset_backward_misaligned_aligned_start() {
+    let mut arr = gen_arr::<32>();
+    let mut reference = arr;
+    unsafe {
+        let ptr = arr.0.as_mut_ptr().add(WORD_SIZE);
+        assert_eq!(memset(ptr, 0xCC, 17), ptr);
+        core::ptr::write_bytes(reference.0.as_mut_ptr().add(WORD_SIZE), 0xCC, 17);
+        assert_eq!(arr.0, reference.0);
+    }
+}
+
+#[test]
+fn memset_backward_aligned() {
+    let mut arr = gen_arr::<32>();
+    let mut reference = arr;
+    unsafe {
+        let ptr = arr.0.as_mut_ptr().add(3 + WORD_SIZE);
+        assert_eq!(memset(ptr, 0xCC, 17), ptr);
+        core::ptr::write_bytes(reference.0.as_mut_ptr().add(3 + WORD_SIZE), 0xCC, 17);
+        assert_eq!(arr.0, reference.0);
+    }
+}
diff --git a/library/compiler-builtins/builtins-test/tests/misc.rs b/library/compiler-builtins/builtins-test/tests/misc.rs
new file mode 100644
index 00000000000..b8c75c02653
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/tests/misc.rs
@@ -0,0 +1,208 @@
+// makes configuration easier
+#![allow(unused_macros)]
+
+use builtins_test::*;
+
+/// Make sure that the the edge case tester and randomized tester don't break, and list examples of
+/// fuzz values for documentation purposes.
+#[test]
+fn fuzz_values() {
+    const VALS: [u16; 47] = [
+        0b0, // edge cases
+        0b1111111111111111,
+        0b1111111111111110,
+        0b1111111111111100,
+        0b1111111110000000,
+        0b1111111100000000,
+        0b1110000000000000,
+        0b1100000000000000,
+        0b1000000000000000,
+        0b111111111111111,
+        0b111111111111110,
+        0b111111111111100,
+        0b111111110000000,
+        0b111111100000000,
+        0b110000000000000,
+        0b100000000000000,
+        0b11111111111111,
+        0b11111111111110,
+        0b11111111111100,
+        0b11111110000000,
+        0b11111100000000,
+        0b10000000000000,
+        0b111111111,
+        0b111111110,
+        0b111111100,
+        0b110000000,
+        0b100000000,
+        0b11111111,
+        0b11111110,
+        0b11111100,
+        0b10000000,
+        0b111,
+        0b110,
+        0b100,
+        0b11,
+        0b10,
+        0b1,
+        0b1010110100000, // beginning of random fuzzing
+        0b1100011001011010,
+        0b1001100101001111,
+        0b1101010100011010,
+        0b100010001,
+        0b1000000000000000,
+        0b1100000000000101,
+        0b1100111101010101,
+        0b1100010111111111,
+        0b1111110101111111,
+    ];
+    let mut i = 0;
+    fuzz(10, |x: u16| {
+        assert_eq!(x, VALS[i]);
+        i += 1;
+    });
+}
+
+#[test]
+fn leading_zeros() {
+    use compiler_builtins::int::leading_zeros::{leading_zeros_default, leading_zeros_riscv};
+    {
+        use compiler_builtins::int::leading_zeros::__clzsi2;
+        fuzz(N, |x: u32| {
+            if x == 0 {
+                return; // undefined value for an intrinsic
+            }
+            let lz = x.leading_zeros() as usize;
+            let lz0 = __clzsi2(x);
+            let lz1 = leading_zeros_default(x);
+            let lz2 = leading_zeros_riscv(x);
+            if lz0 != lz {
+                panic!("__clzsi2({}): std: {}, builtins: {}", x, lz, lz0);
+            }
+            if lz1 != lz {
+                panic!(
+                    "leading_zeros_default({}): std: {}, builtins: {}",
+                    x, lz, lz1
+                );
+            }
+            if lz2 != lz {
+                panic!("leading_zeros_riscv({}): std: {}, builtins: {}", x, lz, lz2);
+            }
+        });
+    }
+
+    {
+        use compiler_builtins::int::leading_zeros::__clzdi2;
+        fuzz(N, |x: u64| {
+            if x == 0 {
+                return; // undefined value for an intrinsic
+            }
+            let lz = x.leading_zeros() as usize;
+            let lz0 = __clzdi2(x);
+            let lz1 = leading_zeros_default(x);
+            let lz2 = leading_zeros_riscv(x);
+            if lz0 != lz {
+                panic!("__clzdi2({}): std: {}, builtins: {}", x, lz, lz0);
+            }
+            if lz1 != lz {
+                panic!(
+                    "leading_zeros_default({}): std: {}, builtins: {}",
+                    x, lz, lz1
+                );
+            }
+            if lz2 != lz {
+                panic!("leading_zeros_riscv({}): std: {}, builtins: {}", x, lz, lz2);
+            }
+        });
+    }
+
+    {
+        use compiler_builtins::int::leading_zeros::__clzti2;
+        fuzz(N, |x: u128| {
+            if x == 0 {
+                return; // undefined value for an intrinsic
+            }
+            let lz = x.leading_zeros() as usize;
+            let lz0 = __clzti2(x);
+            if lz0 != lz {
+                panic!("__clzti2({}): std: {}, builtins: {}", x, lz, lz0);
+            }
+        });
+    }
+}
+
+#[test]
+fn trailing_zeros() {
+    use compiler_builtins::int::trailing_zeros::{__ctzdi2, __ctzsi2, __ctzti2, trailing_zeros};
+    fuzz(N, |x: u32| {
+        if x == 0 {
+            return; // undefined value for an intrinsic
+        }
+        let tz = x.trailing_zeros() as usize;
+        let tz0 = __ctzsi2(x);
+        let tz1 = trailing_zeros(x);
+        if tz0 != tz {
+            panic!("__ctzsi2({}): std: {}, builtins: {}", x, tz, tz0);
+        }
+        if tz1 != tz {
+            panic!("trailing_zeros({}): std: {}, builtins: {}", x, tz, tz1);
+        }
+    });
+    fuzz(N, |x: u64| {
+        if x == 0 {
+            return; // undefined value for an intrinsic
+        }
+        let tz = x.trailing_zeros() as usize;
+        let tz0 = __ctzdi2(x);
+        let tz1 = trailing_zeros(x);
+        if tz0 != tz {
+            panic!("__ctzdi2({}): std: {}, builtins: {}", x, tz, tz0);
+        }
+        if tz1 != tz {
+            panic!("trailing_zeros({}): std: {}, builtins: {}", x, tz, tz1);
+        }
+    });
+    fuzz(N, |x: u128| {
+        if x == 0 {
+            return; // undefined value for an intrinsic
+        }
+        let tz = x.trailing_zeros() as usize;
+        let tz0 = __ctzti2(x);
+        if tz0 != tz {
+            panic!("__ctzti2({}): std: {}, builtins: {}", x, tz, tz0);
+        }
+    });
+}
+
+#[test]
+fn bswap() {
+    use compiler_builtins::int::bswap::{__bswapdi2, __bswapsi2};
+    fuzz(N, |x: u32| {
+        assert_eq!(x.swap_bytes(), __bswapsi2(x));
+    });
+    fuzz(N, |x: u64| {
+        assert_eq!(x.swap_bytes(), __bswapdi2(x));
+    });
+
+    assert_eq!(__bswapsi2(0x12345678u32), 0x78563412u32);
+    assert_eq!(__bswapsi2(0x00000001u32), 0x01000000u32);
+    assert_eq!(__bswapdi2(0x123456789ABCDEF0u64), 0xF0DEBC9A78563412u64);
+    assert_eq!(__bswapdi2(0x0200000001000000u64), 0x0000000100000002u64);
+
+    #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
+    {
+        use compiler_builtins::int::bswap::__bswapti2;
+        fuzz(N, |x: u128| {
+            assert_eq!(x.swap_bytes(), __bswapti2(x));
+        });
+
+        assert_eq!(
+            __bswapti2(0x123456789ABCDEF013579BDF02468ACEu128),
+            0xCE8A4602DF9B5713F0DEBC9A78563412u128
+        );
+        assert_eq!(
+            __bswapti2(0x04000000030000000200000001000000u128),
+            0x00000001000000020000000300000004u128
+        );
+    }
+}
diff --git a/library/compiler-builtins/builtins-test/tests/mul.rs b/library/compiler-builtins/builtins-test/tests/mul.rs
new file mode 100644
index 00000000000..198cacb3489
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/tests/mul.rs
@@ -0,0 +1,156 @@
+#![allow(unused_macros)]
+#![cfg_attr(f128_enabled, feature(f128))]
+
+use builtins_test::*;
+
+mod int_mul {
+    use super::*;
+
+    macro_rules! mul {
+        ($($i:ty, $fn:ident);*;) => {
+            $(
+                #[test]
+                fn $fn() {
+                    use compiler_builtins::int::mul::$fn;
+
+                    fuzz_2(N, |x: $i, y: $i| {
+                        let mul0 = x.wrapping_mul(y);
+                        let mul1: $i = $fn(x, y);
+                        if mul0 != mul1 {
+                            panic!(
+                                "{}({}, {}): std: {}, builtins: {}",
+                                stringify!($fn), x, y, mul0, mul1
+                            );
+                        }
+                    });
+
+                }
+            )*
+        };
+    }
+
+    mul! {
+        u64, __muldi3;
+        i128, __multi3;
+    }
+}
+
+mod int_overflowing_mul {
+    use super::*;
+
+    macro_rules! overflowing_mul {
+        ($($i:ty, $fn:ident);*;) => {
+            $(
+                #[test]
+                fn $fn() {
+                    use compiler_builtins::int::mul::$fn;
+
+                    fuzz_2(N, |x: $i, y: $i| {
+                        let (mul0, o0) = x.overflowing_mul(y);
+                        let mut o1 = 0i32;
+                        let mul1: $i = $fn(x, y, &mut o1);
+                        let o1 = o1 != 0;
+                        if mul0 != mul1 || o0 != o1 {
+                            panic!(
+                                "{}({}, {}): std: ({}, {}), builtins: ({}, {})",
+                                stringify!($fn), x, y, mul0, o0, mul1, o1
+                            );
+                        }
+                    });
+                }
+            )*
+        };
+    }
+
+    overflowing_mul! {
+        i32, __mulosi4;
+        i64, __mulodi4;
+        i128, __muloti4;
+    }
+
+    #[test]
+    fn overflowing_mul_u128() {
+        use compiler_builtins::int::mul::{__rust_i128_mulo, __rust_u128_mulo};
+
+        fuzz_2(N, |x: u128, y: u128| {
+            let mut o1 = 0;
+            let (mul0, o0) = x.overflowing_mul(y);
+            let mul1 = __rust_u128_mulo(x, y, &mut o1);
+            if mul0 != mul1 || i32::from(o0) != o1 {
+                panic!(
+                    "__rust_u128_mulo({}, {}): std: ({}, {}), builtins: ({}, {})",
+                    x, y, mul0, o0, mul1, o1
+                );
+            }
+            let x = x as i128;
+            let y = y as i128;
+            let (mul0, o0) = x.overflowing_mul(y);
+            let mul1 = __rust_i128_mulo(x, y, &mut o1);
+            if mul0 != mul1 || i32::from(o0) != o1 {
+                panic!(
+                    "__rust_i128_mulo({}, {}): std: ({}, {}), builtins: ({}, {})",
+                    x, y, mul0, o0, mul1, o1
+                );
+            }
+        });
+    }
+}
+
+macro_rules! float_mul {
+    ($($f:ty, $fn:ident, $apfloat_ty:ident, $sys_available:meta);*;) => {
+        $(
+            #[test]
+            fn $fn() {
+                use compiler_builtins::float::{mul::$fn, Float};
+                use core::ops::Mul;
+
+                fuzz_float_2(N, |x: $f, y: $f| {
+                    let mul0 = apfloat_fallback!($f, $apfloat_ty, $sys_available, Mul::mul, x, y);
+                    let mul1: $f = $fn(x, y);
+                    if !Float::eq_repr(mul0, mul1) {
+                        panic!(
+                            "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
+                            stringify!($fn), x, y, mul0, mul1
+                        );
+                    }
+                });
+            }
+        )*
+    };
+}
+
+#[cfg(not(all(target_arch = "x86", not(target_feature = "sse"))))]
+mod float_mul {
+    use super::*;
+
+    // FIXME(#616): Stop ignoring arches that don't have native support once fix for builtins is in
+    // nightly.
+    float_mul! {
+        f32, __mulsf3, Single, not(target_arch = "arm");
+        f64, __muldf3, Double, not(target_arch = "arm");
+    }
+}
+
+#[cfg(f128_enabled)]
+#[cfg(not(all(target_arch = "x86", not(target_feature = "sse"))))]
+#[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+mod float_mul_f128 {
+    use super::*;
+
+    float_mul! {
+        f128, __multf3, Quad,
+        // FIXME(llvm): there is a bug in LLVM rt.
+        // See <https://github.com/llvm/llvm-project/issues/91840>.
+        not(any(feature = "no-sys-f128", all(target_arch = "aarch64", target_os = "linux")));
+    }
+}
+
+#[cfg(f128_enabled)]
+#[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+mod float_mul_f128_ppc {
+    use super::*;
+
+    float_mul! {
+        f128, __mulkf3, Quad, not(feature = "no-sys-f128");
+    }
+}
diff --git a/library/compiler-builtins/builtins-test/tests/shift.rs b/library/compiler-builtins/builtins-test/tests/shift.rs
new file mode 100644
index 00000000000..0f2483855e5
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/tests/shift.rs
@@ -0,0 +1,35 @@
+use builtins_test::*;
+
+macro_rules! shift {
+    ($($i:ty, $fn_std:ident, $fn_builtins:ident);*;) => {
+        $(
+            #[test]
+            fn $fn_builtins() {
+                use compiler_builtins::int::shift::$fn_builtins;
+
+                fuzz_shift(|x: $i, s: u32| {
+                    let tmp0: $i = x.$fn_std(s);
+                    let tmp1: $i = $fn_builtins(x, s);
+                    if tmp0 != tmp1 {
+                        panic!(
+                            "{}({}, {}): std: {}, builtins: {}",
+                            stringify!($fn_builtins), x, s, tmp0, tmp1
+                        );
+                    }
+                });
+            }
+        )*
+    };
+}
+
+shift! {
+    u32, wrapping_shl, __ashlsi3;
+    u64, wrapping_shl, __ashldi3;
+    u128, wrapping_shl, __ashlti3;
+    i32, wrapping_shr, __ashrsi3;
+    i64, wrapping_shr, __ashrdi3;
+    i128, wrapping_shr, __ashrti3;
+    u32, wrapping_shr, __lshrsi3;
+    u64, wrapping_shr, __lshrdi3;
+    u128, wrapping_shr, __lshrti3;
+}