378 files changed, 52996 insertions, 0 deletions
diff --git a/library/compiler-builtins/.editorconfig b/library/compiler-builtins/.editorconfig
new file mode 100644
index 00000000000..f0735cedfbd
--- /dev/null
+++ b/library/compiler-builtins/.editorconfig
@@ -0,0 +1,16 @@
+# EditorConfig helps developers define and maintain consistent
+# coding styles between different editors and IDEs
+# editorconfig.org
+
+root = true
+
+[*]
+end_of_line = lf
+charset = utf-8
+trim_trailing_whitespace = true
+insert_final_newline = true
+indent_style = space
+indent_size = 4
+
+[*.yml]
+indent_size = 2
diff --git a/library/compiler-builtins/.git-blame-ignore-revs b/library/compiler-builtins/.git-blame-ignore-revs
new file mode 100644
index 00000000000..2ede10da53d
--- /dev/null
+++ b/library/compiler-builtins/.git-blame-ignore-revs
@@ -0,0 +1,6 @@
+# Use `git config blame.ignorerevsfile .git-blame-ignore-revs` to make
+# `git blame` ignore the following commits.
+
+# Reformat with a new `.rustfmt.toml`
+# In rust-lang/libm this was 5882cabb83c30bf7c36023f9a55a80583636b0e8
+4bb07a6275cc628ef81c65ac971dc6479963322f
diff --git a/library/compiler-builtins/.github/workflows/main.yaml b/library/compiler-builtins/.github/workflows/main.yaml
new file mode 100644
index 00000000000..d13dd6b0f64
--- /dev/null
+++ b/library/compiler-builtins/.github/workflows/main.yaml
@@ -0,0 +1,344 @@
+name: CI
+on:
+  push: { branches: [master] }
+  pull_request:
+
+concurrency:
+  # Make sure that new pushes cancel running jobs
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+env:
+  CARGO_TERM_COLOR: always
+  RUSTDOCFLAGS: -Dwarnings
+  RUSTFLAGS: -Dwarnings
+  RUST_BACKTRACE: full
+  BENCHMARK_RUSTC: nightly-2025-01-16 # Pin the toolchain for reproducable results
+
+jobs:
+  # Determine which tests should be run based on changed files.
+  calculate_vars:
+    name: Calculate workflow variables
+    runs-on: ubuntu-24.04
+    timeout-minutes: 10
+    env:
+      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      PR_NUMBER: ${{ github.event.pull_request.number }}
+    outputs:
+      extensive_matrix: ${{ steps.script.outputs.extensive_matrix }}
+      may_skip_libm_ci: ${{ steps.script.outputs.may_skip_libm_ci }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 500
+      - name: Fetch pull request ref
+        run: git fetch origin "$GITHUB_REF:$GITHUB_REF"
+        if: github.event_name == 'pull_request'
+      - run: python3 ci/ci-util.py generate-matrix >> "$GITHUB_OUTPUT"
+        id: script
+
+  test:
+    name: Build and test
+    timeout-minutes: 60
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+        - target: aarch64-apple-darwin
+          os: macos-15
+        - target: aarch64-unknown-linux-gnu
+          os: ubuntu-24.04-arm
+        - target: aarch64-pc-windows-msvc
+          os: windows-2025
+          test_verbatim: 1
+          build_only: 1
+        - target: arm-unknown-linux-gnueabi
+          os: ubuntu-24.04
+        - target: arm-unknown-linux-gnueabihf
+          os: ubuntu-24.04
+        - target: armv7-unknown-linux-gnueabihf
+          os: ubuntu-24.04
+        - target: i586-unknown-linux-gnu
+          os: ubuntu-24.04
+        - target: i686-unknown-linux-gnu
+          os: ubuntu-24.04
+        - target: loongarch64-unknown-linux-gnu
+          os: ubuntu-24.04
+        - target: powerpc-unknown-linux-gnu
+          os: ubuntu-24.04
+        - target: powerpc64-unknown-linux-gnu
+          os: ubuntu-24.04
+        - target: powerpc64le-unknown-linux-gnu
+          os: ubuntu-24.04
+        - target: riscv64gc-unknown-linux-gnu
+          os: ubuntu-24.04
+        - target: thumbv6m-none-eabi
+          os: ubuntu-24.04
+        - target: thumbv7em-none-eabi
+          os: ubuntu-24.04
+        - target: thumbv7em-none-eabihf
+          os: ubuntu-24.04
+        - target: thumbv7m-none-eabi
+          os: ubuntu-24.04
+        - target: wasm32-unknown-unknown
+          os: ubuntu-24.04
+        - target: x86_64-unknown-linux-gnu
+          os: ubuntu-24.04
+        - target: x86_64-apple-darwin
+          os: macos-13
+        - target: i686-pc-windows-msvc
+          os: windows-2025
+          test_verbatim: 1
+        - target: x86_64-pc-windows-msvc
+          os: windows-2025
+          test_verbatim: 1
+        - target: i686-pc-windows-gnu
+          os: windows-2025
+          channel: nightly-i686-gnu
+        - target: x86_64-pc-windows-gnu
+          os: windows-2025
+          channel: nightly-x86_64-gnu
+    runs-on: ${{ matrix.os }}
+    needs: [calculate_vars]
+    env:
+      BUILD_ONLY: ${{ matrix.build_only }}
+      TEST_VERBATIM: ${{ matrix.test_verbatim }}
+      MAY_SKIP_LIBM_CI: ${{ needs.calculate_vars.outputs.may_skip_libm_ci }}
+    steps:
+    - name: Print runner information
+      run: uname -a
+    - uses: actions/checkout@v4
+      with:
+        submodules: true
+    - name: Install Rust (rustup)
+      shell: bash
+      run: |
+        channel="nightly"
+        # Account for channels that have required components (MinGW)
+        [ -n "${{ matrix.channel }}" ] && channel="${{ matrix.channel }}"
+        rustup update "$channel" --no-self-update
+        rustup default "$channel"
+        rustup target add "${{ matrix.target }}"
+        rustup component add llvm-tools-preview
+    - uses: taiki-e/install-action@nextest
+    - uses: Swatinem/rust-cache@v2
+      with:
+        key: ${{ matrix.target }}
+    - name: Cache Docker layers
+      uses: actions/cache@v4
+      if: matrix.os == 'ubuntu-24.04'
+      with:
+        path: /tmp/.buildx-cache
+        key: ${{ matrix.target }}-buildx-${{ github.sha }}
+        restore-keys: ${{ matrix.target }}-buildx-
+    # Configure buildx to use Docker layer caching
+    - uses: docker/setup-buildx-action@v3
+      if: matrix.os == 'ubuntu-24.04'
+
+    - name: Cache compiler-rt
+      id: cache-compiler-rt
+      uses: actions/cache@v4
+      with:
+        path: compiler-rt
+        key: ${{ runner.os }}-compiler-rt-${{ hashFiles('ci/download-compiler-rt.sh') }}
+    - name: Download compiler-rt reference sources
+      if: steps.cache-compiler-rt.outputs.cache-hit != 'true'
+      run: ./ci/download-compiler-rt.sh
+      shell: bash
+    - run: echo "RUST_COMPILER_RT_ROOT=$(realpath ./compiler-rt)" >> "$GITHUB_ENV"
+      shell: bash
+
+    - name: Verify API list
+      if: matrix.os == 'ubuntu-24.04'
+      run: python3 etc/update-api-list.py --check
+
+    # Non-linux tests just use our raw script
+    - name: Run locally
+      if: matrix.os != 'ubuntu-24.04'
+      shell: bash
+      run: ./ci/run.sh ${{ matrix.target }}
+
+    # Otherwise we use our docker containers to run builds
+    - name: Run in Docker
+      if: matrix.os == 'ubuntu-24.04'
+      run: ./ci/run-docker.sh ${{ matrix.target }}
+
+    - name: Print test logs if available
+      if: always()
+      run: if [ -f "target/test-log.txt" ]; then cat target/test-log.txt; fi
+      shell: bash
+
+    # Workaround to keep Docker cache smaller
+    # https://github.com/docker/build-push-action/issues/252
+    # https://github.com/moby/buildkit/issues/1896
+    - name: Move Docker cache
+      if: matrix.os == 'ubuntu-24.04'
+      run: |
+        rm -rf /tmp/.buildx-cache
+        mv /tmp/.buildx-cache-new /tmp/.buildx-cache
+
+  clippy:
+    name: Clippy
+    runs-on: ubuntu-24.04
+    timeout-minutes: 10
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        submodules: true
+    # Unlike rustfmt, stable clippy does not work on code with nightly features.
+    - name: Install nightly `clippy`
+      run: |
+        rustup set profile minimal
+        rustup default nightly
+        rustup component add clippy
+    - uses: Swatinem/rust-cache@v2
+    - run: cargo clippy --workspace --all-targets
+
+  benchmarks:
+    name: Benchmarks
+    runs-on: ubuntu-24.04
+    timeout-minutes: 20
+    steps:
+    - uses: actions/checkout@master
+      with:
+        submodules: true
+    - uses: taiki-e/install-action@cargo-binstall
+
+    - name: Set up dependencies
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y valgrind gdb libc6-dbg # Needed for iai-callgrind
+        rustup update "$BENCHMARK_RUSTC" --no-self-update
+        rustup default "$BENCHMARK_RUSTC"
+        # Install the version of iai-callgrind-runner that is specified in Cargo.toml
+        iai_version="$(cargo metadata --format-version=1 --features icount |
+           jq -r '.packages[] | select(.name == "iai-callgrind").version')"
+        cargo binstall -y iai-callgrind-runner --version "$iai_version"
+        sudo apt-get install valgrind
+    - uses: Swatinem/rust-cache@v2
+
+    - name: Run icount benchmarks
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        PR_NUMBER: ${{ github.event.pull_request.number }}
+      run: ./ci/bench-icount.sh
+
+    - name: Upload the benchmark baseline
+      uses: actions/upload-artifact@v4
+      with:
+        name: ${{ env.BASELINE_NAME }}
+        path: ${{ env.BASELINE_NAME }}.tar.xz
+    
+    - name: Run wall time benchmarks
+      run: |
+        # Always use the same seed for benchmarks. Ideally we should switch to a
+        # non-random generator.
+        export LIBM_SEED=benchesbenchesbenchesbencheswoo!
+        cargo bench --package libm-test \
+          --no-default-features \
+          --features short-benchmarks,build-musl,libm/force-soft-floats
+
+    - name: Print test logs if available
+      if: always()
+      run: if [ -f "target/test-log.txt" ]; then cat target/test-log.txt; fi
+      shell: bash
+
+  miri:
+    name: Miri
+    runs-on: ubuntu-24.04
+    timeout-minutes: 10
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        submodules: true
+    - name: Install Rust (rustup)
+      run: rustup update nightly --no-self-update && rustup default nightly
+      shell: bash
+    - run: rustup component add miri
+    - run: cargo miri setup
+    - uses: Swatinem/rust-cache@v2
+    - run: ./ci/miri.sh
+
+  msrv:
+    name: Check libm MSRV
+    runs-on: ubuntu-24.04
+    timeout-minutes: 10
+    env:
+      RUSTFLAGS: # No need to check warnings on old MSRV, unset `-Dwarnings`
+    steps:
+    - uses: actions/checkout@master
+    - name: Install Rust
+      run: |
+        msrv="$(perl -ne 'print if s/rust-version\s*=\s*"(.*)"/\1/g' libm/Cargo.toml)"
+        echo "MSRV: $msrv"
+        rustup update "$msrv" --no-self-update && rustup default "$msrv"
+    - uses: Swatinem/rust-cache@v2
+    - run: |
+        # FIXME(msrv): Remove the workspace Cargo.toml so 1.63 cargo doesn't see
+        # `edition = "2024"` and get spooked.
+        rm Cargo.toml
+        cargo build --manifest-path libm/Cargo.toml
+
+  rustfmt:
+    name: Rustfmt
+    runs-on: ubuntu-24.04
+    timeout-minutes: 10
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        submodules: true
+    - name: Install stable `rustfmt`
+      run: rustup set profile minimal && rustup default stable && rustup component add rustfmt
+    - run: cargo fmt -- --check
+
+  extensive:
+    name: Extensive tests for ${{ matrix.ty }}
+    needs:
+      # Wait on `clippy` so we have some confidence that the crate will build
+      - clippy
+      - calculate_vars
+    runs-on: ubuntu-24.04
+    timeout-minutes: 240 # 4 hours
+    strategy:
+      matrix:
+        # Use the output from `calculate_vars` to create the matrix
+        # FIXME: it would be better to run all jobs (i.e. all types) but mark those that
+        # didn't change as skipped, rather than completely excluding the job. However,
+        # this is not currently possible https://github.com/actions/runner/issues/1985.
+        include: ${{ fromJSON(needs.calculate_vars.outputs.extensive_matrix).extensive_matrix }}
+    env:
+      TO_TEST: ${{ matrix.to_test }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: true
+      - name: Install Rust
+        run: |
+          rustup update nightly --no-self-update
+          rustup default nightly
+      - uses: Swatinem/rust-cache@v2
+      - name: Run extensive tests
+        run: ./ci/run-extensive.sh
+      - name: Print test logs if available
+        run: if [ -f "target/test-log.txt" ]; then cat target/test-log.txt; fi
+        shell: bash
+
+  success:
+    needs:
+      - benchmarks
+      - clippy
+      - extensive
+      - miri
+      - msrv
+      - rustfmt
+      - test
+    runs-on: ubuntu-24.04
+    timeout-minutes: 10
+    # GitHub branch protection is exceedingly silly and treats "jobs skipped because a dependency
+    # failed" as success. So we have to do some contortions to ensure the job fails if any of its
+    # dependencies fails.
+    if: always() # make sure this is never "skipped"
+    steps:
+      # Manually check the status of all dependencies. `if: failure()` does not work.
+      - name: check if any dependency failed
+        run: jq --exit-status 'all(.result == "success")' <<< '${{ toJson(needs) }}'
diff --git a/library/compiler-builtins/.github/workflows/publish.yaml b/library/compiler-builtins/.github/workflows/publish.yaml
new file mode 100644
index 00000000000..85a33c039d2
--- /dev/null
+++ b/library/compiler-builtins/.github/workflows/publish.yaml
@@ -0,0 +1,25 @@
+name: Release-plz
+
+permissions:
+  pull-requests: write
+  contents: write
+
+on:
+  push: { branches: [master] }
+
+jobs:
+  release-plz:
+    name: Release-plz
+    runs-on: ubuntu-24.04
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Install Rust (rustup)
+        run: rustup update nightly --no-self-update && rustup default nightly
+      - name: Run release-plz
+        uses: MarcoIeni/release-plz-action@v0.5
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
diff --git a/library/compiler-builtins/.gitignore b/library/compiler-builtins/.gitignore
new file mode 100644
index 00000000000..5287a6c72be
--- /dev/null
+++ b/library/compiler-builtins/.gitignore
@@ -0,0 +1,16 @@
+# Rust files
+Cargo.lock
+target
+
+# Sources for external files
+compiler-rt
+*.tar.gz
+
+# Benchmark cache
+baseline-*
+iai-home
+
+# Temporary files
+*.bk
+*.rs.bk
+.#*
diff --git a/library/compiler-builtins/.gitmodules b/library/compiler-builtins/.gitmodules
new file mode 100644
index 00000000000..792ed9ab21f
--- /dev/null
+++ b/library/compiler-builtins/.gitmodules
@@ -0,0 +1,4 @@
+[submodule "crates/musl-math-sys/musl"]
+	path = crates/musl-math-sys/musl
+	url = https://git.musl-libc.org/git/musl
+	shallow = true
diff --git a/library/compiler-builtins/.release-plz.toml b/library/compiler-builtins/.release-plz.toml
new file mode 100644
index 00000000000..8023ade9bfd
--- /dev/null
+++ b/library/compiler-builtins/.release-plz.toml
@@ -0,0 +1,13 @@
+[workspace]
+# As part of the release process, we delete `libm/Cargo.toml`. Since
+# this is only run in CI, we shouldn't need to worry about it.
+allow_dirty = true
+publish_allow_dirty = true
+
+[[package]]
+name = "compiler_builtins"
+semver_check = false
+changelog_include = ["libm"] # libm is included as part of builtins
+
+[[package]]
+name = "libm"
diff --git a/library/compiler-builtins/.rustfmt.toml b/library/compiler-builtins/.rustfmt.toml
new file mode 100644
index 00000000000..79ac399c1b6
--- /dev/null
+++ b/library/compiler-builtins/.rustfmt.toml
@@ -0,0 +1,4 @@
+# This matches rustc
+style_edition = "2024"
+group_imports = "StdExternalCrate"
+imports_granularity = "Module"
diff --git a/library/compiler-builtins/CONTRIBUTING.md b/library/compiler-builtins/CONTRIBUTING.md
new file mode 100644
index 00000000000..9f67cfc3157
--- /dev/null
+++ b/library/compiler-builtins/CONTRIBUTING.md
@@ -0,0 +1,167 @@
+# How to contribute
+
+## compiler-builtins
+
+1. From the [pending list](compiler-builtins/README.md#progress), pick one or
+   more intrinsics.
+2. Port the version from [`compiler-rt`] and, if applicable, their
+   [tests][rt-tests]. Note that this crate has generic implementations for a lot
+   of routines, which may be usable without porting the entire implementation.
+3. Add a test to `builtins-test`, comparing the behavior of the ported
+   intrinsic(s) with their implementation on the testing host.
+4. Add the intrinsic to `builtins-test-intrinsics/src/main.rs` to verify it can
+   be linked on all targets.
+5. Send a Pull Request (PR) :tada:.
+
+[`compiler-rt`]: https://github.com/llvm/llvm-project/tree/b6820c35c59a4da3e59c11f657093ffbd79ae1db/compiler-rt/lib/builtins
+[rt-tests]: https://github.com/llvm/llvm-project/tree/b6820c35c59a4da3e59c11f657093ffbd79ae1db/compiler-rt/test/builtins
+
+## Porting Reminders
+
+1. [Rust][prec-rust] and [C][prec-c] have slightly different operator
+   precedence. C evaluates comparisons (`== !=`) before bitwise operations
+   (`& | ^`), while Rust evaluates the other way.
+2. C assumes wrapping operations everywhere. Rust panics on overflow when in
+   debug mode. Consider using the [Wrapping][wrap-ty] type or the explicit
+   [wrapping_*][wrap-fn] functions where applicable.
+3. Note [C implicit casts][casts], especially integer promotion. Rust is much
+   more explicit about casting, so be sure that any cast which affects the
+   output is ported to the Rust implementation.
+4. Rust has [many functions][i32] for integer or floating point manipulation in
+   the standard library. Consider using one of these functions rather than
+   porting a new one.
+
+[prec-rust]: https://doc.rust-lang.org/reference/expressions.html#expression-precedence
+[prec-c]: http://en.cppreference.com/w/c/language/operator_precedence
+[wrap-ty]: https://doc.rust-lang.org/core/num/struct.Wrapping.html
+[wrap-fn]: https://doc.rust-lang.org/std/primitive.i32.html#method.wrapping_add
+[casts]: http://en.cppreference.com/w/cpp/language/implicit_conversion
+[i32]: https://doc.rust-lang.org/std/primitive.i32.html
+
+## Tips and tricks
+
+- _IMPORTANT_ The code in this crate will end up being used in the `core` crate
+  so it can **not** have any external dependencies (other than a subset of
+  `core` itself).
+- Only use relative imports within the `math` directory / module, e.g.
+  `use self::fabs::fabs` or `use super::k_cos`. Absolute imports from core are
+  OK, e.g. `use core::u64`.
+- To reinterpret a float as an integer use the `to_bits` method. The MUSL code
+  uses the `GET_FLOAT_WORD` macro, or a union, to do this operation.
+- To reinterpret an integer as a float use the `f32::from_bits` constructor. The
+  MUSL code uses the `SET_FLOAT_WORD` macro, or a union, to do this operation.
+- You may use other methods from core like `f64::is_nan`, etc. as appropriate.
+- Rust does not have hex float literals. This crate provides two `hf16!`,
+  `hf32!`, `hf64!`, and `hf128!` which convert string literals to floats at
+  compile time.
+
+  ```rust
+  assert_eq!(hf32!("0x1.ffep+8").to_bits(), 0x43fff000);
+  assert_eq!(hf64!("0x1.ffep+8").to_bits(), 0x407ffe0000000000);
+  ```
+
+- Rust code panics on arithmetic overflows when not optimized. You may need to
+  use the [`Wrapping`] newtype to avoid this problem, or individual methods like
+  [`wrapping_add`].
+
+[`Wrapping`]: https://doc.rust-lang.org/std/num/struct.Wrapping.html
+[`wrapping_add`]: https://doc.rust-lang.org/std/primitive.u32.html#method.wrapping_add
+
+## Testing
+
+Testing for these crates can be somewhat complex, so feel free to rely on CI.
+
+The easiest way replicate CI testing is using Docker. This can be done by
+running `./ci/run-docker.sh [target]`. If no target is specified, all targets
+will be run.
+
+Tests can also be run without Docker:
+
+```sh
+# Run basic tests
+#
+# --no-default-features always needs to be passed, an unfortunate limitation
+# since the `#![compiler_builtins]` feature is enabled by default.
+cargo test --workspace --no-default-features
+
+# Test with all interesting features
+cargo test --workspace --no-default-features \
+    --features arch,unstable-float,unstable-intrinsics,mem
+
+# Run with more detailed tests for libm
+cargo test --workspace --no-default-features \
+    --features arch,unstable-float,unstable-intrinsics,mem \
+    --features build-mpfr,build-musl \
+    --profile release-checked
+```
+
+The multiprecision tests use the [`rug`] crate for bindings to MPFR. MPFR can be
+difficult to build on non-Unix systems, refer to [`gmp_mpfr_sys`] for help.
+
+`build-musl` does not build with MSVC, Wasm, or Thumb.
+
+[`rug`]: https://docs.rs/rug/latest/rug/
+[`gmp_mpfr_sys`]: https://docs.rs/gmp-mpfr-sys/1.6.4/gmp_mpfr_sys/
+
+In order to run all tests, some dependencies may be required:
+
+```sh
+# Allow testing compiler-builtins
+./ci/download-compiler-rt.sh
+
+# Optional, initialize musl for `--features build-musl`
+git submodule init
+git submodule update
+
+# `--release` ables more test cases
+cargo test --release
+```
+
+### Extensive tests
+
+Libm also has tests that are exhaustive (for single-argument `f32` and 1- or 2-
+argument `f16`) or extensive (for all other float and argument combinations).
+These take quite a long time to run, but are launched in CI when relevant files
+are changed.
+
+Exhaustive tests can be selected by passing an environment variable:
+
+```sh
+LIBM_EXTENSIVE_TESTS=sqrt,sqrtf cargo test --features build-mpfr \
+    --test z_extensive \
+    --profile release-checked
+
+# Run all tests for one type
+LIBM_EXTENSIVE_TESTS=all_f16 cargo test ...
+
+# Ensure `f64` tests can run exhaustively. Estimated completion test for a
+# single test is 57306 years on my machine so this may be worth skipping.
+LIBM_EXTENSIVE_TESTS=all LIBM_EXTENSIVE_ITERATIONS=18446744073709551615 cargo test ...
+```
+
+## Benchmarking
+
+Regular walltime benchmarks can be run with `cargo bench`:
+
+```sh
+cargo bench --no-default-features \
+    --features arch,unstable-float,unstable-intrinsics,mem \
+    --features benchmarking-reports
+```
+
+There are also benchmarks that check instruction count behind the `icount`
+feature. These require [`iai-callgrind-runner`] (via Cargo) and [Valgrind]
+to be installed, which means these only run on limited platforms.
+
+Instruction count benchmarks are run as part of CI to flag performance
+regresions.
+
+```sh
+cargo bench --no-default-features \
+    --features arch,unstable-float,unstable-intrinsics,mem \
+    --features icount \
+    --bench icount --bench mem_icount
+```
+
+[`iai-callgrind-runner`]: https://crates.io/crates/iai-callgrind-runner
+[Valgrind]: https://valgrind.org/
diff --git a/library/compiler-builtins/Cargo.toml b/library/compiler-builtins/Cargo.toml
new file mode 100644
index 00000000000..b39ec8a25da
--- /dev/null
+++ b/library/compiler-builtins/Cargo.toml
@@ -0,0 +1,50 @@
+[workspace]
+resolver = "2"
+members = [
+    "builtins-test",
+    "compiler-builtins",
+    "crates/libm-macros",
+    "crates/musl-math-sys",
+    "crates/panic-handler",
+    "crates/util",
+    "libm",
+    "libm-test",
+]
+
+default-members = [
+    "builtins-test",
+    "compiler-builtins",
+    "crates/libm-macros",
+    "libm",
+    "libm-test",
+]
+
+exclude = [
+    # `builtins-test-intrinsics` needs the feature `compiler-builtins` enabled
+    # and `mangled-names` disabled, which is the opposite of what is needed for
+    # other tests, so it makes sense to keep it out of the workspace.
+    "builtins-test-intrinsics",
+]
+
+[profile.release]
+panic = "abort"
+
+[profile.dev]
+panic = "abort"
+
+# Release mode with debug assertions
+[profile.release-checked]
+inherits = "release"
+debug-assertions = true
+overflow-checks = true
+
+# Release with maximum optimizations, which is very slow to build. This is also
+# what is needed to check `no-panic`.
+[profile.release-opt]
+inherits = "release"
+codegen-units = 1
+lto = "fat"
+
+[profile.bench]
+# Required for iai-callgrind
+debug = true
diff --git a/library/compiler-builtins/LICENSE.txt b/library/compiler-builtins/LICENSE.txt
new file mode 100644
index 00000000000..00ae6140bd5
--- /dev/null
+++ b/library/compiler-builtins/LICENSE.txt
@@ -0,0 +1,275 @@
+The compiler-builtins crate is available for use under both the MIT license
+and the Apache-2.0 license with the LLVM exception (MIT AND Apache-2.0 WITH
+LLVM-exception).
+
+The libm crate is available for use under the MIT license.
+
+As a contributor, you agree that your code may be used under any of the
+following: the MIT license, the Apache-2.0 license, or the Apache-2.0 license
+with the LLVM exception. In other words, original (non-derivative) work is
+licensed under MIT OR Apache-2.0 OR Apache-2.0 WITH LLVM-exception. This is
+the default license for all other source in this repository.
+
+Text of the relevant licenses is provided below:
+
+------------------------------------------------------------------------------
+MIT License
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+    1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+    2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+    3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+    4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+    5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+    6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+    7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+    8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+    9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+    END OF TERMS AND CONDITIONS
+
+    APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+
+---- LLVM Exceptions to the Apache 2.0 License ----
+
+As an exception, if, as a result of your compiling your source code, portions
+of this Software are embedded into an Object form of such source code, you
+may redistribute such embedded portions in such Object form without complying
+with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
+
+In addition, if you combine or link compiled forms of this Software with
+software that is licensed under the GPLv2 ("Combined Software") and if a
+court of competent jurisdiction determines that the patent provision (Section
+3), the indemnity provision (Section 9) or other Section of the License
+conflicts with the conditions of the GPLv2, you may retroactively and
+prospectively choose to deem waived or otherwise exclude such Section(s) of
+the License, but only in their entirety and only with respect to the Combined
+Software.
+------------------------------------------------------------------------------
+
+Portions of this software are derived from third-party works licensed under
+terms compatible with the above Apache-2.0 WITH LLVM-exception AND MIT
+license:
+
+* compiler-builtins is derived from LLVM's compiler-rt (https://llvm.org/).
+  Work derived from compiler-rt prior to 2019-01-19 is usable under the MIT
+  license, with the following copyright:
+
+      Copyright (c) 2009-2016 by the contributors listed in CREDITS.TXT
+
+  The relevant CREDITS.TXT is located at
+  https://github.com/llvm/llvm-project/blob/main/compiler-rt/CREDITS.TXT.
+
+* Work derived from compiler-rt after 2019-01-19 is usable under the
+  Apache-2.0 license with the LLVM exception.
+
+* The bundled `math` module is from the libm crate, usable under the MIT
+  license. For further details and copyrights, see see libm/LICENSE.txt at
+  https://github.com/rust-lang/compiler-builtins.
+
+Additionally, some source files may contain comments with specific copyrights
+or licenses.
diff --git a/library/compiler-builtins/PUBLISHING.md b/library/compiler-builtins/PUBLISHING.md
new file mode 100644
index 00000000000..3df682ab04a
--- /dev/null
+++ b/library/compiler-builtins/PUBLISHING.md
@@ -0,0 +1,16 @@
+# Publishing to crates.io
+
+Publishing `compiler-builtins` to crates.io takes a few steps unfortunately.
+It's not great, but it works for now. PRs to improve this process would be
+greatly appreciated!
+
+1. Make sure you've got a clean working tree and it's updated with the latest
+   changes on `master`
+2. Edit `Cargo.toml` to bump the version number
+3. Commit this change
+4. Run `git tag` to create a tag for this version
+5. Delete the `libm/Cargo.toml` file
+6. Run `cargo +nightly publish`
+7. Push the tag
+8. Push the commit
+9. Undo changes to `Cargo.toml` and the `libm` submodule
diff --git a/library/compiler-builtins/README.md b/library/compiler-builtins/README.md
new file mode 100644
index 00000000000..3130ff7b77d
--- /dev/null
+++ b/library/compiler-builtins/README.md
@@ -0,0 +1,27 @@
+# `compiler-builtins` and `libm`
+
+This repository contains two main crates:
+
+* `compiler-builtins`: symbols that the compiler expects to be available at
+  link time
+* `libm`: a Rust implementation of C math libraries, used to provide
+  implementations in `ocre`.
+
+More details are at [compiler-builtins/README.md](compiler-builtins/README.md)
+and [libm/README.md](libm/README.md).
+
+For instructions on contributing, see [CONTRIBUTING.md](CONTRIBUTING.md).
+
+## License
+
+* `libm` may be used under the [MIT License]
+* `compiler-builtins` may be used under the [MIT License] and the
+  [Apache License, Version 2.0] with the LLVM exception.
+* All original contributions must be under all of: the MIT license, the
+  Apache-2.0 license, and the Apache-2.0 license with the LLVM exception.
+
+More details are in [LICENSE.txt](LICENSE.txt) and
+[libm/LICENSE.txt](libm/LICENSE.txt).
+
+[MIT License]: https://opensource.org/license/mit
+[Apache License, Version 2.0]: htps://www.apache.org/licenses/LICENSE-2.0
diff --git a/library/compiler-builtins/builtins-test-intrinsics/Cargo.toml b/library/compiler-builtins/builtins-test-intrinsics/Cargo.toml
new file mode 100644
index 00000000000..6e10628a41b
--- /dev/null
+++ b/library/compiler-builtins/builtins-test-intrinsics/Cargo.toml
@@ -0,0 +1,19 @@
+[package]
+name = "builtins-test-intrinsics"
+version = "0.1.0"
+edition = "2021"
+publish = false
+license = "MIT OR Apache-2.0"
+
+[dependencies]
+compiler_builtins = { path = "../compiler-builtins", features = ["compiler-builtins"]}
+panic-handler = { path = "../crates/panic-handler" }
+
+[features]
+c = ["compiler_builtins/c"]
+
+[profile.release]
+panic = "abort"
+
+[profile.dev]
+panic = "abort"
diff --git a/library/compiler-builtins/builtins-test-intrinsics/build.rs b/library/compiler-builtins/builtins-test-intrinsics/build.rs
new file mode 100644
index 00000000000..89b126ff2b2
--- /dev/null
+++ b/library/compiler-builtins/builtins-test-intrinsics/build.rs
@@ -0,0 +1,11 @@
+mod builtins_configure {
+    include!("../compiler-builtins/configure.rs");
+}
+
+fn main() {
+    println!("cargo::rerun-if-changed=../configure.rs");
+
+    let target = builtins_configure::Target::from_env();
+    builtins_configure::configure_f16_f128(&target);
+    builtins_configure::configure_aliases(&target);
+}
diff --git a/library/compiler-builtins/builtins-test-intrinsics/src/main.rs b/library/compiler-builtins/builtins-test-intrinsics/src/main.rs
new file mode 100644
index 00000000000..1fa7b00916f
--- /dev/null
+++ b/library/compiler-builtins/builtins-test-intrinsics/src/main.rs
@@ -0,0 +1,697 @@
+// By compiling this file we check that all the intrinsics we care about continue to be provided by
+// the `compiler_builtins` crate regardless of the changes we make to it. If we, by mistake, stop
+// compiling a C implementation and forget to implement that intrinsic in Rust, this file will fail
+// to link due to the missing intrinsic (symbol).
+
+#![allow(unused_features)]
+#![allow(internal_features)]
+#![deny(dead_code)]
+#![feature(allocator_api)]
+#![feature(f128)]
+#![feature(f16)]
+#![feature(lang_items)]
+#![no_std]
+#![no_main]
+
+extern crate panic_handler;
+
+#[cfg(all(not(thumb), not(windows), not(target_arch = "wasm32")))]
+#[link(name = "c")]
+extern "C" {}
+
+// Every function in this module maps will be lowered to an intrinsic by LLVM, if the platform
+// doesn't have native support for the operation used in the function. ARM has a naming convention
+// convention for its intrinsics that's different from other architectures; that's why some function
+// have an additional comment: the function name is the ARM name for the intrinsic and the comment
+// in the non-ARM name for the intrinsic.
+mod intrinsics {
+    /* f16 operations */
+
+    #[cfg(f16_enabled)]
+    pub fn extendhfsf(x: f16) -> f32 {
+        x as f32
+    }
+
+    #[cfg(f16_enabled)]
+    pub fn extendhfdf(x: f16) -> f64 {
+        x as f64
+    }
+
+    #[cfg(all(
+        f16_enabled,
+        f128_enabled,
+        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
+    ))]
+    pub fn extendhftf(x: f16) -> f128 {
+        x as f128
+    }
+
+    /* f32 operations */
+
+    #[cfg(f16_enabled)]
+    pub fn truncsfhf(x: f32) -> f16 {
+        x as f16
+    }
+
+    // extendsfdf2
+    pub fn aeabi_f2d(x: f32) -> f64 {
+        x as f64
+    }
+
+    #[cfg(f128_enabled)]
+    pub fn extendsftf(x: f32) -> f128 {
+        x as f128
+    }
+
+    // fixsfsi
+    pub fn aeabi_f2iz(x: f32) -> i32 {
+        x as i32
+    }
+
+    // fixsfdi
+    pub fn aeabi_f2lz(x: f32) -> i64 {
+        x as i64
+    }
+
+    pub fn fixsfti(x: f32) -> i128 {
+        x as i128
+    }
+
+    // fixunssfsi
+    pub fn aeabi_f2uiz(x: f32) -> u32 {
+        x as u32
+    }
+
+    // fixunssfdi
+    pub fn aeabi_f2ulz(x: f32) -> u64 {
+        x as u64
+    }
+
+    pub fn fixunssfti(x: f32) -> u128 {
+        x as u128
+    }
+
+    // addsf3
+    pub fn aeabi_fadd(a: f32, b: f32) -> f32 {
+        a + b
+    }
+
+    // eqsf2
+    pub fn aeabi_fcmpeq(a: f32, b: f32) -> bool {
+        a == b
+    }
+
+    // gtsf2
+    pub fn aeabi_fcmpgt(a: f32, b: f32) -> bool {
+        a > b
+    }
+
+    // ltsf2
+    pub fn aeabi_fcmplt(a: f32, b: f32) -> bool {
+        a < b
+    }
+
+    // divsf3
+    pub fn aeabi_fdiv(a: f32, b: f32) -> f32 {
+        a / b
+    }
+
+    // mulsf3
+    pub fn aeabi_fmul(a: f32, b: f32) -> f32 {
+        a * b
+    }
+
+    // subsf3
+    pub fn aeabi_fsub(a: f32, b: f32) -> f32 {
+        a - b
+    }
+
+    /* f64 operations */
+
+    // truncdfsf2
+    pub fn aeabi_d2f(x: f64) -> f32 {
+        x as f32
+    }
+
+    // fixdfsi
+    pub fn aeabi_d2i(x: f64) -> i32 {
+        x as i32
+    }
+
+    // fixdfdi
+    pub fn aeabi_d2l(x: f64) -> i64 {
+        x as i64
+    }
+
+    pub fn fixdfti(x: f64) -> i128 {
+        x as i128
+    }
+
+    // fixunsdfsi
+    pub fn aeabi_d2uiz(x: f64) -> u32 {
+        x as u32
+    }
+
+    // fixunsdfdi
+    pub fn aeabi_d2ulz(x: f64) -> u64 {
+        x as u64
+    }
+
+    pub fn fixunsdfti(x: f64) -> u128 {
+        x as u128
+    }
+
+    // adddf3
+    pub fn aeabi_dadd(a: f64, b: f64) -> f64 {
+        a + b
+    }
+
+    // eqdf2
+    pub fn aeabi_dcmpeq(a: f64, b: f64) -> bool {
+        a == b
+    }
+
+    // gtdf2
+    pub fn aeabi_dcmpgt(a: f64, b: f64) -> bool {
+        a > b
+    }
+
+    // ltdf2
+    pub fn aeabi_dcmplt(a: f64, b: f64) -> bool {
+        a < b
+    }
+
+    // divdf3
+    pub fn aeabi_ddiv(a: f64, b: f64) -> f64 {
+        a / b
+    }
+
+    // muldf3
+    pub fn aeabi_dmul(a: f64, b: f64) -> f64 {
+        a * b
+    }
+
+    // subdf3
+    pub fn aeabi_dsub(a: f64, b: f64) -> f64 {
+        a - b
+    }
+
+    /* f128 operations */
+
+    #[cfg(all(
+        f16_enabled,
+        f128_enabled,
+        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
+    ))]
+    pub fn trunctfhf(x: f128) -> f16 {
+        x as f16
+    }
+
+    #[cfg(f128_enabled)]
+    pub fn trunctfsf(x: f128) -> f32 {
+        x as f32
+    }
+
+    #[cfg(f128_enabled)]
+    pub fn trunctfdf(x: f128) -> f64 {
+        x as f64
+    }
+
+    #[cfg(all(
+        f128_enabled,
+        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
+    ))]
+    pub fn fixtfsi(x: f128) -> i32 {
+        x as i32
+    }
+
+    #[cfg(all(
+        f128_enabled,
+        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
+    ))]
+    pub fn fixtfdi(x: f128) -> i64 {
+        x as i64
+    }
+
+    #[cfg(all(
+        f128_enabled,
+        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
+    ))]
+    pub fn fixtfti(x: f128) -> i128 {
+        x as i128
+    }
+
+    #[cfg(all(
+        f128_enabled,
+        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
+    ))]
+    pub fn fixunstfsi(x: f128) -> u32 {
+        x as u32
+    }
+
+    #[cfg(all(
+        f128_enabled,
+        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
+    ))]
+    pub fn fixunstfdi(x: f128) -> u64 {
+        x as u64
+    }
+
+    #[cfg(all(
+        f128_enabled,
+        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
+    ))]
+    pub fn fixunstfti(x: f128) -> u128 {
+        x as u128
+    }
+
+    #[cfg(f128_enabled)]
+    pub fn addtf(a: f128, b: f128) -> f128 {
+        a + b
+    }
+
+    #[cfg(f128_enabled)]
+    pub fn eqtf(a: f128, b: f128) -> bool {
+        a == b
+    }
+
+    #[cfg(f128_enabled)]
+    pub fn gttf(a: f128, b: f128) -> bool {
+        a > b
+    }
+
+    #[cfg(f128_enabled)]
+    pub fn lttf(a: f128, b: f128) -> bool {
+        a < b
+    }
+
+    #[cfg(f128_enabled)]
+    pub fn multf(a: f128, b: f128) -> f128 {
+        a * b
+    }
+
+    #[cfg(f128_enabled)]
+    pub fn divtf(a: f128, b: f128) -> f128 {
+        a / b
+    }
+
+    #[cfg(f128_enabled)]
+    pub fn subtf(a: f128, b: f128) -> f128 {
+        a - b
+    }
+
+    /* i32 operations */
+
+    // floatsisf
+    pub fn aeabi_i2f(x: i32) -> f32 {
+        x as f32
+    }
+
+    // floatsidf
+    pub fn aeabi_i2d(x: i32) -> f64 {
+        x as f64
+    }
+
+    #[cfg(f128_enabled)]
+    pub fn floatsitf(x: i32) -> f128 {
+        x as f128
+    }
+
+    pub fn aeabi_idiv(a: i32, b: i32) -> i32 {
+        a.wrapping_div(b)
+    }
+
+    pub fn aeabi_idivmod(a: i32, b: i32) -> i32 {
+        a % b
+    }
+
+    /* i64 operations */
+
+    // floatdisf
+    pub fn aeabi_l2f(x: i64) -> f32 {
+        x as f32
+    }
+
+    // floatdidf
+    pub fn aeabi_l2d(x: i64) -> f64 {
+        x as f64
+    }
+
+    #[cfg(f128_enabled)]
+    pub fn floatditf(x: i64) -> f128 {
+        x as f128
+    }
+
+    pub fn mulodi4(a: i64, b: i64) -> i64 {
+        a * b
+    }
+
+    // divdi3
+    pub fn aeabi_ldivmod(a: i64, b: i64) -> i64 {
+        a / b
+    }
+
+    pub fn moddi3(a: i64, b: i64) -> i64 {
+        a % b
+    }
+
+    // muldi3
+    pub fn aeabi_lmul(a: i64, b: i64) -> i64 {
+        a.wrapping_mul(b)
+    }
+
+    /* i128 operations */
+
+    pub fn floattisf(x: i128) -> f32 {
+        x as f32
+    }
+
+    pub fn floattidf(x: i128) -> f64 {
+        x as f64
+    }
+
+    #[cfg(f128_enabled)]
+    pub fn floattitf(x: i128) -> f128 {
+        x as f128
+    }
+
+    pub fn lshrti3(a: i128, b: usize) -> i128 {
+        a >> b
+    }
+
+    pub fn divti3(a: i128, b: i128) -> i128 {
+        a / b
+    }
+
+    pub fn modti3(a: i128, b: i128) -> i128 {
+        a % b
+    }
+
+    /* u32 operations */
+
+    // floatunsisf
+    pub fn aeabi_ui2f(x: u32) -> f32 {
+        x as f32
+    }
+
+    // floatunsidf
+    pub fn aeabi_ui2d(x: u32) -> f64 {
+        x as f64
+    }
+
+    #[cfg(f128_enabled)]
+    pub fn floatunsitf(x: u32) -> f128 {
+        x as f128
+    }
+
+    pub fn aeabi_uidiv(a: u32, b: u32) -> u32 {
+        a / b
+    }
+
+    pub fn aeabi_uidivmod(a: u32, b: u32) -> u32 {
+        a % b
+    }
+
+    /* u64 operations */
+
+    // floatundisf
+    pub fn aeabi_ul2f(x: u64) -> f32 {
+        x as f32
+    }
+
+    // floatundidf
+    pub fn aeabi_ul2d(x: u64) -> f64 {
+        x as f64
+    }
+
+    #[cfg(f128_enabled)]
+    pub fn floatunditf(x: u64) -> f128 {
+        x as f128
+    }
+
+    // udivdi3
+    pub fn aeabi_uldivmod(a: u64, b: u64) -> u64 {
+        a * b
+    }
+
+    pub fn umoddi3(a: u64, b: u64) -> u64 {
+        a % b
+    }
+
+    /* u128 operations */
+
+    pub fn floatuntisf(x: u128) -> f32 {
+        x as f32
+    }
+
+    pub fn floatuntidf(x: u128) -> f64 {
+        x as f64
+    }
+
+    #[cfg(f128_enabled)]
+    pub fn floatuntitf(x: u128) -> f128 {
+        x as f128
+    }
+
+    pub fn muloti4(a: u128, b: u128) -> Option<u128> {
+        a.checked_mul(b)
+    }
+
+    pub fn multi3(a: u128, b: u128) -> u128 {
+        a.wrapping_mul(b)
+    }
+
+    pub fn ashlti3(a: u128, b: usize) -> u128 {
+        a >> b
+    }
+
+    pub fn ashrti3(a: u128, b: usize) -> u128 {
+        a << b
+    }
+
+    pub fn udivti3(a: u128, b: u128) -> u128 {
+        a / b
+    }
+
+    pub fn umodti3(a: u128, b: u128) -> u128 {
+        a % b
+    }
+}
+
+fn run() {
+    use core::hint::black_box as bb;
+
+    use intrinsics::*;
+
+    // FIXME(f16_f128): some PPC f128 <-> int conversion functions have the wrong names
+
+    #[cfg(f128_enabled)]
+    bb(addtf(bb(2.), bb(2.)));
+    bb(aeabi_d2f(bb(2.)));
+    bb(aeabi_d2i(bb(2.)));
+    bb(aeabi_d2l(bb(2.)));
+    bb(aeabi_d2uiz(bb(2.)));
+    bb(aeabi_d2ulz(bb(2.)));
+    bb(aeabi_dadd(bb(2.), bb(3.)));
+    bb(aeabi_dcmpeq(bb(2.), bb(3.)));
+    bb(aeabi_dcmpgt(bb(2.), bb(3.)));
+    bb(aeabi_dcmplt(bb(2.), bb(3.)));
+    bb(aeabi_ddiv(bb(2.), bb(3.)));
+    bb(aeabi_dmul(bb(2.), bb(3.)));
+    bb(aeabi_dsub(bb(2.), bb(3.)));
+    bb(aeabi_f2d(bb(2.)));
+    bb(aeabi_f2iz(bb(2.)));
+    bb(aeabi_f2lz(bb(2.)));
+    bb(aeabi_f2uiz(bb(2.)));
+    bb(aeabi_f2ulz(bb(2.)));
+    bb(aeabi_fadd(bb(2.), bb(3.)));
+    bb(aeabi_fcmpeq(bb(2.), bb(3.)));
+    bb(aeabi_fcmpgt(bb(2.), bb(3.)));
+    bb(aeabi_fcmplt(bb(2.), bb(3.)));
+    bb(aeabi_fdiv(bb(2.), bb(3.)));
+    bb(aeabi_fmul(bb(2.), bb(3.)));
+    bb(aeabi_fsub(bb(2.), bb(3.)));
+    bb(aeabi_i2d(bb(2)));
+    bb(aeabi_i2f(bb(2)));
+    bb(aeabi_idiv(bb(2), bb(3)));
+    bb(aeabi_idivmod(bb(2), bb(3)));
+    bb(aeabi_l2d(bb(2)));
+    bb(aeabi_l2f(bb(2)));
+    bb(aeabi_ldivmod(bb(2), bb(3)));
+    bb(aeabi_lmul(bb(2), bb(3)));
+    bb(aeabi_ui2d(bb(2)));
+    bb(aeabi_ui2f(bb(2)));
+    bb(aeabi_uidiv(bb(2), bb(3)));
+    bb(aeabi_uidivmod(bb(2), bb(3)));
+    bb(aeabi_ul2d(bb(2)));
+    bb(aeabi_ul2f(bb(2)));
+    bb(aeabi_uldivmod(bb(2), bb(3)));
+    bb(ashlti3(bb(2), bb(2)));
+    bb(ashrti3(bb(2), bb(2)));
+    #[cfg(f128_enabled)]
+    bb(divtf(bb(2.), bb(2.)));
+    bb(divti3(bb(2), bb(2)));
+    #[cfg(f128_enabled)]
+    bb(eqtf(bb(2.), bb(2.)));
+    #[cfg(f16_enabled)]
+    bb(extendhfdf(bb(2.)));
+    #[cfg(f16_enabled)]
+    bb(extendhfsf(bb(2.)));
+    #[cfg(all(
+        f16_enabled,
+        f128_enabled,
+        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
+    ))]
+    bb(extendhftf(bb(2.)));
+    #[cfg(f128_enabled)]
+    bb(extendsftf(bb(2.)));
+    bb(fixdfti(bb(2.)));
+    bb(fixsfti(bb(2.)));
+    #[cfg(all(
+        f128_enabled,
+        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
+    ))]
+    bb(fixtfdi(bb(2.)));
+    #[cfg(all(
+        f128_enabled,
+        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
+    ))]
+    bb(fixtfsi(bb(2.)));
+    #[cfg(all(
+        f128_enabled,
+        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
+    ))]
+    bb(fixtfti(bb(2.)));
+    bb(fixunsdfti(bb(2.)));
+    bb(fixunssfti(bb(2.)));
+    #[cfg(all(
+        f128_enabled,
+        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
+    ))]
+    bb(fixunstfdi(bb(2.)));
+    #[cfg(all(
+        f128_enabled,
+        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
+    ))]
+    bb(fixunstfsi(bb(2.)));
+    #[cfg(all(
+        f128_enabled,
+        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
+    ))]
+    bb(fixunstfti(bb(2.)));
+    #[cfg(f128_enabled)]
+    bb(floatditf(bb(2)));
+    #[cfg(f128_enabled)]
+    bb(floatsitf(bb(2)));
+    bb(floattidf(bb(2)));
+    bb(floattisf(bb(2)));
+    #[cfg(f128_enabled)]
+    bb(floattitf(bb(2)));
+    #[cfg(f128_enabled)]
+    bb(floatunditf(bb(2)));
+    #[cfg(f128_enabled)]
+    bb(floatunsitf(bb(2)));
+    bb(floatuntidf(bb(2)));
+    bb(floatuntisf(bb(2)));
+    #[cfg(f128_enabled)]
+    bb(floatuntitf(bb(2)));
+    #[cfg(f128_enabled)]
+    bb(gttf(bb(2.), bb(2.)));
+    bb(lshrti3(bb(2), bb(2)));
+    #[cfg(f128_enabled)]
+    bb(lttf(bb(2.), bb(2.)));
+    bb(moddi3(bb(2), bb(3)));
+    bb(modti3(bb(2), bb(2)));
+    bb(mulodi4(bb(2), bb(3)));
+    bb(muloti4(bb(2), bb(2)));
+    #[cfg(f128_enabled)]
+    bb(multf(bb(2.), bb(2.)));
+    bb(multi3(bb(2), bb(2)));
+    #[cfg(f128_enabled)]
+    bb(subtf(bb(2.), bb(2.)));
+    #[cfg(f16_enabled)]
+    bb(truncsfhf(bb(2.)));
+    #[cfg(f128_enabled)]
+    bb(trunctfdf(bb(2.)));
+    #[cfg(all(
+        f16_enabled,
+        f128_enabled,
+        not(any(target_arch = "powerpc", target_arch = "powerpc64"))
+    ))]
+    bb(trunctfhf(bb(2.)));
+    #[cfg(f128_enabled)]
+    bb(trunctfsf(bb(2.)));
+    bb(udivti3(bb(2), bb(2)));
+    bb(umoddi3(bb(2), bb(3)));
+    bb(umodti3(bb(2), bb(2)));
+
+    something_with_a_dtor(&|| assert_eq!(bb(1), 1));
+
+    // FIXME(#802): This should be re-enabled once a workaround is found.
+    // extern "C" {
+    //     fn rust_begin_unwind(x: usize);
+    // }
+
+    // unsafe {
+    //     rust_begin_unwind(0);
+    // }
+}
+
+fn something_with_a_dtor(f: &dyn Fn()) {
+    struct A<'a>(&'a (dyn Fn() + 'a));
+
+    impl Drop for A<'_> {
+        fn drop(&mut self) {
+            (self.0)();
+        }
+    }
+    let _a = A(f);
+    f();
+}
+
+#[unsafe(no_mangle)]
+#[cfg(not(thumb))]
+fn main(_argc: core::ffi::c_int, _argv: *const *const u8) -> core::ffi::c_int {
+    run();
+    0
+}
+
+#[unsafe(no_mangle)]
+#[cfg(thumb)]
+pub fn _start() -> ! {
+    run();
+    loop {}
+}
+
+#[cfg(windows)]
+#[link(name = "kernel32")]
+#[link(name = "msvcrt")]
+extern "C" {}
+
+// ARM targets need these symbols
+#[unsafe(no_mangle)]
+pub fn __aeabi_unwind_cpp_pr0() {}
+
+#[unsafe(no_mangle)]
+pub fn __aeabi_unwind_cpp_pr1() {}
+
+#[cfg(not(any(windows, target_os = "cygwin")))]
+#[allow(non_snake_case)]
+#[unsafe(no_mangle)]
+pub fn _Unwind_Resume() {}
+
+#[cfg(not(any(windows, target_os = "cygwin")))]
+#[lang = "eh_personality"]
+pub extern "C" fn eh_personality() {}
+
+#[cfg(any(all(windows, target_env = "gnu"), target_os = "cygwin"))]
+mod mingw_unwinding {
+    #[unsafe(no_mangle)]
+    pub fn rust_eh_personality() {}
+    #[unsafe(no_mangle)]
+    pub fn rust_eh_unwind_resume() {}
+    #[unsafe(no_mangle)]
+    pub fn rust_eh_register_frames() {}
+    #[unsafe(no_mangle)]
+    pub fn rust_eh_unregister_frames() {}
+}
diff --git a/library/compiler-builtins/builtins-test/Cargo.toml b/library/compiler-builtins/builtins-test/Cargo.toml
new file mode 100644
index 00000000000..10978c0bb7e
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/Cargo.toml
@@ -0,0 +1,99 @@
+[package]
+name = "builtins-test"
+version = "0.1.0"
+authors = ["Alex Crichton <alex@alexcrichton.com>"]
+edition = "2024"
+publish = false
+license = "MIT AND Apache-2.0 WITH LLVM-exception AND (MIT OR Apache-2.0)"
+
+[dependencies]
+# For fuzzing tests we want a deterministic seedable RNG. We also eliminate potential
+# problems with system RNGs on the variety of platforms this crate is tested on.
+# `xoshiro128**` is used for its quality, size, and speed at generating `u32` shift amounts.
+rand_xoshiro = "0.6"
+# To compare float builtins against
+rustc_apfloat = "0.2.1"
+# Really a dev dependency, but dev dependencies can't be optional
+iai-callgrind = { version = "0.14.0", optional = true }
+
+[dependencies.compiler_builtins]
+path = "../compiler-builtins"
+default-features = false
+features = ["unstable-public-internals"]
+
+[dev-dependencies]
+criterion = { version = "0.5.1", default-features = false, features = ["cargo_bench_support"] }
+paste = "1.0.15"
+
+[target.'cfg(all(target_arch = "arm", not(any(target_env = "gnu", target_env = "musl")), target_os = "linux"))'.dev-dependencies]
+test = { git = "https://github.com/japaric/utest" }
+utest-cortex-m-qemu = { default-features = false, git = "https://github.com/japaric/utest" }
+utest-macros = { git = "https://github.com/japaric/utest" }
+
+[features]
+default = ["mangled-names"]
+c = ["compiler_builtins/c"]
+no-asm = ["compiler_builtins/no-asm"]
+no-f16-f128 = ["compiler_builtins/no-f16-f128"]
+mem = ["compiler_builtins/mem"]
+mangled-names = ["compiler_builtins/mangled-names"]
+# Skip tests that rely on f128 symbols being available on the system
+no-sys-f128 = ["no-sys-f128-int-convert", "no-sys-f16-f128-convert"]
+# Some platforms have some f128 functions but everything except integer conversions
+no-sys-f128-int-convert = []
+no-sys-f16-f128-convert = []
+no-sys-f16-f64-convert = []
+# Skip tests that rely on f16 symbols being available on the system
+no-sys-f16 = ["no-sys-f16-f64-convert"]
+
+# Enable icount benchmarks (requires iai-callgrind and valgrind)
+icount = ["dep:iai-callgrind"]
+
+# Enable report generation without bringing in more dependencies by default
+benchmarking-reports = ["criterion/plotters", "criterion/html_reports"]
+
+# NOTE: benchmarks must be run with `--no-default-features` or with
+# `-p builtins-test`, otherwise the default `compiler-builtins` feature
+# of the `compiler_builtins` crate gets activated, resulting in linker
+# errors.
+
+[[bench]]
+name = "float_add"
+harness = false
+
+[[bench]]
+name = "float_sub"
+harness = false
+
+[[bench]]
+name = "float_mul"
+harness = false
+
+[[bench]]
+name = "float_div"
+harness = false
+
+[[bench]]
+name = "float_cmp"
+harness = false
+
+[[bench]]
+name = "float_conv"
+harness = false
+
+[[bench]]
+name = "float_extend"
+harness = false
+
+[[bench]]
+name = "float_trunc"
+harness = false
+
+[[bench]]
+name = "float_pow"
+harness = false
+
+[[bench]]
+name = "mem_icount"
+harness = false
+required-features = ["icount"]
diff --git a/library/compiler-builtins/builtins-test/benches/float_add.rs b/library/compiler-builtins/builtins-test/benches/float_add.rs
new file mode 100644
index 00000000000..197f90b319d
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/benches/float_add.rs
@@ -0,0 +1,93 @@
+#![cfg_attr(f128_enabled, feature(f128))]
+
+use builtins_test::float_bench;
+use compiler_builtins::float::add;
+use criterion::{Criterion, criterion_main};
+
+float_bench! {
+    name: add_f32,
+    sig: (a: f32, b: f32) -> f32,
+    crate_fn: add::__addsf3,
+    sys_fn: __addsf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            asm!(
+                "addss {a}, {b}",
+                a = inout(xmm_reg) a,
+                b = in(xmm_reg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            asm!(
+                "fadd {a:s}, {a:s}, {b:s}",
+                a = inout(vreg) a,
+                b = in(vreg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+    ],
+}
+
+float_bench! {
+    name: add_f64,
+    sig: (a: f64, b: f64) -> f64,
+    crate_fn: add::__adddf3,
+    sys_fn: __adddf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            asm!(
+                "addsd {a}, {b}",
+                a = inout(xmm_reg) a,
+                b = in(xmm_reg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            asm!(
+                "fadd {a:d}, {a:d}, {b:d}",
+                a = inout(vreg) a,
+                b = in(vreg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+    ],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: add_f128,
+    sig: (a: f128, b: f128) -> f128,
+    crate_fn: add::__addtf3,
+    crate_fn_ppc: add::__addkf3,
+    sys_fn: __addtf3,
+    sys_fn_ppc: __addkf3,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: []
+}
+
+pub fn float_add() {
+    let mut criterion = Criterion::default().configure_from_args();
+
+    add_f32(&mut criterion);
+    add_f64(&mut criterion);
+
+    #[cfg(f128_enabled)]
+    {
+        add_f128(&mut criterion);
+    }
+}
+
+criterion_main!(float_add);
diff --git a/library/compiler-builtins/builtins-test/benches/float_cmp.rs b/library/compiler-builtins/builtins-test/benches/float_cmp.rs
new file mode 100644
index 00000000000..42d6652397d
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/benches/float_cmp.rs
@@ -0,0 +1,207 @@
+#![cfg_attr(f128_enabled, feature(f128))]
+
+use builtins_test::float_bench;
+use compiler_builtins::float::cmp;
+use criterion::{Criterion, criterion_main};
+
+/// `gt` symbols are allowed to return differing results, they just get compared
+/// to 0.
+fn gt_res_eq(a: i32, b: i32) -> bool {
+    let a_lt_0 = a <= 0;
+    let b_lt_0 = b <= 0;
+    (a_lt_0 && b_lt_0) || (!a_lt_0 && !b_lt_0)
+}
+
+float_bench! {
+    name: cmp_f32_gt,
+    sig: (a: f32, b: f32) -> i32,
+    crate_fn: cmp::__gtsf2,
+    sys_fn: __gtsf2,
+    sys_available: all(),
+    output_eq: gt_res_eq,
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: i32;
+            asm!(
+                "xor     {ret:e}, {ret:e}",
+                "ucomiss {a}, {b}",
+                "seta    {ret:l}",
+                a = in(xmm_reg) a,
+                b = in(xmm_reg) b,
+                ret = out(reg) ret,
+                options(nomem, nostack, pure)
+            );
+
+            ret
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: i32;
+            asm!(
+                "fcmp    {a:s}, {b:s}",
+                "cset    {ret:w}, gt",
+                a = in(vreg) a,
+                b = in(vreg) b,
+                ret = out(reg) ret,
+                options(nomem,nostack),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: cmp_f32_unord,
+    sig: (a: f32, b: f32) -> i32,
+    crate_fn: cmp::__unordsf2,
+    sys_fn: __unordsf2,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: i32;
+            asm!(
+                "xor     {ret:e}, {ret:e}",
+                "ucomiss {a}, {b}",
+                "setp    {ret:l}",
+                a = in(xmm_reg) a,
+                b = in(xmm_reg) b,
+                ret = out(reg) ret,
+                options(nomem, nostack, pure)
+            );
+
+            ret
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: i32;
+            asm!(
+                "fcmp    {a:s}, {b:s}",
+                "cset    {ret:w}, vs",
+                a = in(vreg) a,
+                b = in(vreg) b,
+                ret = out(reg) ret,
+                options(nomem, nostack, pure)
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: cmp_f64_gt,
+    sig: (a: f64, b: f64) -> i32,
+    crate_fn: cmp::__gtdf2,
+    sys_fn: __gtdf2,
+    sys_available: all(),
+    output_eq: gt_res_eq,
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: i32;
+            asm!(
+                "xor     {ret:e}, {ret:e}",
+                "ucomisd {a}, {b}",
+                "seta    {ret:l}",
+                a = in(xmm_reg) a,
+                b = in(xmm_reg) b,
+                ret = out(reg) ret,
+                options(nomem, nostack, pure)
+            );
+
+            ret
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: i32;
+            asm!(
+                "fcmp    {a:d}, {b:d}",
+                "cset {ret:w}, gt",
+                a = in(vreg) a,
+                b = in(vreg) b,
+                ret = out(reg) ret,
+                options(nomem, nostack, pure)
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: cmp_f64_unord,
+    sig: (a: f64, b: f64) -> i32,
+    crate_fn: cmp::__unorddf2,
+    sys_fn: __unorddf2,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: i32;
+            asm!(
+                "xor     {ret:e}, {ret:e}",
+                "ucomisd {a}, {b}",
+                "setp    {ret:l}",
+                a = in(xmm_reg) a,
+                b = in(xmm_reg) b,
+                ret = out(reg) ret,
+                options(nomem, nostack, pure)
+            );
+
+            ret
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: i32;
+            asm!(
+                "fcmp    {a:d}, {b:d}",
+                "cset    {ret:w}, vs",
+                a = in(vreg) a,
+                b = in(vreg) b,
+                ret = out(reg) ret,
+                options(nomem, nostack, pure)
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: cmp_f128_gt,
+    sig: (a: f128, b: f128) -> i32,
+    crate_fn: cmp::__gttf2,
+    crate_fn_ppc: cmp::__gtkf2,
+    sys_fn: __gttf2,
+    sys_fn_ppc: __gtkf2,
+    sys_available: not(feature = "no-sys-f128"),
+    output_eq: gt_res_eq,
+    asm: []
+}
+
+float_bench! {
+    name: cmp_f128_unord,
+    sig: (a: f128, b: f128) -> i32,
+    crate_fn: cmp::__unordtf2,
+    crate_fn_ppc: cmp::__unordkf2,
+    sys_fn: __unordtf2,
+    sys_fn_ppc: __unordkf2,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: []
+}
+
+pub fn float_cmp() {
+    let mut criterion = Criterion::default().configure_from_args();
+
+    cmp_f32_gt(&mut criterion);
+    cmp_f32_unord(&mut criterion);
+    cmp_f64_gt(&mut criterion);
+    cmp_f64_unord(&mut criterion);
+
+    #[cfg(f128_enabled)]
+    {
+        cmp_f128_gt(&mut criterion);
+        cmp_f128_unord(&mut criterion);
+    }
+}
+
+criterion_main!(float_cmp);
diff --git a/library/compiler-builtins/builtins-test/benches/float_conv.rs b/library/compiler-builtins/builtins-test/benches/float_conv.rs
new file mode 100644
index 00000000000..d4a7346d1d5
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/benches/float_conv.rs
@@ -0,0 +1,688 @@
+#![allow(improper_ctypes)]
+#![cfg_attr(f128_enabled, feature(f128))]
+
+use builtins_test::float_bench;
+use compiler_builtins::float::conv;
+use criterion::{Criterion, criterion_main};
+
+/* unsigned int -> float */
+
+float_bench! {
+    name: conv_u32_f32,
+    sig: (a: u32) -> f32,
+    crate_fn: conv::__floatunsisf,
+    sys_fn: __floatunsisf,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: f32;
+            asm!(
+                "mov {tmp:e}, {a:e}",
+                "cvtsi2ss {ret}, {tmp}",
+                a = in(reg) a,
+                tmp = out(reg) _,
+                ret = lateout(xmm_reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f32;
+            asm!(
+                "ucvtf {ret:s}, {a:w}",
+                a = in(reg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: conv_u32_f64,
+    sig: (a: u32) -> f64,
+    crate_fn: conv::__floatunsidf,
+    sys_fn: __floatunsidf,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: f64;
+            asm!(
+                "mov {tmp:e}, {a:e}",
+                "cvtsi2sd {ret}, {tmp}",
+                a = in(reg) a,
+                tmp = out(reg) _,
+                ret = lateout(xmm_reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f64;
+            asm!(
+                "ucvtf {ret:d}, {a:w}",
+                a = in(reg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: conv_u32_f128,
+    sig: (a: u32) -> f128,
+    crate_fn: conv::__floatunsitf,
+    crate_fn_ppc: conv::__floatunsikf,
+    sys_fn: __floatunsitf,
+    sys_fn_ppc: __floatunsikf,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: []
+}
+
+float_bench! {
+    name: conv_u64_f32,
+    sig: (a: u64) -> f32,
+    crate_fn: conv::__floatundisf,
+    sys_fn: __floatundisf,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f32;
+            asm!(
+                "ucvtf {ret:s}, {a:x}",
+                a = in(reg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: conv_u64_f64,
+    sig: (a: u64) -> f64,
+    crate_fn: conv::__floatundidf,
+    sys_fn: __floatundidf,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f64;
+            asm!(
+                "ucvtf {ret:d}, {a:x}",
+                a = in(reg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: conv_u64_f128,
+    sig: (a: u64) -> f128,
+    crate_fn: conv::__floatunditf,
+    crate_fn_ppc: conv::__floatundikf,
+    sys_fn: __floatunditf,
+    sys_fn_ppc: __floatundikf,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: []
+}
+
+float_bench! {
+    name: conv_u128_f32,
+    sig: (a: u128) -> f32,
+    crate_fn: conv::__floatuntisf,
+    sys_fn: __floatuntisf,
+    sys_available: all(),
+    asm: []
+}
+
+float_bench! {
+    name: conv_u128_f64,
+    sig: (a: u128) -> f64,
+    crate_fn: conv::__floatuntidf,
+    sys_fn: __floatuntidf,
+    sys_available: all(),
+    asm: []
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: conv_u128_f128,
+    sig: (a: u128) -> f128,
+    crate_fn: conv::__floatuntitf,
+    crate_fn_ppc: conv::__floatuntikf,
+    sys_fn: __floatuntitf,
+    sys_fn_ppc: __floatuntikf,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: []
+}
+
+/* signed int -> float */
+
+float_bench! {
+    name: conv_i32_f32,
+    sig: (a: i32) -> f32,
+    crate_fn: conv::__floatsisf,
+    sys_fn: __floatsisf,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: f32;
+            asm!(
+                "cvtsi2ss    {ret}, {a:e}",
+                a = in(reg) a,
+                ret = lateout(xmm_reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f32;
+            asm!(
+                "scvtf {ret:s}, {a:w}",
+                a = in(reg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: conv_i32_f64,
+    sig: (a: i32) -> f64,
+    crate_fn: conv::__floatsidf,
+    sys_fn: __floatsidf,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: f64;
+            asm!(
+                "cvtsi2sd    {ret}, {a:e}",
+                a = in(reg) a,
+                ret = lateout(xmm_reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f64;
+            asm!(
+                "scvtf {ret:d}, {a:w}",
+                a = in(reg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: conv_i32_f128,
+    sig: (a: i32) -> f128,
+    crate_fn: conv::__floatsitf,
+    crate_fn_ppc: conv::__floatsikf,
+    sys_fn: __floatsitf,
+    sys_fn_ppc: __floatsikf,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: []
+}
+
+float_bench! {
+    name: conv_i64_f32,
+    sig: (a: i64) -> f32,
+    crate_fn: conv::__floatdisf,
+    sys_fn: __floatdisf,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: f32;
+            asm!(
+                "cvtsi2ss    {ret}, {a:r}",
+                a = in(reg) a,
+                ret = lateout(xmm_reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f32;
+            asm!(
+                "scvtf {ret:s}, {a:x}",
+                a = in(reg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: conv_i64_f64,
+    sig: (a: i64) -> f64,
+    crate_fn: conv::__floatdidf,
+    sys_fn: __floatdidf,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: f64;
+            asm!(
+                "cvtsi2sd    {ret}, {a:r}",
+                a = in(reg) a,
+                ret = lateout(xmm_reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f64;
+            asm!(
+                "scvtf {ret:d}, {a:x}",
+                a = in(reg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: conv_i64_f128,
+    sig: (a: i64) -> f128,
+    crate_fn: conv::__floatditf,
+    crate_fn_ppc: conv::__floatdikf,
+    sys_fn: __floatditf,
+    sys_fn_ppc: __floatdikf,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: []
+}
+
+float_bench! {
+    name: conv_i128_f32,
+    sig: (a: i128) -> f32,
+    crate_fn: conv::__floattisf,
+    sys_fn: __floattisf,
+    sys_available: all(),
+    asm: []
+}
+
+float_bench! {
+    name: conv_i128_f64,
+    sig: (a: i128) -> f64,
+    crate_fn: conv::__floattidf,
+    sys_fn: __floattidf,
+    sys_available: all(),
+    asm: []
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: conv_i128_f128,
+    sig: (a: i128) -> f128,
+    crate_fn: conv::__floattitf,
+    crate_fn_ppc: conv::__floattikf,
+    sys_fn: __floattitf,
+    sys_fn_ppc: __floattikf,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: []
+}
+
+/* float -> unsigned int */
+
+#[cfg(not(all(target_arch = "powerpc64", target_endian = "little")))]
+float_bench! {
+    name: conv_f32_u32,
+    sig: (a: f32) -> u32,
+    crate_fn: conv::__fixunssfsi,
+    sys_fn: __fixunssfsi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: u32;
+            asm!(
+                "fcvtzu {ret:w}, {a:s}",
+                a = in(vreg) a,
+                ret = lateout(reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(not(all(target_arch = "powerpc64", target_endian = "little")))]
+float_bench! {
+    name: conv_f32_u64,
+    sig: (a: f32) -> u64,
+    crate_fn: conv::__fixunssfdi,
+    sys_fn: __fixunssfdi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: u64;
+            asm!(
+                "fcvtzu {ret:x}, {a:s}",
+                a = in(vreg) a,
+                ret = lateout(reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(not(all(target_arch = "powerpc64", target_endian = "little")))]
+float_bench! {
+    name: conv_f32_u128,
+    sig: (a: f32) -> u128,
+    crate_fn: conv::__fixunssfti,
+    sys_fn: __fixunssfti,
+    sys_available: all(),
+    asm: []
+}
+
+float_bench! {
+    name: conv_f64_u32,
+    sig: (a: f64) -> u32,
+    crate_fn: conv::__fixunsdfsi,
+    sys_fn: __fixunsdfsi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: u32;
+            asm!(
+                "fcvtzu {ret:w}, {a:d}",
+                a = in(vreg) a,
+                ret = lateout(reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: conv_f64_u64,
+    sig: (a: f64) -> u64,
+    crate_fn: conv::__fixunsdfdi,
+    sys_fn: __fixunsdfdi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: u64;
+            asm!(
+                "fcvtzu {ret:x}, {a:d}",
+                a = in(vreg) a,
+                ret = lateout(reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: conv_f64_u128,
+    sig: (a: f64) -> u128,
+    crate_fn: conv::__fixunsdfti,
+    sys_fn: __fixunsdfti,
+    sys_available: all(),
+    asm: []
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: conv_f128_u32,
+    sig: (a: f128) -> u32,
+    crate_fn: conv::__fixunstfsi,
+    crate_fn_ppc: conv::__fixunskfsi,
+    sys_fn: __fixunstfsi,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: []
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: conv_f128_u64,
+    sig: (a: f128) -> u64,
+    crate_fn: conv::__fixunstfdi,
+    crate_fn_ppc: conv::__fixunskfdi,
+    sys_fn: __fixunstfdi,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: []
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: conv_f128_u128,
+    sig: (a: f128) -> u128,
+    crate_fn: conv::__fixunstfti,
+    crate_fn_ppc: conv::__fixunskfti,
+    sys_fn: __fixunstfti,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: []
+}
+
+/* float -> signed int */
+
+#[cfg(not(all(target_arch = "powerpc64", target_endian = "little")))]
+float_bench! {
+    name: conv_f32_i32,
+    sig: (a: f32) -> i32,
+    crate_fn: conv::__fixsfsi,
+    sys_fn: __fixsfsi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: i32;
+            asm!(
+                "fcvtzs {ret:w}, {a:s}",
+                a = in(vreg) a,
+                ret = lateout(reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(not(all(target_arch = "powerpc64", target_endian = "little")))]
+float_bench! {
+    name: conv_f32_i64,
+    sig: (a: f32) -> i64,
+    crate_fn: conv::__fixsfdi,
+    sys_fn: __fixsfdi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: i64;
+            asm!(
+                "fcvtzs {ret:x}, {a:s}",
+                a = in(vreg) a,
+                ret = lateout(reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(not(all(target_arch = "powerpc64", target_endian = "little")))]
+float_bench! {
+    name: conv_f32_i128,
+    sig: (a: f32) -> i128,
+    crate_fn: conv::__fixsfti,
+    sys_fn: __fixsfti,
+    sys_available: all(),
+    asm: []
+}
+
+float_bench! {
+    name: conv_f64_i32,
+    sig: (a: f64) -> i32,
+    crate_fn: conv::__fixdfsi,
+    sys_fn: __fixdfsi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: i32;
+            asm!(
+                "fcvtzs {ret:w}, {a:d}",
+                a = in(vreg) a,
+                ret = lateout(reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: conv_f64_i64,
+    sig: (a: f64) -> i64,
+    crate_fn: conv::__fixdfdi,
+    sys_fn: __fixdfdi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: i64;
+            asm!(
+                "fcvtzs {ret:x}, {a:d}",
+                a = in(vreg) a,
+                ret = lateout(reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: conv_f64_i128,
+    sig: (a: f64) -> i128,
+    crate_fn: conv::__fixdfti,
+    sys_fn: __fixdfti,
+    sys_available: all(),
+    asm: []
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: conv_f128_i32,
+    sig: (a: f128) -> i32,
+    crate_fn: conv::__fixtfsi,
+    crate_fn_ppc: conv::__fixkfsi,
+    sys_fn: __fixtfsi,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: []
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: conv_f128_i64,
+    sig: (a: f128) -> i64,
+    crate_fn: conv::__fixtfdi,
+    crate_fn_ppc: conv::__fixkfdi,
+    sys_fn: __fixtfdi,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: []
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: conv_f128_i128,
+    sig: (a: f128) -> i128,
+    crate_fn: conv::__fixtfti,
+    crate_fn_ppc: conv::__fixkfti,
+    sys_fn: __fixtfti,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: []
+}
+
+pub fn float_conv() {
+    let mut criterion = Criterion::default().configure_from_args();
+
+    conv_u32_f32(&mut criterion);
+    conv_u32_f64(&mut criterion);
+    conv_u64_f32(&mut criterion);
+    conv_u64_f64(&mut criterion);
+    conv_u128_f32(&mut criterion);
+    conv_u128_f64(&mut criterion);
+    conv_i32_f32(&mut criterion);
+    conv_i32_f64(&mut criterion);
+    conv_i64_f32(&mut criterion);
+    conv_i64_f64(&mut criterion);
+    conv_i128_f32(&mut criterion);
+    conv_i128_f64(&mut criterion);
+    conv_f64_u32(&mut criterion);
+    conv_f64_u64(&mut criterion);
+    conv_f64_u128(&mut criterion);
+    conv_f64_i32(&mut criterion);
+    conv_f64_i64(&mut criterion);
+    conv_f64_i128(&mut criterion);
+
+    #[cfg(f128_enabled)]
+    // FIXME: ppc64le has a sporadic overflow panic in the crate functions
+    // <https://github.com/rust-lang/compiler-builtins/issues/617#issuecomment-2125914639>
+    #[cfg(not(all(target_arch = "powerpc64", target_endian = "little")))]
+    {
+        conv_u32_f128(&mut criterion);
+        conv_u64_f128(&mut criterion);
+        conv_u128_f128(&mut criterion);
+        conv_i32_f128(&mut criterion);
+        conv_i64_f128(&mut criterion);
+        conv_i128_f128(&mut criterion);
+        conv_f128_u32(&mut criterion);
+        conv_f128_u64(&mut criterion);
+        conv_f128_u128(&mut criterion);
+        conv_f128_i32(&mut criterion);
+        conv_f128_i64(&mut criterion);
+        conv_f128_i128(&mut criterion);
+    }
+}
+
+criterion_main!(float_conv);
diff --git a/library/compiler-builtins/builtins-test/benches/float_div.rs b/library/compiler-builtins/builtins-test/benches/float_div.rs
new file mode 100644
index 00000000000..d5b0ad0fd40
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/benches/float_div.rs
@@ -0,0 +1,93 @@
+#![cfg_attr(f128_enabled, feature(f128))]
+
+use builtins_test::float_bench;
+use compiler_builtins::float::div;
+use criterion::{Criterion, criterion_main};
+
+float_bench! {
+    name: div_f32,
+    sig: (a: f32, b: f32) -> f32,
+    crate_fn: div::__divsf3,
+    sys_fn: __divsf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            asm!(
+                "divss {a}, {b}",
+                a = inout(xmm_reg) a,
+                b = in(xmm_reg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            asm!(
+                "fdiv {a:s}, {a:s}, {b:s}",
+                a = inout(vreg) a,
+                b = in(vreg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+    ],
+}
+
+float_bench! {
+    name: div_f64,
+    sig: (a: f64, b: f64) -> f64,
+    crate_fn: div::__divdf3,
+    sys_fn: __divdf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            asm!(
+                "divsd {a}, {b}",
+                a = inout(xmm_reg) a,
+                b = in(xmm_reg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            asm!(
+                "fdiv {a:d}, {a:d}, {b:d}",
+                a = inout(vreg) a,
+                b = in(vreg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+    ],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: div_f128,
+    sig: (a: f128, b: f128) -> f128,
+    crate_fn: div::__divtf3,
+    crate_fn_ppc: div::__divkf3,
+    sys_fn: __divtf3,
+    sys_fn_ppc: __divkf3,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: []
+}
+
+pub fn float_div() {
+    let mut criterion = Criterion::default().configure_from_args();
+
+    div_f32(&mut criterion);
+    div_f64(&mut criterion);
+
+    #[cfg(f128_enabled)]
+    {
+        div_f128(&mut criterion);
+    }
+}
+
+criterion_main!(float_div);
diff --git a/library/compiler-builtins/builtins-test/benches/float_extend.rs b/library/compiler-builtins/builtins-test/benches/float_extend.rs
new file mode 100644
index 00000000000..fc44e80c9e1
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/benches/float_extend.rs
@@ -0,0 +1,133 @@
+#![allow(unused_variables)] // "unused" f16 registers
+#![cfg_attr(f128_enabled, feature(f128))]
+#![cfg_attr(f16_enabled, feature(f16))]
+
+use builtins_test::float_bench;
+use compiler_builtins::float::extend;
+use criterion::{Criterion, criterion_main};
+
+#[cfg(f16_enabled)]
+float_bench! {
+    name: extend_f16_f32,
+    sig: (a: f16) -> f32,
+    crate_fn: extend::__extendhfsf2,
+    sys_fn: __extendhfsf2,
+    sys_available: not(feature = "no-sys-f16"),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f32;
+            asm!(
+                "fcvt    {ret:s}, {a:h}",
+                a = in(vreg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(f16_enabled)]
+float_bench! {
+    name: extend_f16_f64,
+    sig: (a: f16) -> f64,
+    crate_fn: extend::__extendhfdf2,
+    sys_fn: __extendhfdf2,
+    sys_available: not(feature = "no-sys-f16-f64-convert"),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f64;
+            asm!(
+                "fcvt    {ret:d}, {a:h}",
+                a = in(vreg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(all(f16_enabled, f128_enabled))]
+float_bench! {
+    name: extend_f16_f128,
+    sig: (a: f16) -> f128,
+    crate_fn: extend::__extendhftf2,
+    crate_fn_ppc: extend::__extendhfkf2,
+    sys_fn: __extendhftf2,
+    sys_fn_ppc: __extendhfkf2,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: [],
+}
+
+float_bench! {
+    name: extend_f32_f64,
+    sig: (a: f32) -> f64,
+    crate_fn: extend::__extendsfdf2,
+    sys_fn: __extendsfdf2,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f64;
+            asm!(
+                "fcvt    {ret:d}, {a:s}",
+                a = in(vreg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: extend_f32_f128,
+    sig: (a: f32) -> f128,
+    crate_fn: extend::__extendsftf2,
+    crate_fn_ppc: extend::__extendsfkf2,
+    sys_fn: __extendsftf2,
+    sys_fn_ppc: __extendsfkf2,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: [],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: extend_f64_f128,
+    sig: (a: f64) -> f128,
+    crate_fn: extend::__extenddftf2,
+    crate_fn_ppc: extend::__extenddfkf2,
+    sys_fn: __extenddftf2,
+    sys_fn_ppc: __extenddfkf2,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: [],
+}
+
+pub fn float_extend() {
+    let mut criterion = Criterion::default().configure_from_args();
+
+    // FIXME(#655): `f16` tests disabled until we can bootstrap symbols
+    #[cfg(f16_enabled)]
+    #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+    {
+        extend_f16_f32(&mut criterion);
+        extend_f16_f64(&mut criterion);
+
+        #[cfg(f128_enabled)]
+        extend_f16_f128(&mut criterion);
+    }
+
+    extend_f32_f64(&mut criterion);
+
+    #[cfg(f128_enabled)]
+    {
+        extend_f32_f128(&mut criterion);
+        extend_f64_f128(&mut criterion);
+    }
+}
+
+criterion_main!(float_extend);
diff --git a/library/compiler-builtins/builtins-test/benches/float_mul.rs b/library/compiler-builtins/builtins-test/benches/float_mul.rs
new file mode 100644
index 00000000000..a7a2d34aa04
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/benches/float_mul.rs
@@ -0,0 +1,93 @@
+#![cfg_attr(f128_enabled, feature(f128))]
+
+use builtins_test::float_bench;
+use compiler_builtins::float::mul;
+use criterion::{Criterion, criterion_main};
+
+float_bench! {
+    name: mul_f32,
+    sig: (a: f32, b: f32) -> f32,
+    crate_fn: mul::__mulsf3,
+    sys_fn: __mulsf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            asm!(
+                "mulss {a}, {b}",
+                a = inout(xmm_reg) a,
+                b = in(xmm_reg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            asm!(
+                "fmul {a:s}, {a:s}, {b:s}",
+                a = inout(vreg) a,
+                b = in(vreg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+    ],
+}
+
+float_bench! {
+    name: mul_f64,
+    sig: (a: f64, b: f64) -> f64,
+    crate_fn: mul::__muldf3,
+    sys_fn: __muldf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            asm!(
+                "mulsd {a}, {b}",
+                a = inout(xmm_reg) a,
+                b = in(xmm_reg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            asm!(
+                "fmul {a:d}, {a:d}, {b:d}",
+                a = inout(vreg) a,
+                b = in(vreg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+    ],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: mul_f128,
+    sig: (a: f128, b: f128) -> f128,
+    crate_fn: mul::__multf3,
+    crate_fn_ppc: mul::__mulkf3,
+    sys_fn: __multf3,
+    sys_fn_ppc: __mulkf3,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: []
+}
+
+pub fn float_mul() {
+    let mut criterion = Criterion::default().configure_from_args();
+
+    mul_f32(&mut criterion);
+    mul_f64(&mut criterion);
+
+    #[cfg(f128_enabled)]
+    {
+        mul_f128(&mut criterion);
+    }
+}
+
+criterion_main!(float_mul);
diff --git a/library/compiler-builtins/builtins-test/benches/float_pow.rs b/library/compiler-builtins/builtins-test/benches/float_pow.rs
new file mode 100644
index 00000000000..64e37dd3241
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/benches/float_pow.rs
@@ -0,0 +1,49 @@
+#![cfg_attr(f128_enabled, feature(f128))]
+
+use builtins_test::float_bench;
+use compiler_builtins::float::pow;
+use criterion::{Criterion, criterion_main};
+
+float_bench! {
+    name: powi_f32,
+    sig: (a: f32, b: i32) -> f32,
+    crate_fn: pow::__powisf2,
+    sys_fn: __powisf2,
+    sys_available: all(),
+    asm: [],
+}
+
+float_bench! {
+    name: powi_f64,
+    sig: (a: f64, b: i32) -> f64,
+    crate_fn: pow::__powidf2,
+    sys_fn: __powidf2,
+    sys_available: all(),
+    asm: [],
+}
+
+// FIXME(f16_f128): can be changed to only `f128_enabled` once `__multf3` and `__divtf3` are
+// distributed by nightly.
+#[cfg(all(f128_enabled, not(feature = "no-sys-f128")))]
+float_bench! {
+    name: powi_f128,
+    sig: (a: f128, b: i32) -> f128,
+    crate_fn: pow::__powitf2,
+    crate_fn_ppc: pow::__powikf2,
+    sys_fn: __powitf2,
+    sys_fn_ppc: __powikf2,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: []
+}
+
+pub fn float_pow() {
+    let mut criterion = Criterion::default().configure_from_args();
+
+    powi_f32(&mut criterion);
+    powi_f64(&mut criterion);
+
+    #[cfg(all(f128_enabled, not(feature = "no-sys-f128")))]
+    powi_f128(&mut criterion);
+}
+
+criterion_main!(float_pow);
diff --git a/library/compiler-builtins/builtins-test/benches/float_sub.rs b/library/compiler-builtins/builtins-test/benches/float_sub.rs
new file mode 100644
index 00000000000..8bae294cd56
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/benches/float_sub.rs
@@ -0,0 +1,93 @@
+#![cfg_attr(f128_enabled, feature(f128))]
+
+use builtins_test::float_bench;
+use compiler_builtins::float::sub;
+use criterion::{Criterion, criterion_main};
+
+float_bench! {
+    name: sub_f32,
+    sig: (a: f32, b: f32) -> f32,
+    crate_fn: sub::__subsf3,
+    sys_fn: __subsf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            asm!(
+                "subss {a}, {b}",
+                a = inout(xmm_reg) a,
+                b = in(xmm_reg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            asm!(
+                "fsub {a:s}, {a:s}, {b:s}",
+                a = inout(vreg) a,
+                b = in(vreg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+    ],
+}
+
+float_bench! {
+    name: sub_f64,
+    sig: (a: f64, b: f64) -> f64,
+    crate_fn: sub::__subdf3,
+    sys_fn: __subdf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            asm!(
+                "subsd {a}, {b}",
+                a = inout(xmm_reg) a,
+                b = in(xmm_reg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            asm!(
+                "fsub {a:d}, {a:d}, {b:d}",
+                a = inout(vreg) a,
+                b = in(vreg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+    ],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: sub_f128,
+    sig: (a: f128, b: f128) -> f128,
+    crate_fn: sub::__subtf3,
+    crate_fn_ppc: sub::__subkf3,
+    sys_fn: __subtf3,
+    sys_fn_ppc: __subkf3,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: []
+}
+
+pub fn float_sub() {
+    let mut criterion = Criterion::default().configure_from_args();
+
+    sub_f32(&mut criterion);
+    sub_f64(&mut criterion);
+
+    #[cfg(f128_enabled)]
+    {
+        sub_f128(&mut criterion);
+    }
+}
+
+criterion_main!(float_sub);
diff --git a/library/compiler-builtins/builtins-test/benches/float_trunc.rs b/library/compiler-builtins/builtins-test/benches/float_trunc.rs
new file mode 100644
index 00000000000..43310c7cfc8
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/benches/float_trunc.rs
@@ -0,0 +1,146 @@
+#![cfg_attr(f128_enabled, feature(f128))]
+#![cfg_attr(f16_enabled, feature(f16))]
+
+use builtins_test::float_bench;
+use compiler_builtins::float::trunc;
+use criterion::{Criterion, criterion_main};
+
+#[cfg(f16_enabled)]
+float_bench! {
+    name: trunc_f32_f16,
+    sig: (a: f32) -> f16,
+    crate_fn: trunc::__truncsfhf2,
+    sys_fn: __truncsfhf2,
+    sys_available: not(feature = "no-sys-f16"),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f16;
+            asm!(
+                "fcvt    {ret:h}, {a:s}",
+                a = in(vreg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(f16_enabled)]
+float_bench! {
+    name: trunc_f64_f16,
+    sig: (a: f64) -> f16,
+    crate_fn: trunc::__truncdfhf2,
+    sys_fn: __truncdfhf2,
+    sys_available: not(feature = "no-sys-f16-f64-convert"),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f16;
+            asm!(
+                "fcvt    {ret:h}, {a:d}",
+                a = in(vreg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: trunc_f64_f32,
+    sig: (a: f64) -> f32,
+    crate_fn: trunc::__truncdfsf2,
+    sys_fn: __truncdfsf2,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: f32;
+            asm!(
+                "cvtsd2ss {ret}, {a}",
+                a = in(xmm_reg) a,
+                ret = lateout(xmm_reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f32;
+            asm!(
+                "fcvt    {ret:s}, {a:d}",
+                a = in(vreg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(all(f16_enabled, f128_enabled))]
+float_bench! {
+    name: trunc_f128_f16,
+    sig: (a: f128) -> f16,
+    crate_fn: trunc::__trunctfhf2,
+    crate_fn_ppc: trunc::__trunckfhf2,
+    sys_fn: __trunctfhf2,
+    sys_fn_ppc: __trunckfhf2,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: [],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: trunc_f128_f32,
+    sig: (a: f128) -> f32,
+    crate_fn: trunc::__trunctfsf2,
+    crate_fn_ppc: trunc::__trunckfsf2,
+    sys_fn: __trunctfsf2,
+    sys_fn_ppc: __trunckfsf2,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: [],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: trunc_f128_f64,
+    sig: (a: f128) -> f64,
+    crate_fn: trunc::__trunctfdf2,
+    crate_fn_ppc: trunc::__trunckfdf2,
+    sys_fn: __trunctfdf2,
+    sys_fn_ppc: __trunckfdf2,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: [],
+}
+
+pub fn float_trunc() {
+    let mut criterion = Criterion::default().configure_from_args();
+
+    // FIXME(#655): `f16` tests disabled until we can bootstrap symbols
+    #[cfg(f16_enabled)]
+    #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+    {
+        trunc_f32_f16(&mut criterion);
+        trunc_f64_f16(&mut criterion);
+    }
+
+    trunc_f64_f32(&mut criterion);
+
+    #[cfg(f128_enabled)]
+    {
+        // FIXME(#655): `f16` tests disabled until we can bootstrap symbols
+        #[cfg(f16_enabled)]
+        #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+        trunc_f128_f16(&mut criterion);
+
+        trunc_f128_f32(&mut criterion);
+        trunc_f128_f64(&mut criterion);
+    }
+}
+
+criterion_main!(float_trunc);
diff --git a/library/compiler-builtins/builtins-test/benches/mem.rs b/library/compiler-builtins/builtins-test/benches/mem.rs
new file mode 100644
index 00000000000..3f83926b6c5
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/benches/mem.rs
@@ -0,0 +1,364 @@
+#![feature(test)]
+
+extern crate test;
+use test::{Bencher, black_box};
+
+extern crate compiler_builtins;
+use compiler_builtins::mem::{memcmp, memcpy, memmove, memset};
+
+const WORD_SIZE: usize = core::mem::size_of::<usize>();
+
+struct AlignedVec {
+    vec: Vec<usize>,
+    size: usize,
+}
+
+impl AlignedVec {
+    fn new(fill: u8, size: usize) -> Self {
+        let mut broadcast = fill as usize;
+        let mut bits = 8;
+        while bits < WORD_SIZE * 8 {
+            broadcast |= broadcast << bits;
+            bits *= 2;
+        }
+
+        let vec = vec![broadcast; (size + WORD_SIZE - 1) & !WORD_SIZE];
+        AlignedVec { vec, size }
+    }
+}
+
+impl core::ops::Deref for AlignedVec {
+    type Target = [u8];
+    fn deref(&self) -> &[u8] {
+        unsafe { core::slice::from_raw_parts(self.vec.as_ptr() as *const u8, self.size) }
+    }
+}
+
+impl core::ops::DerefMut for AlignedVec {
+    fn deref_mut(&mut self) -> &mut [u8] {
+        unsafe { core::slice::from_raw_parts_mut(self.vec.as_mut_ptr() as *mut u8, self.size) }
+    }
+}
+
+fn memcpy_builtin(b: &mut Bencher, n: usize, offset1: usize, offset2: usize) {
+    let v1 = AlignedVec::new(1, n + offset1);
+    let mut v2 = AlignedVec::new(0, n + offset2);
+    b.bytes = n as u64;
+    b.iter(|| {
+        let src: &[u8] = black_box(&v1[offset1..]);
+        let dst: &mut [u8] = black_box(&mut v2[offset2..]);
+        dst.copy_from_slice(src);
+    })
+}
+
+fn memcpy_rust(b: &mut Bencher, n: usize, offset1: usize, offset2: usize) {
+    let v1 = AlignedVec::new(1, n + offset1);
+    let mut v2 = AlignedVec::new(0, n + offset2);
+    b.bytes = n as u64;
+    b.iter(|| {
+        let src: &[u8] = black_box(&v1[offset1..]);
+        let dst: &mut [u8] = black_box(&mut v2[offset2..]);
+        unsafe { memcpy(dst.as_mut_ptr(), src.as_ptr(), n) }
+    })
+}
+
+fn memset_builtin(b: &mut Bencher, n: usize, offset: usize) {
+    let mut v1 = AlignedVec::new(0, n + offset);
+    b.bytes = n as u64;
+    b.iter(|| {
+        let dst: &mut [u8] = black_box(&mut v1[offset..]);
+        let val: u8 = black_box(27);
+        for b in dst {
+            *b = val;
+        }
+    })
+}
+
+fn memset_rust(b: &mut Bencher, n: usize, offset: usize) {
+    let mut v1 = AlignedVec::new(0, n + offset);
+    b.bytes = n as u64;
+    b.iter(|| {
+        let dst: &mut [u8] = black_box(&mut v1[offset..]);
+        let val = black_box(27);
+        unsafe { memset(dst.as_mut_ptr(), val, n) }
+    })
+}
+
+fn memcmp_builtin(b: &mut Bencher, n: usize) {
+    let v1 = AlignedVec::new(0, n);
+    let mut v2 = AlignedVec::new(0, n);
+    v2[n - 1] = 1;
+    b.bytes = n as u64;
+    b.iter(|| {
+        let s1: &[u8] = black_box(&v1);
+        let s2: &[u8] = black_box(&v2);
+        s1.cmp(s2)
+    })
+}
+
+fn memcmp_builtin_unaligned(b: &mut Bencher, n: usize) {
+    let v1 = AlignedVec::new(0, n);
+    let mut v2 = AlignedVec::new(0, n);
+    v2[n - 1] = 1;
+    b.bytes = n as u64;
+    b.iter(|| {
+        let s1: &[u8] = black_box(&v1[0..]);
+        let s2: &[u8] = black_box(&v2[1..]);
+        s1.cmp(s2)
+    })
+}
+
+fn memcmp_rust(b: &mut Bencher, n: usize) {
+    let v1 = AlignedVec::new(0, n);
+    let mut v2 = AlignedVec::new(0, n);
+    v2[n - 1] = 1;
+    b.bytes = n as u64;
+    b.iter(|| {
+        let s1: &[u8] = black_box(&v1);
+        let s2: &[u8] = black_box(&v2);
+        unsafe { memcmp(s1.as_ptr(), s2.as_ptr(), n) }
+    })
+}
+
+fn memcmp_rust_unaligned(b: &mut Bencher, n: usize) {
+    let v1 = AlignedVec::new(0, n);
+    let mut v2 = AlignedVec::new(0, n);
+    v2[n - 1] = 1;
+    b.bytes = n as u64;
+    b.iter(|| {
+        let s1: &[u8] = black_box(&v1[0..]);
+        let s2: &[u8] = black_box(&v2[1..]);
+        unsafe { memcmp(s1.as_ptr(), s2.as_ptr(), n - 1) }
+    })
+}
+
+fn memmove_builtin(b: &mut Bencher, n: usize, offset: usize) {
+    let mut v = AlignedVec::new(0, n + n / 2 + offset);
+    b.bytes = n as u64;
+    b.iter(|| {
+        let s: &mut [u8] = black_box(&mut v);
+        s.copy_within(0..n, n / 2 + offset);
+    })
+}
+
+fn memmove_rust(b: &mut Bencher, n: usize, offset: usize) {
+    let mut v = AlignedVec::new(0, n + n / 2 + offset);
+    b.bytes = n as u64;
+    b.iter(|| {
+        let dst: *mut u8 = black_box(&mut v[n / 2 + offset..]).as_mut_ptr();
+        let src: *const u8 = black_box(&v).as_ptr();
+        unsafe { memmove(dst, src, n) };
+    })
+}
+
+#[bench]
+fn memcpy_builtin_4096(b: &mut Bencher) {
+    memcpy_builtin(b, 4096, 0, 0)
+}
+#[bench]
+fn memcpy_rust_4096(b: &mut Bencher) {
+    memcpy_rust(b, 4096, 0, 0)
+}
+#[bench]
+fn memcpy_builtin_1048576(b: &mut Bencher) {
+    memcpy_builtin(b, 1048576, 0, 0)
+}
+#[bench]
+fn memcpy_rust_1048576(b: &mut Bencher) {
+    memcpy_rust(b, 1048576, 0, 0)
+}
+#[bench]
+fn memcpy_builtin_4096_offset(b: &mut Bencher) {
+    memcpy_builtin(b, 4096, 65, 65)
+}
+#[bench]
+fn memcpy_rust_4096_offset(b: &mut Bencher) {
+    memcpy_rust(b, 4096, 65, 65)
+}
+#[bench]
+fn memcpy_builtin_1048576_offset(b: &mut Bencher) {
+    memcpy_builtin(b, 1048576, 65, 65)
+}
+#[bench]
+fn memcpy_rust_1048576_offset(b: &mut Bencher) {
+    memcpy_rust(b, 1048576, 65, 65)
+}
+#[bench]
+fn memcpy_builtin_4096_misalign(b: &mut Bencher) {
+    memcpy_builtin(b, 4096, 65, 66)
+}
+#[bench]
+fn memcpy_rust_4096_misalign(b: &mut Bencher) {
+    memcpy_rust(b, 4096, 65, 66)
+}
+#[bench]
+fn memcpy_builtin_1048576_misalign(b: &mut Bencher) {
+    memcpy_builtin(b, 1048576, 65, 66)
+}
+#[bench]
+fn memcpy_rust_1048576_misalign(b: &mut Bencher) {
+    memcpy_rust(b, 1048576, 65, 66)
+}
+
+#[bench]
+fn memset_builtin_4096(b: &mut Bencher) {
+    memset_builtin(b, 4096, 0)
+}
+#[bench]
+fn memset_rust_4096(b: &mut Bencher) {
+    memset_rust(b, 4096, 0)
+}
+#[bench]
+fn memset_builtin_1048576(b: &mut Bencher) {
+    memset_builtin(b, 1048576, 0)
+}
+#[bench]
+fn memset_rust_1048576(b: &mut Bencher) {
+    memset_rust(b, 1048576, 0)
+}
+#[bench]
+fn memset_builtin_4096_offset(b: &mut Bencher) {
+    memset_builtin(b, 4096, 65)
+}
+#[bench]
+fn memset_rust_4096_offset(b: &mut Bencher) {
+    memset_rust(b, 4096, 65)
+}
+#[bench]
+fn memset_builtin_1048576_offset(b: &mut Bencher) {
+    memset_builtin(b, 1048576, 65)
+}
+#[bench]
+fn memset_rust_1048576_offset(b: &mut Bencher) {
+    memset_rust(b, 1048576, 65)
+}
+
+#[bench]
+fn memcmp_builtin_8(b: &mut Bencher) {
+    memcmp_builtin(b, 8)
+}
+#[bench]
+fn memcmp_rust_8(b: &mut Bencher) {
+    memcmp_rust(b, 8)
+}
+#[bench]
+fn memcmp_builtin_16(b: &mut Bencher) {
+    memcmp_builtin(b, 16)
+}
+#[bench]
+fn memcmp_rust_16(b: &mut Bencher) {
+    memcmp_rust(b, 16)
+}
+#[bench]
+fn memcmp_builtin_32(b: &mut Bencher) {
+    memcmp_builtin(b, 32)
+}
+#[bench]
+fn memcmp_rust_32(b: &mut Bencher) {
+    memcmp_rust(b, 32)
+}
+#[bench]
+fn memcmp_builtin_64(b: &mut Bencher) {
+    memcmp_builtin(b, 64)
+}
+#[bench]
+fn memcmp_rust_64(b: &mut Bencher) {
+    memcmp_rust(b, 64)
+}
+#[bench]
+fn memcmp_builtin_4096(b: &mut Bencher) {
+    memcmp_builtin(b, 4096)
+}
+#[bench]
+fn memcmp_rust_4096(b: &mut Bencher) {
+    memcmp_rust(b, 4096)
+}
+#[bench]
+fn memcmp_builtin_1048576(b: &mut Bencher) {
+    memcmp_builtin(b, 1048576)
+}
+#[bench]
+fn memcmp_rust_1048576(b: &mut Bencher) {
+    memcmp_rust(b, 1048576)
+}
+#[bench]
+fn memcmp_builtin_unaligned_7(b: &mut Bencher) {
+    memcmp_builtin_unaligned(b, 8)
+}
+#[bench]
+fn memcmp_rust_unaligned_7(b: &mut Bencher) {
+    memcmp_rust_unaligned(b, 8)
+}
+#[bench]
+fn memcmp_builtin_unaligned_15(b: &mut Bencher) {
+    memcmp_builtin_unaligned(b, 16)
+}
+#[bench]
+fn memcmp_rust_unaligned_15(b: &mut Bencher) {
+    memcmp_rust_unaligned(b, 16)
+}
+#[bench]
+fn memcmp_builtin_unaligned_31(b: &mut Bencher) {
+    memcmp_builtin_unaligned(b, 32)
+}
+#[bench]
+fn memcmp_rust_unaligned_31(b: &mut Bencher) {
+    memcmp_rust_unaligned(b, 32)
+}
+#[bench]
+fn memcmp_builtin_unaligned_63(b: &mut Bencher) {
+    memcmp_builtin_unaligned(b, 64)
+}
+#[bench]
+fn memcmp_rust_unaligned_63(b: &mut Bencher) {
+    memcmp_rust_unaligned(b, 64)
+}
+#[bench]
+fn memcmp_builtin_unaligned_4095(b: &mut Bencher) {
+    memcmp_builtin_unaligned(b, 4096)
+}
+#[bench]
+fn memcmp_rust_unaligned_4095(b: &mut Bencher) {
+    memcmp_rust_unaligned(b, 4096)
+}
+#[bench]
+fn memcmp_builtin_unaligned_1048575(b: &mut Bencher) {
+    memcmp_builtin_unaligned(b, 1048576)
+}
+#[bench]
+fn memcmp_rust_unaligned_1048575(b: &mut Bencher) {
+    memcmp_rust_unaligned(b, 1048576)
+}
+
+#[bench]
+fn memmove_builtin_4096(b: &mut Bencher) {
+    memmove_builtin(b, 4096, 0)
+}
+#[bench]
+fn memmove_rust_4096(b: &mut Bencher) {
+    memmove_rust(b, 4096, 0)
+}
+#[bench]
+fn memmove_builtin_1048576(b: &mut Bencher) {
+    memmove_builtin(b, 1048576, 0)
+}
+#[bench]
+fn memmove_rust_1048576(b: &mut Bencher) {
+    memmove_rust(b, 1048576, 0)
+}
+#[bench]
+fn memmove_builtin_4096_misalign(b: &mut Bencher) {
+    memmove_builtin(b, 4096, 1)
+}
+#[bench]
+fn memmove_rust_4096_misalign(b: &mut Bencher) {
+    memmove_rust(b, 4096, 1)
+}
+#[bench]
+fn memmove_builtin_1048576_misalign(b: &mut Bencher) {
+    memmove_builtin(b, 1048576, 1)
+}
+#[bench]
+fn memmove_rust_1048576_misalign(b: &mut Bencher) {
+    memmove_rust(b, 1048576, 1)
+}
diff --git a/library/compiler-builtins/builtins-test/benches/mem_icount.rs b/library/compiler-builtins/builtins-test/benches/mem_icount.rs
new file mode 100644
index 00000000000..bd88cf80c7d
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/benches/mem_icount.rs
@@ -0,0 +1,500 @@
+//! Benchmarks that use Callgrind (via `iai_callgrind`) to report instruction count metrics. This
+//! is stable enough to be tested in CI.
+
+use std::hint::black_box;
+use std::{ops, slice};
+
+use compiler_builtins::mem::{memcmp, memcpy, memmove, memset};
+use iai_callgrind::{library_benchmark, library_benchmark_group, main};
+
+const PAGE_SIZE: usize = 0x1000; // 4 kiB
+const MAX_ALIGN: usize = 512; // assume we may use avx512 operations one day
+const MEG1: usize = 1 << 20; // 1 MiB
+
+#[derive(Clone)]
+#[repr(C, align(0x1000))]
+struct Page([u8; PAGE_SIZE]);
+
+/// A buffer that is page-aligned by default, with an optional offset to create a
+/// misalignment.
+struct AlignedSlice {
+    buf: Box<[Page]>,
+    len: usize,
+    offset: usize,
+}
+
+impl AlignedSlice {
+    /// Allocate a slice aligned to ALIGN with at least `len` items, with `offset` from
+    /// page alignment.
+    fn new_zeroed(len: usize, offset: usize) -> Self {
+        assert!(offset < PAGE_SIZE);
+        let total_len = len + offset;
+        let items = (total_len / PAGE_SIZE) + if total_len % PAGE_SIZE > 0 { 1 } else { 0 };
+        let buf = vec![Page([0u8; PAGE_SIZE]); items].into_boxed_slice();
+        AlignedSlice { buf, len, offset }
+    }
+}
+
+impl ops::Deref for AlignedSlice {
+    type Target = [u8];
+    fn deref(&self) -> &Self::Target {
+        unsafe { slice::from_raw_parts(self.buf.as_ptr().cast::<u8>().add(self.offset), self.len) }
+    }
+}
+
+impl ops::DerefMut for AlignedSlice {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        unsafe {
+            slice::from_raw_parts_mut(
+                self.buf.as_mut_ptr().cast::<u8>().add(self.offset),
+                self.len,
+            )
+        }
+    }
+}
+
+mod mcpy {
+    use super::*;
+
+    struct Cfg {
+        len: usize,
+        s_off: usize,
+        d_off: usize,
+    }
+
+    fn setup(cfg: Cfg) -> (usize, AlignedSlice, AlignedSlice) {
+        let Cfg { len, s_off, d_off } = cfg;
+        println!("bytes: {len} bytes, src offset: {s_off}, dst offset: {d_off}");
+        let mut src = AlignedSlice::new_zeroed(len, s_off);
+        let dst = AlignedSlice::new_zeroed(len, d_off);
+        src.fill(1);
+        (len, src, dst)
+    }
+
+    #[library_benchmark]
+    #[benches::aligned(
+        // Both aligned
+        args = [
+            Cfg { len: 16, s_off: 0, d_off: 0 },
+            Cfg { len: 32, s_off: 0, d_off: 0 },
+            Cfg { len: 64, s_off: 0, d_off: 0 },
+            Cfg { len: 512, s_off: 0, d_off: 0 },
+            Cfg { len: 4096, s_off: 0, d_off: 0 },
+            Cfg { len: MEG1, s_off: 0, d_off: 0 },
+        ],
+        setup = setup,
+    )]
+    #[benches::offset(
+        // Both at the same offset
+        args = [
+            Cfg { len: 16, s_off: 65, d_off: 65 },
+            Cfg { len: 32, s_off: 65, d_off: 65 },
+            Cfg { len: 64, s_off: 65, d_off: 65 },
+            Cfg { len: 512, s_off: 65, d_off: 65 },
+            Cfg { len: 4096, s_off: 65, d_off: 65 },
+            Cfg { len: MEG1, s_off: 65, d_off: 65 },
+        ],
+        setup = setup,
+    )]
+    #[benches::misaligned(
+        // `src` and `dst` both misaligned by different amounts
+        args = [
+            Cfg { len: 16, s_off: 65, d_off: 66 },
+            Cfg { len: 32, s_off: 65, d_off: 66 },
+            Cfg { len: 64, s_off: 65, d_off: 66 },
+            Cfg { len: 512, s_off: 65, d_off: 66 },
+            Cfg { len: 4096, s_off: 65, d_off: 66 },
+            Cfg { len: MEG1, s_off: 65, d_off: 66 },
+        ],
+        setup = setup,
+    )]
+    fn bench((len, mut dst, src): (usize, AlignedSlice, AlignedSlice)) {
+        unsafe {
+            black_box(memcpy(
+                black_box(dst.as_mut_ptr()),
+                black_box(src.as_ptr()),
+                black_box(len),
+            ));
+        }
+    }
+
+    library_benchmark_group!(name = memcpy; benchmarks = bench);
+}
+
+mod mset {
+    use super::*;
+
+    struct Cfg {
+        len: usize,
+        offset: usize,
+    }
+
+    fn setup(Cfg { len, offset }: Cfg) -> (usize, AlignedSlice) {
+        println!("bytes: {len}, offset: {offset}");
+        (len, AlignedSlice::new_zeroed(len, offset))
+    }
+
+    #[library_benchmark]
+    #[benches::aligned(
+        args = [
+            Cfg { len: 16, offset: 0 },
+            Cfg { len: 32, offset: 0 },
+            Cfg { len: 64, offset: 0 },
+            Cfg { len: 512, offset: 0 },
+            Cfg { len: 4096, offset: 0 },
+            Cfg { len: MEG1, offset: 0 },
+        ],
+        setup = setup,
+    )]
+    #[benches::offset(
+        args = [
+            Cfg { len: 16, offset: 65 },
+            Cfg { len: 32, offset: 65 },
+            Cfg { len: 64, offset: 65 },
+            Cfg { len: 512, offset: 65 },
+            Cfg { len: 4096, offset: 65 },
+            Cfg { len: MEG1, offset: 65 },
+        ],
+        setup = setup,
+    )]
+    fn bench((len, mut dst): (usize, AlignedSlice)) {
+        unsafe {
+            black_box(memset(
+                black_box(dst.as_mut_ptr()),
+                black_box(27),
+                black_box(len),
+            ));
+        }
+    }
+
+    library_benchmark_group!(name = memset; benchmarks = bench);
+}
+
+mod mcmp {
+    use super::*;
+
+    struct Cfg {
+        len: usize,
+        s_off: usize,
+        d_off: usize,
+    }
+
+    fn setup(cfg: Cfg) -> (usize, AlignedSlice, AlignedSlice) {
+        let Cfg { len, s_off, d_off } = cfg;
+        println!("bytes: {len}, src offset: {s_off}, dst offset: {d_off}");
+        let b1 = AlignedSlice::new_zeroed(len, s_off);
+        let mut b2 = AlignedSlice::new_zeroed(len, d_off);
+        b2[len - 1] = 1;
+        (len, b1, b2)
+    }
+
+    #[library_benchmark]
+    #[benches::aligned(
+        // Both aligned
+        args = [
+            Cfg { len: 16, s_off: 0, d_off: 0 },
+            Cfg { len: 32, s_off: 0, d_off: 0 },
+            Cfg { len: 64, s_off: 0, d_off: 0 },
+            Cfg { len: 512, s_off: 0, d_off: 0 },
+            Cfg { len: 4096, s_off: 0, d_off: 0 },
+            Cfg { len: MEG1, s_off: 0, d_off: 0 },
+        ],
+        setup = setup
+    )]
+    #[benches::offset(
+        // Both at the same offset
+        args = [
+            Cfg { len: 16, s_off: 65, d_off: 65 },
+            Cfg { len: 32, s_off: 65, d_off: 65 },
+            Cfg { len: 64, s_off: 65, d_off: 65 },
+            Cfg { len: 512, s_off: 65, d_off: 65 },
+            Cfg { len: 4096, s_off: 65, d_off: 65 },
+            Cfg { len: MEG1, s_off: 65, d_off: 65 },
+        ],
+        setup = setup
+    )]
+    #[benches::misaligned(
+        // `src` and `dst` both misaligned by different amounts
+        args = [
+            Cfg { len: 16, s_off: 65, d_off: 66 },
+            Cfg { len: 32, s_off: 65, d_off: 66 },
+            Cfg { len: 64, s_off: 65, d_off: 66 },
+            Cfg { len: 512, s_off: 65, d_off: 66 },
+            Cfg { len: 4096, s_off: 65, d_off: 66 },
+            Cfg { len: MEG1, s_off: 65, d_off: 66 },
+        ],
+        setup = setup
+    )]
+    fn bench((len, mut dst, src): (usize, AlignedSlice, AlignedSlice)) {
+        unsafe {
+            black_box(memcmp(
+                black_box(dst.as_mut_ptr()),
+                black_box(src.as_ptr()),
+                black_box(len),
+            ));
+        }
+    }
+
+    library_benchmark_group!(name = memcmp; benchmarks = bench);
+}
+
+mod mmove {
+    use Spread::{Aligned, Large, Medium, Small};
+
+    use super::*;
+
+    struct Cfg {
+        len: usize,
+        spread: Spread,
+        off: usize,
+    }
+
+    enum Spread {
+        /// `src` and `dst` are close and have the same alignment (or offset).
+        Aligned,
+        /// `src` and `dst` are close.
+        Small,
+        /// `src` and `dst` are halfway offset in the buffer.
+        Medium,
+        /// `src` and `dst` only overlap by a single byte.
+        Large,
+    }
+
+    // Note that small and large are
+    fn calculate_spread(len: usize, spread: Spread) -> usize {
+        match spread {
+            // Note that this test doesn't make sense for lengths less than len=128
+            Aligned => {
+                assert!(len > MAX_ALIGN, "aligned memset would have no overlap");
+                MAX_ALIGN
+            }
+            Small => 1,
+            Medium => (len / 2) + 1, // add 1 so all are misaligned
+            Large => len - 1,
+        }
+    }
+
+    fn setup_forward(cfg: Cfg) -> (usize, usize, AlignedSlice) {
+        let Cfg { len, spread, off } = cfg;
+        let spread = calculate_spread(len, spread);
+        println!("bytes: {len}, spread: {spread}, offset: {off}, forward");
+        assert!(spread < len, "memmove tests should have some overlap");
+        let mut buf = AlignedSlice::new_zeroed(len + spread, off);
+        let mut fill: usize = 0;
+        buf[..len].fill_with(|| {
+            fill += 1;
+            fill as u8
+        });
+        (len, spread, buf)
+    }
+
+    fn setup_backward(cfg: Cfg) -> (usize, usize, AlignedSlice) {
+        let Cfg { len, spread, off } = cfg;
+        let spread = calculate_spread(len, spread);
+        println!("bytes: {len}, spread: {spread}, offset: {off}, backward");
+        assert!(spread < len, "memmove tests should have some overlap");
+        let mut buf = AlignedSlice::new_zeroed(len + spread, off);
+        let mut fill: usize = 0;
+        buf[spread..].fill_with(|| {
+            fill += 1;
+            fill as u8
+        });
+        (len, spread, buf)
+    }
+
+    #[library_benchmark]
+    #[benches::aligned(
+        args = [
+            // Don't test small spreads since there is no overlap
+            Cfg { len: 4096, spread: Aligned, off: 0 },
+            Cfg { len: MEG1, spread: Aligned, off: 0 },
+        ],
+        setup = setup_forward
+    )]
+    #[benches::small_spread(
+        args = [
+            Cfg { len: 16, spread: Small, off: 0 },
+            Cfg { len: 32, spread: Small, off: 0 },
+            Cfg { len: 64, spread: Small, off: 0 },
+            Cfg { len: 512, spread: Small, off: 0 },
+            Cfg { len: 4096, spread: Small, off: 0 },
+            Cfg { len: MEG1, spread: Small, off: 0 },
+        ],
+        setup = setup_forward
+    )]
+    #[benches::medium_spread(
+        args = [
+            Cfg { len: 16, spread: Medium, off: 0 },
+            Cfg { len: 32, spread: Medium, off: 0 },
+            Cfg { len: 64, spread: Medium, off: 0 },
+            Cfg { len: 512, spread: Medium, off: 0 },
+            Cfg { len: 4096, spread: Medium, off: 0 },
+            Cfg { len: MEG1, spread: Medium, off: 0 },
+        ],
+        setup = setup_forward
+    )]
+    #[benches::large_spread(
+        args = [
+            Cfg { len: 16, spread: Large, off: 0 },
+            Cfg { len: 32, spread: Large, off: 0 },
+            Cfg { len: 64, spread: Large, off: 0 },
+            Cfg { len: 512, spread: Large, off: 0 },
+            Cfg { len: 4096, spread: Large, off: 0 },
+            Cfg { len: MEG1, spread: Large, off: 0 },
+        ],
+        setup = setup_forward
+    )]
+    #[benches::aligned_off(
+        args = [
+            Cfg { len: 4096, spread: Aligned, off: 65 },
+            Cfg { len: MEG1, spread: Aligned, off: 65 },
+        ],
+        setup = setup_forward
+    )]
+    #[benches::small_spread_off(
+        args = [
+            Cfg { len: 16, spread: Small, off: 65 },
+            Cfg { len: 32, spread: Small, off: 65 },
+            Cfg { len: 64, spread: Small, off: 65 },
+            Cfg { len: 512, spread: Small, off: 65 },
+            Cfg { len: 4096, spread: Small, off: 65 },
+            Cfg { len: MEG1, spread: Small, off: 65 },
+        ],
+        setup = setup_forward
+    )]
+    #[benches::medium_spread_off(
+        args = [
+            Cfg { len: 16, spread: Medium, off: 65 },
+            Cfg { len: 32, spread: Medium, off: 65 },
+            Cfg { len: 64, spread: Medium, off: 65 },
+            Cfg { len: 512, spread: Medium, off: 65 },
+            Cfg { len: 4096, spread: Medium, off: 65 },
+            Cfg { len: MEG1, spread: Medium, off: 65 },
+        ],
+        setup = setup_forward
+    )]
+    #[benches::large_spread_off(
+        args = [
+            Cfg { len: 16, spread: Large, off: 65 },
+            Cfg { len: 32, spread: Large, off: 65 },
+            Cfg { len: 64, spread: Large, off: 65 },
+            Cfg { len: 512, spread: Large, off: 65 },
+            Cfg { len: 4096, spread: Large, off: 65 },
+            Cfg { len: MEG1, spread: Large, off: 65 },
+        ],
+        setup = setup_forward
+    )]
+    fn forward((len, spread, mut buf): (usize, usize, AlignedSlice)) {
+        // Test moving from the start of the buffer toward the end
+        unsafe {
+            black_box(memmove(
+                black_box(buf[spread..].as_mut_ptr()),
+                black_box(buf.as_ptr()),
+                black_box(len),
+            ));
+        }
+    }
+
+    #[library_benchmark]
+    #[benches::aligned(
+        args = [
+            // Don't test small spreads since there is no overlap
+            Cfg { len: 4096, spread: Aligned, off: 0 },
+            Cfg { len: MEG1, spread: Aligned, off: 0 },
+        ],
+        setup = setup_backward
+    )]
+    #[benches::small_spread(
+        args = [
+            Cfg { len: 16, spread: Small, off: 0 },
+            Cfg { len: 32, spread: Small, off: 0 },
+            Cfg { len: 64, spread: Small, off: 0 },
+            Cfg { len: 512, spread: Small, off: 0 },
+            Cfg { len: 4096, spread: Small, off: 0 },
+            Cfg { len: MEG1, spread: Small, off: 0 },
+        ],
+        setup = setup_backward
+    )]
+    #[benches::medium_spread(
+        args = [
+            Cfg { len: 16, spread: Medium, off: 0 },
+            Cfg { len: 32, spread: Medium, off: 0 },
+            Cfg { len: 64, spread: Medium, off: 0 },
+            Cfg { len: 512, spread: Medium, off: 0 },
+            Cfg { len: 4096, spread: Medium, off: 0 },
+            Cfg { len: MEG1, spread: Medium, off: 0 },
+        ],
+        setup = setup_backward
+    )]
+    #[benches::large_spread(
+        args = [
+            Cfg { len: 16, spread: Large, off: 0 },
+            Cfg { len: 32, spread: Large, off: 0 },
+            Cfg { len: 64, spread: Large, off: 0 },
+            Cfg { len: 512, spread: Large, off: 0 },
+            Cfg { len: 4096, spread: Large, off: 0 },
+            Cfg { len: MEG1, spread: Large, off: 0 },
+        ],
+        setup = setup_backward
+    )]
+    #[benches::aligned_off(
+        args = [
+            // Don't test small spreads since there is no overlap
+            Cfg { len: 4096, spread: Aligned, off: 65 },
+            Cfg { len: MEG1, spread: Aligned, off: 65 },
+        ],
+        setup = setup_backward
+    )]
+    #[benches::small_spread_off(
+        args = [
+            Cfg { len: 16, spread: Small, off: 65 },
+            Cfg { len: 32, spread: Small, off: 65 },
+            Cfg { len: 64, spread: Small, off: 65 },
+            Cfg { len: 512, spread: Small, off: 65 },
+            Cfg { len: 4096, spread: Small, off: 65 },
+            Cfg { len: MEG1, spread: Small, off: 65 },
+        ],
+        setup = setup_backward
+    )]
+    #[benches::medium_spread_off(
+        args = [
+            Cfg { len: 16, spread: Medium, off: 65 },
+            Cfg { len: 32, spread: Medium, off: 65 },
+            Cfg { len: 64, spread: Medium, off: 65 },
+            Cfg { len: 512, spread: Medium, off: 65 },
+            Cfg { len: 4096, spread: Medium, off: 65 },
+            Cfg { len: MEG1, spread: Medium, off: 65 },
+        ],
+        setup = setup_backward
+    )]
+    #[benches::large_spread_off(
+        args = [
+            Cfg { len: 16, spread: Large, off: 65 },
+            Cfg { len: 32, spread: Large, off: 65 },
+            Cfg { len: 64, spread: Large, off: 65 },
+            Cfg { len: 512, spread: Large, off: 65 },
+            Cfg { len: 4096, spread: Large, off: 65 },
+            Cfg { len: MEG1, spread: Large, off: 65 },
+        ],
+        setup = setup_backward
+    )]
+    fn backward((len, spread, mut buf): (usize, usize, AlignedSlice)) {
+        // Test moving from the end of the buffer toward the start
+        unsafe {
+            black_box(memmove(
+                black_box(buf.as_mut_ptr()),
+                black_box(buf[spread..].as_ptr()),
+                black_box(len),
+            ));
+        }
+    }
+
+    library_benchmark_group!(name = memmove; benchmarks = forward, backward);
+}
+
+use mcmp::memcmp;
+use mcpy::memcpy;
+use mmove::memmove;
+use mset::memset;
+
+main!(library_benchmark_groups = memcpy, memset, memcmp, memmove);
diff --git a/library/compiler-builtins/builtins-test/build.rs b/library/compiler-builtins/builtins-test/build.rs
new file mode 100644
index 00000000000..e8f4eb4dd22
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/build.rs
@@ -0,0 +1,120 @@
+use std::collections::HashSet;
+
+mod builtins_configure {
+    include!("../compiler-builtins/configure.rs");
+}
+
+/// Features to enable
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+enum Feature {
+    NoSysF128,
+    NoSysF128IntConvert,
+    NoSysF16,
+    NoSysF16F64Convert,
+    NoSysF16F128Convert,
+}
+
+impl Feature {
+    fn implies(self) -> &'static [Self] {
+        match self {
+            Self::NoSysF128 => [Self::NoSysF128IntConvert, Self::NoSysF16F128Convert].as_slice(),
+            Self::NoSysF128IntConvert => [].as_slice(),
+            Self::NoSysF16 => [Self::NoSysF16F64Convert, Self::NoSysF16F128Convert].as_slice(),
+            Self::NoSysF16F64Convert => [].as_slice(),
+            Self::NoSysF16F128Convert => [].as_slice(),
+        }
+    }
+}
+
+fn main() {
+    println!("cargo::rerun-if-changed=../configure.rs");
+
+    let target = builtins_configure::Target::from_env();
+    let mut features = HashSet::new();
+
+    // These platforms do not have f128 symbols available in their system libraries, so
+    // skip related tests.
+    if target.arch == "arm"
+        || target.vendor == "apple"
+        || target.env == "msvc"
+        // GCC and LLVM disagree on the ABI of `f16` and `f128` with MinGW. See
+        // <https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115054>.
+        || (target.os == "windows" && target.env == "gnu")
+        // FIXME(llvm): There is an ABI incompatibility between GCC and Clang on 32-bit x86.
+        // See <https://github.com/llvm/llvm-project/issues/77401>.
+        || target.arch == "x86"
+        // 32-bit PowerPC and 64-bit LE gets code generated that Qemu cannot handle. See
+        // <https://github.com/rust-lang/compiler-builtins/pull/606#issuecomment-2105635926>.
+        || target.arch == "powerpc"
+        || target.arch == "powerpc64le"
+        // FIXME: We get different results from the builtin functions. See
+        // <https://github.com/rust-lang/compiler-builtins/pull/606#issuecomment-2105657287>.
+        || target.arch == "powerpc64"
+    {
+        features.insert(Feature::NoSysF128);
+    }
+
+    if target.arch == "x86" {
+        // 32-bit x86 does not have `__fixunstfti`/`__fixtfti` but does have everything else
+        features.insert(Feature::NoSysF128IntConvert);
+        // FIXME: 32-bit x86 has a bug in `f128 -> f16` system libraries
+        features.insert(Feature::NoSysF16F128Convert);
+    }
+
+    // These platforms do not have f16 symbols available in their system libraries, so
+    // skip related tests. Most of these are missing `f16 <-> f32` conversion routines.
+    if (target.arch == "aarch64" && target.os == "linux")
+        || target.arch.starts_with("arm")
+        || target.arch == "powerpc"
+        || target.arch == "powerpc64"
+        || target.arch == "powerpc64le"
+        || target.arch == "loongarch64"
+        || (target.arch == "x86" && !target.has_feature("sse"))
+        || target.os == "windows"
+        // Linking says "error: function signature mismatch: __extendhfsf2" and seems to
+        // think the signature is either `(i32) -> f32` or `(f32) -> f32`. See
+        // <https://github.com/llvm/llvm-project/issues/96438>.
+        || target.arch == "wasm32"
+        || target.arch == "wasm64"
+    {
+        features.insert(Feature::NoSysF16);
+    }
+
+    // These platforms are missing either `__extendhfdf2` or `__truncdfhf2`.
+    if target.vendor == "apple" || target.os == "windows" {
+        features.insert(Feature::NoSysF16F64Convert);
+    }
+
+    // Add implied features. Collection is required for borrows.
+    features.extend(
+        features
+            .iter()
+            .flat_map(|x| x.implies())
+            .copied()
+            .collect::<Vec<_>>(),
+    );
+
+    for feature in features {
+        let (name, warning) = match feature {
+            Feature::NoSysF128 => ("no-sys-f128", "using apfloat fallback for f128"),
+            Feature::NoSysF128IntConvert => (
+                "no-sys-f128-int-convert",
+                "using apfloat fallback for f128 <-> int conversions",
+            ),
+            Feature::NoSysF16F64Convert => (
+                "no-sys-f16-f64-convert",
+                "using apfloat fallback for f16 <-> f64 conversions",
+            ),
+            Feature::NoSysF16F128Convert => (
+                "no-sys-f16-f128-convert",
+                "using apfloat fallback for f16 <-> f128 conversions",
+            ),
+            Feature::NoSysF16 => ("no-sys-f16", "using apfloat fallback for f16"),
+        };
+        println!("cargo:warning={warning}");
+        println!("cargo:rustc-cfg=feature=\"{name}\"");
+    }
+
+    builtins_configure::configure_aliases(&target);
+    builtins_configure::configure_f16_f128(&target);
+}
diff --git a/library/compiler-builtins/builtins-test/src/bench.rs b/library/compiler-builtins/builtins-test/src/bench.rs
new file mode 100644
index 00000000000..2348f6bc973
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/src/bench.rs
@@ -0,0 +1,366 @@
+use alloc::vec::Vec;
+use core::cell::RefCell;
+
+use compiler_builtins::float::Float;
+
+/// Fuzz with these many items to ensure equal functions
+pub const CHECK_ITER_ITEMS: u32 = 10_000;
+/// Benchmark with this many items to get a variety
+pub const BENCH_ITER_ITEMS: u32 = 500;
+
+/// Still run benchmarks/tests but don't check correctness between compiler-builtins and
+/// builtin system functions functions
+pub fn skip_sys_checks(test_name: &str) -> bool {
+    const ALWAYS_SKIPPED: &[&str] = &[
+        // FIXME(f16_f128): system symbols have incorrect results
+        // <https://github.com/rust-lang/compiler-builtins/issues/617>
+        "extend_f16_f32",
+        "trunc_f32_f16",
+        "trunc_f64_f16",
+        // FIXME(#616): re-enable once fix is in nightly
+        // <https://github.com/rust-lang/compiler-builtins/issues/616>
+        "mul_f32",
+        "mul_f64",
+    ];
+
+    // FIXME(f16_f128): error on LE ppc64. There are more tests that are cfg-ed out completely
+    // in their benchmark modules due to runtime panics.
+    // <https://github.com/rust-lang/compiler-builtins/issues/617#issuecomment-2125914639>
+    const PPC64LE_SKIPPED: &[&str] = &["extend_f32_f128"];
+
+    // FIXME(f16_f128): system symbols have incorrect results
+    // <https://github.com/rust-lang/compiler-builtins/issues/617#issuecomment-2125914639>
+    const X86_NO_SSE_SKIPPED: &[&str] = &[
+        "add_f128", "sub_f128", "mul_f128", "div_f128", "powi_f32", "powi_f64",
+    ];
+
+    // FIXME(f16_f128): Wide multiply carry bug in `compiler-rt`, re-enable when nightly no longer
+    // uses `compiler-rt` version.
+    // <https://github.com/llvm/llvm-project/issues/91840>
+    const AARCH64_SKIPPED: &[&str] = &["mul_f128", "div_f128"];
+
+    // FIXME(llvm): system symbols have incorrect results on Windows
+    // <https://github.com/rust-lang/compiler-builtins/issues/617#issuecomment-2121359807>
+    const WINDOWS_SKIPPED: &[&str] = &[
+        "conv_f32_u128",
+        "conv_f32_i128",
+        "conv_f64_u128",
+        "conv_f64_i128",
+    ];
+
+    if cfg!(target_arch = "arm") {
+        // The Arm symbols need a different ABI that our macro doesn't handle, just skip it
+        return true;
+    }
+
+    if ALWAYS_SKIPPED.contains(&test_name) {
+        return true;
+    }
+
+    if cfg!(all(target_arch = "powerpc64", target_endian = "little"))
+        && PPC64LE_SKIPPED.contains(&test_name)
+    {
+        return true;
+    }
+
+    if cfg!(all(target_arch = "x86", not(target_feature = "sse")))
+        && X86_NO_SSE_SKIPPED.contains(&test_name)
+    {
+        return true;
+    }
+
+    if cfg!(target_arch = "aarch64") && AARCH64_SKIPPED.contains(&test_name) {
+        return true;
+    }
+
+    if cfg!(target_family = "windows") && WINDOWS_SKIPPED.contains(&test_name) {
+        return true;
+    }
+
+    false
+}
+
+/// Still run benchmarks/tests but don't check correctness between compiler-builtins and
+/// assembly functions
+pub fn skip_asm_checks(_test_name: &str) -> bool {
+    // Nothing to skip at this time
+    false
+}
+
+/// Create a comparison of the system symbol, compiler_builtins, and optionally handwritten
+/// assembly.
+///
+/// # Safety
+///
+/// The signature must be correct and any assembly must be sound.
+#[macro_export]
+macro_rules! float_bench {
+    (
+        // Name of this benchmark
+        name: $name:ident,
+        // The function signature to be tested
+        sig: ($($arg:ident: $arg_ty:ty),*) -> $ret_ty:ty,
+        // Path to the crate in compiler_builtins
+        crate_fn: $crate_fn:path,
+        // Optional alias on ppc
+        $( crate_fn_ppc: $crate_fn_ppc:path, )?
+        // Name of the system symbol
+        sys_fn: $sys_fn:ident,
+        // Optional alias on ppc
+        $( sys_fn_ppc: $sys_fn_ppc:path, )?
+        // Meta saying whether the system symbol is available
+        sys_available: $sys_available:meta,
+        // An optional function to validate the results of two functions are equal, if not
+        // just `$ret_ty::check_eq`
+        $( output_eq: $output_eq:expr, )?
+        // Assembly implementations, if any.
+        asm: [
+            $(
+                #[cfg($asm_meta:meta)] {
+                    $($asm_tt:tt)*
+                }
+            );*
+            $(;)?
+        ]
+        $(,)?
+    ) => {paste::paste! {
+        // SAFETY: macro invocation must use the correct signature
+        #[cfg($sys_available)]
+        unsafe extern "C" {
+            /// Binding for the system function
+            #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+            fn $sys_fn($($arg: $arg_ty),*) -> $ret_ty;
+
+
+            #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+            float_bench! { @coalesce_fn $($sys_fn_ppc)? =>
+                fn $sys_fn($($arg: $arg_ty),*) -> $ret_ty;
+            }
+        }
+
+        fn $name(c: &mut Criterion) {
+            use core::hint::black_box;
+            use compiler_builtins::float::Float;
+            use $crate::bench::TestIO;
+
+            #[inline(never)] // equalize with external calls
+            fn crate_fn($($arg: $arg_ty),*) -> $ret_ty {
+                #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+                let target_crate_fn = $crate_fn;
+
+                // On PPC, use an alias if specified
+                #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+                let target_crate_fn = float_bench!(@coalesce $($crate_fn_ppc)?, $crate_fn);
+
+                target_crate_fn( $($arg),* )
+            }
+
+            #[inline(always)] // already a branch
+            #[cfg($sys_available)]
+            fn sys_fn($($arg: $arg_ty),*) -> $ret_ty {
+                #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+                let target_sys_fn = $sys_fn;
+
+                // On PPC, use an alias if specified
+                #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+                let target_sys_fn = float_bench!(@coalesce $($sys_fn_ppc)?, $sys_fn);
+
+                unsafe { target_sys_fn( $($arg),* ) }
+            }
+
+            #[inline(never)] // equalize with external calls
+            #[cfg(any( $($asm_meta),* ))]
+            fn asm_fn($(mut $arg: $arg_ty),*) -> $ret_ty {
+                use core::arch::asm;
+                $(
+                    #[cfg($asm_meta)]
+                    unsafe { $($asm_tt)* }
+                )*
+            }
+
+            let testvec = <($($arg_ty),*)>::make_testvec($crate::bench::CHECK_ITER_ITEMS);
+            let benchvec = <($($arg_ty),*)>::make_testvec($crate::bench::BENCH_ITER_ITEMS);
+            let test_name = stringify!($name);
+            let check_eq = float_bench!(@coalesce $($output_eq)?, $ret_ty::check_eq);
+
+            // Verify math lines up. We run the crate functions even if we don't validate the
+            // output here to make sure there are no panics or crashes.
+
+            #[cfg($sys_available)]
+            for ($($arg),*) in testvec.iter().copied() {
+                let crate_res = crate_fn($($arg),*);
+                let sys_res = sys_fn($($arg),*);
+
+                if $crate::bench::skip_sys_checks(test_name) {
+                    continue;
+                }
+
+                assert!(
+                    check_eq(crate_res, sys_res),
+                    "{test_name}{:?}: crate: {crate_res:?}, sys: {sys_res:?}",
+                    ($($arg),* ,)
+                );
+            }
+
+            #[cfg(any( $($asm_meta),* ))]
+            {
+                for ($($arg),*) in testvec.iter().copied() {
+                    let crate_res = crate_fn($($arg),*);
+                    let asm_res = asm_fn($($arg),*);
+
+                    if $crate::bench::skip_asm_checks(test_name) {
+                        continue;
+                    }
+
+                    assert!(
+                        check_eq(crate_res, asm_res),
+                        "{test_name}{:?}: crate: {crate_res:?}, asm: {asm_res:?}",
+                        ($($arg),* ,)
+                    );
+                }
+            }
+
+            let mut group = c.benchmark_group(test_name);
+            group.bench_function("compiler-builtins", |b| b.iter(|| {
+                for ($($arg),*) in benchvec.iter().copied() {
+                    black_box(crate_fn( $(black_box($arg)),* ));
+                }
+            }));
+
+            #[cfg($sys_available)]
+            group.bench_function("system", |b| b.iter(|| {
+                for ($($arg),*) in benchvec.iter().copied() {
+                    black_box(sys_fn( $(black_box($arg)),* ));
+                }
+            }));
+
+            #[cfg(any( $($asm_meta),* ))]
+            group.bench_function(&format!(
+                "assembly ({} {})", std::env::consts::ARCH, std::env::consts::FAMILY
+            ), |b| b.iter(|| {
+                for ($($arg),*) in benchvec.iter().copied() {
+                    black_box(asm_fn( $(black_box($arg)),* ));
+                }
+            }));
+
+            group.finish();
+        }
+    }};
+
+    // Allow overriding a default
+    (@coalesce $specified:expr, $default:expr) => { $specified };
+    (@coalesce, $default:expr) => { $default };
+
+    // Allow overriding a function name
+    (@coalesce_fn $specified:ident => fn $default_name:ident $($tt:tt)+) => {
+        fn $specified $($tt)+
+    };
+    (@coalesce_fn => fn $default_name:ident $($tt:tt)+) => {
+        fn $default_name $($tt)+
+    };
+}
+
+/// A type used as either an input or output to/from a benchmark function.
+pub trait TestIO: Sized {
+    fn make_testvec(len: u32) -> Vec<Self>;
+    fn check_eq(a: Self, b: Self) -> bool;
+}
+
+macro_rules! impl_testio {
+    (float $($f_ty:ty),+) => {$(
+        impl TestIO for $f_ty {
+            fn make_testvec(len: u32) -> Vec<Self> {
+                // refcell because fuzz_* takes a `Fn`
+                let ret = RefCell::new(Vec::new());
+                crate::fuzz_float(len, |a| ret.borrow_mut().push(a));
+                ret.into_inner()
+            }
+
+            fn check_eq(a: Self, b: Self) -> bool {
+                Float::eq_repr(a, b)
+            }
+        }
+
+        impl TestIO for ($f_ty, $f_ty) {
+            fn make_testvec(len: u32) -> Vec<Self> {
+                // refcell because fuzz_* takes a `Fn`
+                let ret = RefCell::new(Vec::new());
+                crate::fuzz_float_2(len, |a, b| ret.borrow_mut().push((a, b)));
+                ret.into_inner()
+            }
+
+            fn check_eq(_a: Self, _b: Self) -> bool {
+                unimplemented!()
+            }
+        }
+    )*};
+
+    (int $($i_ty:ty),+) => {$(
+        impl TestIO for $i_ty {
+            fn make_testvec(len: u32) -> Vec<Self> {
+                // refcell because fuzz_* takes a `Fn`
+                let ret = RefCell::new(Vec::new());
+                crate::fuzz(len, |a| ret.borrow_mut().push(a));
+                ret.into_inner()
+            }
+
+            fn check_eq(a: Self, b: Self) -> bool {
+                a == b
+            }
+        }
+
+        impl TestIO for ($i_ty, $i_ty) {
+            fn make_testvec(len: u32) -> Vec<Self> {
+                // refcell because fuzz_* takes a `Fn`
+                let ret = RefCell::new(Vec::new());
+                crate::fuzz_2(len, |a, b| ret.borrow_mut().push((a, b)));
+                ret.into_inner()
+            }
+
+            fn check_eq(_a: Self, _b: Self) -> bool {
+                unimplemented!()
+            }
+        }
+    )*};
+
+    ((float, int) ($f_ty:ty, $i_ty:ty)) => {
+        impl TestIO for ($f_ty, $i_ty) {
+            fn make_testvec(len: u32) -> Vec<Self> {
+                // refcell because fuzz_* takes a `Fn`
+                let ivec = RefCell::new(Vec::new());
+                let fvec = RefCell::new(Vec::new());
+
+                crate::fuzz(len.isqrt(), |a| ivec.borrow_mut().push(a));
+                crate::fuzz_float(len.isqrt(), |a| fvec.borrow_mut().push(a));
+
+                let mut ret = Vec::new();
+                let ivec = ivec.into_inner();
+                let fvec = fvec.into_inner();
+
+                for f in fvec {
+                    for i in &ivec {
+                        ret.push((f, *i));
+                    }
+                }
+
+                ret
+            }
+
+            fn check_eq(_a: Self, _b: Self) -> bool {
+                unimplemented!()
+            }
+        }
+    }
+}
+
+#[cfg(f16_enabled)]
+impl_testio!(float f16);
+impl_testio!(float f32, f64);
+#[cfg(f128_enabled)]
+impl_testio!(float f128);
+impl_testio!(int i16, i32, i64, i128);
+impl_testio!(int u16, u32, u64, u128);
+impl_testio!((float, int)(f32, i32));
+impl_testio!((float, int)(f64, i32));
+#[cfg(f128_enabled)]
+impl_testio!((float, int)(f128, i32));
diff --git a/library/compiler-builtins/builtins-test/src/lib.rs b/library/compiler-builtins/builtins-test/src/lib.rs
new file mode 100644
index 00000000000..c596ac21380
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/src/lib.rs
@@ -0,0 +1,337 @@
+//! This crate is for integration testing and fuzz testing of functions in `compiler-builtins`. This
+//! includes publicly documented intrinsics and some internal alternative implementation functions
+//! such as `usize_leading_zeros_riscv` (which are tested because they are configured for
+//! architectures not tested by the CI).
+//!
+//! The general idea is to use a combination of edge case testing and randomized fuzz testing. The
+//! edge case testing is crucial for checking cases like where both inputs are equal or equal to
+//! special values such as `i128::MIN`, which is unlikely for the random fuzzer by itself to
+//! encounter. The randomized fuzz testing is specially designed to cover wide swaths of search
+//! space in as few iterations as possible. See `fuzz_values` in `builtins-test/tests/misc.rs` for
+//! an example.
+//!
+//! Some floating point tests are disabled for specific architectures, because they do not have
+//! correct rounding.
+#![no_std]
+#![cfg_attr(f128_enabled, feature(f128))]
+#![cfg_attr(f16_enabled, feature(f16))]
+
+pub mod bench;
+extern crate alloc;
+
+use compiler_builtins::float::Float;
+use compiler_builtins::int::{Int, MinInt};
+use rand_xoshiro::Xoshiro128StarStar;
+use rand_xoshiro::rand_core::{RngCore, SeedableRng};
+
+/// Sets the number of fuzz iterations run for most tests. In practice, the vast majority of bugs
+/// are caught by the edge case testers. Most of the remaining bugs triggered by more complex
+/// sequences are caught well within 10_000 fuzz iterations. For classes of algorithms like division
+/// that are vulnerable to rare edge cases, we want 1_000_000 iterations to be more confident. In
+/// practical CI, however, we only want to run the more strenuous test once to catch algorithmic
+/// level bugs, and run the 10_000 iteration test on most targets. Target-dependent bugs are likely
+/// to involve miscompilation and misconfiguration that is likely to break algorithms in quickly
+/// caught ways. We choose to configure `N = 1_000_000` iterations for `x86_64` targets (and if
+/// debug assertions are disabled. Tests without `--release` would take too long) which are likely
+/// to have fast hardware, and run `N = 10_000` for all other targets.
+pub const N: u32 = if cfg!(target_arch = "x86_64") && !cfg!(debug_assertions) {
+    1_000_000
+} else {
+    10_000
+};
+
+/// Random fuzzing step. When run several times, it results in excellent fuzzing entropy such as:
+/// 11110101010101011110111110011111
+/// 10110101010100001011101011001010
+/// 1000000000000000
+/// 10000000000000110111110000001010
+/// 1111011111111101010101111110101
+/// 101111111110100000000101000000
+/// 10000000110100000000100010101
+/// 1010101010101000
+fn fuzz_step<I: Int>(rng: &mut Xoshiro128StarStar, x: &mut I) {
+    let ones = !I::ZERO;
+    let bit_indexing_mask: u32 = I::BITS - 1;
+    // It happens that all the RNG we need can come from one call. 7 bits are needed to index a
+    // worst case 128 bit integer, and there are 4 indexes that need to be made plus 4 bits for
+    // selecting operations
+    let rng32 = rng.next_u32();
+
+    // Randomly OR, AND, and XOR randomly sized and shifted continuous strings of
+    // ones with `lhs` and `rhs`.
+    let r0 = bit_indexing_mask & rng32;
+    let r1 = bit_indexing_mask & (rng32 >> 7);
+    let mask = ones.wrapping_shl(r0).rotate_left(r1);
+    match (rng32 >> 14) % 4 {
+        0 => *x |= mask,
+        1 => *x &= mask,
+        // both 2 and 3 to make XORs as common as ORs and ANDs combined
+        _ => *x ^= mask,
+    }
+
+    // Alternating ones and zeros (e.x. 0b1010101010101010). This catches second-order
+    // problems that might occur for algorithms with two modes of operation (potentially
+    // there is some invariant that can be broken and maintained via alternating between modes,
+    // breaking the algorithm when it reaches the end).
+    let mut alt_ones = I::ONE;
+    for _ in 0..(I::BITS / 2) {
+        alt_ones <<= 2;
+        alt_ones |= I::ONE;
+    }
+    let r0 = bit_indexing_mask & (rng32 >> 16);
+    let r1 = bit_indexing_mask & (rng32 >> 23);
+    let mask = alt_ones.wrapping_shl(r0).rotate_left(r1);
+    match rng32 >> 30 {
+        0 => *x |= mask,
+        1 => *x &= mask,
+        _ => *x ^= mask,
+    }
+}
+
+// We need macros like this, because `#![no_std]` prevents us from using iterators
+macro_rules! edge_cases {
+    ($I:ident, $case:ident, $inner:block) => {
+        for i0 in 0..$I::FUZZ_NUM {
+            let mask_lo = (!$I::UnsignedInt::ZERO).wrapping_shr($I::FUZZ_LENGTHS[i0] as u32);
+            for i1 in i0..I::FUZZ_NUM {
+                let mask_hi =
+                    (!$I::UnsignedInt::ZERO).wrapping_shl($I::FUZZ_LENGTHS[i1 - i0] as u32);
+                let $case = I::from_unsigned(mask_lo & mask_hi);
+                $inner
+            }
+        }
+    };
+}
+
+/// Feeds a series of fuzzing inputs to `f`. The fuzzer first uses an algorithm designed to find
+/// edge cases, followed by a more random fuzzer that runs `n` times.
+pub fn fuzz<I: Int, F: FnMut(I)>(n: u32, mut f: F)
+where
+    <I as MinInt>::UnsignedInt: Int,
+{
+    // edge case tester. Calls `f` 210 times for u128.
+    // zero gets skipped by the loop
+    f(I::ZERO);
+    edge_cases!(I, case, {
+        f(case);
+    });
+
+    // random fuzzer
+    let mut rng = Xoshiro128StarStar::seed_from_u64(0);
+    let mut x: I = MinInt::ZERO;
+    for _ in 0..n {
+        fuzz_step(&mut rng, &mut x);
+        f(x)
+    }
+}
+
+/// The same as `fuzz`, except `f` has two inputs.
+pub fn fuzz_2<I: Int, F: Fn(I, I)>(n: u32, f: F)
+where
+    <I as MinInt>::UnsignedInt: Int,
+{
+    // Check cases where the first and second inputs are zero. Both call `f` 210 times for `u128`.
+    edge_cases!(I, case, {
+        f(I::ZERO, case);
+    });
+    edge_cases!(I, case, {
+        f(case, I::ZERO);
+    });
+    // Nested edge tester. Calls `f` 44100 times for `u128`.
+    edge_cases!(I, case0, {
+        edge_cases!(I, case1, {
+            f(case0, case1);
+        })
+    });
+
+    // random fuzzer
+    let mut rng = Xoshiro128StarStar::seed_from_u64(0);
+    let mut x: I = I::ZERO;
+    let mut y: I = I::ZERO;
+    for _ in 0..n {
+        fuzz_step(&mut rng, &mut x);
+        fuzz_step(&mut rng, &mut y);
+        f(x, y)
+    }
+}
+
+/// Tester for shift functions
+pub fn fuzz_shift<I: Int, F: Fn(I, u32)>(f: F) {
+    // Shift functions are very simple and do not need anything other than shifting a small
+    // set of random patterns for every fuzz length.
+    let mut rng = Xoshiro128StarStar::seed_from_u64(0);
+    let mut x: I = MinInt::ZERO;
+    for i in 0..I::FUZZ_NUM {
+        fuzz_step(&mut rng, &mut x);
+        f(x, MinInt::ZERO);
+        f(x, I::FUZZ_LENGTHS[i] as u32);
+    }
+}
+
+fn fuzz_float_step<F: Float>(rng: &mut Xoshiro128StarStar, f: &mut F) {
+    let rng32 = rng.next_u32();
+    // we need to fuzz the different parts of the float separately, because the masking on larger
+    // significands will tend to set the exponent to all ones or all zeros frequently
+
+    // sign bit fuzzing
+    let sign = (rng32 & 1) != 0;
+
+    // exponent fuzzing. Only 4 bits for the selector needed.
+    let ones = (F::Int::ONE << F::EXP_BITS) - F::Int::ONE;
+    let r0 = (rng32 >> 1) % F::EXP_BITS;
+    let r1 = (rng32 >> 5) % F::EXP_BITS;
+    // custom rotate shift. Note that `F::Int` is unsigned, so we can shift right without smearing
+    // the sign bit.
+    let mask = if r1 == 0 {
+        ones.wrapping_shr(r0)
+    } else {
+        let tmp = ones.wrapping_shr(r0);
+        (tmp.wrapping_shl(r1) | tmp.wrapping_shr(F::EXP_BITS - r1)) & ones
+    };
+    let mut exp = (f.to_bits() & F::EXP_MASK) >> F::SIG_BITS;
+    match (rng32 >> 9) % 4 {
+        0 => exp |= mask,
+        1 => exp &= mask,
+        _ => exp ^= mask,
+    }
+
+    // significand fuzzing
+    let mut sig = f.to_bits() & F::SIG_MASK;
+    fuzz_step(rng, &mut sig);
+    sig &= F::SIG_MASK;
+
+    *f = F::from_parts(sign, exp, sig);
+}
+
+macro_rules! float_edge_cases {
+    ($F:ident, $case:ident, $inner:block) => {
+        for exponent in [
+            F::Int::ZERO,
+            F::Int::ONE,
+            F::Int::ONE << (F::EXP_BITS / 2),
+            (F::Int::ONE << (F::EXP_BITS - 1)) - F::Int::ONE,
+            F::Int::ONE << (F::EXP_BITS - 1),
+            (F::Int::ONE << (F::EXP_BITS - 1)) + F::Int::ONE,
+            (F::Int::ONE << F::EXP_BITS) - F::Int::ONE,
+        ]
+        .iter()
+        {
+            for significand in [
+                F::Int::ZERO,
+                F::Int::ONE,
+                F::Int::ONE << (F::SIG_BITS / 2),
+                (F::Int::ONE << (F::SIG_BITS - 1)) - F::Int::ONE,
+                F::Int::ONE << (F::SIG_BITS - 1),
+                (F::Int::ONE << (F::SIG_BITS - 1)) + F::Int::ONE,
+                (F::Int::ONE << F::SIG_BITS) - F::Int::ONE,
+            ]
+            .iter()
+            {
+                for sign in [false, true].iter() {
+                    let $case = F::from_parts(*sign, *exponent, *significand);
+                    $inner
+                }
+            }
+        }
+    };
+}
+
+pub fn fuzz_float<F: Float, E: Fn(F)>(n: u32, f: E) {
+    float_edge_cases!(F, case, {
+        f(case);
+    });
+
+    // random fuzzer
+    let mut rng = Xoshiro128StarStar::seed_from_u64(0);
+    let mut x = F::ZERO;
+    for _ in 0..n {
+        fuzz_float_step(&mut rng, &mut x);
+        f(x);
+    }
+}
+
+pub fn fuzz_float_2<F: Float, E: Fn(F, F)>(n: u32, f: E) {
+    float_edge_cases!(F, case0, {
+        float_edge_cases!(F, case1, {
+            f(case0, case1);
+        });
+    });
+
+    // random fuzzer
+    let mut rng = Xoshiro128StarStar::seed_from_u64(0);
+    let mut x = F::ZERO;
+    let mut y = F::ZERO;
+    for _ in 0..n {
+        fuzz_float_step(&mut rng, &mut x);
+        fuzz_float_step(&mut rng, &mut y);
+        f(x, y)
+    }
+}
+
+/// Perform an operation using builtin types if available, falling back to apfloat if not.
+#[macro_export]
+macro_rules! apfloat_fallback {
+    (
+        $float_ty:ty,
+        // Type name in `rustc_apfloat::ieee`. Not a full path, it automatically gets the prefix.
+        $apfloat_ty:ident,
+        // Cfg expression for when builtin system operations should be used
+        $sys_available:meta,
+        // The expression to run. This expression may use `FloatTy` for its signature.
+        // Optionally, the final conversion back to a float can be suppressed using
+        // `=> no_convert` (for e.g. operations that return a bool).
+        //
+        // If the apfloat needs a different operation, it can be provided here.
+        $op:expr $(=> $convert:ident)? $(; $apfloat_op:expr)?,
+        // Arguments that get passed to `$op` after converting to a float
+        $($arg:expr),+
+        $(,)?
+    ) => {{
+        #[cfg($sys_available)]
+        let ret = {
+            type FloatTy = $float_ty;
+            $op( $($arg),+ )
+        };
+
+        #[cfg(not($sys_available))]
+        let ret = {
+            use rustc_apfloat::Float;
+            type FloatTy = rustc_apfloat::ieee::$apfloat_ty;
+
+            apfloat_fallback!(@inner
+                fty: $float_ty,
+                // Apply a conversion to `FloatTy` to each arg, then pass all args to `$op`
+                op_res: $op( $(FloatTy::from_bits($arg.to_bits().into())),+ ),
+                $(apfloat_op: $apfloat_op, )?
+                $(conv_opts: $convert,)?
+                args: $($arg),+
+            )
+        };
+
+        ret
+    }};
+
+    // Operations that do not need converting back to a float
+    (@inner fty: $float_ty:ty, op_res: $val:expr, conv_opts: no_convert, args: $($_arg:expr),+) => {
+        $val
+    };
+
+    // Some apfloat operations return a `StatusAnd` that we need to extract the value from. This
+    // is the default.
+    (@inner fty: $float_ty:ty, op_res: $val:expr, args: $($_arg:expr),+) => {{
+        // ignore the status, just get the value
+        let unwrapped = $val.value;
+
+        <$float_ty>::from_bits(FloatTy::to_bits(unwrapped).try_into().unwrap())
+    }};
+
+    // This is the case where we can't use the same expression for the default builtin and
+    // nonstandard apfloat fallback (e.g. `as` casts in std are normal functions in apfloat, so
+    // two separate expressions must be specified.
+    (@inner
+        fty: $float_ty:ty, op_res: $_val:expr,
+        apfloat_op: $apfloat_op:expr, args: $($arg:expr),+
+    ) => {{
+        $apfloat_op($($arg),+)
+    }};
+}
diff --git a/library/compiler-builtins/builtins-test/tests/addsub.rs b/library/compiler-builtins/builtins-test/tests/addsub.rs
new file mode 100644
index 00000000000..865b9e472ab
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/tests/addsub.rs
@@ -0,0 +1,143 @@
+#![allow(unused_macros)]
+#![cfg_attr(f128_enabled, feature(f128))]
+
+use builtins_test::*;
+
+mod int_addsub {
+    use super::*;
+
+    macro_rules! sum {
+        ($($i:ty, $fn_add:ident, $fn_sub:ident);*;) => {
+            $(
+                #[test]
+                fn $fn_add() {
+                    use compiler_builtins::int::addsub::{$fn_add, $fn_sub};
+
+                    fuzz_2(N, |x: $i, y: $i| {
+                        let add0 = x.wrapping_add(y);
+                        let sub0 = x.wrapping_sub(y);
+                        let add1: $i = $fn_add(x, y);
+                        let sub1: $i = $fn_sub(x, y);
+                        if add0 != add1 {
+                            panic!(
+                                "{}({}, {}): std: {}, builtins: {}",
+                                stringify!($fn_add), x, y, add0, add1
+                            );
+                        }
+                        if sub0 != sub1 {
+                            panic!(
+                                "{}({}, {}): std: {}, builtins: {}",
+                                stringify!($fn_sub), x, y, sub0, sub1
+                            );
+                        }
+                    });
+                }
+            )*
+        };
+    }
+
+    macro_rules! overflowing_sum {
+        ($($i:ty, $fn_add:ident, $fn_sub:ident);*;) => {
+            $(
+                #[test]
+                fn $fn_add() {
+                    use compiler_builtins::int::addsub::{$fn_add, $fn_sub};
+
+                    fuzz_2(N, |x: $i, y: $i| {
+                        let (add0, add_o0)= x.overflowing_add(y);
+                        let (sub0, sub_o0)= x.overflowing_sub(y);
+                        let mut add_o1 = 0;
+                        let mut sub_o1 = 0;
+                        let add1: $i = $fn_add(x, y, &mut add_o1);
+                        let sub1: $i = $fn_sub(x, y, &mut sub_o1);
+                        if add0 != add1 || i32::from(add_o0) != add_o1 {
+                            panic!(
+                                "{}({}, {}): std: {:?}, builtins: {:?}",
+                                stringify!($fn_add), x, y, (add0, add_o0) , (add1, add_o1)
+                            );
+                        }
+                        if sub0 != sub1 || i32::from(sub_o0) != sub_o1 {
+                            panic!(
+                                "{}({}, {}): std: {:?}, builtins: {:?}",
+                                stringify!($fn_sub), x, y, (sub0, sub_o0) , (sub1, sub_o1)
+                            );
+                        }
+                    });
+                }
+            )*
+        };
+    }
+
+    // Integer addition and subtraction is very simple, so 100 fuzzing passes should be plenty.
+    sum! {
+        u128, __rust_u128_add, __rust_u128_sub;
+        i128, __rust_i128_add, __rust_i128_sub;
+    }
+
+    overflowing_sum! {
+        u128, __rust_u128_addo, __rust_u128_subo;
+        i128, __rust_i128_addo, __rust_i128_subo;
+    }
+}
+
+macro_rules! float_sum {
+    ($($f:ty, $fn_add:ident, $fn_sub:ident, $apfloat_ty:ident, $sys_available:meta);*;) => {
+        $(
+            #[test]
+            fn $fn_add() {
+                use core::ops::{Add, Sub};
+                use compiler_builtins::float::{{add::$fn_add, sub::$fn_sub}, Float};
+
+                fuzz_float_2(N, |x: $f, y: $f| {
+                    let add0 = apfloat_fallback!($f, $apfloat_ty, $sys_available, Add::add, x, y);
+                    let sub0 = apfloat_fallback!($f, $apfloat_ty, $sys_available, Sub::sub, x, y);
+                    let add1: $f = $fn_add(x, y);
+                    let sub1: $f = $fn_sub(x, y);
+                    if !Float::eq_repr(add0, add1) {
+                        panic!(
+                            "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
+                            stringify!($fn_add), x, y, add0, add1
+                        );
+                    }
+                    if !Float::eq_repr(sub0, sub1) {
+                        panic!(
+                            "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
+                            stringify!($fn_sub), x, y, sub0, sub1
+                        );
+                    }
+                });
+            }
+        )*
+    }
+}
+
+#[cfg(not(all(target_arch = "x86", not(target_feature = "sse"))))]
+mod float_addsub {
+    use super::*;
+
+    float_sum! {
+        f32, __addsf3, __subsf3, Single, all();
+        f64, __adddf3, __subdf3, Double, all();
+    }
+}
+
+#[cfg(f128_enabled)]
+#[cfg(not(all(target_arch = "x86", not(target_feature = "sse"))))]
+#[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+mod float_addsub_f128 {
+    use super::*;
+
+    float_sum! {
+        f128, __addtf3, __subtf3, Quad, not(feature = "no-sys-f128");
+    }
+}
+
+#[cfg(f128_enabled)]
+#[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+mod float_addsub_f128_ppc {
+    use super::*;
+
+    float_sum! {
+        f128, __addkf3, __subkf3, Quad, not(feature = "no-sys-f128");
+    }
+}
diff --git a/library/compiler-builtins/builtins-test/tests/aeabi_memclr.rs b/library/compiler-builtins/builtins-test/tests/aeabi_memclr.rs
new file mode 100644
index 00000000000..bfd15a391aa
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/tests/aeabi_memclr.rs
@@ -0,0 +1,60 @@
+#![cfg(all(
+    target_arch = "arm",
+    not(any(target_env = "gnu", target_env = "musl")),
+    target_os = "linux",
+    feature = "mem"
+))]
+#![feature(compiler_builtins_lib)]
+#![no_std]
+
+extern crate compiler_builtins;
+
+// test runner
+extern crate utest_cortex_m_qemu;
+
+// overrides `panic!`
+#[macro_use]
+extern crate utest_macros;
+
+use core::mem;
+
+macro_rules! panic {
+    ($($tt:tt)*) => {
+        upanic!($($tt)*);
+    };
+}
+
+extern "C" {
+    fn __aeabi_memclr4(dest: *mut u8, n: usize);
+    fn __aeabi_memset4(dest: *mut u8, n: usize, c: u32);
+}
+
+struct Aligned {
+    array: [u8; 8],
+    _alignment: [u32; 0],
+}
+
+impl Aligned {
+    fn new() -> Self {
+        Aligned {
+            array: [0; 8],
+            _alignment: [],
+        }
+    }
+}
+
+#[test]
+fn memclr4() {
+    let mut aligned = Aligned::new();
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+
+    for n in 0..9 {
+        unsafe {
+            __aeabi_memset4(xs.as_mut_ptr(), n, 0xff);
+            __aeabi_memclr4(xs.as_mut_ptr(), n);
+        }
+
+        assert!(xs[0..n].iter().all(|x| *x == 0));
+    }
+}
diff --git a/library/compiler-builtins/builtins-test/tests/aeabi_memcpy.rs b/library/compiler-builtins/builtins-test/tests/aeabi_memcpy.rs
new file mode 100644
index 00000000000..c892c5aba0f
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/tests/aeabi_memcpy.rs
@@ -0,0 +1,71 @@
+#![cfg(all(
+    target_arch = "arm",
+    not(any(target_env = "gnu", target_env = "musl")),
+    target_os = "linux",
+    feature = "mem"
+))]
+#![feature(compiler_builtins_lib)]
+#![no_std]
+
+extern crate compiler_builtins;
+
+// test runner
+extern crate utest_cortex_m_qemu;
+
+// overrides `panic!`
+#[macro_use]
+extern crate utest_macros;
+
+macro_rules! panic {
+    ($($tt:tt)*) => {
+        upanic!($($tt)*);
+    };
+}
+
+extern "C" {
+    fn __aeabi_memcpy(dest: *mut u8, src: *const u8, n: usize);
+    fn __aeabi_memcpy4(dest: *mut u8, src: *const u8, n: usize);
+}
+
+struct Aligned {
+    array: [u8; 8],
+    _alignment: [u32; 0],
+}
+
+impl Aligned {
+    fn new(array: [u8; 8]) -> Self {
+        Aligned {
+            array: array,
+            _alignment: [],
+        }
+    }
+}
+
+#[test]
+fn memcpy() {
+    let mut dest = [0; 4];
+    let src = [0xde, 0xad, 0xbe, 0xef];
+
+    for n in 0..dest.len() {
+        dest.copy_from_slice(&[0; 4]);
+
+        unsafe { __aeabi_memcpy(dest.as_mut_ptr(), src.as_ptr(), n) }
+
+        assert_eq!(&dest[0..n], &src[0..n])
+    }
+}
+
+#[test]
+fn memcpy4() {
+    let mut aligned = Aligned::new([0; 8]);
+    let dest = &mut aligned.array;
+    let src = [0xde, 0xad, 0xbe, 0xef, 0xba, 0xad, 0xf0, 0x0d];
+
+    for n in 0..dest.len() {
+        dest.copy_from_slice(&[0; 8]);
+
+        unsafe { __aeabi_memcpy4(dest.as_mut_ptr(), src.as_ptr(), n) }
+
+        assert_eq!(&dest[0..n], &src[0..n])
+    }
+}
diff --git a/library/compiler-builtins/builtins-test/tests/aeabi_memset.rs b/library/compiler-builtins/builtins-test/tests/aeabi_memset.rs
new file mode 100644
index 00000000000..34ab3acc78c
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/tests/aeabi_memset.rs
@@ -0,0 +1,240 @@
+#![cfg(all(
+    target_arch = "arm",
+    not(any(target_env = "gnu", target_env = "musl")),
+    target_os = "linux",
+    feature = "mem"
+))]
+#![feature(compiler_builtins_lib)]
+#![no_std]
+
+extern crate compiler_builtins;
+
+// test runner
+extern crate utest_cortex_m_qemu;
+
+// overrides `panic!`
+#[macro_use]
+extern crate utest_macros;
+
+use core::mem;
+
+macro_rules! panic {
+    ($($tt:tt)*) => {
+        upanic!($($tt)*);
+    };
+}
+
+extern "C" {
+    fn __aeabi_memset4(dest: *mut u8, n: usize, c: u32);
+}
+
+struct Aligned {
+    array: [u8; 8],
+    _alignment: [u32; 0],
+}
+
+impl Aligned {
+    fn new(array: [u8; 8]) -> Self {
+        Aligned {
+            array: array,
+            _alignment: [],
+        }
+    }
+}
+
+#[test]
+fn zero() {
+    let mut aligned = Aligned::new([0u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), 0, c) }
+
+    assert_eq!(*xs, [0; 8]);
+
+    let mut aligned = Aligned::new([1u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), 0, c) }
+
+    assert_eq!(*xs, [1; 8]);
+}
+
+#[test]
+fn one() {
+    let mut aligned = Aligned::new([0u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let n = 1;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0, 0, 0, 0, 0, 0, 0]);
+
+    let mut aligned = Aligned::new([1u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 1, 1, 1, 1, 1, 1, 1]);
+}
+
+#[test]
+fn two() {
+    let mut aligned = Aligned::new([0u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let n = 2;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 0, 0, 0, 0, 0, 0]);
+
+    let mut aligned = Aligned::new([1u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 1, 1, 1, 1, 1, 1]);
+}
+
+#[test]
+fn three() {
+    let mut aligned = Aligned::new([0u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let n = 3;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 0xef, 0, 0, 0, 0, 0]);
+
+    let mut aligned = Aligned::new([1u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 0xef, 1, 1, 1, 1, 1]);
+}
+
+#[test]
+fn four() {
+    let mut aligned = Aligned::new([0u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let n = 4;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 0xef, 0xef, 0, 0, 0, 0]);
+
+    let mut aligned = Aligned::new([1u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 0xef, 0xef, 1, 1, 1, 1]);
+}
+
+#[test]
+fn five() {
+    let mut aligned = Aligned::new([0u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let n = 5;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 0xef, 0xef, 0xef, 0, 0, 0]);
+
+    let mut aligned = Aligned::new([1u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 0xef, 0xef, 0xef, 1, 1, 1]);
+}
+
+#[test]
+fn six() {
+    let mut aligned = Aligned::new([0u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let n = 6;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 0xef, 0xef, 0xef, 0xef, 0, 0]);
+
+    let mut aligned = Aligned::new([1u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 0xef, 0xef, 0xef, 0xef, 1, 1]);
+}
+
+#[test]
+fn seven() {
+    let mut aligned = Aligned::new([0u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let n = 7;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 0xef, 0xef, 0xef, 0xef, 0xef, 0]);
+
+    let mut aligned = Aligned::new([1u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 0xef, 0xef, 0xef, 0xef, 0xef, 1]);
+}
+
+#[test]
+fn eight() {
+    let mut aligned = Aligned::new([0u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let n = 8;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 0xef, 0xef, 0xef, 0xef, 0xef, 0xef]);
+
+    let mut aligned = Aligned::new([1u8; 8]);
+    assert_eq!(mem::align_of_val(&aligned), 4);
+    let xs = &mut aligned.array;
+    let c = 0xdeadbeef;
+
+    unsafe { __aeabi_memset4(xs.as_mut_ptr(), n, c) }
+
+    assert_eq!(*xs, [0xef, 0xef, 0xef, 0xef, 0xef, 0xef, 0xef, 0xef]);
+}
diff --git a/library/compiler-builtins/builtins-test/tests/big.rs b/library/compiler-builtins/builtins-test/tests/big.rs
new file mode 100644
index 00000000000..d1ae88bd164
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/tests/big.rs
@@ -0,0 +1,134 @@
+use compiler_builtins::int::{HInt, MinInt, i256, u256};
+
+const LOHI_SPLIT: u128 = 0xaaaaaaaaaaaaaaaaffffffffffffffff;
+
+/// Print a `u256` as hex since we can't add format implementations
+fn hexu(v: u256) -> String {
+    format!(
+        "0x{:016x}{:016x}{:016x}{:016x}",
+        v.0[3], v.0[2], v.0[1], v.0[0]
+    )
+}
+
+#[test]
+fn widen_u128() {
+    assert_eq!(u128::MAX.widen(), u256([u64::MAX, u64::MAX, 0, 0]));
+    assert_eq!(
+        LOHI_SPLIT.widen(),
+        u256([u64::MAX, 0xaaaaaaaaaaaaaaaa, 0, 0])
+    );
+}
+
+#[test]
+fn widen_i128() {
+    assert_eq!((-1i128).widen(), u256::MAX.signed());
+    assert_eq!(
+        (LOHI_SPLIT as i128).widen(),
+        i256([u64::MAX, 0xaaaaaaaaaaaaaaaa, u64::MAX, u64::MAX])
+    );
+    assert_eq!((-1i128).zero_widen().unsigned(), (u128::MAX).widen());
+}
+
+#[test]
+fn widen_mul_u128() {
+    let tests = [
+        (u128::MAX / 2, 2_u128, u256([u64::MAX - 1, u64::MAX, 0, 0])),
+        (u128::MAX, 2_u128, u256([u64::MAX - 1, u64::MAX, 1, 0])),
+        (u128::MAX, u128::MAX, u256([1, 0, u64::MAX - 1, u64::MAX])),
+        (u128::MIN, u128::MIN, u256::ZERO),
+        (1234, 0, u256::ZERO),
+        (0, 1234, u256::ZERO),
+    ];
+
+    let mut errors = Vec::new();
+    for (i, (a, b, exp)) in tests.iter().copied().enumerate() {
+        let res = a.widen_mul(b);
+        let res_z = a.zero_widen_mul(b);
+        assert_eq!(res, res_z);
+        if res != exp {
+            errors.push((i, a, b, exp, res));
+        }
+    }
+
+    for (i, a, b, exp, res) in &errors {
+        eprintln!(
+            "FAILURE ({i}): {a:#034x} * {b:#034x} = {} got {}",
+            hexu(*exp),
+            hexu(*res)
+        );
+    }
+    assert!(errors.is_empty());
+}
+
+#[test]
+fn not_u128() {
+    assert_eq!(!u256::ZERO, u256::MAX);
+}
+
+#[test]
+fn shr_u128() {
+    let only_low = [
+        1,
+        u16::MAX.into(),
+        u32::MAX.into(),
+        u64::MAX.into(),
+        u128::MAX,
+    ];
+
+    let mut errors = Vec::new();
+
+    for a in only_low {
+        for perturb in 0..10 {
+            let a = a.saturating_add(perturb);
+            for shift in 0..128 {
+                let res = a.widen() >> shift;
+                let expected = (a >> shift).widen();
+                if res != expected {
+                    errors.push((a.widen(), shift, res, expected));
+                }
+            }
+        }
+    }
+
+    let check = [
+        (
+            u256::MAX,
+            1,
+            u256([u64::MAX, u64::MAX, u64::MAX, u64::MAX >> 1]),
+        ),
+        (
+            u256::MAX,
+            5,
+            u256([u64::MAX, u64::MAX, u64::MAX, u64::MAX >> 5]),
+        ),
+        (u256::MAX, 63, u256([u64::MAX, u64::MAX, u64::MAX, 1])),
+        (u256::MAX, 64, u256([u64::MAX, u64::MAX, u64::MAX, 0])),
+        (u256::MAX, 65, u256([u64::MAX, u64::MAX, u64::MAX >> 1, 0])),
+        (u256::MAX, 127, u256([u64::MAX, u64::MAX, 1, 0])),
+        (u256::MAX, 128, u256([u64::MAX, u64::MAX, 0, 0])),
+        (u256::MAX, 129, u256([u64::MAX, u64::MAX >> 1, 0, 0])),
+        (u256::MAX, 191, u256([u64::MAX, 1, 0, 0])),
+        (u256::MAX, 192, u256([u64::MAX, 0, 0, 0])),
+        (u256::MAX, 193, u256([u64::MAX >> 1, 0, 0, 0])),
+        (u256::MAX, 191, u256([u64::MAX, 1, 0, 0])),
+        (u256::MAX, 254, u256([0b11, 0, 0, 0])),
+        (u256::MAX, 255, u256([1, 0, 0, 0])),
+    ];
+
+    for (input, shift, expected) in check {
+        let res = input >> shift;
+        if res != expected {
+            errors.push((input, shift, res, expected));
+        }
+    }
+
+    for (a, b, res, expected) in &errors {
+        eprintln!(
+            "FAILURE: {} >> {b} = {} got {}",
+            hexu(*a),
+            hexu(*expected),
+            hexu(*res),
+        );
+    }
+    assert!(errors.is_empty());
+}
diff --git a/library/compiler-builtins/builtins-test/tests/cmp.rs b/library/compiler-builtins/builtins-test/tests/cmp.rs
new file mode 100644
index 00000000000..a904dc5f7de
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/tests/cmp.rs
@@ -0,0 +1,184 @@
+#![allow(unused_macros)]
+#![allow(unreachable_code)]
+#![cfg_attr(f128_enabled, feature(f128))]
+
+use builtins_test::*;
+
+mod float_comparisons {
+    use super::*;
+
+    macro_rules! cmp {
+        (
+            $f:ty, $x:ident, $y:ident, $apfloat_ty:ident, $sys_available:meta,
+            $($unordered_val:expr, $fn:ident);*;
+        ) => {
+            $(
+                let cmp0 = if apfloat_fallback!(
+                        $f, $apfloat_ty, $sys_available,
+                        |x: FloatTy| x.is_nan() => no_convert,
+                        $x
+                    ) || apfloat_fallback!(
+                        $f, $apfloat_ty, $sys_available,
+                        |y: FloatTy| y.is_nan() => no_convert,
+                        $y
+                    )
+                {
+                    $unordered_val
+                } else if apfloat_fallback!(
+                    $f, $apfloat_ty, $sys_available,
+                    |x, y| x < y => no_convert,
+                    $x, $y
+                ) {
+                    -1
+                } else if apfloat_fallback!(
+                    $f, $apfloat_ty, $sys_available,
+                    |x, y| x == y => no_convert,
+                    $x, $y
+                ) {
+                    0
+                } else {
+                    1
+                };
+
+                let cmp1 = $fn($x, $y);
+                if cmp0 != cmp1 {
+                    panic!(
+                        "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
+                        stringify!($fn), $x, $y, cmp0, cmp1
+                    );
+                }
+            )*
+        };
+    }
+
+    #[test]
+    fn cmp_f32() {
+        use compiler_builtins::float::cmp::{
+            __eqsf2, __gesf2, __gtsf2, __lesf2, __ltsf2, __nesf2, __unordsf2,
+        };
+
+        fuzz_float_2(N, |x: f32, y: f32| {
+            assert_eq!(__unordsf2(x, y) != 0, x.is_nan() || y.is_nan());
+            cmp!(f32, x, y, Single, all(),
+                1, __ltsf2;
+                1, __lesf2;
+                1, __eqsf2;
+                -1, __gesf2;
+                -1, __gtsf2;
+                1, __nesf2;
+            );
+        });
+    }
+
+    #[test]
+    fn cmp_f64() {
+        use compiler_builtins::float::cmp::{
+            __eqdf2, __gedf2, __gtdf2, __ledf2, __ltdf2, __nedf2, __unorddf2,
+        };
+
+        fuzz_float_2(N, |x: f64, y: f64| {
+            assert_eq!(__unorddf2(x, y) != 0, x.is_nan() || y.is_nan());
+            cmp!(f64, x, y, Double, all(),
+                1, __ltdf2;
+                1, __ledf2;
+                1, __eqdf2;
+                -1, __gedf2;
+                -1, __gtdf2;
+                1, __nedf2;
+            );
+        });
+    }
+
+    #[test]
+    #[cfg(f128_enabled)]
+    fn cmp_f128() {
+        #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+        use compiler_builtins::float::cmp::{
+            __eqkf2 as __eqtf2, __gekf2 as __getf2, __gtkf2 as __gttf2, __lekf2 as __letf2,
+            __ltkf2 as __lttf2, __nekf2 as __netf2, __unordkf2 as __unordtf2,
+        };
+        #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+        use compiler_builtins::float::cmp::{
+            __eqtf2, __getf2, __gttf2, __letf2, __lttf2, __netf2, __unordtf2,
+        };
+
+        fuzz_float_2(N, |x: f128, y: f128| {
+            let x_is_nan = apfloat_fallback!(
+                f128, Quad, not(feature = "no-sys-f128"),
+                |x: FloatTy| x.is_nan() => no_convert,
+                x
+            );
+            let y_is_nan = apfloat_fallback!(
+                f128, Quad, not(feature = "no-sys-f128"),
+                |x: FloatTy| x.is_nan() => no_convert,
+                y
+            );
+
+            assert_eq!(__unordtf2(x, y) != 0, x_is_nan || y_is_nan);
+
+            cmp!(f128, x, y, Quad, not(feature = "no-sys-f128"),
+                1, __lttf2;
+                1, __letf2;
+                1, __eqtf2;
+                -1, __getf2;
+                -1, __gttf2;
+                1, __netf2;
+            );
+        });
+    }
+}
+
+#[cfg(target_arch = "arm")]
+mod float_comparisons_arm {
+    use super::*;
+
+    macro_rules! cmp2 {
+        ($x:ident, $y:ident, $($unordered_val:expr, $fn_std:expr, $fn_builtins:ident);*;) => {
+            $(
+                let cmp0: i32 = if $x.is_nan() || $y.is_nan() {
+                    $unordered_val
+                } else {
+                    $fn_std as i32
+                };
+                let cmp1: i32 = $fn_builtins($x, $y);
+                if cmp0 != cmp1 {
+                    panic!("{}({}, {}): std: {}, builtins: {}", stringify!($fn_builtins), $x, $y, cmp0, cmp1);
+                }
+            )*
+        };
+    }
+
+    #[test]
+    fn cmp_f32() {
+        use compiler_builtins::float::cmp::{
+            __aeabi_fcmpeq, __aeabi_fcmpge, __aeabi_fcmpgt, __aeabi_fcmple, __aeabi_fcmplt,
+        };
+
+        fuzz_float_2(N, |x: f32, y: f32| {
+            cmp2!(x, y,
+                0, x < y, __aeabi_fcmplt;
+                0, x <= y, __aeabi_fcmple;
+                0, x == y, __aeabi_fcmpeq;
+                0, x >= y, __aeabi_fcmpge;
+                0, x > y, __aeabi_fcmpgt;
+            );
+        });
+    }
+
+    #[test]
+    fn cmp_f64() {
+        use compiler_builtins::float::cmp::{
+            __aeabi_dcmpeq, __aeabi_dcmpge, __aeabi_dcmpgt, __aeabi_dcmple, __aeabi_dcmplt,
+        };
+
+        fuzz_float_2(N, |x: f64, y: f64| {
+            cmp2!(x, y,
+                0, x < y, __aeabi_dcmplt;
+                0, x <= y, __aeabi_dcmple;
+                0, x == y, __aeabi_dcmpeq;
+                0, x >= y, __aeabi_dcmpge;
+                0, x > y, __aeabi_dcmpgt;
+            );
+        });
+    }
+}
diff --git a/library/compiler-builtins/builtins-test/tests/conv.rs b/library/compiler-builtins/builtins-test/tests/conv.rs
new file mode 100644
index 00000000000..491915d9bb1
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/tests/conv.rs
@@ -0,0 +1,364 @@
+#![cfg_attr(f128_enabled, feature(f128))]
+#![cfg_attr(f16_enabled, feature(f16))]
+// makes configuration easier
+#![allow(unused_macros)]
+#![allow(unused_imports)]
+
+use builtins_test::*;
+use compiler_builtins::float::Float;
+use rustc_apfloat::{Float as _, FloatConvert as _};
+
+mod i_to_f {
+    use super::*;
+
+    macro_rules! i_to_f {
+        ($f_ty:ty, $apfloat_ty:ident, $sys_available:meta, $($i_ty:ty, $fn:ident);*;) => {
+            $(
+                #[test]
+                fn $fn() {
+                    use compiler_builtins::float::conv::$fn;
+                    use compiler_builtins::int::Int;
+
+                    fuzz(N, |x: $i_ty| {
+                        let f0 = apfloat_fallback!(
+                            $f_ty, $apfloat_ty, $sys_available,
+                            |x| x as $f_ty;
+                            // When the builtin is not available, we need to use a different conversion
+                            // method (since apfloat doesn't support `as` casting).
+                            |x: $i_ty| {
+                                use compiler_builtins::int::MinInt;
+
+                                let apf = if <$i_ty>::SIGNED {
+                                    FloatTy::from_i128(x.try_into().unwrap()).value
+                                } else {
+                                    FloatTy::from_u128(x.try_into().unwrap()).value
+                                };
+
+                                <$f_ty>::from_bits(apf.to_bits())
+                            },
+                            x
+                        );
+                        let f1: $f_ty = $fn(x);
+
+                        #[cfg($sys_available)] {
+                            // This makes sure that the conversion produced the best rounding possible, and does
+                            // this independent of `x as $into` rounding correctly.
+                            // This assumes that float to integer conversion is correct.
+                            let y_minus_ulp = <$f_ty>::from_bits(f1.to_bits().wrapping_sub(1)) as $i_ty;
+                            let y = f1 as $i_ty;
+                            let y_plus_ulp = <$f_ty>::from_bits(f1.to_bits().wrapping_add(1)) as $i_ty;
+                            let error_minus = <$i_ty as Int>::abs_diff(y_minus_ulp, x);
+                            let error = <$i_ty as Int>::abs_diff(y, x);
+                            let error_plus = <$i_ty as Int>::abs_diff(y_plus_ulp, x);
+
+                            // The first two conditions check that none of the two closest float values are
+                            // strictly closer in representation to `x`. The second makes sure that rounding is
+                            // towards even significand if two float values are equally close to the integer.
+                            if error_minus < error
+                                || error_plus < error
+                                || ((error_minus == error || error_plus == error)
+                                    && ((f0.to_bits() & 1) != 0))
+                            {
+                                if !cfg!(any(
+                                    target_arch = "powerpc",
+                                    target_arch = "powerpc64"
+                                )) {
+                                    panic!(
+                                        "incorrect rounding by {}({}): {}, ({}, {}, {}), errors ({}, {}, {})",
+                                        stringify!($fn),
+                                        x,
+                                        f1.to_bits(),
+                                        y_minus_ulp,
+                                        y,
+                                        y_plus_ulp,
+                                        error_minus,
+                                        error,
+                                        error_plus,
+                                    );
+                                }
+                            }
+                        }
+
+                        // Test against native conversion. We disable testing on all `x86` because of
+                        // rounding bugs with `i686`. `powerpc` also has the same rounding bug.
+                        if !Float::eq_repr(f0, f1) && !cfg!(any(
+                            target_arch = "x86",
+                            target_arch = "powerpc",
+                            target_arch = "powerpc64"
+                        )) {
+                            panic!(
+                                "{}({}): std: {:?}, builtins: {:?}",
+                                stringify!($fn),
+                                x,
+                                f0,
+                                f1,
+                            );
+                        }
+                    });
+                }
+            )*
+        };
+    }
+
+    i_to_f! { f32, Single, all(),
+        u32, __floatunsisf;
+        i32, __floatsisf;
+        u64, __floatundisf;
+        i64, __floatdisf;
+        u128, __floatuntisf;
+        i128, __floattisf;
+    }
+
+    i_to_f! { f64, Double, all(),
+        u32, __floatunsidf;
+        i32, __floatsidf;
+        u64, __floatundidf;
+        i64, __floatdidf;
+        u128, __floatuntidf;
+        i128, __floattidf;
+    }
+
+    #[cfg(not(feature = "no-f16-f128"))]
+    #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+    i_to_f! { f128, Quad, not(feature = "no-sys-f128-int-convert"),
+        u32, __floatunsitf;
+        i32, __floatsitf;
+        u64, __floatunditf;
+        i64, __floatditf;
+        u128, __floatuntitf;
+        i128, __floattitf;
+    }
+
+    #[cfg(not(feature = "no-f16-f128"))]
+    #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+    i_to_f! { f128, Quad, not(feature = "no-sys-f128-int-convert"),
+        u32, __floatunsikf;
+        i32, __floatsikf;
+        u64, __floatundikf;
+        i64, __floatdikf;
+        u128, __floatuntikf;
+        i128, __floattikf;
+    }
+}
+
+mod f_to_i {
+    use super::*;
+
+    macro_rules! f_to_i {
+        ($x:ident, $f_ty:ty, $apfloat_ty:ident, $sys_available:meta, $($i_ty:ty, $fn:ident);*;) => {
+            $(
+                // it is undefined behavior in the first place to do conversions with NaNs
+                if !apfloat_fallback!(
+                    $f_ty, $apfloat_ty, $sys_available, |x: FloatTy| x.is_nan() => no_convert, $x
+                ) {
+                    let conv0 = apfloat_fallback!(
+                        $f_ty, $apfloat_ty, $sys_available,
+                        // Use an `as` cast when the builtin is available on the system.
+                        |x| x as $i_ty;
+                        // When the builtin is not available, we need to use a different conversion
+                        // method (since apfloat doesn't support `as` casting).
+                        |x: $f_ty| {
+                            use compiler_builtins::int::MinInt;
+
+                            let apf = FloatTy::from_bits(x.to_bits().into());
+                            let bits: usize = <$i_ty>::BITS.try_into().unwrap();
+
+                            let err_fn = || panic!(
+                                "Unable to convert value {x:?} to type {}:", stringify!($i_ty)
+                            );
+
+                            if <$i_ty>::SIGNED {
+                               <$i_ty>::try_from(apf.to_i128(bits).value).ok().unwrap_or_else(err_fn)
+                            } else {
+                               <$i_ty>::try_from(apf.to_u128(bits).value).ok().unwrap_or_else(err_fn)
+                            }
+                        },
+                        $x
+                    );
+                    let conv1: $i_ty = $fn($x);
+                    if conv0 != conv1 {
+                        panic!("{}({:?}): std: {:?}, builtins: {:?}", stringify!($fn), $x, conv0, conv1);
+                    }
+                }
+            )*
+        };
+    }
+
+    #[test]
+    fn f32_to_int() {
+        use compiler_builtins::float::conv::{
+            __fixsfdi, __fixsfsi, __fixsfti, __fixunssfdi, __fixunssfsi, __fixunssfti,
+        };
+
+        fuzz_float(N, |x: f32| {
+            f_to_i!(x, f32, Single, all(),
+                u32, __fixunssfsi;
+                u64, __fixunssfdi;
+                u128, __fixunssfti;
+                i32, __fixsfsi;
+                i64, __fixsfdi;
+                i128, __fixsfti;
+            );
+        });
+    }
+
+    #[test]
+    fn f64_to_int() {
+        use compiler_builtins::float::conv::{
+            __fixdfdi, __fixdfsi, __fixdfti, __fixunsdfdi, __fixunsdfsi, __fixunsdfti,
+        };
+
+        fuzz_float(N, |x: f64| {
+            f_to_i!(x, f64, Double, all(),
+                u32, __fixunsdfsi;
+                u64, __fixunsdfdi;
+                u128, __fixunsdfti;
+                i32, __fixdfsi;
+                i64, __fixdfdi;
+                i128, __fixdfti;
+            );
+        });
+    }
+
+    #[test]
+    #[cfg(f128_enabled)]
+    fn f128_to_int() {
+        #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+        use compiler_builtins::float::conv::{
+            __fixkfdi as __fixtfdi, __fixkfsi as __fixtfsi, __fixkfti as __fixtfti,
+            __fixunskfdi as __fixunstfdi, __fixunskfsi as __fixunstfsi,
+            __fixunskfti as __fixunstfti,
+        };
+        #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+        use compiler_builtins::float::conv::{
+            __fixtfdi, __fixtfsi, __fixtfti, __fixunstfdi, __fixunstfsi, __fixunstfti,
+        };
+
+        fuzz_float(N, |x: f128| {
+            f_to_i!(
+                x,
+                f128,
+                Quad,
+                not(feature = "no-sys-f128-int-convert"),
+                u32, __fixunstfsi;
+                u64, __fixunstfdi;
+                u128, __fixunstfti;
+                i32, __fixtfsi;
+                i64, __fixtfdi;
+                i128, __fixtfti;
+            );
+        });
+    }
+}
+
+macro_rules! f_to_f {
+    (
+        $mod:ident,
+        $(
+            $from_ty:ty => $to_ty:ty,
+            $from_ap_ty:ident => $to_ap_ty:ident,
+            $fn:ident, $sys_available:meta
+        );+;
+    ) => {$(
+        #[test]
+        fn $fn() {
+            use compiler_builtins::float::{$mod::$fn, Float};
+            use rustc_apfloat::ieee::{$from_ap_ty, $to_ap_ty};
+
+            fuzz_float(N, |x: $from_ty| {
+                let tmp0: $to_ty = apfloat_fallback!(
+                    $from_ty,
+                    $from_ap_ty,
+                    $sys_available,
+                    |x: $from_ty| x as $to_ty;
+                    |x: $from_ty| {
+                        let from_apf = FloatTy::from_bits(x.to_bits().into());
+                        // Get `value` directly to ignore INVALID_OP
+                        let to_apf: $to_ap_ty = from_apf.convert(&mut false).value;
+                        <$to_ty>::from_bits(to_apf.to_bits().try_into().unwrap())
+                    },
+                    x
+                );
+                let tmp1: $to_ty = $fn(x);
+
+                if !Float::eq_repr(tmp0, tmp1) {
+                    panic!(
+                        "{}({:?}): std: {:?}, builtins: {:?}",
+                        stringify!($fn),
+                        x,
+                        tmp0,
+                        tmp1
+                    );
+                }
+            })
+        }
+    )+};
+}
+
+mod extend {
+    use super::*;
+
+    f_to_f! {
+        extend,
+        f32 => f64, Single => Double, __extendsfdf2, all();
+    }
+
+    #[cfg(all(f16_enabled, f128_enabled))]
+    #[cfg(not(any(
+        target_arch = "powerpc",
+        target_arch = "powerpc64",
+        target_arch = "loongarch64"
+    )))]
+    f_to_f! {
+        extend,
+        f16 => f32, Half => Single, __extendhfsf2, not(feature = "no-sys-f16");
+        f16 => f32, Half => Single, __gnu_h2f_ieee, not(feature = "no-sys-f16");
+        f16 => f64, Half => Double, __extendhfdf2, not(feature = "no-sys-f16-f64-convert");
+        f16 => f128, Half => Quad, __extendhftf2, not(feature = "no-sys-f16-f128-convert");
+        f32 => f128, Single => Quad, __extendsftf2, not(feature = "no-sys-f128");
+        f64 => f128, Double => Quad, __extenddftf2, not(feature = "no-sys-f128");
+    }
+
+    #[cfg(f128_enabled)]
+    #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+    f_to_f! {
+        extend,
+        // FIXME(#655): `f16` tests disabled until we can bootstrap symbols
+        f32 => f128, Single => Quad, __extendsfkf2, not(feature = "no-sys-f128");
+        f64 => f128, Double => Quad, __extenddfkf2, not(feature = "no-sys-f128");
+    }
+}
+
+mod trunc {
+    use super::*;
+
+    f_to_f! {
+        trunc,
+        f64 => f32, Double => Single, __truncdfsf2, all();
+    }
+
+    #[cfg(all(f16_enabled, f128_enabled))]
+    #[cfg(not(any(
+        target_arch = "powerpc",
+        target_arch = "powerpc64",
+        target_arch = "loongarch64"
+    )))]
+    f_to_f! {
+        trunc,
+        f32 => f16, Single => Half, __truncsfhf2, not(feature = "no-sys-f16");
+        f32 => f16, Single => Half, __gnu_f2h_ieee, not(feature = "no-sys-f16");
+        f64 => f16, Double => Half, __truncdfhf2, not(feature = "no-sys-f16-f64-convert");
+        f128 => f16, Quad => Half, __trunctfhf2, not(feature = "no-sys-f16-f128-convert");
+        f128 => f32, Quad => Single, __trunctfsf2, not(feature = "no-sys-f128");
+        f128 => f64, Quad => Double, __trunctfdf2, not(feature = "no-sys-f128");
+    }
+
+    #[cfg(f128_enabled)]
+    #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+    f_to_f! {
+        trunc,
+        // FIXME(#655): `f16` tests disabled until we can bootstrap symbols
+        f128 => f32, Quad => Single, __trunckfsf2, not(feature = "no-sys-f128");
+        f128 => f64, Quad => Double, __trunckfdf2, not(feature = "no-sys-f128");
+    }
+}
diff --git a/library/compiler-builtins/builtins-test/tests/div_rem.rs b/library/compiler-builtins/builtins-test/tests/div_rem.rs
new file mode 100644
index 00000000000..5ae653cc90c
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/tests/div_rem.rs
@@ -0,0 +1,164 @@
+#![feature(f128)]
+#![allow(unused_macros)]
+
+use builtins_test::*;
+use compiler_builtins::int::sdiv::{__divmoddi4, __divmodsi4, __divmodti4};
+use compiler_builtins::int::udiv::{__udivmoddi4, __udivmodsi4, __udivmodti4, u128_divide_sparc};
+
+// Division algorithms have by far the nastiest and largest number of edge cases, and experience shows
+// that sometimes 100_000 iterations of the random fuzzer is needed.
+
+/// Creates intensive test functions for division functions of a certain size
+macro_rules! test {
+    (
+        $n:expr, // the number of bits in a $iX or $uX
+        $uX:ident, // unsigned integer that will be shifted
+        $iX:ident, // signed version of $uX
+        $test_name:ident, // name of the test function
+        $unsigned_name:ident, // unsigned division function
+        $signed_name:ident // signed division function
+    ) => {
+        #[test]
+        fn $test_name() {
+            fuzz_2(N, |lhs, rhs| {
+                if rhs == 0 {
+                    return;
+                }
+
+                let mut rem: $uX = 0;
+                let quo: $uX = $unsigned_name(lhs, rhs, Some(&mut rem));
+                if rhs <= rem || (lhs != rhs.wrapping_mul(quo).wrapping_add(rem)) {
+                    panic!(
+                        "unsigned division function failed with lhs:{} rhs:{} \
+                        std:({}, {}) builtins:({}, {})",
+                        lhs,
+                        rhs,
+                        lhs.wrapping_div(rhs),
+                        lhs.wrapping_rem(rhs),
+                        quo,
+                        rem
+                    );
+                }
+
+                // test the signed division function also
+                let lhs = lhs as $iX;
+                let rhs = rhs as $iX;
+                let mut rem: $iX = 0;
+                let quo: $iX = $signed_name(lhs, rhs, &mut rem);
+                // We cannot just test that
+                // `lhs == rhs.wrapping_mul(quo).wrapping_add(rem)`, but also
+                // need to make sure the remainder isn't larger than the divisor
+                // and has the correct sign.
+                let incorrect_rem = if rem == 0 {
+                    false
+                } else if rhs == $iX::MIN {
+                    // `rhs.wrapping_abs()` would overflow, so handle this case
+                    // separately.
+                    (lhs.is_negative() != rem.is_negative()) || (rem == $iX::MIN)
+                } else {
+                    (lhs.is_negative() != rem.is_negative())
+                        || (rhs.wrapping_abs() <= rem.wrapping_abs())
+                };
+                if incorrect_rem || lhs != rhs.wrapping_mul(quo).wrapping_add(rem) {
+                    panic!(
+                        "signed division function failed with lhs:{} rhs:{} \
+                        std:({}, {}) builtins:({}, {})",
+                        lhs,
+                        rhs,
+                        lhs.wrapping_div(rhs),
+                        lhs.wrapping_rem(rhs),
+                        quo,
+                        rem
+                    );
+                }
+            });
+        }
+    };
+}
+
+test!(32, u32, i32, div_rem_si4, __udivmodsi4, __divmodsi4);
+test!(64, u64, i64, div_rem_di4, __udivmoddi4, __divmoddi4);
+test!(128, u128, i128, div_rem_ti4, __udivmodti4, __divmodti4);
+
+#[test]
+fn divide_sparc() {
+    fuzz_2(N, |lhs, rhs| {
+        if rhs == 0 {
+            return;
+        }
+
+        let mut rem: u128 = 0;
+        let quo: u128 = u128_divide_sparc(lhs, rhs, &mut rem);
+        if rhs <= rem || (lhs != rhs.wrapping_mul(quo).wrapping_add(rem)) {
+            panic!(
+                "u128_divide_sparc({}, {}): \
+                std:({}, {}), builtins:({}, {})",
+                lhs,
+                rhs,
+                lhs.wrapping_div(rhs),
+                lhs.wrapping_rem(rhs),
+                quo,
+                rem
+            );
+        }
+    });
+}
+
+macro_rules! float {
+    ($($f:ty, $fn:ident, $apfloat_ty:ident, $sys_available:meta);*;) => {
+        $(
+            #[test]
+            fn $fn() {
+                use compiler_builtins::float::{div::$fn, Float};
+                use core::ops::Div;
+
+                fuzz_float_2(N, |x: $f, y: $f| {
+                    let quo0: $f = apfloat_fallback!($f, $apfloat_ty, $sys_available, Div::div, x, y);
+                    let quo1: $f = $fn(x, y);
+
+                    // ARM SIMD instructions always flush subnormals to zero
+                    if cfg!(target_arch = "arm") &&
+                        ((Float::is_subnormal(quo0)) || Float::is_subnormal(quo1)) {
+                        return;
+                    }
+
+                    if !Float::eq_repr(quo0, quo1) {
+                        panic!(
+                            "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
+                            stringify!($fn),
+                            x,
+                            y,
+                            quo0,
+                            quo1
+                        );
+                    }
+                });
+            }
+        )*
+    };
+}
+
+#[cfg(not(all(target_arch = "x86", not(target_feature = "sse"))))]
+mod float_div {
+    use super::*;
+
+    float! {
+        f32, __divsf3, Single, all();
+        f64, __divdf3, Double, all();
+    }
+
+    #[cfg(not(feature = "no-f16-f128"))]
+    #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+    float! {
+        f128, __divtf3, Quad,
+        // FIXME(llvm): there is a bug in LLVM rt.
+        // See <https://github.com/llvm/llvm-project/issues/91840>.
+        not(any(feature = "no-sys-f128", all(target_arch = "aarch64", target_os = "linux")));
+    }
+
+    #[cfg(not(feature = "no-f16-f128"))]
+    #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+    float! {
+        f128, __divkf3, Quad, not(feature = "no-sys-f128");
+    }
+}
diff --git a/library/compiler-builtins/builtins-test/tests/float_pow.rs b/library/compiler-builtins/builtins-test/tests/float_pow.rs
new file mode 100644
index 00000000000..8209543e666
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/tests/float_pow.rs
@@ -0,0 +1,72 @@
+#![allow(unused_macros)]
+#![cfg_attr(f128_enabled, feature(f128))]
+#![cfg(not(all(target_arch = "x86", not(target_feature = "sse"))))]
+
+use builtins_test::*;
+
+// This is approximate because of issues related to
+// https://github.com/rust-lang/rust/issues/73920.
+// TODO how do we resolve this indeterminacy?
+macro_rules! pow {
+    ($($f:ty, $tolerance:expr, $fn:ident, $sys_available:meta);*;) => {
+        $(
+            #[test]
+            // FIXME(apfloat): We skip tests if system symbols aren't available rather
+            // than providing a fallback, since `rustc_apfloat` does not provide `pow`.
+            #[cfg($sys_available)]
+            fn $fn() {
+                use compiler_builtins::float::pow::$fn;
+                use compiler_builtins::float::Float;
+                fuzz_float_2(N, |x: $f, y: $f| {
+                    if !(Float::is_subnormal(x) || Float::is_subnormal(y) || x.is_nan()) {
+                        let n = y.to_bits() & !<$f as Float>::SIG_MASK;
+                        let n = (n as <$f as Float>::SignedInt) >> <$f as Float>::SIG_BITS;
+                        let n = n as i32;
+                        let tmp0: $f = x.powi(n);
+                        let tmp1: $f = $fn(x, n);
+                        let (a, b) = if tmp0 < tmp1 {
+                            (tmp0, tmp1)
+                        } else {
+                            (tmp1, tmp0)
+                        };
+
+                        let good = if a == b {
+                            // handles infinity equality
+                            true
+                        } else if a < $tolerance {
+                            b < $tolerance
+                        } else {
+                            let quo = b / a;
+                            (quo < (1. + $tolerance)) && (quo > (1. - $tolerance))
+                        };
+
+                        assert!(
+                            good,
+                            "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
+                            stringify!($fn), x, n, tmp0, tmp1
+                        );
+                    }
+                });
+            }
+        )*
+    };
+}
+
+pow! {
+    f32, 1e-4, __powisf2, all();
+    f64, 1e-12, __powidf2, all();
+}
+
+#[cfg(f128_enabled)]
+// FIXME(f16_f128): MSVC cannot build these until `__divtf3` is available in nightly.
+#[cfg(not(target_env = "msvc"))]
+#[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+pow! {
+    f128, 1e-36, __powitf2, not(feature = "no-sys-f128");
+}
+
+#[cfg(f128_enabled)]
+#[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+pow! {
+    f128, 1e-36, __powikf2, not(feature = "no-sys-f128");
+}
diff --git a/library/compiler-builtins/builtins-test/tests/lse.rs b/library/compiler-builtins/builtins-test/tests/lse.rs
new file mode 100644
index 00000000000..53167d98fc0
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/tests/lse.rs
@@ -0,0 +1,97 @@
+#![feature(decl_macro)] // so we can use pub(super)
+#![cfg(all(target_arch = "aarch64", target_os = "linux", not(feature = "no-asm")))]
+
+/// Translate a byte size to a Rust type.
+macro int_ty {
+    (1) => { i8 },
+    (2) => { i16 },
+    (4) => { i32 },
+    (8) => { i64 },
+    (16) => { i128 }
+}
+
+mod cas {
+    pub(super) macro test($_ordering:ident, $bytes:tt, $name:ident) {
+        #[test]
+        fn $name() {
+            builtins_test::fuzz_2(10000, |expected: super::int_ty!($bytes), new| {
+                let mut target = expected.wrapping_add(10);
+                assert_eq!(
+                    unsafe {
+                        compiler_builtins::aarch64_linux::$name::$name(expected, new, &mut target)
+                    },
+                    expected.wrapping_add(10),
+                    "return value should always be the previous value",
+                );
+                assert_eq!(
+                    target,
+                    expected.wrapping_add(10),
+                    "shouldn't have changed target"
+                );
+
+                target = expected;
+                assert_eq!(
+                    unsafe {
+                        compiler_builtins::aarch64_linux::$name::$name(expected, new, &mut target)
+                    },
+                    expected
+                );
+                assert_eq!(target, new, "should have updated target");
+            });
+        }
+    }
+}
+
+macro test_cas16($_ordering:ident, $name:ident) {
+    cas::test!($_ordering, 16, $name);
+}
+
+mod swap {
+    pub(super) macro test($_ordering:ident, $bytes:tt, $name:ident) {
+        #[test]
+        fn $name() {
+            builtins_test::fuzz_2(10000, |left: super::int_ty!($bytes), mut right| {
+                let orig_right = right;
+                assert_eq!(
+                    unsafe { compiler_builtins::aarch64_linux::$name::$name(left, &mut right) },
+                    orig_right
+                );
+                assert_eq!(left, right);
+            });
+        }
+    }
+}
+
+macro_rules! test_op {
+    ($mod:ident, $( $op:tt )* ) => {
+        mod $mod {
+            pub(super) macro test {
+                ($_ordering:ident, $bytes:tt, $name:ident) => {
+                    #[test]
+                    fn $name() {
+                        builtins_test::fuzz_2(10000, |old, val| {
+                            let mut target = old;
+                            let op: fn(super::int_ty!($bytes), super::int_ty!($bytes)) -> _ = $($op)*;
+                            let expected = op(old, val);
+                            assert_eq!(old, unsafe { compiler_builtins::aarch64_linux::$name::$name(val, &mut target) }, "{} should return original value", stringify!($name));
+                            assert_eq!(expected, target, "{} should store to target", stringify!($name));
+                        });
+                    }
+                }
+            }
+        }
+    };
+}
+
+test_op!(add, |left, right| left.wrapping_add(right));
+test_op!(clr, |left, right| left & !right);
+test_op!(xor, std::ops::BitXor::bitxor);
+test_op!(or, std::ops::BitOr::bitor);
+
+compiler_builtins::foreach_cas!(cas::test);
+compiler_builtins::foreach_cas16!(test_cas16);
+compiler_builtins::foreach_swp!(swap::test);
+compiler_builtins::foreach_ldadd!(add::test);
+compiler_builtins::foreach_ldclr!(clr::test);
+compiler_builtins::foreach_ldeor!(xor::test);
+compiler_builtins::foreach_ldset!(or::test);
diff --git a/library/compiler-builtins/builtins-test/tests/mem.rs b/library/compiler-builtins/builtins-test/tests/mem.rs
new file mode 100644
index 00000000000..d838ef159a0
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/tests/mem.rs
@@ -0,0 +1,286 @@
+extern crate compiler_builtins;
+use compiler_builtins::mem::{memcmp, memcpy, memmove, memset};
+
+const WORD_SIZE: usize = core::mem::size_of::<usize>();
+
+#[test]
+fn memcpy_3() {
+    let mut arr: [u8; 12] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11];
+    unsafe {
+        let src = arr.as_ptr().offset(9);
+        let dst = arr.as_mut_ptr().offset(1);
+        assert_eq!(memcpy(dst, src, 3), dst);
+        assert_eq!(arr, [0, 9, 10, 11, 4, 5, 6, 7, 8, 9, 10, 11]);
+    }
+    arr = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11];
+    unsafe {
+        let src = arr.as_ptr().offset(1);
+        let dst = arr.as_mut_ptr().offset(9);
+        assert_eq!(memcpy(dst, src, 3), dst);
+        assert_eq!(arr, [0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3]);
+    }
+}
+
+#[test]
+fn memcpy_10() {
+    let arr: [u8; 18] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17];
+    let mut dst: [u8; 12] = [0; 12];
+    unsafe {
+        let src = arr.as_ptr().offset(1);
+        assert_eq!(memcpy(dst.as_mut_ptr(), src, 10), dst.as_mut_ptr());
+        assert_eq!(dst, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 0]);
+    }
+    unsafe {
+        let src = arr.as_ptr().offset(8);
+        assert_eq!(memcpy(dst.as_mut_ptr(), src, 10), dst.as_mut_ptr());
+        assert_eq!(dst, [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 0, 0]);
+    }
+}
+
+#[test]
+fn memcpy_big() {
+    // Make the arrays cross 3 pages
+    const SIZE: usize = 8193;
+    let src: [u8; SIZE] = [22; SIZE];
+    struct Dst {
+        start: usize,
+        buf: [u8; SIZE],
+        end: usize,
+    }
+
+    let mut dst = Dst {
+        start: 0,
+        buf: [0; SIZE],
+        end: 0,
+    };
+    unsafe {
+        assert_eq!(
+            memcpy(dst.buf.as_mut_ptr(), src.as_ptr(), SIZE),
+            dst.buf.as_mut_ptr()
+        );
+        assert_eq!(dst.start, 0);
+        assert_eq!(dst.buf, [22; SIZE]);
+        assert_eq!(dst.end, 0);
+    }
+}
+
+#[test]
+fn memmove_forward() {
+    let mut arr: [u8; 12] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11];
+    unsafe {
+        let src = arr.as_ptr().offset(6);
+        let dst = arr.as_mut_ptr().offset(3);
+        assert_eq!(memmove(dst, src, 5), dst);
+        assert_eq!(arr, [0, 1, 2, 6, 7, 8, 9, 10, 8, 9, 10, 11]);
+    }
+}
+
+#[test]
+fn memmove_backward() {
+    let mut arr: [u8; 12] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11];
+    unsafe {
+        let src = arr.as_ptr().offset(3);
+        let dst = arr.as_mut_ptr().offset(6);
+        assert_eq!(memmove(dst, src, 5), dst);
+        assert_eq!(arr, [0, 1, 2, 3, 4, 5, 3, 4, 5, 6, 7, 11]);
+    }
+}
+
+#[test]
+fn memset_zero() {
+    let mut arr: [u8; 8] = [0, 1, 2, 3, 4, 5, 6, 7];
+    unsafe {
+        let ptr = arr.as_mut_ptr().offset(5);
+        assert_eq!(memset(ptr, 0, 2), ptr);
+        assert_eq!(arr, [0, 1, 2, 3, 4, 0, 0, 7]);
+
+        // Only the LSB matters for a memset
+        assert_eq!(memset(arr.as_mut_ptr(), 0x2000, 8), arr.as_mut_ptr());
+        assert_eq!(arr, [0, 0, 0, 0, 0, 0, 0, 0]);
+    }
+}
+
+#[test]
+fn memset_nonzero() {
+    let mut arr: [u8; 8] = [0, 1, 2, 3, 4, 5, 6, 7];
+    unsafe {
+        let ptr = arr.as_mut_ptr().offset(2);
+        assert_eq!(memset(ptr, 22, 3), ptr);
+        assert_eq!(arr, [0, 1, 22, 22, 22, 5, 6, 7]);
+
+        // Only the LSB matters for a memset
+        assert_eq!(memset(arr.as_mut_ptr(), 0x2009, 8), arr.as_mut_ptr());
+        assert_eq!(arr, [9, 9, 9, 9, 9, 9, 9, 9]);
+    }
+}
+
+#[test]
+fn memcmp_eq() {
+    let arr1 @ arr2 = gen_arr::<256>();
+    for i in 0..256 {
+        unsafe {
+            assert_eq!(memcmp(arr1.0.as_ptr(), arr2.0.as_ptr(), i), 0);
+            assert_eq!(memcmp(arr2.0.as_ptr(), arr1.0.as_ptr(), i), 0);
+        }
+    }
+}
+
+#[test]
+fn memcmp_ne() {
+    let arr1 @ arr2 = gen_arr::<256>();
+    // Reduce iteration count in Miri as it is too slow otherwise.
+    let limit = if cfg!(miri) { 64 } else { 256 };
+    for i in 0..limit {
+        let mut diff_arr = arr1;
+        diff_arr.0[i] = 127;
+        let expect = diff_arr.0[i].cmp(&arr2.0[i]);
+        for k in i + 1..limit {
+            let result = unsafe { memcmp(diff_arr.0.as_ptr(), arr2.0.as_ptr(), k) };
+            assert_eq!(expect, result.cmp(&0));
+        }
+    }
+}
+
+#[derive(Clone, Copy)]
+struct AlignedStorage<const N: usize>([u8; N], [usize; 0]);
+
+fn gen_arr<const N: usize>() -> AlignedStorage<N> {
+    let mut ret = AlignedStorage::<N>([0; N], []);
+    for i in 0..N {
+        ret.0[i] = i as u8;
+    }
+    ret
+}
+
+#[test]
+fn memmove_forward_misaligned_nonaligned_start() {
+    let mut arr = gen_arr::<32>();
+    let mut reference = arr;
+    unsafe {
+        let src = arr.0.as_ptr().offset(6);
+        let dst = arr.0.as_mut_ptr().offset(3);
+        assert_eq!(memmove(dst, src, 17), dst);
+        reference.0.copy_within(6..6 + 17, 3);
+        assert_eq!(arr.0, reference.0);
+    }
+}
+
+#[test]
+fn memmove_forward_misaligned_aligned_start() {
+    let mut arr = gen_arr::<32>();
+    let mut reference = arr;
+    unsafe {
+        let src = arr.0.as_ptr().offset(6);
+        let dst = arr.0.as_mut_ptr().add(0);
+        assert_eq!(memmove(dst, src, 17), dst);
+        reference.0.copy_within(6..6 + 17, 0);
+        assert_eq!(arr.0, reference.0);
+    }
+}
+
+#[test]
+fn memmove_forward_aligned() {
+    let mut arr = gen_arr::<32>();
+    let mut reference = arr;
+    unsafe {
+        let src = arr.0.as_ptr().add(3 + WORD_SIZE);
+        let dst = arr.0.as_mut_ptr().add(3);
+        assert_eq!(memmove(dst, src, 17), dst);
+        reference
+            .0
+            .copy_within(3 + WORD_SIZE..3 + WORD_SIZE + 17, 3);
+        assert_eq!(arr.0, reference.0);
+    }
+}
+
+#[test]
+fn memmove_backward_misaligned_nonaligned_start() {
+    let mut arr = gen_arr::<32>();
+    let mut reference = arr;
+    unsafe {
+        let src = arr.0.as_ptr().offset(3);
+        let dst = arr.0.as_mut_ptr().offset(6);
+        assert_eq!(memmove(dst, src, 17), dst);
+        reference.0.copy_within(3..3 + 17, 6);
+        assert_eq!(arr.0, reference.0);
+    }
+}
+
+#[test]
+fn memmove_backward_misaligned_aligned_start() {
+    let mut arr = gen_arr::<32>();
+    let mut reference = arr;
+    unsafe {
+        let src = arr.0.as_ptr().offset(3);
+        let dst = arr.0.as_mut_ptr().add(WORD_SIZE);
+        assert_eq!(memmove(dst, src, 17), dst);
+        reference.0.copy_within(3..3 + 17, WORD_SIZE);
+        assert_eq!(arr.0, reference.0);
+    }
+}
+
+#[test]
+fn memmove_backward_aligned() {
+    let mut arr = gen_arr::<32>();
+    let mut reference = arr;
+    unsafe {
+        let src = arr.0.as_ptr().add(3);
+        let dst = arr.0.as_mut_ptr().add(3 + WORD_SIZE);
+        assert_eq!(memmove(dst, src, 17), dst);
+        reference.0.copy_within(3..3 + 17, 3 + WORD_SIZE);
+        assert_eq!(arr.0, reference.0);
+    }
+}
+
+#[test]
+fn memmove_misaligned_bounds() {
+    // The above test have the downside that the addresses surrounding the range-to-copy are all
+    // still in-bounds, so Miri would not actually complain about OOB accesses. So we also test with
+    // an array that has just the right size. We test a few times to avoid it being accidentally
+    // aligned.
+    for _ in 0..8 {
+        let mut arr1 = [0u8; 17];
+        let mut arr2 = [0u8; 17];
+        unsafe {
+            // Copy both ways so we hit both the forward and backward cases.
+            memmove(arr1.as_mut_ptr(), arr2.as_mut_ptr(), 17);
+            memmove(arr2.as_mut_ptr(), arr1.as_mut_ptr(), 17);
+        }
+    }
+}
+
+#[test]
+fn memset_backward_misaligned_nonaligned_start() {
+    let mut arr = gen_arr::<32>();
+    let mut reference = arr;
+    unsafe {
+        let ptr = arr.0.as_mut_ptr().offset(6);
+        assert_eq!(memset(ptr, 0xCC, 17), ptr);
+        core::ptr::write_bytes(reference.0.as_mut_ptr().add(6), 0xCC, 17);
+        assert_eq!(arr.0, reference.0);
+    }
+}
+
+#[test]
+fn memset_backward_misaligned_aligned_start() {
+    let mut arr = gen_arr::<32>();
+    let mut reference = arr;
+    unsafe {
+        let ptr = arr.0.as_mut_ptr().add(WORD_SIZE);
+        assert_eq!(memset(ptr, 0xCC, 17), ptr);
+        core::ptr::write_bytes(reference.0.as_mut_ptr().add(WORD_SIZE), 0xCC, 17);
+        assert_eq!(arr.0, reference.0);
+    }
+}
+
+#[test]
+fn memset_backward_aligned() {
+    let mut arr = gen_arr::<32>();
+    let mut reference = arr;
+    unsafe {
+        let ptr = arr.0.as_mut_ptr().add(3 + WORD_SIZE);
+        assert_eq!(memset(ptr, 0xCC, 17), ptr);
+        core::ptr::write_bytes(reference.0.as_mut_ptr().add(3 + WORD_SIZE), 0xCC, 17);
+        assert_eq!(arr.0, reference.0);
+    }
+}
diff --git a/library/compiler-builtins/builtins-test/tests/misc.rs b/library/compiler-builtins/builtins-test/tests/misc.rs
new file mode 100644
index 00000000000..64a9d56f36b
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/tests/misc.rs
@@ -0,0 +1,202 @@
+// makes configuration easier
+#![allow(unused_macros)]
+
+use builtins_test::*;
+
+/// Make sure that the the edge case tester and randomized tester don't break, and list examples of
+/// fuzz values for documentation purposes.
+#[test]
+fn fuzz_values() {
+    const VALS: [u16; 47] = [
+        0b0, // edge cases
+        0b1111111111111111,
+        0b1111111111111110,
+        0b1111111111111100,
+        0b1111111110000000,
+        0b1111111100000000,
+        0b1110000000000000,
+        0b1100000000000000,
+        0b1000000000000000,
+        0b111111111111111,
+        0b111111111111110,
+        0b111111111111100,
+        0b111111110000000,
+        0b111111100000000,
+        0b110000000000000,
+        0b100000000000000,
+        0b11111111111111,
+        0b11111111111110,
+        0b11111111111100,
+        0b11111110000000,
+        0b11111100000000,
+        0b10000000000000,
+        0b111111111,
+        0b111111110,
+        0b111111100,
+        0b110000000,
+        0b100000000,
+        0b11111111,
+        0b11111110,
+        0b11111100,
+        0b10000000,
+        0b111,
+        0b110,
+        0b100,
+        0b11,
+        0b10,
+        0b1,
+        0b1010110100000, // beginning of random fuzzing
+        0b1100011001011010,
+        0b1001100101001111,
+        0b1101010100011010,
+        0b100010001,
+        0b1000000000000000,
+        0b1100000000000101,
+        0b1100111101010101,
+        0b1100010111111111,
+        0b1111110101111111,
+    ];
+    let mut i = 0;
+    fuzz(10, |x: u16| {
+        assert_eq!(x, VALS[i]);
+        i += 1;
+    });
+}
+
+#[test]
+fn leading_zeros() {
+    use compiler_builtins::int::leading_zeros::{leading_zeros_default, leading_zeros_riscv};
+    {
+        use compiler_builtins::int::leading_zeros::__clzsi2;
+        fuzz(N, |x: u32| {
+            if x == 0 {
+                return; // undefined value for an intrinsic
+            }
+            let lz = x.leading_zeros() as usize;
+            let lz0 = __clzsi2(x);
+            let lz1 = leading_zeros_default(x);
+            let lz2 = leading_zeros_riscv(x);
+            if lz0 != lz {
+                panic!("__clzsi2({x}): std: {lz}, builtins: {lz0}");
+            }
+            if lz1 != lz {
+                panic!("leading_zeros_default({x}): std: {lz}, builtins: {lz1}");
+            }
+            if lz2 != lz {
+                panic!("leading_zeros_riscv({x}): std: {lz}, builtins: {lz2}");
+            }
+        });
+    }
+
+    {
+        use compiler_builtins::int::leading_zeros::__clzdi2;
+        fuzz(N, |x: u64| {
+            if x == 0 {
+                return; // undefined value for an intrinsic
+            }
+            let lz = x.leading_zeros() as usize;
+            let lz0 = __clzdi2(x);
+            let lz1 = leading_zeros_default(x);
+            let lz2 = leading_zeros_riscv(x);
+            if lz0 != lz {
+                panic!("__clzdi2({x}): std: {lz}, builtins: {lz0}");
+            }
+            if lz1 != lz {
+                panic!("leading_zeros_default({x}): std: {lz}, builtins: {lz1}");
+            }
+            if lz2 != lz {
+                panic!("leading_zeros_riscv({x}): std: {lz}, builtins: {lz2}");
+            }
+        });
+    }
+
+    {
+        use compiler_builtins::int::leading_zeros::__clzti2;
+        fuzz(N, |x: u128| {
+            if x == 0 {
+                return; // undefined value for an intrinsic
+            }
+            let lz = x.leading_zeros() as usize;
+            let lz0 = __clzti2(x);
+            if lz0 != lz {
+                panic!("__clzti2({x}): std: {lz}, builtins: {lz0}");
+            }
+        });
+    }
+}
+
+#[test]
+fn trailing_zeros() {
+    use compiler_builtins::int::trailing_zeros::{__ctzdi2, __ctzsi2, __ctzti2, trailing_zeros};
+    fuzz(N, |x: u32| {
+        if x == 0 {
+            return; // undefined value for an intrinsic
+        }
+        let tz = x.trailing_zeros() as usize;
+        let tz0 = __ctzsi2(x);
+        let tz1 = trailing_zeros(x);
+        if tz0 != tz {
+            panic!("__ctzsi2({x}): std: {tz}, builtins: {tz0}");
+        }
+        if tz1 != tz {
+            panic!("trailing_zeros({x}): std: {tz}, builtins: {tz1}");
+        }
+    });
+    fuzz(N, |x: u64| {
+        if x == 0 {
+            return; // undefined value for an intrinsic
+        }
+        let tz = x.trailing_zeros() as usize;
+        let tz0 = __ctzdi2(x);
+        let tz1 = trailing_zeros(x);
+        if tz0 != tz {
+            panic!("__ctzdi2({x}): std: {tz}, builtins: {tz0}");
+        }
+        if tz1 != tz {
+            panic!("trailing_zeros({x}): std: {tz}, builtins: {tz1}");
+        }
+    });
+    fuzz(N, |x: u128| {
+        if x == 0 {
+            return; // undefined value for an intrinsic
+        }
+        let tz = x.trailing_zeros() as usize;
+        let tz0 = __ctzti2(x);
+        if tz0 != tz {
+            panic!("__ctzti2({x}): std: {tz}, builtins: {tz0}");
+        }
+    });
+}
+
+#[test]
+fn bswap() {
+    use compiler_builtins::int::bswap::{__bswapdi2, __bswapsi2};
+    fuzz(N, |x: u32| {
+        assert_eq!(x.swap_bytes(), __bswapsi2(x));
+    });
+    fuzz(N, |x: u64| {
+        assert_eq!(x.swap_bytes(), __bswapdi2(x));
+    });
+
+    assert_eq!(__bswapsi2(0x12345678u32), 0x78563412u32);
+    assert_eq!(__bswapsi2(0x00000001u32), 0x01000000u32);
+    assert_eq!(__bswapdi2(0x123456789ABCDEF0u64), 0xF0DEBC9A78563412u64);
+    assert_eq!(__bswapdi2(0x0200000001000000u64), 0x0000000100000002u64);
+
+    #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
+    {
+        use compiler_builtins::int::bswap::__bswapti2;
+        fuzz(N, |x: u128| {
+            assert_eq!(x.swap_bytes(), __bswapti2(x));
+        });
+
+        assert_eq!(
+            __bswapti2(0x123456789ABCDEF013579BDF02468ACEu128),
+            0xCE8A4602DF9B5713F0DEBC9A78563412u128
+        );
+        assert_eq!(
+            __bswapti2(0x04000000030000000200000001000000u128),
+            0x00000001000000020000000300000004u128
+        );
+    }
+}
diff --git a/library/compiler-builtins/builtins-test/tests/mul.rs b/library/compiler-builtins/builtins-test/tests/mul.rs
new file mode 100644
index 00000000000..58bc9ab4ac9
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/tests/mul.rs
@@ -0,0 +1,150 @@
+#![allow(unused_macros)]
+#![cfg_attr(f128_enabled, feature(f128))]
+
+use builtins_test::*;
+
+mod int_mul {
+    use super::*;
+
+    macro_rules! mul {
+        ($($i:ty, $fn:ident);*;) => {
+            $(
+                #[test]
+                fn $fn() {
+                    use compiler_builtins::int::mul::$fn;
+
+                    fuzz_2(N, |x: $i, y: $i| {
+                        let mul0 = x.wrapping_mul(y);
+                        let mul1: $i = $fn(x, y);
+                        if mul0 != mul1 {
+                            panic!(
+                                "{func}({x}, {y}): std: {mul0}, builtins: {mul1}",
+                                func = stringify!($fn),
+                            );
+                        }
+                    });
+
+                }
+            )*
+        };
+    }
+
+    mul! {
+        u64, __muldi3;
+        i128, __multi3;
+    }
+}
+
+mod int_overflowing_mul {
+    use super::*;
+
+    macro_rules! overflowing_mul {
+        ($($i:ty, $fn:ident);*;) => {
+            $(
+                #[test]
+                fn $fn() {
+                    use compiler_builtins::int::mul::$fn;
+
+                    fuzz_2(N, |x: $i, y: $i| {
+                        let (mul0, o0) = x.overflowing_mul(y);
+                        let mut o1 = 0i32;
+                        let mul1: $i = $fn(x, y, &mut o1);
+                        let o1 = o1 != 0;
+                        if mul0 != mul1 || o0 != o1 {
+                            panic!(
+                                "{func}({x}, {y}): std: ({mul0}, {o0}), builtins: ({mul1}, {o1})",
+                                func = stringify!($fn),
+                            );
+                        }
+                    });
+                }
+            )*
+        };
+    }
+
+    overflowing_mul! {
+        i32, __mulosi4;
+        i64, __mulodi4;
+        i128, __muloti4;
+    }
+
+    #[test]
+    fn overflowing_mul_u128() {
+        use compiler_builtins::int::mul::{__rust_i128_mulo, __rust_u128_mulo};
+
+        fuzz_2(N, |x: u128, y: u128| {
+            let mut o1 = 0;
+            let (mul0, o0) = x.overflowing_mul(y);
+            let mul1 = __rust_u128_mulo(x, y, &mut o1);
+            if mul0 != mul1 || i32::from(o0) != o1 {
+                panic!("__rust_u128_mulo({x}, {y}): std: ({mul0}, {o0}), builtins: ({mul1}, {o1})",);
+            }
+            let x = x as i128;
+            let y = y as i128;
+            let (mul0, o0) = x.overflowing_mul(y);
+            let mul1 = __rust_i128_mulo(x, y, &mut o1);
+            if mul0 != mul1 || i32::from(o0) != o1 {
+                panic!("__rust_i128_mulo({x}, {y}): std: ({mul0}, {o0}), builtins: ({mul1}, {o1})",);
+            }
+        });
+    }
+}
+
+macro_rules! float_mul {
+    ($($f:ty, $fn:ident, $apfloat_ty:ident, $sys_available:meta);*;) => {
+        $(
+            #[test]
+            fn $fn() {
+                use compiler_builtins::float::{mul::$fn, Float};
+                use core::ops::Mul;
+
+                fuzz_float_2(N, |x: $f, y: $f| {
+                    let mul0 = apfloat_fallback!($f, $apfloat_ty, $sys_available, Mul::mul, x, y);
+                    let mul1: $f = $fn(x, y);
+                    if !Float::eq_repr(mul0, mul1) {
+                        panic!(
+                            "{func}({x:?}, {y:?}): std: {mul0:?}, builtins: {mul1:?}",
+                            func = stringify!($fn),
+                        );
+                    }
+                });
+            }
+        )*
+    };
+}
+
+#[cfg(not(all(target_arch = "x86", not(target_feature = "sse"))))]
+mod float_mul {
+    use super::*;
+
+    // FIXME(#616): Stop ignoring arches that don't have native support once fix for builtins is in
+    // nightly.
+    float_mul! {
+        f32, __mulsf3, Single, not(target_arch = "arm");
+        f64, __muldf3, Double, not(target_arch = "arm");
+    }
+}
+
+#[cfg(f128_enabled)]
+#[cfg(not(all(target_arch = "x86", not(target_feature = "sse"))))]
+#[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+mod float_mul_f128 {
+    use super::*;
+
+    float_mul! {
+        f128, __multf3, Quad,
+        // FIXME(llvm): there is a bug in LLVM rt.
+        // See <https://github.com/llvm/llvm-project/issues/91840>.
+        not(any(feature = "no-sys-f128", all(target_arch = "aarch64", target_os = "linux")));
+    }
+}
+
+#[cfg(f128_enabled)]
+#[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+mod float_mul_f128_ppc {
+    use super::*;
+
+    float_mul! {
+        f128, __mulkf3, Quad, not(feature = "no-sys-f128");
+    }
+}
diff --git a/library/compiler-builtins/builtins-test/tests/shift.rs b/library/compiler-builtins/builtins-test/tests/shift.rs
new file mode 100644
index 00000000000..0f2483855e5
--- /dev/null
+++ b/library/compiler-builtins/builtins-test/tests/shift.rs
@@ -0,0 +1,35 @@
+use builtins_test::*;
+
+macro_rules! shift {
+    ($($i:ty, $fn_std:ident, $fn_builtins:ident);*;) => {
+        $(
+            #[test]
+            fn $fn_builtins() {
+                use compiler_builtins::int::shift::$fn_builtins;
+
+                fuzz_shift(|x: $i, s: u32| {
+                    let tmp0: $i = x.$fn_std(s);
+                    let tmp1: $i = $fn_builtins(x, s);
+                    if tmp0 != tmp1 {
+                        panic!(
+                            "{}({}, {}): std: {}, builtins: {}",
+                            stringify!($fn_builtins), x, s, tmp0, tmp1
+                        );
+                    }
+                });
+            }
+        )*
+    };
+}
+
+shift! {
+    u32, wrapping_shl, __ashlsi3;
+    u64, wrapping_shl, __ashldi3;
+    u128, wrapping_shl, __ashlti3;
+    i32, wrapping_shr, __ashrsi3;
+    i64, wrapping_shr, __ashrdi3;
+    i128, wrapping_shr, __ashrti3;
+    u32, wrapping_shr, __lshrsi3;
+    u64, wrapping_shr, __lshrdi3;
+    u128, wrapping_shr, __lshrti3;
+}
diff --git a/library/compiler-builtins/ci/bench-icount.sh b/library/compiler-builtins/ci/bench-icount.sh
new file mode 100755
index 00000000000..4d93e257a6c
--- /dev/null
+++ b/library/compiler-builtins/ci/bench-icount.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+set -eux
+
+iai_home="iai-home"
+
+# Download the baseline from master
+./ci/ci-util.py locate-baseline --download --extract
+
+# Run benchmarks once
+function run_icount_benchmarks() {
+    cargo_args=(
+        "--bench" "icount"
+        "--no-default-features"
+        "--features" "unstable,unstable-float,icount"
+    )
+
+    iai_args=(
+        "--home" "$(pwd)/$iai_home"
+        "--regression=ir=5.0"
+        "--save-summary"
+    )
+
+    # Parse `cargo_arg0 cargo_arg1 -- iai_arg0 iai_arg1` syntax
+    parsing_iai_args=0
+    while [ "$#" -gt 0 ]; do
+        if [ "$parsing_iai_args" == "1" ]; then
+            iai_args+=("$1")
+        elif [ "$1" == "--" ]; then
+            parsing_iai_args=1
+        else
+            cargo_args+=("$1")
+        fi
+
+        shift
+    done
+
+    # Run iai-callgrind benchmarks
+    cargo bench "${cargo_args[@]}" -- "${iai_args[@]}"
+
+    # NB: iai-callgrind should exit on error but does not, so we inspect the sumary
+    # for errors. See  https://github.com/iai-callgrind/iai-callgrind/issues/337
+    if [ -n "${PR_NUMBER:-}" ]; then
+        # If this is for a pull request, ignore regressions if specified.
+        ./ci/ci-util.py check-regressions --home "$iai_home" --allow-pr-override "$PR_NUMBER"
+    else
+        ./ci/ci-util.py check-regressions --home "$iai_home" || true
+    fi
+}
+
+# Run once with softfloats, once with arch instructions enabled
+run_icount_benchmarks --features force-soft-floats -- --save-baseline=softfloat
+run_icount_benchmarks -- --save-baseline=hardfloat
+
+# Name and tar the new baseline
+name="baseline-icount-$(date -u +'%Y%m%d%H%M')-${GITHUB_SHA:0:12}"
+echo "BASELINE_NAME=$name" >>"$GITHUB_ENV"
+tar cJf "$name.tar.xz" "$iai_home"
diff --git a/library/compiler-builtins/ci/ci-util.py b/library/compiler-builtins/ci/ci-util.py
new file mode 100755
index 00000000000..d785b2e9e1d
--- /dev/null
+++ b/library/compiler-builtins/ci/ci-util.py
@@ -0,0 +1,438 @@
+#!/usr/bin/env python3
+"""Utilities for CI.
+
+This dynamically prepares a list of routines that had a source file change based on
+git history.
+"""
+
+import json
+import os
+import re
+import subprocess as sp
+import sys
+from dataclasses import dataclass
+from glob import glob, iglob
+from inspect import cleandoc
+from os import getenv
+from pathlib import Path
+from typing import TypedDict, Self
+
+USAGE = cleandoc(
+    """
+    usage:
+
+    ./ci/ci-util.py <COMMAND> [flags]
+
+    COMMAND:
+        generate-matrix
+            Calculate a matrix of which functions had source change, print that as
+            a JSON object.
+
+        locate-baseline [--download] [--extract]
+            Locate the most recent benchmark baseline available in CI and, if flags
+            specify, download and extract it. Never exits with nonzero status if
+            downloading fails.
+
+            Note that `--extract` will overwrite files in `iai-home`.
+
+        check-regressions [--home iai-home] [--allow-pr-override pr_number]
+            Check `iai-home` (or `iai-home` if unspecified) for `summary.json`
+            files and see if there are any regressions. This is used as a workaround
+            for `iai-callgrind` not exiting with error status; see
+            <https://github.com/iai-callgrind/iai-callgrind/issues/337>.
+
+            If `--allow-pr-override` is specified, the regression check will not exit
+            with failure if any line in the PR starts with `allow-regressions`.
+    """
+)
+
+REPO_ROOT = Path(__file__).parent.parent
+GIT = ["git", "-C", REPO_ROOT]
+DEFAULT_BRANCH = "master"
+WORKFLOW_NAME = "CI"  # Workflow that generates the benchmark artifacts
+ARTIFACT_GLOB = "baseline-icount*"
+# Place this in a PR body to skip regression checks (must be at the start of a line).
+REGRESSION_DIRECTIVE = "ci: allow-regressions"
+# Place this in a PR body to skip extensive tests
+SKIP_EXTENSIVE_DIRECTIVE = "ci: skip-extensive"
+# Place this in a PR body to allow running a large number of extensive tests. If not
+# set, this script will error out if a threshold is exceeded in order to avoid
+# accidentally spending huge amounts of CI time.
+ALLOW_MANY_EXTENSIVE_DIRECTIVE = "ci: allow-many-extensive"
+MANY_EXTENSIVE_THRESHOLD = 20
+
+# Don't run exhaustive tests if these files change, even if they contaiin a function
+# definition.
+IGNORE_FILES = [
+    "libm/src/math/support/",
+    "libm/src/libm_helper.rs",
+    "libm/src/math/arch/intrinsics.rs",
+]
+
+# libm PR CI takes a long time and doesn't need to run unless relevant files have been
+# changed. Anything matching this regex pattern will trigger a run.
+TRIGGER_LIBM_PR_CI = ".*(libm|musl).*"
+
+TYPES = ["f16", "f32", "f64", "f128"]
+
+
+def eprint(*args, **kwargs):
+    """Print to stderr."""
+    print(*args, file=sys.stderr, **kwargs)
+
+
+@dataclass
+class PrInfo:
+    """GitHub response for PR query"""
+
+    body: str
+    commits: list[str]
+    created_at: str
+    number: int
+
+    @classmethod
+    def load(cls, pr_number: int | str) -> Self:
+        """For a given PR number, query the body and commit list"""
+        pr_info = sp.check_output(
+            [
+                "gh",
+                "pr",
+                "view",
+                str(pr_number),
+                "--json=number,commits,body,createdAt",
+                # Flatten the commit list to only hashes, change a key to snake naming
+                "--jq=.commits |= map(.oid) | .created_at = .createdAt | del(.createdAt)",
+            ],
+            text=True,
+        )
+        eprint("PR info:", json.dumps(pr_info, indent=4))
+        return cls(**json.loads(pr_info))
+
+    def contains_directive(self, directive: str) -> bool:
+        """Return true if the provided directive is on a line in the PR body"""
+        lines = self.body.splitlines()
+        return any(line.startswith(directive) for line in lines)
+
+
+class FunctionDef(TypedDict):
+    """Type for an entry in `function-definitions.json`"""
+
+    sources: list[str]
+    type: str
+
+
+class Context:
+    gh_ref: str | None
+    changed: list[Path]
+    defs: dict[str, FunctionDef]
+
+    def __init__(self) -> None:
+        self.gh_ref = getenv("GITHUB_REF")
+        self.changed = []
+        self._init_change_list()
+
+        with open(REPO_ROOT.joinpath("etc/function-definitions.json")) as f:
+            defs = json.load(f)
+
+        defs.pop("__comment", None)
+        self.defs = defs
+
+    def _init_change_list(self):
+        """Create a list of files that have been changed. This uses GITHUB_REF if
+        available, otherwise a diff between `HEAD` and `master`.
+        """
+
+        # For pull requests, GitHub creates a ref `refs/pull/1234/merge` (1234 being
+        # the PR number), and sets this as `GITHUB_REF`.
+        ref = self.gh_ref
+        eprint(f"using ref `{ref}`")
+        if not self.is_pr():
+            # If the ref is not for `merge` then we are not in PR CI
+            eprint("No diff available for ref")
+            return
+
+        # The ref is for a dummy merge commit. We can extract the merge base by
+        # inspecting all parents (`^@`).
+        merge_sha = sp.check_output(
+            GIT + ["show-ref", "--hash", ref], text=True
+        ).strip()
+        merge_log = sp.check_output(GIT + ["log", "-1", merge_sha], text=True)
+        eprint(f"Merge:\n{merge_log}\n")
+
+        parents = (
+            sp.check_output(GIT + ["rev-parse", f"{merge_sha}^@"], text=True)
+            .strip()
+            .splitlines()
+        )
+        assert len(parents) == 2, f"expected two-parent merge but got:\n{parents}"
+        base = parents[0].strip()
+        incoming = parents[1].strip()
+
+        eprint(f"base: {base}, incoming: {incoming}")
+        textlist = sp.check_output(
+            GIT + ["diff", base, incoming, "--name-only"], text=True
+        )
+        self.changed = [Path(p) for p in textlist.splitlines()]
+
+    def is_pr(self) -> bool:
+        """Check if we are looking at a PR rather than a push."""
+        return self.gh_ref is not None and "merge" in self.gh_ref
+
+    @staticmethod
+    def _ignore_file(fname: str) -> bool:
+        return any(fname.startswith(pfx) for pfx in IGNORE_FILES)
+
+    def changed_routines(self) -> dict[str, list[str]]:
+        """Create a list of routines for which one or more files have been updated,
+        separated by type.
+        """
+        routines = set()
+        for name, meta in self.defs.items():
+            # Don't update if changes to the file should be ignored
+            sources = (f for f in meta["sources"] if not self._ignore_file(f))
+
+            # Select changed files
+            changed = [f for f in sources if Path(f) in self.changed]
+
+            if len(changed) > 0:
+                eprint(f"changed files for {name}: {changed}")
+                routines.add(name)
+
+        ret: dict[str, list[str]] = {}
+        for r in sorted(routines):
+            ret.setdefault(self.defs[r]["type"], []).append(r)
+
+        return ret
+
+    def may_skip_libm_ci(self) -> bool:
+        """If this is a PR and no libm files were changed, allow skipping libm
+        jobs."""
+
+        if self.is_pr():
+            return all(not re.match(TRIGGER_LIBM_PR_CI, str(f)) for f in self.changed)
+
+        return False
+
+    def emit_workflow_output(self):
+        """Create a JSON object a list items for each type's changed files, if any
+        did change, and the routines that were affected by the change.
+        """
+
+        pr_number = os.environ.get("PR_NUMBER")
+        skip_tests = False
+        error_on_many_tests = False
+
+        if pr_number is not None and len(pr_number) > 0:
+            pr = PrInfo.load(pr_number)
+            skip_tests = pr.contains_directive(SKIP_EXTENSIVE_DIRECTIVE)
+            error_on_many_tests = not pr.contains_directive(
+                ALLOW_MANY_EXTENSIVE_DIRECTIVE
+            )
+
+            if skip_tests:
+                eprint("Skipping all extensive tests")
+
+        changed = self.changed_routines()
+        matrix = []
+        total_to_test = 0
+
+        # Figure out which extensive tests need to run
+        for ty in TYPES:
+            ty_changed = changed.get(ty, [])
+            ty_to_test = [] if skip_tests else ty_changed
+            total_to_test += len(ty_to_test)
+
+            item = {
+                "ty": ty,
+                "changed": ",".join(ty_changed),
+                "to_test": ",".join(ty_to_test),
+            }
+
+            matrix.append(item)
+
+        ext_matrix = json.dumps({"extensive_matrix": matrix}, separators=(",", ":"))
+        may_skip = str(self.may_skip_libm_ci()).lower()
+        print(f"extensive_matrix={ext_matrix}")
+        print(f"may_skip_libm_ci={may_skip}")
+        eprint(f"extensive_matrix={ext_matrix}")
+        eprint(f"may_skip_libm_ci={may_skip}")
+        eprint(f"total extensive tests: {total_to_test}")
+
+        if error_on_many_tests and total_to_test > MANY_EXTENSIVE_THRESHOLD:
+            eprint(
+                f"More than {MANY_EXTENSIVE_THRESHOLD} tests would be run; add"
+                f" `{ALLOW_MANY_EXTENSIVE_DIRECTIVE}` to the PR body if this is"
+                " intentional. If this is refactoring that happens to touch a lot of"
+                f" files, `{SKIP_EXTENSIVE_DIRECTIVE}` can be used instead."
+            )
+            exit(1)
+
+
+def locate_baseline(flags: list[str]) -> None:
+    """Find the most recent baseline from CI, download it if specified.
+
+    This returns rather than erroring, even if the `gh` commands fail. This is to avoid
+    erroring in CI if the baseline is unavailable (artifact time limit exceeded, first
+    run on the branch, etc).
+    """
+
+    download = False
+    extract = False
+
+    while len(flags) > 0:
+        match flags[0]:
+            case "--download":
+                download = True
+            case "--extract":
+                extract = True
+            case _:
+                eprint(USAGE)
+                exit(1)
+        flags = flags[1:]
+
+    if extract and not download:
+        eprint("cannot extract without downloading")
+        exit(1)
+
+    try:
+        # Locate the most recent job to complete with success on our branch
+        latest_job = sp.check_output(
+            [
+                "gh",
+                "run",
+                "list",
+                "--status=success",
+                f"--branch={DEFAULT_BRANCH}",
+                "--json=databaseId,url,headSha,conclusion,createdAt,"
+                "status,workflowDatabaseId,workflowName",
+                # Return the first array element matching our workflow name. NB: cannot
+                # just use `--limit=1`, jq filtering happens after limiting. We also
+                # cannot just use `--workflow` because GH gets confused from
+                # different file names in history.
+                f'--jq=[.[] | select(.workflowName == "{WORKFLOW_NAME}")][0]',
+            ],
+            text=True,
+        )
+    except sp.CalledProcessError as e:
+        eprint(f"failed to run github command: {e}")
+        return
+
+    try:
+        latest = json.loads(latest_job)
+        eprint("latest job: ", json.dumps(latest, indent=4))
+    except json.JSONDecodeError as e:
+        eprint(f"failed to decode json '{latest_job}', {e}")
+        return
+
+    if not download:
+        eprint("--download not specified, returning")
+        return
+
+    job_id = latest.get("databaseId")
+    if job_id is None:
+        eprint("skipping download step")
+        return
+
+    sp.run(
+        ["gh", "run", "download", str(job_id), f"--pattern={ARTIFACT_GLOB}"],
+        check=False,
+    )
+
+    if not extract:
+        eprint("skipping extraction step")
+        return
+
+    # Find the baseline with the most recent timestamp. GH downloads the files to e.g.
+    # `some-dirname/some-dirname.tar.xz`, so just glob the whole thing together.
+    candidate_baselines = glob(f"{ARTIFACT_GLOB}/{ARTIFACT_GLOB}")
+    if len(candidate_baselines) == 0:
+        eprint("no possible baseline directories found")
+        return
+
+    candidate_baselines.sort(reverse=True)
+    baseline_archive = candidate_baselines[0]
+    eprint(f"extracting {baseline_archive}")
+    sp.run(["tar", "xJvf", baseline_archive], check=True)
+    eprint("baseline extracted successfully")
+
+
+def check_iai_regressions(args: list[str]):
+    """Find regressions in iai summary.json files, exit with failure if any are
+    found.
+    """
+
+    iai_home_str = "iai-home"
+    pr_number = None
+
+    while len(args) > 0:
+        match args:
+            case ["--home", home, *rest]:
+                iai_home_str = home
+                args = rest
+            case ["--allow-pr-override", pr_num, *rest]:
+                pr_number = pr_num
+                args = rest
+            case _:
+                eprint(USAGE)
+                exit(1)
+
+    iai_home = Path(iai_home_str)
+
+    found_summaries = False
+    regressions: list[dict] = []
+    for summary_path in iglob("**/summary.json", root_dir=iai_home, recursive=True):
+        found_summaries = True
+        with open(iai_home / summary_path, "r") as f:
+            summary = json.load(f)
+
+        summary_regs = []
+        run = summary["callgrind_summary"]["callgrind_run"]
+        fname = summary["function_name"]
+        id = summary["id"]
+        name_entry = {"name": f"{fname}.{id}"}
+
+        for segment in run["segments"]:
+            summary_regs.extend(segment["regressions"])
+
+        summary_regs.extend(run["total"]["regressions"])
+
+        regressions.extend(name_entry | reg for reg in summary_regs)
+
+    if not found_summaries:
+        eprint(f"did not find any summary.json files within {iai_home}")
+        exit(1)
+
+    if len(regressions) == 0:
+        eprint("No regressions found")
+        return
+
+    eprint("Found regressions:", json.dumps(regressions, indent=4))
+
+    if pr_number is not None:
+        pr = PrInfo.load(pr_number)
+        if pr.contains_directive(REGRESSION_DIRECTIVE):
+            eprint("PR allows regressions, returning")
+            return
+
+    exit(1)
+
+
+def main():
+    match sys.argv[1:]:
+        case ["generate-matrix"]:
+            ctx = Context()
+            ctx.emit_workflow_output()
+        case ["locate-baseline", *flags]:
+            locate_baseline(flags)
+        case ["check-regressions", *args]:
+            check_iai_regressions(args)
+        case ["--help" | "-h"]:
+            print(USAGE)
+            exit()
+        case _:
+            eprint(USAGE)
+            exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/library/compiler-builtins/ci/docker/aarch64-unknown-linux-gnu/Dockerfile b/library/compiler-builtins/ci/docker/aarch64-unknown-linux-gnu/Dockerfile
new file mode 100644
index 00000000000..df71804ba23
--- /dev/null
+++ b/library/compiler-builtins/ci/docker/aarch64-unknown-linux-gnu/Dockerfile
@@ -0,0 +1,16 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc libc6-dev ca-certificates \
+    gcc-aarch64-linux-gnu m4 make libc6-dev-arm64-cross \
+    qemu-user-static
+
+ENV TOOLCHAIN_PREFIX=aarch64-linux-gnu-
+ENV CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER="$TOOLCHAIN_PREFIX"gcc \
+    CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_RUNNER=qemu-aarch64-static \
+    AR_aarch64_unknown_linux_gnu="$TOOLCHAIN_PREFIX"ar \
+    CC_aarch64_unknown_linux_gnu="$TOOLCHAIN_PREFIX"gcc \
+    QEMU_LD_PREFIX=/usr/aarch64-linux-gnu \
+    RUST_TEST_THREADS=1
diff --git a/library/compiler-builtins/ci/docker/arm-unknown-linux-gnueabi/Dockerfile b/library/compiler-builtins/ci/docker/arm-unknown-linux-gnueabi/Dockerfile
new file mode 100644
index 00000000000..38ad1a13623
--- /dev/null
+++ b/library/compiler-builtins/ci/docker/arm-unknown-linux-gnueabi/Dockerfile
@@ -0,0 +1,15 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc libc6-dev ca-certificates \
+    gcc-arm-linux-gnueabi libc6-dev-armel-cross qemu-user-static
+
+ENV TOOLCHAIN_PREFIX=arm-linux-gnueabi-
+ENV CARGO_TARGET_ARM_UNKNOWN_LINUX_GNUEABI_LINKER="$TOOLCHAIN_PREFIX"gcc \
+    CARGO_TARGET_ARM_UNKNOWN_LINUX_GNUEABI_RUNNER=qemu-arm-static \
+    AR_arm_unknown_linux_gnueabi="$TOOLCHAIN_PREFIX"ar \
+    CC_arm_unknown_linux_gnueabi="$TOOLCHAIN_PREFIX"gcc \
+    QEMU_LD_PREFIX=/usr/arm-linux-gnueabi \
+    RUST_TEST_THREADS=1
diff --git a/library/compiler-builtins/ci/docker/arm-unknown-linux-gnueabihf/Dockerfile b/library/compiler-builtins/ci/docker/arm-unknown-linux-gnueabihf/Dockerfile
new file mode 100644
index 00000000000..ffead05d5f2
--- /dev/null
+++ b/library/compiler-builtins/ci/docker/arm-unknown-linux-gnueabihf/Dockerfile
@@ -0,0 +1,15 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc libc6-dev ca-certificates \
+    gcc-arm-linux-gnueabihf libc6-dev-armhf-cross qemu-user-static
+
+ENV TOOLCHAIN_PREFIX=arm-linux-gnueabihf-
+ENV CARGO_TARGET_ARM_UNKNOWN_LINUX_GNUEABIHF_LINKER="$TOOLCHAIN_PREFIX"gcc \
+    CARGO_TARGET_ARM_UNKNOWN_LINUX_GNUEABIHF_RUNNER=qemu-arm-static \
+    AR_arm_unknown_linux_gnueabihf="$TOOLCHAIN_PREFIX"ar \
+    CC_arm_unknown_linux_gnueabihf="$TOOLCHAIN_PREFIX"gcc \
+    QEMU_LD_PREFIX=/usr/arm-linux-gnueabihf \
+    RUST_TEST_THREADS=1
diff --git a/library/compiler-builtins/ci/docker/armv7-unknown-linux-gnueabihf/Dockerfile b/library/compiler-builtins/ci/docker/armv7-unknown-linux-gnueabihf/Dockerfile
new file mode 100644
index 00000000000..9ab49e46ee3
--- /dev/null
+++ b/library/compiler-builtins/ci/docker/armv7-unknown-linux-gnueabihf/Dockerfile
@@ -0,0 +1,15 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc libc6-dev ca-certificates \
+    gcc-arm-linux-gnueabihf libc6-dev-armhf-cross qemu-user-static
+
+ENV TOOLCHAIN_PREFIX=arm-linux-gnueabihf-
+ENV CARGO_TARGET_ARMV7_UNKNOWN_LINUX_GNUEABIHF_LINKER="$TOOLCHAIN_PREFIX"gcc \
+    CARGO_TARGET_ARMV7_UNKNOWN_LINUX_GNUEABIHF_RUNNER=qemu-arm-static \
+    AR_armv7_unknown_linux_gnueabihf="$TOOLCHAIN_PREFIX"ar \
+    CC_armv7_unknown_linux_gnueabihf="$TOOLCHAIN_PREFIX"gcc \
+    QEMU_LD_PREFIX=/usr/arm-linux-gnueabihf \
+    RUST_TEST_THREADS=1
diff --git a/library/compiler-builtins/ci/docker/i586-unknown-linux-gnu/Dockerfile b/library/compiler-builtins/ci/docker/i586-unknown-linux-gnu/Dockerfile
new file mode 100644
index 00000000000..d12ced3257f
--- /dev/null
+++ b/library/compiler-builtins/ci/docker/i586-unknown-linux-gnu/Dockerfile
@@ -0,0 +1,6 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc-multilib m4 make libc6-dev ca-certificates
diff --git a/library/compiler-builtins/ci/docker/i686-unknown-linux-gnu/Dockerfile b/library/compiler-builtins/ci/docker/i686-unknown-linux-gnu/Dockerfile
new file mode 100644
index 00000000000..d12ced3257f
--- /dev/null
+++ b/library/compiler-builtins/ci/docker/i686-unknown-linux-gnu/Dockerfile
@@ -0,0 +1,6 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc-multilib m4 make libc6-dev ca-certificates
diff --git a/library/compiler-builtins/ci/docker/loongarch64-unknown-linux-gnu/Dockerfile b/library/compiler-builtins/ci/docker/loongarch64-unknown-linux-gnu/Dockerfile
new file mode 100644
index 00000000000..62b43da9e70
--- /dev/null
+++ b/library/compiler-builtins/ci/docker/loongarch64-unknown-linux-gnu/Dockerfile
@@ -0,0 +1,14 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc libc6-dev qemu-user-static ca-certificates \
+    gcc-14-loongarch64-linux-gnu libc6-dev-loong64-cross
+
+ENV CARGO_TARGET_LOONGARCH64_UNKNOWN_LINUX_GNU_LINKER=loongarch64-linux-gnu-gcc-14 \
+    CARGO_TARGET_LOONGARCH64_UNKNOWN_LINUX_GNU_RUNNER=qemu-loongarch64-static \
+    AR_loongarch64_unknown_linux_gnu=loongarch64-linux-gnu-ar \
+    CC_loongarch64_unknown_linux_gnu=loongarch64-linux-gnu-gcc-14 \
+    QEMU_LD_PREFIX=/usr/loongarch64-linux-gnu \
+    RUST_TEST_THREADS=1
diff --git a/library/compiler-builtins/ci/docker/mips-unknown-linux-gnu/Dockerfile b/library/compiler-builtins/ci/docker/mips-unknown-linux-gnu/Dockerfile
new file mode 100644
index 00000000000..c02a9467234
--- /dev/null
+++ b/library/compiler-builtins/ci/docker/mips-unknown-linux-gnu/Dockerfile
@@ -0,0 +1,16 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc libc6-dev ca-certificates \
+    gcc-mips-linux-gnu libc6-dev-mips-cross \
+    binfmt-support qemu-user-static qemu-system-mips
+
+ENV TOOLCHAIN_PREFIX=mips-linux-gnu-
+ENV CARGO_TARGET_MIPS_UNKNOWN_LINUX_GNU_LINKER="$TOOLCHAIN_PREFIX"gcc \
+    CARGO_TARGET_MIPS_UNKNOWN_LINUX_GNU_RUNNER=qemu-mips-static \
+    AR_mips_unknown_linux_gnu="$TOOLCHAIN_PREFIX"ar \
+    CC_mips_unknown_linux_gnu="$TOOLCHAIN_PREFIX"gcc \
+    QEMU_LD_PREFIX=/usr/mips-linux-gnu \
+    RUST_TEST_THREADS=1
diff --git a/library/compiler-builtins/ci/docker/mips64-unknown-linux-gnuabi64/Dockerfile b/library/compiler-builtins/ci/docker/mips64-unknown-linux-gnuabi64/Dockerfile
new file mode 100644
index 00000000000..6d8b96069be
--- /dev/null
+++ b/library/compiler-builtins/ci/docker/mips64-unknown-linux-gnuabi64/Dockerfile
@@ -0,0 +1,20 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    ca-certificates \
+    gcc \
+    gcc-mips64-linux-gnuabi64 \
+    libc6-dev \
+    libc6-dev-mips64-cross \
+    qemu-user-static \
+    qemu-system-mips
+
+ENV TOOLCHAIN_PREFIX=mips64-linux-gnuabi64-
+ENV CARGO_TARGET_MIPS64_UNKNOWN_LINUX_GNUABI64_LINKER="$TOOLCHAIN_PREFIX"gcc \
+    CARGO_TARGET_MIPS64_UNKNOWN_LINUX_GNUABI64_RUNNER=qemu-mips64-static \
+    AR_mips64_unknown_linux_gnuabi64="$TOOLCHAIN_PREFIX"ar \
+    CC_mips64_unknown_linux_gnuabi64="$TOOLCHAIN_PREFIX"gcc \
+    QEMU_LD_PREFIX=/usr/mips64-linux-gnuabi64 \
+    RUST_TEST_THREADS=1
diff --git a/library/compiler-builtins/ci/docker/mips64el-unknown-linux-gnuabi64/Dockerfile b/library/compiler-builtins/ci/docker/mips64el-unknown-linux-gnuabi64/Dockerfile
new file mode 100644
index 00000000000..7e6ac7c3b8a
--- /dev/null
+++ b/library/compiler-builtins/ci/docker/mips64el-unknown-linux-gnuabi64/Dockerfile
@@ -0,0 +1,19 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    ca-certificates \
+    gcc \
+    gcc-mips64el-linux-gnuabi64 \
+    libc6-dev \
+    libc6-dev-mips64el-cross \
+    qemu-user-static
+
+ENV TOOLCHAIN_PREFIX=mips64el-linux-gnuabi64-
+ENV CARGO_TARGET_MIPS64EL_UNKNOWN_LINUX_GNUABI64_LINKER="$TOOLCHAIN_PREFIX"gcc \
+    CARGO_TARGET_MIPS64EL_UNKNOWN_LINUX_GNUABI64_RUNNER=qemu-mips64el-static \
+    AR_mips64el_unknown_linux_gnuabi64="$TOOLCHAIN_PREFIX"ar \
+    CC_mips64el_unknown_linux_gnuabi64="$TOOLCHAIN_PREFIX"gcc \
+    QEMU_LD_PREFIX=/usr/mips64el-linux-gnuabi64 \
+    RUST_TEST_THREADS=1
diff --git a/library/compiler-builtins/ci/docker/mipsel-unknown-linux-gnu/Dockerfile b/library/compiler-builtins/ci/docker/mipsel-unknown-linux-gnu/Dockerfile
new file mode 100644
index 00000000000..9feadc7b5ce
--- /dev/null
+++ b/library/compiler-builtins/ci/docker/mipsel-unknown-linux-gnu/Dockerfile
@@ -0,0 +1,16 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc libc6-dev ca-certificates \
+    gcc-mipsel-linux-gnu libc6-dev-mipsel-cross \
+    binfmt-support qemu-user-static
+
+ENV TOOLCHAIN_PREFIX=mipsel-linux-gnu-
+ENV CARGO_TARGET_MIPSEL_UNKNOWN_LINUX_GNU_LINKER="$TOOLCHAIN_PREFIX"gcc \
+    CARGO_TARGET_MIPSEL_UNKNOWN_LINUX_GNU_RUNNER=qemu-mipsel-static \
+    AR_mipsel_unknown_linux_gnu="$TOOLCHAIN_PREFIX"ar \
+    CC_mipsel_unknown_linux_gnu="$TOOLCHAIN_PREFIX"gcc \
+    QEMU_LD_PREFIX=/usr/mipsel-linux-gnu \
+    RUST_TEST_THREADS=1
diff --git a/library/compiler-builtins/ci/docker/powerpc-unknown-linux-gnu/Dockerfile b/library/compiler-builtins/ci/docker/powerpc-unknown-linux-gnu/Dockerfile
new file mode 100644
index 00000000000..84dcaf47ed5
--- /dev/null
+++ b/library/compiler-builtins/ci/docker/powerpc-unknown-linux-gnu/Dockerfile
@@ -0,0 +1,16 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc libc6-dev qemu-user-static ca-certificates \
+    gcc-powerpc-linux-gnu libc6-dev-powerpc-cross \
+    qemu-system-ppc
+
+ENV TOOLCHAIN_PREFIX=powerpc-linux-gnu-
+ENV CARGO_TARGET_POWERPC_UNKNOWN_LINUX_GNU_LINKER="$TOOLCHAIN_PREFIX"gcc \
+    CARGO_TARGET_POWERPC_UNKNOWN_LINUX_GNU_RUNNER=qemu-ppc-static \
+    AR_powerpc_unknown_linux_gnu="$TOOLCHAIN_PREFIX"ar \
+    CC_powerpc_unknown_linux_gnu="$TOOLCHAIN_PREFIX"gcc \
+    QEMU_LD_PREFIX=/usr/powerpc-linux-gnu \
+    RUST_TEST_THREADS=1
diff --git a/library/compiler-builtins/ci/docker/powerpc64-unknown-linux-gnu/Dockerfile b/library/compiler-builtins/ci/docker/powerpc64-unknown-linux-gnu/Dockerfile
new file mode 100644
index 00000000000..b90fd5ec545
--- /dev/null
+++ b/library/compiler-builtins/ci/docker/powerpc64-unknown-linux-gnu/Dockerfile
@@ -0,0 +1,16 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc libc6-dev ca-certificates \
+    gcc-powerpc64-linux-gnu libc6-dev-ppc64-cross \
+    binfmt-support qemu-user-static qemu-system-ppc
+
+ENV TOOLCHAIN_PREFIX=powerpc64-linux-gnu-
+ENV CARGO_TARGET_POWERPC64_UNKNOWN_LINUX_GNU_LINKER="$TOOLCHAIN_PREFIX"gcc \
+    CARGO_TARGET_POWERPC64_UNKNOWN_LINUX_GNU_RUNNER=qemu-ppc64-static \
+    AR_powerpc64_unknown_linux_gnu="$TOOLCHAIN_PREFIX"ar \
+    CC_powerpc64_unknown_linux_gnu="$TOOLCHAIN_PREFIX"gcc \
+    QEMU_LD_PREFIX=/usr/powerpc64-linux-gnu \
+    RUST_TEST_THREADS=1
diff --git a/library/compiler-builtins/ci/docker/powerpc64le-unknown-linux-gnu/Dockerfile b/library/compiler-builtins/ci/docker/powerpc64le-unknown-linux-gnu/Dockerfile
new file mode 100644
index 00000000000..e6d1d1cd0b5
--- /dev/null
+++ b/library/compiler-builtins/ci/docker/powerpc64le-unknown-linux-gnu/Dockerfile
@@ -0,0 +1,17 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc libc6-dev qemu-user-static ca-certificates \
+    gcc-powerpc64le-linux-gnu libc6-dev-ppc64el-cross \
+    qemu-system-ppc
+
+ENV TOOLCHAIN_PREFIX=powerpc64le-linux-gnu-
+ENV CARGO_TARGET_POWERPC64LE_UNKNOWN_LINUX_GNU_LINKER="$TOOLCHAIN_PREFIX"gcc \
+    CARGO_TARGET_POWERPC64LE_UNKNOWN_LINUX_GNU_RUNNER=qemu-ppc64le-static \
+    AR_powerpc64le_unknown_linux_gnu="$TOOLCHAIN_PREFIX"ar \
+    CC_powerpc64le_unknown_linux_gnu="$TOOLCHAIN_PREFIX"gcc \
+    QEMU_CPU=POWER8 \
+    QEMU_LD_PREFIX=/usr/powerpc64le-linux-gnu \
+    RUST_TEST_THREADS=1
diff --git a/library/compiler-builtins/ci/docker/riscv64gc-unknown-linux-gnu/Dockerfile b/library/compiler-builtins/ci/docker/riscv64gc-unknown-linux-gnu/Dockerfile
new file mode 100644
index 00000000000..eeb4ed0193e
--- /dev/null
+++ b/library/compiler-builtins/ci/docker/riscv64gc-unknown-linux-gnu/Dockerfile
@@ -0,0 +1,16 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc libc6-dev qemu-user-static ca-certificates \
+    gcc-riscv64-linux-gnu libc6-dev-riscv64-cross \
+    qemu-system-riscv64
+
+ENV TOOLCHAIN_PREFIX=riscv64-linux-gnu-
+ENV CARGO_TARGET_RISCV64GC_UNKNOWN_LINUX_GNU_LINKER="$TOOLCHAIN_PREFIX"gcc \
+    CARGO_TARGET_RISCV64GC_UNKNOWN_LINUX_GNU_RUNNER=qemu-riscv64-static \
+    AR_riscv64gc_unknown_linux_gnu="$TOOLCHAIN_PREFIX"ar \
+    CC_riscv64gc_unknown_linux_gnu="$TOOLCHAIN_PREFIX"gcc \
+    QEMU_LD_PREFIX=/usr/riscv64-linux-gnu \
+    RUST_TEST_THREADS=1
diff --git a/library/compiler-builtins/ci/docker/thumbv6m-none-eabi/Dockerfile b/library/compiler-builtins/ci/docker/thumbv6m-none-eabi/Dockerfile
new file mode 100644
index 00000000000..ad0d4351ea6
--- /dev/null
+++ b/library/compiler-builtins/ci/docker/thumbv6m-none-eabi/Dockerfile
@@ -0,0 +1,9 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc libc6-dev ca-certificates \
+    gcc-arm-none-eabi \
+    libnewlib-arm-none-eabi
+ENV BUILD_ONLY=1
diff --git a/library/compiler-builtins/ci/docker/thumbv7em-none-eabi/Dockerfile b/library/compiler-builtins/ci/docker/thumbv7em-none-eabi/Dockerfile
new file mode 100644
index 00000000000..ad0d4351ea6
--- /dev/null
+++ b/library/compiler-builtins/ci/docker/thumbv7em-none-eabi/Dockerfile
@@ -0,0 +1,9 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc libc6-dev ca-certificates \
+    gcc-arm-none-eabi \
+    libnewlib-arm-none-eabi
+ENV BUILD_ONLY=1
diff --git a/library/compiler-builtins/ci/docker/thumbv7em-none-eabihf/Dockerfile b/library/compiler-builtins/ci/docker/thumbv7em-none-eabihf/Dockerfile
new file mode 100644
index 00000000000..ad0d4351ea6
--- /dev/null
+++ b/library/compiler-builtins/ci/docker/thumbv7em-none-eabihf/Dockerfile
@@ -0,0 +1,9 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc libc6-dev ca-certificates \
+    gcc-arm-none-eabi \
+    libnewlib-arm-none-eabi
+ENV BUILD_ONLY=1
diff --git a/library/compiler-builtins/ci/docker/thumbv7m-none-eabi/Dockerfile b/library/compiler-builtins/ci/docker/thumbv7m-none-eabi/Dockerfile
new file mode 100644
index 00000000000..ad0d4351ea6
--- /dev/null
+++ b/library/compiler-builtins/ci/docker/thumbv7m-none-eabi/Dockerfile
@@ -0,0 +1,9 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc libc6-dev ca-certificates \
+    gcc-arm-none-eabi \
+    libnewlib-arm-none-eabi
+ENV BUILD_ONLY=1
diff --git a/library/compiler-builtins/ci/docker/wasm32-unknown-unknown/Dockerfile b/library/compiler-builtins/ci/docker/wasm32-unknown-unknown/Dockerfile
new file mode 100644
index 00000000000..2813d318670
--- /dev/null
+++ b/library/compiler-builtins/ci/docker/wasm32-unknown-unknown/Dockerfile
@@ -0,0 +1,8 @@
+ARG IMAGE=ubuntu:20.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc clang libc6-dev ca-certificates
+
+ENV CARGO_TARGET_WASM32_UNKNOWN_UNKNOWN_RUNNER=true
diff --git a/library/compiler-builtins/ci/docker/x86_64-unknown-linux-gnu/Dockerfile b/library/compiler-builtins/ci/docker/x86_64-unknown-linux-gnu/Dockerfile
new file mode 100644
index 00000000000..c590adcddf6
--- /dev/null
+++ b/library/compiler-builtins/ci/docker/x86_64-unknown-linux-gnu/Dockerfile
@@ -0,0 +1,6 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc m4 make libc6-dev ca-certificates
diff --git a/library/compiler-builtins/ci/download-compiler-rt.sh b/library/compiler-builtins/ci/download-compiler-rt.sh
new file mode 100755
index 00000000000..bf7f8c24896
--- /dev/null
+++ b/library/compiler-builtins/ci/download-compiler-rt.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+# Download sources to build C versions of intrinsics. Once being run,
+# `RUST_COMPILER_RT_ROOT` must be set.
+
+set -eux
+
+rust_llvm_version=20.1-2025-02-13
+
+curl -L -o code.tar.gz "https://github.com/rust-lang/llvm-project/archive/rustc/${rust_llvm_version}.tar.gz"
+tar xzf code.tar.gz --strip-components 1 llvm-project-rustc-${rust_llvm_version}/compiler-rt
diff --git a/library/compiler-builtins/ci/miri.sh b/library/compiler-builtins/ci/miri.sh
new file mode 100755
index 00000000000..7b0ea44c690
--- /dev/null
+++ b/library/compiler-builtins/ci/miri.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+set -eux
+
+# We need Tree Borrows as some of our raw pointer patterns are not
+# compatible with Stacked Borrows.
+export MIRIFLAGS="-Zmiri-tree-borrows"
+
+# One target that sets `mem-unaligned` and one that does not,
+# and a big-endian target.
+targets=(
+    x86_64-unknown-linux-gnu
+    armv7-unknown-linux-gnueabihf
+    s390x-unknown-linux-gnu
+)
+for target in "${targets[@]}"; do
+    # Only run the `mem` tests to avoid this taking too long.
+    cargo miri test --manifest-path builtins-test/Cargo.toml --features no-asm --target "$target" -- mem
+done
diff --git a/library/compiler-builtins/ci/run-docker.sh b/library/compiler-builtins/ci/run-docker.sh
new file mode 100755
index 00000000000..d0122dee5c8
--- /dev/null
+++ b/library/compiler-builtins/ci/run-docker.sh
@@ -0,0 +1,111 @@
+#!/bin/bash
+
+# Small script to run tests for a target (or all targets) inside all the
+# respective docker images.
+
+set -euxo pipefail
+
+host_arch="$(uname -m | sed 's/arm64/aarch64/')"
+
+# Directories and files that do not yet exist need to be created before
+# calling docker, otherwise docker will create them but they will be owned
+# by root.
+mkdir -p target
+cargo generate-lockfile
+cargo generate-lockfile --manifest-path builtins-test-intrinsics/Cargo.toml
+
+run() {
+    local target="$1"
+
+    echo "testing target: $target"
+
+    emulated=""
+    target_arch="$(echo "$target" | cut -d'-' -f1)"
+    if [ "$target_arch" != "$host_arch" ]; then
+        emulated=1
+        echo "target is emulated"
+    fi
+
+    run_cmd="HOME=/tmp"
+
+    if [ "${GITHUB_ACTIONS:-}" = "true" ]; then
+        # Enable Docker image caching on GHA
+        build_cmd=("buildx" "build")
+        build_args=(
+            "--cache-from" "type=local,src=/tmp/.buildx-cache"
+            "--cache-to" "type=local,dest=/tmp/.buildx-cache-new"
+            # This is the beautiful bash syntax for expanding an array but neither
+            # raising an error nor returning an empty string if the array is empty.
+            "${build_args[@]:+"${build_args[@]}"}"
+            "--load"
+        )
+    fi
+
+    if [ "$(uname -s)" = "Linux" ] && [ -z "${DOCKER_BASE_IMAGE:-}" ]; then
+        # Share the host rustc and target. Do this only on Linux and if the image
+        # isn't overridden
+        run_args=(
+            --user "$(id -u):$(id -g)"
+            -e "CARGO_HOME=/cargo"
+            -v "${HOME}/.cargo:/cargo"
+            -v "$(pwd)/target:/builtins-target"
+            -v "$(rustc --print sysroot):/rust:ro"
+        )
+        run_cmd="$run_cmd PATH=\$PATH:/rust/bin:/cargo/bin"
+    else
+        # Use rustc provided by a docker image
+        docker volume create compiler-builtins-cache
+        build_args=(
+            "--build-arg"
+            "IMAGE=${DOCKER_BASE_IMAGE:-rustlang/rust:nightly}"
+        )
+        run_args=(-v "compiler-builtins-cache:/builtins-target")
+        run_cmd="$run_cmd HOME=/tmp" "USING_CONTAINER_RUSTC=1"
+    fi
+
+    if [ -d compiler-rt ]; then
+        export RUST_COMPILER_RT_ROOT="/checkout/compiler-rt"
+    fi
+
+    run_cmd="$run_cmd ci/run.sh $target"
+
+    docker "${build_cmd[@]:-build}" \
+        -t "builtins-$target" \
+        "${build_args[@]:-}" \
+        "ci/docker/$target"
+    docker run \
+        --rm \
+        -e CI \
+        -e CARGO_TARGET_DIR=/builtins-target \
+        -e CARGO_TERM_COLOR \
+        -e MAY_SKIP_LIBM_CI \
+        -e RUSTFLAGS \
+        -e RUST_BACKTRACE \
+        -e RUST_COMPILER_RT_ROOT \
+        -e "EMULATED=$emulated" \
+        -v "$(pwd):/checkout:ro" \
+        -w /checkout \
+        "${run_args[@]:-}" \
+        --init \
+        "builtins-$target" \
+        sh -c "$run_cmd"
+}
+
+if [ "${1:-}" = "--help" ] || [ "$#" -gt 1 ]; then
+    set +x
+    echo "\
+    usage: ./ci/run-docker.sh [target]
+
+    you can also set DOCKER_BASE_IMAGE to use something other than the default
+    ubuntu:24.04 (or rustlang/rust:nightly).
+    "
+    exit
+fi
+
+if [ -z "${1:-}" ]; then
+    for d in ci/docker/*; do
+        run $(basename "$d")
+    done
+else
+    run "$1"
+fi
diff --git a/library/compiler-builtins/ci/run-extensive.sh b/library/compiler-builtins/ci/run-extensive.sh
new file mode 100755
index 00000000000..4ba41a026fa
--- /dev/null
+++ b/library/compiler-builtins/ci/run-extensive.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+set -euo pipefail
+
+echo "Tests to run: '$TO_TEST'"
+
+if [ -z "$TO_TEST" ]; then
+    echo "No tests to run, exiting."
+    exit
+fi
+
+set -x
+
+test_cmd=(
+    cargo test
+    --package libm-test
+    --features "build-mpfr,libm/unstable,libm/force-soft-floats"
+    --profile release-checked
+)
+
+# Run the non-extensive tests first to catch any easy failures
+"${test_cmd[@]}" -- "$TO_TEST"
+
+LIBM_EXTENSIVE_TESTS="$TO_TEST" "${test_cmd[@]}" -- extensive
diff --git a/library/compiler-builtins/ci/run.sh b/library/compiler-builtins/ci/run.sh
new file mode 100755
index 00000000000..68d13c130bc
--- /dev/null
+++ b/library/compiler-builtins/ci/run.sh
@@ -0,0 +1,302 @@
+#!/bin/bash
+
+set -eux
+
+export RUST_BACKTRACE="${RUST_BACKTRACE:-full}"
+export NEXTEST_STATUS_LEVEL=all
+
+target="${1:-}"
+
+if [ -z "$target" ]; then
+    host_target=$(rustc -vV | awk '/^host/ { print $2 }')
+    echo "Defaulted to host target $host_target"
+    target="$host_target"
+fi
+
+if [[ "$target" = *"wasm"* ]]; then
+    # Enable the random backend
+    export RUSTFLAGS="${RUSTFLAGS:-} --cfg getrandom_backend=\"wasm_js\""
+fi
+
+if [ "${USING_CONTAINER_RUSTC:-}" = 1 ]; then
+    # Install nonstandard components if we have control of the environment
+    rustup target list --installed |
+        grep -E "^$target\$" ||
+        rustup target add "$target"
+fi
+
+# Test our implementation
+if [ "${BUILD_ONLY:-}" = "1" ]; then
+    echo "no tests to run for build-only targets"
+else
+    test_builtins=(cargo test --package builtins-test --no-fail-fast --target "$target")
+    "${test_builtins[@]}"
+    "${test_builtins[@]}" --release
+    "${test_builtins[@]}" --features c
+    "${test_builtins[@]}" --features c --release
+    "${test_builtins[@]}" --features no-asm
+    "${test_builtins[@]}" --features no-asm --release
+    "${test_builtins[@]}" --features no-f16-f128
+    "${test_builtins[@]}" --features no-f16-f128 --release
+    "${test_builtins[@]}" --benches
+    "${test_builtins[@]}" --benches --release
+
+    if [ "${TEST_VERBATIM:-}" = "1" ]; then
+        verb_path=$(cmd.exe //C echo \\\\?\\%cd%\\builtins-test\\target2)
+        "${test_builtins[@]}" --target-dir "$verb_path" --features c
+    fi
+fi
+
+
+declare -a rlib_paths
+
+# Set the `rlib_paths` global array to a list of all compiler-builtins rlibs
+update_rlib_paths() {
+    if [ -d /builtins-target ]; then
+        rlib_paths=( /builtins-target/"${target}"/debug/deps/libcompiler_builtins-*.rlib )
+    else
+        rlib_paths=( target/"${target}"/debug/deps/libcompiler_builtins-*.rlib )
+    fi
+}
+
+# Remove any existing artifacts from previous tests that don't set #![compiler_builtins]
+update_rlib_paths
+rm -f "${rlib_paths[@]}"
+
+cargo build -p compiler_builtins --target "$target"
+cargo build -p compiler_builtins --target "$target" --release
+cargo build -p compiler_builtins --target "$target" --features c
+cargo build -p compiler_builtins --target "$target" --features c --release
+cargo build -p compiler_builtins --target "$target" --features no-asm
+cargo build -p compiler_builtins --target "$target" --features no-asm --release
+cargo build -p compiler_builtins --target "$target" --features no-f16-f128
+cargo build -p compiler_builtins --target "$target" --features no-f16-f128 --release
+
+PREFIX=${target//unknown-/}-
+case "$target" in
+    armv7-*)
+        PREFIX=arm-linux-gnueabihf-
+        ;;
+    thumb*)
+        PREFIX=arm-none-eabi-
+        ;;
+    *86*-*)
+        PREFIX=
+        ;;
+esac
+
+NM=$(find "$(rustc --print sysroot)" \( -name llvm-nm -o -name llvm-nm.exe \) )
+if [ "$NM" = "" ]; then
+  NM="${PREFIX}nm"
+fi
+
+# i686-pc-windows-gnu tools have a dependency on some DLLs, so run it with
+# rustup run to ensure that those are in PATH.
+TOOLCHAIN="$(rustup show active-toolchain | sed 's/ (default)//')"
+if [[ "$TOOLCHAIN" == *i686-pc-windows-gnu ]]; then
+  NM="rustup run $TOOLCHAIN $NM"
+fi
+
+# Look out for duplicated symbols when we include the compiler-rt (C) implementation
+update_rlib_paths
+for rlib in "${rlib_paths[@]}"; do
+    set +x
+    echo "================================================================"
+    echo "checking $rlib for duplicate symbols"
+    echo "================================================================"
+    set -x
+    
+    duplicates_found=0
+
+    # NOTE On i586, It's normal that the get_pc_thunk symbol appears several
+    # times so ignore it
+    $NM -g --defined-only "$rlib" 2>&1 |
+      sort |
+      uniq -d |
+      grep -v __x86.get_pc_thunk --quiet |
+      grep 'T __' && duplicates_found=1
+
+    if [ "$duplicates_found" != 0 ]; then
+        echo "error: found duplicate symbols"
+        exit 1
+    else
+        echo "success; no duplicate symbols found"
+    fi
+done
+
+rm -f "${rlib_paths[@]}"
+
+build_intrinsics_test() {
+    cargo build \
+        --target "$target" --verbose \
+        --manifest-path builtins-test-intrinsics/Cargo.toml "$@"
+}
+
+# Verify that we haven't dropped any intrinsics/symbols
+build_intrinsics_test
+build_intrinsics_test --release
+build_intrinsics_test --features c
+build_intrinsics_test --features c --release
+
+# Verify that there are no undefined symbols to `panic` within our
+# implementations
+CARGO_PROFILE_DEV_LTO=true build_intrinsics_test
+CARGO_PROFILE_RELEASE_LTO=true build_intrinsics_test --release
+
+# Ensure no references to any symbols from core
+update_rlib_paths
+for rlib in "${rlib_paths[@]}"; do
+    set +x
+    echo "================================================================"
+    echo "checking $rlib for references to core"
+    echo "================================================================"
+    set -x
+
+    tmpdir="${CARGO_TARGET_DIR:-target}/tmp"
+    test -d "$tmpdir" || mkdir "$tmpdir"
+    defined="$tmpdir/defined_symbols.txt"
+    undefined="$tmpdir/defined_symbols.txt"
+
+    $NM --quiet -U "$rlib" | grep 'T _ZN4core' | awk '{print $3}' | sort | uniq > "$defined"
+    $NM --quiet -u "$rlib" | grep 'U _ZN4core' | awk '{print $2}' | sort | uniq > "$undefined"
+    grep_has_results=0
+    grep -v -F -x -f "$defined" "$undefined" && grep_has_results=1
+
+    if [ "$target" = "powerpc64-unknown-linux-gnu" ]; then
+        echo "FIXME: powerpc64 fails these tests"
+    elif [ "$grep_has_results" != 0 ]; then
+        echo "error: found unexpected references to core"
+        exit 1
+    else
+        echo "success; no references to core found"
+    fi
+done
+
+# Test libm
+
+# Make sure a simple build works
+cargo check -p libm --no-default-features --target "$target"
+
+if [ "${MAY_SKIP_LIBM_CI:-}" = "true" ]; then
+    echo "skipping libm PR CI"
+    exit
+fi
+
+mflags=()
+
+# We enumerate features manually.
+mflags+=(--no-default-features)
+
+# Enable arch-specific routines when available.
+mflags+=(--features arch)
+
+# Always enable `unstable-float` since it expands available API but does not
+# change any implementations.
+mflags+=(--features unstable-float)
+
+# We need to specifically skip tests for musl-math-sys on systems that can't
+# build musl since otherwise `--all` will activate it.
+case "$target" in
+    # Can't build at all on MSVC, WASM, or thumb
+    *windows-msvc*) mflags+=(--exclude musl-math-sys) ;;
+    *wasm*) mflags+=(--exclude musl-math-sys) ;;
+    *thumb*) mflags+=(--exclude musl-math-sys) ;;
+
+    # We can build musl on MinGW but running tests gets a stack overflow
+    *windows-gnu*) ;;
+    # FIXME(#309): LE PPC crashes calling the musl version of some functions. It
+    # seems like a qemu bug but should be investigated further at some point.
+    # See <https://github.com/rust-lang/libm/issues/309>.
+    *powerpc64le*) ;;
+
+    # Everything else gets musl enabled
+    *) mflags+=(--features libm-test/build-musl) ;;
+esac
+
+
+# Configure which targets test against MPFR
+case "$target" in
+    # MSVC cannot link MPFR
+    *windows-msvc*) ;;
+    # FIXME: MinGW should be able to build MPFR, but setup in CI is nontrivial.
+    *windows-gnu*) ;;
+    # Targets that aren't cross compiled in CI work fine
+    aarch64*apple*) mflags+=(--features libm-test/build-mpfr) ;;
+    aarch64*linux*) mflags+=(--features libm-test/build-mpfr) ;;
+    i586*) mflags+=(--features libm-test/build-mpfr --features gmp-mpfr-sys/force-cross) ;;
+    i686*) mflags+=(--features libm-test/build-mpfr) ;;
+    x86_64*) mflags+=(--features libm-test/build-mpfr) ;;
+esac
+
+# FIXME: `STATUS_DLL_NOT_FOUND` testing macros on CI.
+# <https://github.com/rust-lang/rust/issues/128944>
+case "$target" in
+    *windows-gnu) mflags+=(--exclude libm-macros) ;;
+esac
+
+if [ "${BUILD_ONLY:-}" = "1" ]; then
+    # If we are on targets that can't run tests, verify that we can build.
+    cmd=(cargo build --target "$target" --package libm)
+    "${cmd[@]}"
+    "${cmd[@]}" --features unstable-intrinsics
+
+    echo "can't run tests on $target; skipping"
+else
+    mflags+=(--workspace --target "$target")
+    cmd=(cargo test "${mflags[@]}")
+    profile_flag="--profile"
+    
+    # If nextest is available, use that
+    command -v cargo-nextest && nextest=1 || nextest=0
+    if [ "$nextest" = "1" ]; then
+        cmd=(cargo nextest run --max-fail=10)
+
+        # Workaround for https://github.com/nextest-rs/nextest/issues/2066
+        if [ -f /.dockerenv ]; then
+            cfg_file="/tmp/nextest-config.toml"
+            echo "[store]" >> "$cfg_file"
+            echo "dir = \"$CARGO_TARGET_DIR/nextest\"" >> "$cfg_file"
+            cmd+=(--config-file "$cfg_file")
+        fi
+
+        # Not all configurations have tests to run on wasm
+        [[ "$target" = *"wasm"* ]] && cmd+=(--no-tests=warn)
+
+        cmd+=("${mflags[@]}")
+        profile_flag="--cargo-profile"
+    fi
+
+    # Test once without intrinsics
+    "${cmd[@]}"
+
+    # Run doctests if they were excluded by nextest
+    [ "$nextest" = "1" ] && cargo test --doc --exclude compiler_builtins "${mflags[@]}"
+
+    # Exclude the macros and utile crates from the rest of the tests to save CI
+    # runtime, they shouldn't have anything feature- or opt-level-dependent.
+    cmd+=(--exclude util --exclude libm-macros)
+
+    # Test once with intrinsics enabled
+    "${cmd[@]}" --features unstable-intrinsics
+    "${cmd[@]}" --features unstable-intrinsics --benches
+
+    # Test the same in release mode, which also increases coverage. Also ensure
+    # the soft float routines are checked.
+    "${cmd[@]}" "$profile_flag" release-checked
+    "${cmd[@]}" "$profile_flag" release-checked --features force-soft-floats
+    "${cmd[@]}" "$profile_flag" release-checked --features unstable-intrinsics
+    "${cmd[@]}" "$profile_flag" release-checked --features unstable-intrinsics --benches
+
+    # Ensure that the routines do not panic.
+    # 
+    # `--tests` must be passed because no-panic is only enabled as a dev
+    # dependency. The `release-opt` profile must be used to enable LTO and a
+    # single CGU.
+    ENSURE_NO_PANIC=1 cargo build \
+        -p libm \
+        --target "$target" \
+        --no-default-features \
+        --features unstable-float \
+        --tests \
+        --profile release-opt
+fi
diff --git a/library/compiler-builtins/compiler-builtins/CHANGELOG.md b/library/compiler-builtins/compiler-builtins/CHANGELOG.md
new file mode 100644
index 00000000000..a7c01c463ca
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/CHANGELOG.md
@@ -0,0 +1,168 @@
+# Changelog
+
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [Unreleased]
+
+## [0.1.159](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.158...compiler_builtins-v0.1.159) - 2025-05-12
+
+### Other
+
+- Remove cfg(bootstrap)
+
+## [0.1.158](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.157...compiler_builtins-v0.1.158) - 2025-05-06
+
+### Other
+
+- Require `target_has_atomic = "ptr"` for runtime feature detection
+
+## [0.1.157](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.156...compiler_builtins-v0.1.157) - 2025-05-03
+
+### Other
+
+- Use runtime feature detection for fma routines on x86
+
+## [0.1.156](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.155...compiler_builtins-v0.1.156) - 2025-04-21
+
+### Other
+
+- avr: Provide `abort()`
+- Remove `unsafe` from `naked_asm!` blocks
+- Enable icount benchmarks in CI
+- Move builtins-test-intrinsics out of the workspace
+- Run `cargo fmt` on all projects
+- Flatten the `libm/libm` directory
+- Update path to libm after the merge
+
+## [0.1.155](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.154...compiler_builtins-v0.1.155) - 2025-04-17
+
+### Other
+
+- use `#[cfg(bootstrap)]` for rustc sync
+- Replace the `bl!` macro with `asm_sym`
+- __udivmod(h|q)i4
+
+## [0.1.154](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.153...compiler_builtins-v0.1.154) - 2025-04-16
+
+### Other
+
+- turn #[naked] into an unsafe attribute
+
+## [0.1.153](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.152...compiler_builtins-v0.1.153) - 2025-04-09
+
+### Other
+
+- Remove a mention of `force-soft-float` in `build.rs`
+- Revert "Disable `f16` on AArch64 without the `neon` feature"
+- Skip No More!
+- avoid out-of-bounds accesses ([#799](https://github.com/rust-lang/compiler-builtins/pull/799))
+
+## [0.1.152](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.151...compiler_builtins-v0.1.152) - 2025-03-20
+
+### Other
+
+- Remove use of `atomic_load_unordered` and undefined behaviour from `arm_linux.rs`
+- Switch repository layout to use a virtual manifest
+
+## [0.1.151](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.150...compiler_builtins-v0.1.151) - 2025-03-05
+
+### Other
+
+- Add cygwin support
+- Enable `f16` for LoongArch ([#770](https://github.com/rust-lang/compiler-builtins/pull/770))
+- Add __extendhfdf2 and add __truncdfhf2 test
+- Remove outdated information from the readme
+
+## [0.1.150](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.149...compiler_builtins-v0.1.150) - 2025-03-01
+
+### Other
+
+- Disable `f16` on AArch64 without the `neon` feature
+- Update LLVM downloads to 20.1-2025-02-13
+
+## [0.1.149](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.148...compiler_builtins-v0.1.149) - 2025-02-25
+
+### Other
+
+- Make a subset of `libm` symbols weakly available on all platforms
+
+## [0.1.148](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.147...compiler_builtins-v0.1.148) - 2025-02-24
+
+### Other
+
+- Update the `libm` submodule
+- Enable `f16` for MIPS
+- Eliminate the use of `public_test_dep!` for a third time
+
+## [0.1.147](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.146...compiler_builtins-v0.1.147) - 2025-02-19
+
+### Other
+
+- remove win64_128bit_abi_hack
+
+## [0.1.146](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.145...compiler_builtins-v0.1.146) - 2025-02-06
+
+### Other
+
+- Expose erf{,c}{,f} from libm
+
+## [0.1.145](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.144...compiler_builtins-v0.1.145) - 2025-02-04
+
+### Other
+
+- Revert "Eliminate the use of `public_test_dep!`"
+- Indentation fix to please clippy
+- Don't build out of line atomics support code for uefi
+- Add a version to some FIXMEs that will be resolved in LLVM 20
+- Remove use of the `start` feature
+
+## [0.1.144](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.143...compiler_builtins-v0.1.144) - 2025-01-15
+
+### Other
+
+- Eliminate the use of `public_test_dep!`
+
+## [0.1.143](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.142...compiler_builtins-v0.1.143) - 2025-01-15
+
+### Other
+
+- Use a C-safe return type for `__rust_[ui]128_*` overflowing intrinsics
+
+## [0.1.142](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.141...compiler_builtins-v0.1.142) - 2025-01-07
+
+### Other
+
+- Account for optimization levels other than numbers
+
+## [0.1.141](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.140...compiler_builtins-v0.1.141) - 2025-01-07
+
+### Other
+
+- Update the `libm` submodule
+- Fix new `clippy::precedence` errors
+- Rename `EXP_MAX` to `EXP_SAT`
+- Shorten prefixes for float constants
+
+## [0.1.140](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.139...compiler_builtins-v0.1.140) - 2024-12-26
+
+### Other
+
+- Disable f128 for amdgpu ([#737](https://github.com/rust-lang/compiler-builtins/pull/737))
+- Fix a bug in `abs_diff`
+- Disable `f16` on platforms that have recursion problems
+
+## [0.1.139](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.138...compiler_builtins-v0.1.139) - 2024-11-03
+
+### Other
+
+- Remove incorrect `sparcv9` match pattern from `configure_f16_f128`
+
+## [0.1.138](https://github.com/rust-lang/compiler-builtins/compare/compiler_builtins-v0.1.137...compiler_builtins-v0.1.138) - 2024-11-01
+
+### Other
+
+- Use `f16_enabled`/`f128_enabled` in `examples/intrinsics.rs` ([#724](https://github.com/rust-lang/compiler-builtins/pull/724))
+- Disable `f16` for LoongArch64 ([#722](https://github.com/rust-lang/compiler-builtins/pull/722))
diff --git a/library/compiler-builtins/compiler-builtins/Cargo.toml b/library/compiler-builtins/compiler-builtins/Cargo.toml
new file mode 100644
index 00000000000..d65a22152ef
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/Cargo.toml
@@ -0,0 +1,64 @@
+[package]
+authors = ["Jorge Aparicio <japaricious@gmail.com>"]
+name = "compiler_builtins"
+version = "0.1.159"
+license = "MIT AND Apache-2.0 WITH LLVM-exception AND (MIT OR Apache-2.0)"
+readme = "README.md"
+repository = "https://github.com/rust-lang/compiler-builtins"
+homepage = "https://github.com/rust-lang/compiler-builtins"
+documentation = "https://docs.rs/compiler_builtins"
+edition = "2021"
+description = "Compiler intrinsics used by the Rust compiler."
+links = "compiler-rt"
+
+[lib]
+bench = false
+doctest = false
+test = false
+
+[dependencies]
+# For more information on this dependency see
+# https://github.com/rust-lang/rust/tree/master/library/rustc-std-workspace-core
+core = { version = "1.0.0", optional = true, package = "rustc-std-workspace-core" }
+
+[build-dependencies]
+cc = { optional = true, version = "1.0" }
+
+[dev-dependencies]
+panic-handler = { path = "../crates/panic-handler" }
+
+[features]
+default = ["compiler-builtins"]
+
+# Enable compilation of C code in compiler-rt, filling in some more optimized
+# implementations and also filling in unimplemented intrinsics
+c = ["dep:cc"]
+
+# Workaround for the Cranelift codegen backend. Disables any implementations
+# which use inline assembly and fall back to pure Rust versions (if available).
+no-asm = []
+
+# Workaround for codegen backends which haven't yet implemented `f16` and
+# `f128` support. Disabled any intrinsics which use those types.
+no-f16-f128 = []
+
+# Flag this library as the unstable compiler-builtins lib
+compiler-builtins = []
+
+# Generate memory-related intrinsics like memcpy
+mem = []
+
+# Mangle all names so this can be linked in with other versions or other
+# compiler-rt implementations. Also used for testing
+mangled-names = []
+
+# Only used in the compiler's build system
+rustc-dep-of-std = ["compiler-builtins", "dep:core"]
+
+# This makes certain traits and function specializations public that
+# are not normally public but are required by the `builtins-test`
+unstable-public-internals = []
+
+[lints.rust]
+# The cygwin config can be dropped after our benchmark toolchain is bumped
+unexpected_cfgs = { level = "warn", check-cfg = ['cfg(bootstrap)', 'cfg(target_os, values("cygwin"))'] }
diff --git a/library/compiler-builtins/compiler-builtins/README.md b/library/compiler-builtins/compiler-builtins/README.md
new file mode 100644
index 00000000000..387b70c0499
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/README.md
@@ -0,0 +1,436 @@
+# `compiler-builtins`
+
+This crate provides external symbols that the compiler expects to be available
+when building Rust projects, typically software routines for basic operations
+that do not have hardware support. It is largely a port of LLVM's
+[`compiler-rt`].
+
+It is distributed as part of Rust's sysroot. `compiler-builtins` does not need
+to be added as an explicit dependency in `Cargo.toml`.
+
+[`compiler-rt`]: https://github.com/llvm/llvm-project/tree/1b1dc505057322f4fa1110ef4f53c44347f52986/compiler-rt
+
+## Contributing
+
+See [CONTRIBUTING.md](CONTRIBUTING.md).
+
+## Progress
+
+- [x] aarch64/chkstk.S
+- [x] adddf3.c
+- [x] addsf3.c
+- [x] arm/addsf3.S
+- [x] arm/aeabi_dcmp.S
+- [x] arm/aeabi_fcmp.S
+- [x] arm/aeabi_idivmod.S
+- [x] arm/aeabi_ldivmod.S
+- [x] arm/aeabi_memcpy.S
+- [x] arm/aeabi_memmove.S
+- [x] arm/aeabi_memset.S
+- [x] arm/aeabi_uidivmod.S
+- [x] arm/aeabi_uldivmod.S
+- [ ] arm/chkstk.S
+- [ ] arm/divmodsi4.S (generic version is done)
+- [ ] arm/divsi3.S (generic version is done)
+- [ ] arm/modsi3.S (generic version is done)
+- [x] arm/softfloat-alias.list
+- [ ] arm/udivmodsi4.S (generic version is done)
+- [ ] arm/udivsi3.S (generic version is done)
+- [ ] arm/umodsi3.S (generic version is done)
+- [x] ashldi3.c
+- [x] ashrdi3.c
+- [ ] avr/divmodhi4.S
+- [ ] avr/divmodqi4.S
+- [ ] avr/mulhi3.S
+- [ ] avr/mulqi3.S
+- [ ] avr/udivmodhi4.S
+- [ ] avr/udivmodqi4.S
+- [x] bswapdi2.c
+- [x] bswapsi2.c
+- [x] bswapti2.c
+- [x] clzdi2.c
+- [x] clzsi2.c
+- [x] clzti2.c
+- [x] comparedf2.c
+- [x] comparesf2.c
+- [x] ctzdi2.c
+- [x] ctzsi2.c
+- [x] ctzti2.c
+- [x] divdf3.c
+- [x] divdi3.c
+- [x] divmoddi4.c
+- [x] divmodsi4.c
+- [x] divmodti4.c
+- [x] divsf3.c
+- [x] divsi3.c
+- [x] extendsfdf2.c
+- [x] fixdfdi.c
+- [x] fixdfsi.c
+- [x] fixsfdi.c
+- [x] fixsfsi.c
+- [x] fixunsdfdi.c
+- [x] fixunsdfsi.c
+- [x] fixunssfdi.c
+- [x] fixunssfsi.c
+- [x] floatdidf.c
+- [x] floatdisf.c
+- [x] floatsidf.c
+- [x] floatsisf.c
+- [x] floatundidf.c
+- [x] floatundisf.c
+- [x] floatunsidf.c
+- [x] floatunsisf.c
+- [ ] i386/ashldi3.S
+- [ ] i386/ashrdi3.S
+- [x] i386/chkstk.S
+- [ ] i386/divdi3.S
+- [ ] i386/lshrdi3.S
+- [ ] i386/moddi3.S
+- [ ] i386/muldi3.S
+- [ ] i386/udivdi3.S
+- [ ] i386/umoddi3.S
+- [x] lshrdi3.c
+- [x] moddi3.c
+- [x] modsi3.c
+- [x] muldf3.c
+- [x] muldi3.c
+- [x] mulodi4.c
+- [x] mulosi4.c
+- [x] mulsf3.c
+- [x] powidf2.c
+- [x] powisf2.c
+- [ ] riscv/muldi3.S
+- [ ] riscv/mulsi3.S
+- [x] subdf3.c
+- [x] subsf3.c
+- [x] truncdfsf2.c
+- [x] udivdi3.c
+- [x] udivmoddi4.c
+- [x] udivmodsi4.c
+- [x] udivsi3.c
+- [x] umoddi3.c
+- [x] umodsi3.c
+- [x] x86_64/chkstk.S
+
+These builtins are needed to support 128-bit integers.
+
+- [x] ashlti3.c
+- [x] ashrti3.c
+- [x] divti3.c
+- [x] fixdfti.c
+- [x] fixsfti.c
+- [x] fixunsdfti.c
+- [x] fixunssfti.c
+- [x] floattidf.c
+- [x] floattisf.c
+- [x] floatuntidf.c
+- [x] floatuntisf.c
+- [x] lshrti3.c
+- [x] modti3.c
+- [x] muloti4.c
+- [x] multi3.c
+- [x] udivmodti4.c
+- [x] udivti3.c
+- [x] umodti3.c
+
+These builtins are needed to support `f16` and `f128`, which are in the process
+of being added to Rust.
+
+- [x] addtf3.c
+- [x] comparetf2.c
+- [x] divtf3.c
+- [x] extenddftf2.c
+- [x] extendhfsf2.c
+- [x] extendhftf2.c
+- [x] extendsftf2.c
+- [x] fixtfdi.c
+- [x] fixtfsi.c
+- [x] fixtfti.c
+- [x] fixunstfdi.c
+- [x] fixunstfsi.c
+- [x] fixunstfti.c
+- [x] floatditf.c
+- [x] floatsitf.c
+- [x] floattitf.c
+- [x] floatunditf.c
+- [x] floatunsitf.c
+- [x] floatuntitf.c
+- [x] multf3.c
+- [x] powitf2.c
+- [x] subtf3.c
+- [x] truncdfhf2.c
+- [x] truncsfhf2.c
+- [x] trunctfdf2.c
+- [x] trunctfhf2.c
+- [x] trunctfsf2.c
+
+
+These builtins are used by the Hexagon DSP
+
+- [ ] hexagon/common_entry_exit_abi1.S
+- [ ] hexagon/common_entry_exit_abi2.S
+- [ ] hexagon/common_entry_exit_legacy.S
+- [x] hexagon/dfaddsub.S~~
+- [x] hexagon/dfdiv.S~~
+- [x] hexagon/dffma.S~~
+- [x] hexagon/dfminmax.S~~
+- [x] hexagon/dfmul.S~~
+- [x] hexagon/dfsqrt.S~~
+- [x] hexagon/divdi3.S~~
+- [x] hexagon/divsi3.S~~
+- [x] hexagon/fastmath2_dlib_asm.S~~
+- [x] hexagon/fastmath2_ldlib_asm.S~~
+- [x] hexagon/fastmath_dlib_asm.S~~
+- [x] hexagon/memcpy_forward_vp4cp4n2.S~~
+- [x] hexagon/memcpy_likely_aligned.S~~
+- [x] hexagon/moddi3.S~~
+- [x] hexagon/modsi3.S~~
+- [x] hexagon/sfdiv_opt.S~~
+- [x] hexagon/sfsqrt_opt.S~~
+- [x] hexagon/udivdi3.S~~
+- [x] hexagon/udivmoddi4.S~~
+- [x] hexagon/udivmodsi4.S~~
+- [x] hexagon/udivsi3.S~~
+- [x] hexagon/umoddi3.S~~
+- [x] hexagon/umodsi3.S~~
+
+## Unimplemented functions
+
+These builtins are for x87 `f80` floating-point numbers that are not supported
+by Rust.
+
+- ~~extendxftf2.c~~
+- ~~fixunsxfdi.c~~
+- ~~fixunsxfsi.c~~
+- ~~fixunsxfti.c~~
+- ~~fixxfdi.c~~
+- ~~fixxfti.c~~
+- ~~floatdixf.c~~
+- ~~floattixf.c~~
+- ~~floatundixf.c~~
+- ~~floatuntixf.c~~
+- ~~i386/floatdixf.S~~
+- ~~i386/floatundixf.S~~
+- ~~x86_64/floatdixf.c~~
+- ~~x86_64/floatundixf.S~~
+
+These builtins are for IBM "extended double" non-IEEE 128-bit floating-point
+numbers.
+
+- ~~ppc/divtc3.c~~
+- ~~ppc/fixtfdi.c~~
+- ~~ppc/fixtfti.c~~
+- ~~ppc/fixunstfdi.c~~
+- ~~ppc/fixunstfti.c~~
+- ~~ppc/floatditf.c~~
+- ~~ppc/floattitf.c~~
+- ~~ppc/floatunditf.c~~
+- ~~ppc/gcc_qadd.c~~
+- ~~ppc/gcc_qdiv.c~~
+- ~~ppc/gcc_qmul.c~~
+- ~~ppc/gcc_qsub.c~~
+- ~~ppc/multc3.c~~
+
+These builtins are for 16-bit brain floating-point numbers that are not
+supported by Rust.
+
+- ~~truncdfbf2.c~~
+- ~~truncsfbf2.c~~
+- ~~trunctfxf2.c~~
+
+These builtins involve complex floating-point types that are not supported by
+Rust.
+
+- ~~divdc3.c~~
+- ~~divsc3.c~~
+- ~~divtc3.c~~
+- ~~divxc3.c~~
+- ~~muldc3.c~~
+- ~~mulsc3.c~~
+- ~~multc3.c~~
+- ~~mulxc3.c~~
+- ~~powixf2.c~~
+
+These builtins are never called by LLVM.
+
+- ~~absvdi2.c~~
+- ~~absvsi2.c~~
+- ~~absvti2.c~~
+- ~~addvdi3.c~~
+- ~~addvsi3.c~~
+- ~~addvti3.c~~
+- ~~arm/aeabi_cdcmp.S~~
+- ~~arm/aeabi_cdcmpeq_check_nan.c~~
+- ~~arm/aeabi_cfcmp.S~~
+- ~~arm/aeabi_cfcmpeq_check_nan.c~~
+- ~~arm/aeabi_div0.c~~
+- ~~arm/aeabi_drsub.c~~
+- ~~arm/aeabi_frsub.c~~
+- ~~arm/aeabi_memcmp.S~~
+- ~~arm/bswapdi2.S~~
+- ~~arm/bswapsi2.S~~
+- ~~arm/clzdi2.S~~
+- ~~arm/clzsi2.S~~
+- ~~arm/comparesf2.S~~
+- ~~arm/restore_vfp_d8_d15_regs.S~~
+- ~~arm/save_vfp_d8_d15_regs.S~~
+- ~~arm/switch16.S~~
+- ~~arm/switch32.S~~
+- ~~arm/switch8.S~~
+- ~~arm/switchu8.S~~
+- ~~cmpdi2.c~~
+- ~~cmpti2.c~~
+- ~~ffssi2.c~~
+- ~~ffsdi2.c~~ - this is [called by gcc][jemalloc-fail] though!
+- ~~ffsti2.c~~
+- ~~mulvdi3.c~~
+- ~~mulvsi3.c~~
+- ~~mulvti3.c~~
+- ~~negdf2.c~~
+- ~~negdi2.c~~
+- ~~negsf2.c~~
+- ~~negti2.c~~
+- ~~negvdi2.c~~
+- ~~negvsi2.c~~
+- ~~negvti2.c~~
+- ~~paritydi2.c~~
+- ~~paritysi2.c~~
+- ~~parityti2.c~~
+- ~~popcountdi2.c~~
+- ~~popcountsi2.c~~
+- ~~popcountti2.c~~
+- ~~ppc/restFP.S~~
+- ~~ppc/saveFP.S~~
+- ~~subvdi3.c~~
+- ~~subvsi3.c~~
+- ~~subvti3.c~~
+- ~~ucmpdi2.c~~
+- ~~ucmpti2.c~~
+- ~~udivmodti4.c~~
+
+[jemalloc-fail]: https://travis-ci.org/rust-lang/rust/jobs/249772758
+
+Rust only exposes atomic types on platforms that support them, and therefore does not need to fall back to software implementations.
+
+- ~~arm/sync_fetch_and_add_4.S~~
+- ~~arm/sync_fetch_and_add_8.S~~
+- ~~arm/sync_fetch_and_and_4.S~~
+- ~~arm/sync_fetch_and_and_8.S~~
+- ~~arm/sync_fetch_and_max_4.S~~
+- ~~arm/sync_fetch_and_max_8.S~~
+- ~~arm/sync_fetch_and_min_4.S~~
+- ~~arm/sync_fetch_and_min_8.S~~
+- ~~arm/sync_fetch_and_nand_4.S~~
+- ~~arm/sync_fetch_and_nand_8.S~~
+- ~~arm/sync_fetch_and_or_4.S~~
+- ~~arm/sync_fetch_and_or_8.S~~
+- ~~arm/sync_fetch_and_sub_4.S~~
+- ~~arm/sync_fetch_and_sub_8.S~~
+- ~~arm/sync_fetch_and_umax_4.S~~
+- ~~arm/sync_fetch_and_umax_8.S~~
+- ~~arm/sync_fetch_and_umin_4.S~~
+- ~~arm/sync_fetch_and_umin_8.S~~
+- ~~arm/sync_fetch_and_xor_4.S~~
+- ~~arm/sync_fetch_and_xor_8.S~~
+- ~~arm/sync_synchronize.S~~
+- ~~atomic.c~~
+- ~~atomic_flag_clear.c~~
+- ~~atomic_flag_clear_explicit.c~~
+- ~~atomic_flag_test_and_set.c~~
+- ~~atomic_flag_test_and_set_explicit.c~~
+- ~~atomic_signal_fence.c~~
+- ~~atomic_thread_fence.c~~
+
+Miscellaneous functionality that is not used by Rust.
+
+- ~~aarch64/fp_mode.c~~
+- ~~aarch64/lse.S~~ (LSE atomics)
+- ~~aarch64/sme-abi-init.c~~ (matrix extension)
+- ~~aarch64/sme-abi.S~~ (matrix extension)
+- ~~aarch64/sme-libc-routines.c~~ (matrix extension)
+- ~~apple_versioning.c~~
+- ~~arm/fp_mode.c~~
+- ~~avr/exit.S~~
+- ~~clear_cache.c~~
+- ~~cpu_model/aarch64.c~~
+- ~~cpu_model/x86.c~~
+- ~~crtbegin.c~~
+- ~~crtend.c~~
+- ~~emutls.c~~
+- ~~enable_execute_stack.c~~
+- ~~eprintf.c~~
+- ~~fp_mode.c~~ (float exception handling)
+- ~~gcc_personality_v0.c~~
+- ~~i386/fp_mode.c~~
+- ~~int_util.c~~
+- ~~loongarch/fp_mode.c~~
+- ~~os_version_check.c~~
+- ~~riscv/fp_mode.c~~
+- ~~riscv/restore.S~~ (callee-saved registers)
+- ~~riscv/save.S~~ (callee-saved registers)
+- ~~trampoline_setup.c~~
+- ~~ve/grow_stack.S~~
+- ~~ve/grow_stack_align.S~~
+
+Floating-point implementations of builtins that are only called from soft-float code. It would be better to simply use the generic soft-float versions in this case.
+
+- ~~i386/floatdidf.S~~
+- ~~i386/floatdisf.S~~
+- ~~i386/floatundidf.S~~
+- ~~i386/floatundisf.S~~
+- ~~x86_64/floatundidf.S~~
+- ~~x86_64/floatundisf.S~~
+- ~~x86_64/floatdidf.c~~
+- ~~x86_64/floatdisf.c~~
+
+Unsupported in any current target: used on old versions of 32-bit iOS with ARMv5.
+
+- ~~arm/adddf3vfp.S~~
+- ~~arm/addsf3vfp.S~~
+- ~~arm/divdf3vfp.S~~
+- ~~arm/divsf3vfp.S~~
+- ~~arm/eqdf2vfp.S~~
+- ~~arm/eqsf2vfp.S~~
+- ~~arm/extendsfdf2vfp.S~~
+- ~~arm/fixdfsivfp.S~~
+- ~~arm/fixsfsivfp.S~~
+- ~~arm/fixunsdfsivfp.S~~
+- ~~arm/fixunssfsivfp.S~~
+- ~~arm/floatsidfvfp.S~~
+- ~~arm/floatsisfvfp.S~~
+- ~~arm/floatunssidfvfp.S~~
+- ~~arm/floatunssisfvfp.S~~
+- ~~arm/gedf2vfp.S~~
+- ~~arm/gesf2vfp.S~~
+- ~~arm/gtdf2vfp.S~~
+- ~~arm/gtsf2vfp.S~~
+- ~~arm/ledf2vfp.S~~
+- ~~arm/lesf2vfp.S~~
+- ~~arm/ltdf2vfp.S~~
+- ~~arm/ltsf2vfp.S~~
+- ~~arm/muldf3vfp.S~~
+- ~~arm/mulsf3vfp.S~~
+- ~~arm/nedf2vfp.S~~
+- ~~arm/negdf2vfp.S~~
+- ~~arm/negsf2vfp.S~~
+- ~~arm/nesf2vfp.S~~
+- ~~arm/subdf3vfp.S~~
+- ~~arm/subsf3vfp.S~~
+- ~~arm/truncdfsf2vfp.S~~
+- ~~arm/unorddf2vfp.S~~
+- ~~arm/unordsf2vfp.S~~
+
+## License
+
+Usage is allowed under the [MIT License] and the [Apache License, Version 2.0]
+with the LLVM exception.
+
+[MIT License]: https://opensource.org/license/mit
+[Apache License, Version 2.0]: htps://www.apache.org/licenses/LICENSE-2.0
+
+### Contribution
+
+Contributions are licensed under the MIT License, the Apache License,
+Version 2.0, and the Apache-2.0 license with the LLVM exception.
+
+See [LICENSE.txt](../LICENSE.txt) for full details.
diff --git a/library/compiler-builtins/compiler-builtins/build.rs b/library/compiler-builtins/compiler-builtins/build.rs
new file mode 100644
index 00000000000..90d98ec7ce9
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/build.rs
@@ -0,0 +1,712 @@
+mod configure;
+
+use std::collections::BTreeMap;
+use std::env;
+use std::path::PathBuf;
+use std::sync::atomic::Ordering;
+
+use configure::{Target, configure_aliases, configure_f16_f128};
+
+fn main() {
+    println!("cargo::rerun-if-changed=build.rs");
+    println!("cargo::rerun-if-changed=configure.rs");
+
+    let target = Target::from_env();
+    let cwd = env::current_dir().unwrap();
+
+    configure_check_cfg();
+    configure_f16_f128(&target);
+    configure_aliases(&target);
+
+    configure_libm(&target);
+
+    println!("cargo:compiler-rt={}", cwd.join("compiler-rt").display());
+
+    // Emscripten's runtime includes all the builtins
+    if target.os == "emscripten" {
+        return;
+    }
+
+    // OpenBSD provides compiler_rt by default, use it instead of rebuilding it from source
+    if target.os == "openbsd" {
+        println!("cargo:rustc-link-search=native=/usr/lib");
+        println!("cargo:rustc-link-lib=compiler_rt");
+        return;
+    }
+
+    // Forcibly enable memory intrinsics on wasm & SGX as we don't have a libc to
+    // provide them.
+    if (target.triple.contains("wasm") && !target.triple.contains("wasi"))
+        || (target.triple.contains("sgx") && target.triple.contains("fortanix"))
+        || target.triple.contains("-none")
+        || target.triple.contains("nvptx")
+        || target.triple.contains("uefi")
+        || target.triple.contains("xous")
+    {
+        println!("cargo:rustc-cfg=feature=\"mem\"");
+    }
+
+    // These targets have hardware unaligned access support.
+    println!("cargo::rustc-check-cfg=cfg(feature, values(\"mem-unaligned\"))");
+    if target.arch.contains("x86_64")
+        || target.arch.contains("x86")
+        || target.arch.contains("aarch64")
+        || target.arch.contains("bpf")
+    {
+        println!("cargo:rustc-cfg=feature=\"mem-unaligned\"");
+    }
+
+    // NOTE we are going to assume that llvm-target, what determines our codegen option, matches the
+    // target triple. This is usually correct for our built-in targets but can break in presence of
+    // custom targets, which can have arbitrary names.
+    let llvm_target = target.triple.split('-').collect::<Vec<_>>();
+
+    // Build missing intrinsics from compiler-rt C source code. If we're
+    // mangling names though we assume that we're also in test mode so we don't
+    // build anything and we rely on the upstream implementation of compiler-rt
+    // functions
+    if !cfg!(feature = "mangled-names") && cfg!(feature = "c") {
+        // Don't use a C compiler for these targets:
+        //
+        // * nvptx - everything is bitcode, not compatible with mixed C/Rust
+        if !target.arch.contains("nvptx") {
+            #[cfg(feature = "c")]
+            c::compile(&llvm_target, &target);
+        }
+    }
+
+    // Only emit the ARM Linux atomic emulation on pre-ARMv6 architectures. This
+    // includes the old androideabi. It is deprecated but it is available as a
+    // rustc target (arm-linux-androideabi).
+    println!("cargo::rustc-check-cfg=cfg(kernel_user_helpers)");
+    if llvm_target[0] == "armv4t"
+        || llvm_target[0] == "armv5te"
+        || target.triple == "arm-linux-androideabi"
+    {
+        println!("cargo:rustc-cfg=kernel_user_helpers")
+    }
+
+    if llvm_target[0].starts_with("aarch64") {
+        generate_aarch64_outlined_atomics();
+    }
+}
+
+/// Run configuration for `libm` since it is included directly.
+///
+/// Much of this is copied from `libm/configure.rs`.
+fn configure_libm(target: &Target) {
+    println!("cargo:rustc-check-cfg=cfg(intrinsics_enabled)");
+    println!("cargo:rustc-check-cfg=cfg(arch_enabled)");
+    println!("cargo:rustc-check-cfg=cfg(optimizations_enabled)");
+    println!("cargo:rustc-check-cfg=cfg(feature, values(\"unstable-public-internals\"))");
+
+    // Always use intrinsics
+    println!("cargo:rustc-cfg=intrinsics_enabled");
+
+    // The arch module may contain assembly.
+    if !cfg!(feature = "no-asm") {
+        println!("cargo:rustc-cfg=arch_enabled");
+    }
+
+    println!("cargo:rustc-check-cfg=cfg(optimizations_enabled)");
+    if !matches!(target.opt_level.as_str(), "0" | "1") {
+        println!("cargo:rustc-cfg=optimizations_enabled");
+    }
+
+    // Config shorthands
+    println!("cargo:rustc-check-cfg=cfg(x86_no_sse)");
+    if target.arch == "x86" && !target.features.iter().any(|f| f == "sse") {
+        // Shorthand to detect i586 targets
+        println!("cargo:rustc-cfg=x86_no_sse");
+    }
+
+    println!(
+        "cargo:rustc-env=CFG_CARGO_FEATURES={:?}",
+        target.cargo_features
+    );
+    println!("cargo:rustc-env=CFG_OPT_LEVEL={}", target.opt_level);
+    println!("cargo:rustc-env=CFG_TARGET_FEATURES={:?}", target.features);
+
+    // Activate libm's unstable features to make full use of Nightly.
+    println!("cargo:rustc-cfg=feature=\"unstable-intrinsics\"");
+}
+
+fn aarch64_symbol(ordering: Ordering) -> &'static str {
+    match ordering {
+        Ordering::Relaxed => "relax",
+        Ordering::Acquire => "acq",
+        Ordering::Release => "rel",
+        Ordering::AcqRel => "acq_rel",
+        _ => panic!("unknown symbol for {ordering:?}"),
+    }
+}
+
+/// The `concat_idents` macro is extremely annoying and doesn't allow us to define new items.
+/// Define them from the build script instead.
+/// Note that the majority of the code is still defined in `aarch64.rs` through inline macros.
+fn generate_aarch64_outlined_atomics() {
+    use std::fmt::Write;
+    // #[macro_export] so that we can use this in tests
+    let gen_macro =
+        |name| format!("#[macro_export] macro_rules! foreach_{name} {{ ($macro:path) => {{\n");
+
+    // Generate different macros for add/clr/eor/set so that we can test them separately.
+    let sym_names = ["cas", "ldadd", "ldclr", "ldeor", "ldset", "swp"];
+    let mut macros = BTreeMap::new();
+    for sym in sym_names {
+        macros.insert(sym, gen_macro(sym));
+    }
+
+    // Only CAS supports 16 bytes, and it has a different implementation that uses a different macro.
+    let mut cas16 = gen_macro("cas16");
+
+    for ordering in [
+        Ordering::Relaxed,
+        Ordering::Acquire,
+        Ordering::Release,
+        Ordering::AcqRel,
+    ] {
+        let sym_ordering = aarch64_symbol(ordering);
+        for size in [1, 2, 4, 8] {
+            for (sym, macro_) in &mut macros {
+                let name = format!("__aarch64_{sym}{size}_{sym_ordering}");
+                writeln!(macro_, "$macro!( {ordering:?}, {size}, {name} );").unwrap();
+            }
+        }
+        let name = format!("__aarch64_cas16_{sym_ordering}");
+        writeln!(cas16, "$macro!( {ordering:?}, {name} );").unwrap();
+    }
+
+    let mut buf = String::new();
+    for macro_def in macros.values().chain(std::iter::once(&cas16)) {
+        buf += macro_def;
+        buf += "}; }\n";
+    }
+    let out_dir = PathBuf::from(std::env::var("OUT_DIR").unwrap());
+    std::fs::write(out_dir.join("outlined_atomics.rs"), buf).unwrap();
+}
+
+/// Emit directives for features we expect to support that aren't in `Cargo.toml`.
+///
+/// These are mostly cfg elements emitted by this `build.rs`.
+fn configure_check_cfg() {
+    // Functions where we can set the "optimized-c" flag
+    const HAS_OPTIMIZED_C: &[&str] = &[
+        "__ashldi3",
+        "__ashlsi3",
+        "__ashrdi3",
+        "__ashrsi3",
+        "__bswapsi2",
+        "__bswapdi2",
+        "__bswapti2",
+        "__divdi3",
+        "__divsi3",
+        "__divmoddi4",
+        "__divmodsi4",
+        "__divmodsi4",
+        "__divmodti4",
+        "__lshrdi3",
+        "__lshrsi3",
+        "__moddi3",
+        "__modsi3",
+        "__muldi3",
+        "__udivdi3",
+        "__udivmoddi4",
+        "__udivmodsi4",
+        "__udivsi3",
+        "__umoddi3",
+        "__umodsi3",
+    ];
+
+    // Build a list of all aarch64 atomic operation functions
+    let mut aarch_atomic = Vec::new();
+    for aarch_op in ["cas", "ldadd", "ldclr", "ldeor", "ldset", "swp"] {
+        let op_sizes = if aarch_op == "cas" {
+            [1, 2, 4, 8, 16].as_slice()
+        } else {
+            [1, 2, 4, 8].as_slice()
+        };
+
+        for op_size in op_sizes {
+            for ordering in ["relax", "acq", "rel", "acq_rel"] {
+                aarch_atomic.push(format!("__aarch64_{aarch_op}{op_size}_{ordering}"));
+            }
+        }
+    }
+
+    for fn_name in HAS_OPTIMIZED_C
+        .iter()
+        .copied()
+        .chain(aarch_atomic.iter().map(|s| s.as_str()))
+    {
+        println!("cargo::rustc-check-cfg=cfg({fn_name}, values(\"optimized-c\"))",);
+    }
+
+    // Rustc is unaware of sparc target features, but this does show up from
+    // `rustc --print target-features --target sparc64-unknown-linux-gnu`.
+    println!("cargo::rustc-check-cfg=cfg(target_feature, values(\"vis3\"))");
+
+    // FIXME: these come from libm and should be changed there
+    println!("cargo::rustc-check-cfg=cfg(feature, values(\"checked\"))");
+    println!("cargo::rustc-check-cfg=cfg(assert_no_panic)");
+}
+
+#[cfg(feature = "c")]
+mod c {
+    use std::collections::{BTreeMap, HashSet};
+    use std::env;
+    use std::fs::{self, File};
+    use std::io::Write;
+    use std::path::{Path, PathBuf};
+
+    use super::Target;
+
+    struct Sources {
+        // SYMBOL -> PATH TO SOURCE
+        map: BTreeMap<&'static str, &'static str>,
+    }
+
+    impl Sources {
+        fn new() -> Sources {
+            Sources {
+                map: BTreeMap::new(),
+            }
+        }
+
+        fn extend(&mut self, sources: &[(&'static str, &'static str)]) {
+            // NOTE Some intrinsics have both a generic implementation (e.g.
+            // `floatdidf.c`) and an arch optimized implementation
+            // (`x86_64/floatdidf.c`). In those cases, we keep the arch optimized
+            // implementation and discard the generic implementation. If we don't
+            // and keep both implementations, the linker will yell at us about
+            // duplicate symbols!
+            for (symbol, src) in sources {
+                if src.contains("/") {
+                    // Arch-optimized implementation (preferred)
+                    self.map.insert(symbol, src);
+                } else {
+                    // Generic implementation
+                    if !self.map.contains_key(symbol) {
+                        self.map.insert(symbol, src);
+                    }
+                }
+            }
+        }
+
+        fn remove(&mut self, symbols: &[&str]) {
+            for symbol in symbols {
+                self.map.remove(*symbol).unwrap();
+            }
+        }
+    }
+
+    /// Compile intrinsics from the compiler-rt C source code
+    pub fn compile(llvm_target: &[&str], target: &Target) {
+        let mut consider_float_intrinsics = true;
+        let cfg = &mut cc::Build::new();
+
+        // AArch64 GCCs exit with an error condition when they encounter any kind of floating point
+        // code if the `nofp` and/or `nosimd` compiler flags have been set.
+        //
+        // Therefore, evaluate if those flags are present and set a boolean that causes any
+        // compiler-rt intrinsics that contain floating point source to be excluded for this target.
+        if target.arch == "aarch64" {
+            let cflags_key = String::from("CFLAGS_") + &(target.triple.replace("-", "_"));
+            if let Ok(cflags_value) = env::var(cflags_key) {
+                if cflags_value.contains("+nofp") || cflags_value.contains("+nosimd") {
+                    consider_float_intrinsics = false;
+                }
+            }
+        }
+
+        // `compiler-rt` requires `COMPILER_RT_HAS_FLOAT16` to be defined to make it use the
+        // `_Float16` type for `f16` intrinsics. This shouldn't matter as all existing `f16`
+        // intrinsics have been ported to Rust in `compiler-builtins` as C compilers don't
+        // support `_Float16` on all targets (whereas Rust does). However, define the macro
+        // anyway to prevent issues like rust#118813 and rust#123885 silently reoccuring if more
+        // `f16` intrinsics get accidentally added here in the future.
+        cfg.define("COMPILER_RT_HAS_FLOAT16", None);
+
+        cfg.warnings(false);
+
+        if target.env == "msvc" {
+            // Don't pull in extra libraries on MSVC
+            cfg.flag("/Zl");
+
+            // Emulate C99 and C++11's __func__ for MSVC prior to 2013 CTP
+            cfg.define("__func__", Some("__FUNCTION__"));
+        } else {
+            // Turn off various features of gcc and such, mostly copying
+            // compiler-rt's build system already
+            cfg.flag("-fno-builtin");
+            cfg.flag("-fvisibility=hidden");
+            cfg.flag("-ffreestanding");
+            // Avoid the following warning appearing once **per file**:
+            // clang: warning: optimization flag '-fomit-frame-pointer' is not supported for target 'armv7' [-Wignored-optimization-argument]
+            //
+            // Note that compiler-rt's build system also checks
+            //
+            // `check_cxx_compiler_flag(-fomit-frame-pointer COMPILER_RT_HAS_FOMIT_FRAME_POINTER_FLAG)`
+            //
+            // in https://github.com/rust-lang/compiler-rt/blob/c8fbcb3/cmake/config-ix.cmake#L19.
+            cfg.flag_if_supported("-fomit-frame-pointer");
+            cfg.define("VISIBILITY_HIDDEN", None);
+
+            if let "aarch64" | "arm64ec" = target.arch.as_str() {
+                // FIXME(llvm20): Older GCCs on A64 fail to build with
+                // -Werror=implicit-function-declaration due to a compiler-rt bug.
+                // With a newer LLVM we should be able to enable the flag everywhere.
+                // https://github.com/llvm/llvm-project/commit/8aa9d6206ce55bdaaf422839c351fbd63f033b89
+            } else {
+                // Avoid implicitly creating references to undefined functions
+                cfg.flag("-Werror=implicit-function-declaration");
+            }
+        }
+
+        // int_util.c tries to include stdlib.h if `_WIN32` is defined,
+        // which it is when compiling UEFI targets with clang. This is
+        // at odds with compiling with `-ffreestanding`, as the header
+        // may be incompatible or not present. Create a minimal stub
+        // header to use instead.
+        if target.os == "uefi" {
+            let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
+            let include_dir = out_dir.join("include");
+            if !include_dir.exists() {
+                fs::create_dir(&include_dir).unwrap();
+            }
+            fs::write(include_dir.join("stdlib.h"), "#include <stddef.h>").unwrap();
+            cfg.flag(&format!("-I{}", include_dir.to_str().unwrap()));
+        }
+
+        let mut sources = Sources::new();
+        sources.extend(&[
+            ("__absvdi2", "absvdi2.c"),
+            ("__absvsi2", "absvsi2.c"),
+            ("__addvdi3", "addvdi3.c"),
+            ("__addvsi3", "addvsi3.c"),
+            ("__cmpdi2", "cmpdi2.c"),
+            ("__int_util", "int_util.c"),
+            ("__mulvdi3", "mulvdi3.c"),
+            ("__mulvsi3", "mulvsi3.c"),
+            ("__negdi2", "negdi2.c"),
+            ("__negvdi2", "negvdi2.c"),
+            ("__negvsi2", "negvsi2.c"),
+            ("__paritydi2", "paritydi2.c"),
+            ("__paritysi2", "paritysi2.c"),
+            ("__popcountdi2", "popcountdi2.c"),
+            ("__popcountsi2", "popcountsi2.c"),
+            ("__subvdi3", "subvdi3.c"),
+            ("__subvsi3", "subvsi3.c"),
+            ("__ucmpdi2", "ucmpdi2.c"),
+        ]);
+
+        if consider_float_intrinsics {
+            sources.extend(&[
+                ("__divdc3", "divdc3.c"),
+                ("__divsc3", "divsc3.c"),
+                ("__muldc3", "muldc3.c"),
+                ("__mulsc3", "mulsc3.c"),
+                ("__negdf2", "negdf2.c"),
+                ("__negsf2", "negsf2.c"),
+            ]);
+        }
+
+        // On iOS and 32-bit OSX these are all just empty intrinsics, no need to
+        // include them.
+        if target.vendor != "apple" || target.arch != "x86" {
+            sources.extend(&[
+                ("__absvti2", "absvti2.c"),
+                ("__addvti3", "addvti3.c"),
+                ("__cmpti2", "cmpti2.c"),
+                ("__ffsti2", "ffsti2.c"),
+                ("__mulvti3", "mulvti3.c"),
+                ("__negti2", "negti2.c"),
+                ("__parityti2", "parityti2.c"),
+                ("__popcountti2", "popcountti2.c"),
+                ("__subvti3", "subvti3.c"),
+                ("__ucmpti2", "ucmpti2.c"),
+            ]);
+
+            if consider_float_intrinsics {
+                sources.extend(&[("__negvti2", "negvti2.c")]);
+            }
+        }
+
+        if target.vendor == "apple" {
+            sources.extend(&[
+                ("atomic_flag_clear", "atomic_flag_clear.c"),
+                ("atomic_flag_clear_explicit", "atomic_flag_clear_explicit.c"),
+                ("atomic_flag_test_and_set", "atomic_flag_test_and_set.c"),
+                (
+                    "atomic_flag_test_and_set_explicit",
+                    "atomic_flag_test_and_set_explicit.c",
+                ),
+                ("atomic_signal_fence", "atomic_signal_fence.c"),
+                ("atomic_thread_fence", "atomic_thread_fence.c"),
+            ]);
+        }
+
+        if target.env != "msvc" {
+            if target.arch == "x86" {
+                sources.extend(&[
+                    ("__ashldi3", "i386/ashldi3.S"),
+                    ("__ashrdi3", "i386/ashrdi3.S"),
+                    ("__divdi3", "i386/divdi3.S"),
+                    ("__lshrdi3", "i386/lshrdi3.S"),
+                    ("__moddi3", "i386/moddi3.S"),
+                    ("__muldi3", "i386/muldi3.S"),
+                    ("__udivdi3", "i386/udivdi3.S"),
+                    ("__umoddi3", "i386/umoddi3.S"),
+                ]);
+            }
+        }
+
+        if target.arch == "arm" && target.vendor != "apple" && target.env != "msvc" {
+            sources.extend(&[
+                ("__aeabi_div0", "arm/aeabi_div0.c"),
+                ("__aeabi_drsub", "arm/aeabi_drsub.c"),
+                ("__aeabi_frsub", "arm/aeabi_frsub.c"),
+                ("__bswapdi2", "arm/bswapdi2.S"),
+                ("__bswapsi2", "arm/bswapsi2.S"),
+                ("__divmodsi4", "arm/divmodsi4.S"),
+                ("__divsi3", "arm/divsi3.S"),
+                ("__modsi3", "arm/modsi3.S"),
+                ("__switch16", "arm/switch16.S"),
+                ("__switch32", "arm/switch32.S"),
+                ("__switch8", "arm/switch8.S"),
+                ("__switchu8", "arm/switchu8.S"),
+                ("__sync_synchronize", "arm/sync_synchronize.S"),
+                ("__udivmodsi4", "arm/udivmodsi4.S"),
+                ("__udivsi3", "arm/udivsi3.S"),
+                ("__umodsi3", "arm/umodsi3.S"),
+            ]);
+
+            if target.os == "freebsd" {
+                sources.extend(&[("__clear_cache", "clear_cache.c")]);
+            }
+
+            // First of all aeabi_cdcmp and aeabi_cfcmp are never called by LLVM.
+            // Second are little-endian only, so build fail on big-endian targets.
+            // Temporally workaround: exclude these files for big-endian targets.
+            if !llvm_target[0].starts_with("thumbeb") && !llvm_target[0].starts_with("armeb") {
+                sources.extend(&[
+                    ("__aeabi_cdcmp", "arm/aeabi_cdcmp.S"),
+                    ("__aeabi_cdcmpeq_check_nan", "arm/aeabi_cdcmpeq_check_nan.c"),
+                    ("__aeabi_cfcmp", "arm/aeabi_cfcmp.S"),
+                    ("__aeabi_cfcmpeq_check_nan", "arm/aeabi_cfcmpeq_check_nan.c"),
+                ]);
+            }
+        }
+
+        if llvm_target[0] == "armv7" {
+            sources.extend(&[
+                ("__sync_fetch_and_add_4", "arm/sync_fetch_and_add_4.S"),
+                ("__sync_fetch_and_add_8", "arm/sync_fetch_and_add_8.S"),
+                ("__sync_fetch_and_and_4", "arm/sync_fetch_and_and_4.S"),
+                ("__sync_fetch_and_and_8", "arm/sync_fetch_and_and_8.S"),
+                ("__sync_fetch_and_max_4", "arm/sync_fetch_and_max_4.S"),
+                ("__sync_fetch_and_max_8", "arm/sync_fetch_and_max_8.S"),
+                ("__sync_fetch_and_min_4", "arm/sync_fetch_and_min_4.S"),
+                ("__sync_fetch_and_min_8", "arm/sync_fetch_and_min_8.S"),
+                ("__sync_fetch_and_nand_4", "arm/sync_fetch_and_nand_4.S"),
+                ("__sync_fetch_and_nand_8", "arm/sync_fetch_and_nand_8.S"),
+                ("__sync_fetch_and_or_4", "arm/sync_fetch_and_or_4.S"),
+                ("__sync_fetch_and_or_8", "arm/sync_fetch_and_or_8.S"),
+                ("__sync_fetch_and_sub_4", "arm/sync_fetch_and_sub_4.S"),
+                ("__sync_fetch_and_sub_8", "arm/sync_fetch_and_sub_8.S"),
+                ("__sync_fetch_and_umax_4", "arm/sync_fetch_and_umax_4.S"),
+                ("__sync_fetch_and_umax_8", "arm/sync_fetch_and_umax_8.S"),
+                ("__sync_fetch_and_umin_4", "arm/sync_fetch_and_umin_4.S"),
+                ("__sync_fetch_and_umin_8", "arm/sync_fetch_and_umin_8.S"),
+                ("__sync_fetch_and_xor_4", "arm/sync_fetch_and_xor_4.S"),
+                ("__sync_fetch_and_xor_8", "arm/sync_fetch_and_xor_8.S"),
+            ]);
+        }
+
+        if llvm_target.last().unwrap().ends_with("eabihf") {
+            if !llvm_target[0].starts_with("thumbv7em")
+                && !llvm_target[0].starts_with("thumbv8m.main")
+            {
+                // The FPU option chosen for these architectures in cc-rs, ie:
+                //     -mfpu=fpv4-sp-d16 for thumbv7em
+                //     -mfpu=fpv5-sp-d16 for thumbv8m.main
+                // do not support double precision floating points conversions so the files
+                // that include such instructions are not included for these targets.
+                sources.extend(&[
+                    ("__fixdfsivfp", "arm/fixdfsivfp.S"),
+                    ("__fixunsdfsivfp", "arm/fixunsdfsivfp.S"),
+                    ("__floatsidfvfp", "arm/floatsidfvfp.S"),
+                    ("__floatunssidfvfp", "arm/floatunssidfvfp.S"),
+                ]);
+            }
+
+            sources.extend(&[
+                ("__fixsfsivfp", "arm/fixsfsivfp.S"),
+                ("__fixunssfsivfp", "arm/fixunssfsivfp.S"),
+                ("__floatsisfvfp", "arm/floatsisfvfp.S"),
+                ("__floatunssisfvfp", "arm/floatunssisfvfp.S"),
+                ("__floatunssisfvfp", "arm/floatunssisfvfp.S"),
+                ("__restore_vfp_d8_d15_regs", "arm/restore_vfp_d8_d15_regs.S"),
+                ("__save_vfp_d8_d15_regs", "arm/save_vfp_d8_d15_regs.S"),
+                ("__negdf2vfp", "arm/negdf2vfp.S"),
+                ("__negsf2vfp", "arm/negsf2vfp.S"),
+            ]);
+        }
+
+        if (target.arch == "aarch64" || target.arch == "arm64ec") && consider_float_intrinsics {
+            sources.extend(&[
+                ("__comparetf2", "comparetf2.c"),
+                ("__fe_getround", "fp_mode.c"),
+                ("__fe_raise_inexact", "fp_mode.c"),
+            ]);
+
+            if target.os != "windows" && target.os != "cygwin" {
+                sources.extend(&[("__multc3", "multc3.c")]);
+            }
+        }
+
+        if target.arch == "mips" || target.arch == "riscv32" || target.arch == "riscv64" {
+            sources.extend(&[("__bswapsi2", "bswapsi2.c")]);
+        }
+
+        if target.arch == "mips64" {
+            sources.extend(&[("__netf2", "comparetf2.c"), ("__fe_getround", "fp_mode.c")]);
+        }
+
+        if target.arch == "loongarch64" {
+            sources.extend(&[("__netf2", "comparetf2.c"), ("__fe_getround", "fp_mode.c")]);
+        }
+
+        // Remove the assembly implementations that won't compile for the target
+        if llvm_target[0] == "thumbv6m" || llvm_target[0] == "thumbv8m.base" || target.os == "uefi"
+        {
+            let mut to_remove = Vec::new();
+            for (k, v) in sources.map.iter() {
+                if v.ends_with(".S") {
+                    to_remove.push(*k);
+                }
+            }
+            sources.remove(&to_remove);
+        }
+
+        if llvm_target[0] == "thumbv7m" || llvm_target[0] == "thumbv7em" {
+            sources.remove(&["__aeabi_cdcmp", "__aeabi_cfcmp"]);
+        }
+
+        // Android and Cygwin uses emulated TLS so we need a runtime support function.
+        if target.os == "android" || target.os == "cygwin" {
+            sources.extend(&[("__emutls_get_address", "emutls.c")]);
+        }
+
+        // Work around a bug in the NDK headers (fixed in
+        // https://r.android.com/2038949 which will be released in a future
+        // NDK version) by providing a definition of LONG_BIT.
+        if target.os == "android" {
+            cfg.define("LONG_BIT", "(8 * sizeof(long))");
+        }
+
+        // OpenHarmony also uses emulated TLS.
+        if target.env == "ohos" {
+            sources.extend(&[("__emutls_get_address", "emutls.c")]);
+        }
+
+        // When compiling the C code we require the user to tell us where the
+        // source code is, and this is largely done so when we're compiling as
+        // part of rust-lang/rust we can use the same llvm-project repository as
+        // rust-lang/rust.
+        let root = match env::var_os("RUST_COMPILER_RT_ROOT") {
+            Some(s) => PathBuf::from(s),
+            None => {
+                panic!(
+                    "RUST_COMPILER_RT_ROOT is not set. You may need to run \
+                    `ci/download-compiler-rt.sh`."
+                );
+            }
+        };
+        if !root.exists() {
+            panic!("RUST_COMPILER_RT_ROOT={} does not exist", root.display());
+        }
+
+        // Support deterministic builds by remapping the __FILE__ prefix if the
+        // compiler supports it.  This fixes the nondeterminism caused by the
+        // use of that macro in lib/builtins/int_util.h in compiler-rt.
+        cfg.flag_if_supported(&format!("-ffile-prefix-map={}=.", root.display()));
+
+        // Include out-of-line atomics for aarch64, which are all generated by supplying different
+        // sets of flags to the same source file.
+        // Note: Out-of-line aarch64 atomics are not supported by the msvc toolchain (#430) and
+        // on uefi.
+        let src_dir = root.join("lib/builtins");
+        if target.arch == "aarch64" && target.env != "msvc" && target.os != "uefi" {
+            // See below for why we're building these as separate libraries.
+            build_aarch64_out_of_line_atomics_libraries(&src_dir, cfg);
+
+            // Some run-time CPU feature detection is necessary, as well.
+            let cpu_model_src = if src_dir.join("cpu_model.c").exists() {
+                "cpu_model.c"
+            } else {
+                "cpu_model/aarch64.c"
+            };
+            sources.extend(&[("__aarch64_have_lse_atomics", cpu_model_src)]);
+        }
+
+        let mut added_sources = HashSet::new();
+        for (sym, src) in sources.map.iter() {
+            let src = src_dir.join(src);
+            if added_sources.insert(src.clone()) {
+                cfg.file(&src);
+                println!("cargo:rerun-if-changed={}", src.display());
+            }
+            println!("cargo:rustc-cfg={}=\"optimized-c\"", sym);
+        }
+
+        cfg.compile("libcompiler-rt.a");
+    }
+
+    fn build_aarch64_out_of_line_atomics_libraries(builtins_dir: &Path, cfg: &mut cc::Build) {
+        let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
+        let outlined_atomics_file = builtins_dir.join("aarch64").join("lse.S");
+        println!("cargo:rerun-if-changed={}", outlined_atomics_file.display());
+
+        cfg.include(&builtins_dir);
+
+        for instruction_type in &["cas", "swp", "ldadd", "ldclr", "ldeor", "ldset"] {
+            for size in &[1, 2, 4, 8, 16] {
+                if *size == 16 && *instruction_type != "cas" {
+                    continue;
+                }
+
+                for (model_number, model_name) in
+                    &[(1, "relax"), (2, "acq"), (3, "rel"), (4, "acq_rel")]
+                {
+                    // The original compiler-rt build system compiles the same
+                    // source file multiple times with different compiler
+                    // options. Here we do something slightly different: we
+                    // create multiple .S files with the proper #defines and
+                    // then include the original file.
+                    //
+                    // This is needed because the cc crate doesn't allow us to
+                    // override the name of object files and libtool requires
+                    // all objects in an archive to have unique names.
+                    let path =
+                        out_dir.join(format!("lse_{}{}_{}.S", instruction_type, size, model_name));
+                    let mut file = File::create(&path).unwrap();
+                    writeln!(file, "#define L_{}", instruction_type).unwrap();
+                    writeln!(file, "#define SIZE {}", size).unwrap();
+                    writeln!(file, "#define MODEL {}", model_number).unwrap();
+                    writeln!(
+                        file,
+                        "#include \"{}\"",
+                        outlined_atomics_file.canonicalize().unwrap().display()
+                    )
+                    .unwrap();
+                    drop(file);
+                    cfg.file(path);
+
+                    let sym = format!("__aarch64_{}{}_{}", instruction_type, size, model_name);
+                    println!("cargo:rustc-cfg={}=\"optimized-c\"", sym);
+                }
+            }
+        }
+    }
+}
diff --git a/library/compiler-builtins/compiler-builtins/configure.rs b/library/compiler-builtins/compiler-builtins/configure.rs
new file mode 100644
index 00000000000..d825f35a9aa
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/configure.rs
@@ -0,0 +1,136 @@
+// Configuration that is shared between `compiler_builtins` and `builtins_test`.
+
+use std::env;
+
+#[derive(Debug)]
+#[allow(dead_code)]
+pub struct Target {
+    pub triple: String,
+    pub triple_split: Vec<String>,
+    pub opt_level: String,
+    pub cargo_features: Vec<String>,
+    pub os: String,
+    pub arch: String,
+    pub vendor: String,
+    pub env: String,
+    pub pointer_width: u8,
+    pub little_endian: bool,
+    pub features: Vec<String>,
+}
+
+impl Target {
+    pub fn from_env() -> Self {
+        let triple = env::var("TARGET").unwrap();
+        let triple_split = triple.split('-').map(ToOwned::to_owned).collect();
+        let little_endian = match env::var("CARGO_CFG_TARGET_ENDIAN").unwrap().as_str() {
+            "little" => true,
+            "big" => false,
+            x => panic!("unknown endian {x}"),
+        };
+        let cargo_features = env::vars()
+            .filter_map(|(name, _value)| name.strip_prefix("CARGO_FEATURE_").map(ToOwned::to_owned))
+            .map(|s| s.to_lowercase().replace("_", "-"))
+            .collect();
+
+        Self {
+            triple,
+            triple_split,
+            os: env::var("CARGO_CFG_TARGET_OS").unwrap(),
+            opt_level: env::var("OPT_LEVEL").unwrap(),
+            cargo_features,
+            arch: env::var("CARGO_CFG_TARGET_ARCH").unwrap(),
+            vendor: env::var("CARGO_CFG_TARGET_VENDOR").unwrap(),
+            env: env::var("CARGO_CFG_TARGET_ENV").unwrap(),
+            pointer_width: env::var("CARGO_CFG_TARGET_POINTER_WIDTH")
+                .unwrap()
+                .parse()
+                .unwrap(),
+            little_endian,
+            features: env::var("CARGO_CFG_TARGET_FEATURE")
+                .unwrap_or_default()
+                .split(",")
+                .map(ToOwned::to_owned)
+                .collect(),
+        }
+    }
+
+    #[allow(dead_code)]
+    pub fn has_feature(&self, feature: &str) -> bool {
+        self.features.iter().any(|f| f == feature)
+    }
+}
+
+pub fn configure_aliases(target: &Target) {
+    // To compile builtins-test-intrinsics for thumb targets, where there is no libc
+    println!("cargo::rustc-check-cfg=cfg(thumb)");
+    if target.triple_split[0].starts_with("thumb") {
+        println!("cargo:rustc-cfg=thumb")
+    }
+
+    // compiler-rt `cfg`s away some intrinsics for thumbv6m and thumbv8m.base because
+    // these targets do not have full Thumb-2 support but only original Thumb-1.
+    // We have to cfg our code accordingly.
+    println!("cargo::rustc-check-cfg=cfg(thumb_1)");
+    if target.triple_split[0] == "thumbv6m" || target.triple_split[0] == "thumbv8m.base" {
+        println!("cargo:rustc-cfg=thumb_1")
+    }
+}
+
+/// Configure whether or not `f16` and `f128` support should be enabled.
+pub fn configure_f16_f128(target: &Target) {
+    // Set whether or not `f16` and `f128` are supported at a basic level by LLVM. This only means
+    // that the backend will not crash when using these types and generates code that can be called
+    // without crashing (no infinite recursion). This does not mean that the platform doesn't have
+    // ABI or other bugs.
+    //
+    // We do this here rather than in `rust-lang/rust` because configuring via cargo features is
+    // not straightforward.
+    //
+    // Original source of this list:
+    // <https://github.com/rust-lang/compiler-builtins/pull/652#issuecomment-2266151350>
+    let f16_enabled = match target.arch.as_str() {
+        // Unsupported <https://github.com/llvm/llvm-project/issues/94434>
+        "arm64ec" => false,
+        // Selection failure <https://github.com/llvm/llvm-project/issues/50374>
+        "s390x" => false,
+        // Infinite recursion <https://github.com/llvm/llvm-project/issues/97981>
+        "csky" => false,
+        "hexagon" => false,
+        "powerpc" | "powerpc64" => false,
+        "sparc" | "sparc64" => false,
+        "wasm32" | "wasm64" => false,
+        // Most everything else works as of LLVM 19
+        _ => true,
+    };
+
+    let f128_enabled = match target.arch.as_str() {
+        // Unsupported (libcall is not supported) <https://github.com/llvm/llvm-project/issues/121122>
+        "amdgpu" => false,
+        // Unsupported <https://github.com/llvm/llvm-project/issues/94434>
+        "arm64ec" => false,
+        // FIXME(llvm20): fixed by <https://github.com/llvm/llvm-project/pull/117525>
+        "mips64" | "mips64r6" => false,
+        // Selection failure <https://github.com/llvm/llvm-project/issues/95471>
+        "nvptx64" => false,
+        // Selection failure <https://github.com/llvm/llvm-project/issues/101545>
+        "powerpc64" if &target.os == "aix" => false,
+        // Selection failure <https://github.com/llvm/llvm-project/issues/41838>
+        "sparc" => false,
+        // Most everything else works as of LLVM 19
+        _ => true,
+    };
+
+    // If the feature is set, disable these types.
+    let disable_both = env::var_os("CARGO_FEATURE_NO_F16_F128").is_some();
+
+    println!("cargo::rustc-check-cfg=cfg(f16_enabled)");
+    println!("cargo::rustc-check-cfg=cfg(f128_enabled)");
+
+    if f16_enabled && !disable_both {
+        println!("cargo::rustc-cfg=f16_enabled");
+    }
+
+    if f128_enabled && !disable_both {
+        println!("cargo::rustc-cfg=f128_enabled");
+    }
+}
diff --git a/library/compiler-builtins/compiler-builtins/src/aarch64.rs b/library/compiler-builtins/compiler-builtins/src/aarch64.rs
new file mode 100644
index 00000000000..80392187c89
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/aarch64.rs
@@ -0,0 +1,21 @@
+#![allow(unused_imports)]
+
+use core::intrinsics;
+
+intrinsics! {
+    #[unsafe(naked)]
+    #[cfg(all(target_os = "uefi", not(feature = "no-asm")))]
+    pub unsafe extern "C" fn __chkstk() {
+        core::arch::naked_asm!(
+            ".p2align 2",
+            "lsl    x16, x15, #4",
+            "mov    x17, sp",
+            "1:",
+            "sub    x17, x17, 4096",
+            "subs   x16, x16, 4096",
+            "ldr    xzr, [x17]",
+            "b.gt   1b",
+            "ret",
+        );
+    }
+}
diff --git a/library/compiler-builtins/compiler-builtins/src/aarch64_linux.rs b/library/compiler-builtins/compiler-builtins/src/aarch64_linux.rs
new file mode 100644
index 00000000000..e238d0237eb
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/aarch64_linux.rs
@@ -0,0 +1,273 @@
+//! Aarch64 targets have two possible implementations for atomics:
+//! 1. Load-Locked, Store-Conditional (LL/SC), older and slower.
+//! 2. Large System Extensions (LSE), newer and faster.
+//! To avoid breaking backwards compat, C toolchains introduced a concept of "outlined atomics",
+//! where atomic operations call into the compiler runtime to dispatch between two depending on
+//! which is supported on the current CPU.
+//! See https://community.arm.com/arm-community-blogs/b/tools-software-ides-blog/posts/making-the-most-of-the-arm-architecture-in-gcc-10#:~:text=out%20of%20line%20atomics for more discussion.
+//!
+//! Currently we only support LL/SC, because LSE requires `getauxval` from libc in order to do runtime detection.
+//! Use the `compiler-rt` intrinsics if you want LSE support.
+//!
+//! Ported from `aarch64/lse.S` in LLVM's compiler-rt.
+//!
+//! Generate functions for each of the following symbols:
+//!  __aarch64_casM_ORDER
+//!  __aarch64_swpN_ORDER
+//!  __aarch64_ldaddN_ORDER
+//!  __aarch64_ldclrN_ORDER
+//!  __aarch64_ldeorN_ORDER
+//!  __aarch64_ldsetN_ORDER
+//! for N = {1, 2, 4, 8}, M = {1, 2, 4, 8, 16}, ORDER = { relax, acq, rel, acq_rel }
+//!
+//! The original `lse.S` has some truly horrifying code that expects to be compiled multiple times with different constants.
+//! We do something similar, but with macro arguments.
+#![cfg_attr(feature = "c", allow(unused_macros))] // avoid putting the macros into a submodule
+
+// We don't do runtime dispatch so we don't have to worry about the `__aarch64_have_lse_atomics` global ctor.
+
+/// Translate a byte size to a Rust type.
+#[rustfmt::skip]
+macro_rules! int_ty {
+    (1) => { i8 };
+    (2) => { i16 };
+    (4) => { i32 };
+    (8) => { i64 };
+    (16) => { i128 };
+}
+
+/// Given a byte size and a register number, return a register of the appropriate size.
+///
+/// See <https://developer.arm.com/documentation/102374/0101/Registers-in-AArch64---general-purpose-registers>.
+#[rustfmt::skip]
+macro_rules! reg {
+    (1, $num:literal) => { concat!("w", $num) };
+    (2, $num:literal) => { concat!("w", $num) };
+    (4, $num:literal) => { concat!("w", $num) };
+    (8, $num:literal) => { concat!("x", $num) };
+}
+
+/// Given an atomic ordering, translate it to the acquire suffix for the lxdr aarch64 ASM instruction.
+#[rustfmt::skip]
+macro_rules! acquire {
+    (Relaxed) => { "" };
+    (Acquire) => { "a" };
+    (Release) => { "" };
+    (AcqRel) => { "a" };
+}
+
+/// Given an atomic ordering, translate it to the release suffix for the stxr aarch64 ASM instruction.
+#[rustfmt::skip]
+macro_rules! release {
+    (Relaxed) => { "" };
+    (Acquire) => { "" };
+    (Release) => { "l" };
+    (AcqRel) => { "l" };
+}
+
+/// Given a size in bytes, translate it to the byte suffix for an aarch64 ASM instruction.
+#[rustfmt::skip]
+macro_rules! size {
+    (1) => { "b" };
+    (2) => { "h" };
+    (4) => { "" };
+    (8) => { "" };
+    (16) => { "" };
+}
+
+/// Given a byte size, translate it to an Unsigned eXTend instruction
+/// with the correct semantics.
+///
+/// See <https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/UXTB--Unsigned-Extend-Byte--an-alias-of-UBFM->
+#[rustfmt::skip]
+macro_rules! uxt {
+    (1) => { "uxtb" };
+    (2) => { "uxth" };
+    ($_:tt) => { "mov" };
+}
+
+/// Given an atomic ordering and byte size, translate it to a LoaD eXclusive Register instruction
+/// with the correct semantics.
+///
+/// See <https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/LDXR--Load-Exclusive-Register->.
+macro_rules! ldxr {
+    ($ordering:ident, $bytes:tt) => {
+        concat!("ld", acquire!($ordering), "xr", size!($bytes))
+    };
+}
+
+/// Given an atomic ordering and byte size, translate it to a STore eXclusive Register instruction
+/// with the correct semantics.
+///
+/// See <https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/STXR--Store-Exclusive-Register->.
+macro_rules! stxr {
+    ($ordering:ident, $bytes:tt) => {
+        concat!("st", release!($ordering), "xr", size!($bytes))
+    };
+}
+
+/// Given an atomic ordering and byte size, translate it to a LoaD eXclusive Pair of registers instruction
+/// with the correct semantics.
+///
+/// See <https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/LDXP--Load-Exclusive-Pair-of-Registers->
+macro_rules! ldxp {
+    ($ordering:ident) => {
+        concat!("ld", acquire!($ordering), "xp")
+    };
+}
+
+/// Given an atomic ordering and byte size, translate it to a STore eXclusive Pair of registers instruction
+/// with the correct semantics.
+///
+/// See <https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/STXP--Store-Exclusive-Pair-of-registers->.
+macro_rules! stxp {
+    ($ordering:ident) => {
+        concat!("st", release!($ordering), "xp")
+    };
+}
+
+/// See <https://doc.rust-lang.org/stable/std/sync/atomic/struct.AtomicI8.html#method.compare_and_swap>.
+macro_rules! compare_and_swap {
+    ($ordering:ident, $bytes:tt, $name:ident) => {
+        intrinsics! {
+            #[maybe_use_optimized_c_shim]
+            #[unsafe(naked)]
+            pub unsafe extern "C" fn $name (
+                expected: int_ty!($bytes), desired: int_ty!($bytes), ptr: *mut int_ty!($bytes)
+            ) -> int_ty!($bytes) {
+                // We can't use `AtomicI8::compare_and_swap`; we *are* compare_and_swap.
+                core::arch::naked_asm! {
+                    // UXT s(tmp0), s(0)
+                    concat!(uxt!($bytes), " ", reg!($bytes, 16), ", ", reg!($bytes, 0)),
+                    "0:",
+                    // LDXR   s(0), [x2]
+                    concat!(ldxr!($ordering, $bytes), " ", reg!($bytes, 0), ", [x2]"),
+                    // cmp    s(0), s(tmp0)
+                    concat!("cmp ", reg!($bytes, 0), ", ", reg!($bytes, 16)),
+                    "bne    1f",
+                    // STXR   w(tmp1), s(1), [x2]
+                    concat!(stxr!($ordering, $bytes), " w17, ", reg!($bytes, 1), ", [x2]"),
+                    "cbnz   w17, 0b",
+                    "1:",
+                    "ret",
+                }
+            }
+        }
+    };
+}
+
+// i128 uses a completely different impl, so it has its own macro.
+macro_rules! compare_and_swap_i128 {
+    ($ordering:ident, $name:ident) => {
+        intrinsics! {
+            #[maybe_use_optimized_c_shim]
+            #[unsafe(naked)]
+            pub unsafe extern "C" fn $name (
+                expected: i128, desired: i128, ptr: *mut i128
+            ) -> i128 {
+                core::arch::naked_asm! {
+                    "mov    x16, x0",
+                    "mov    x17, x1",
+                    "0:",
+                    // LDXP   x0, x1, [x4]
+                    concat!(ldxp!($ordering), " x0, x1, [x4]"),
+                    "cmp    x0, x16",
+                    "ccmp   x1, x17, #0, eq",
+                    "bne    1f",
+                    // STXP   w(tmp2), x2, x3, [x4]
+                    concat!(stxp!($ordering), " w15, x2, x3, [x4]"),
+                    "cbnz   w15, 0b",
+                    "1:",
+                    "ret",
+                }
+            }
+        }
+    };
+}
+
+/// See <https://doc.rust-lang.org/stable/std/sync/atomic/struct.AtomicI8.html#method.swap>.
+macro_rules! swap {
+    ($ordering:ident, $bytes:tt, $name:ident) => {
+        intrinsics! {
+            #[maybe_use_optimized_c_shim]
+            #[unsafe(naked)]
+            pub unsafe extern "C" fn $name (
+                left: int_ty!($bytes), right_ptr: *mut int_ty!($bytes)
+            ) -> int_ty!($bytes) {
+                core::arch::naked_asm! {
+                    // mov    s(tmp0), s(0)
+                    concat!("mov ", reg!($bytes, 16), ", ", reg!($bytes, 0)),
+                    "0:",
+                    // LDXR   s(0), [x1]
+                    concat!(ldxr!($ordering, $bytes), " ", reg!($bytes, 0), ", [x1]"),
+                    // STXR   w(tmp1), s(tmp0), [x1]
+                    concat!(stxr!($ordering, $bytes), " w17, ", reg!($bytes, 16), ", [x1]"),
+                    "cbnz   w17, 0b",
+                    "ret",
+                }
+            }
+        }
+    };
+}
+
+/// See (e.g.) <https://doc.rust-lang.org/stable/std/sync/atomic/struct.AtomicI8.html#method.fetch_add>.
+macro_rules! fetch_op {
+    ($ordering:ident, $bytes:tt, $name:ident, $op:literal) => {
+        intrinsics! {
+            #[maybe_use_optimized_c_shim]
+            #[unsafe(naked)]
+            pub unsafe extern "C" fn $name (
+                val: int_ty!($bytes), ptr: *mut int_ty!($bytes)
+            ) -> int_ty!($bytes) {
+                core::arch::naked_asm! {
+                    // mov    s(tmp0), s(0)
+                    concat!("mov ", reg!($bytes, 16), ", ", reg!($bytes, 0)),
+                    "0:",
+                    // LDXR   s(0), [x1]
+                    concat!(ldxr!($ordering, $bytes), " ", reg!($bytes, 0), ", [x1]"),
+                    // OP     s(tmp1), s(0), s(tmp0)
+                    concat!($op, " ", reg!($bytes, 17), ", ", reg!($bytes, 0), ", ", reg!($bytes, 16)),
+                    // STXR   w(tmp2), s(tmp1), [x1]
+                    concat!(stxr!($ordering, $bytes), " w15, ", reg!($bytes, 17), ", [x1]"),
+                    "cbnz  w15, 0b",
+                    "ret",
+                }
+            }
+        }
+    }
+}
+
+// We need a single macro to pass to `foreach_ldadd`.
+macro_rules! add {
+    ($ordering:ident, $bytes:tt, $name:ident) => {
+        fetch_op! { $ordering, $bytes, $name, "add" }
+    };
+}
+
+macro_rules! and {
+    ($ordering:ident, $bytes:tt, $name:ident) => {
+        fetch_op! { $ordering, $bytes, $name, "bic" }
+    };
+}
+
+macro_rules! xor {
+    ($ordering:ident, $bytes:tt, $name:ident) => {
+        fetch_op! { $ordering, $bytes, $name, "eor" }
+    };
+}
+
+macro_rules! or {
+    ($ordering:ident, $bytes:tt, $name:ident) => {
+        fetch_op! { $ordering, $bytes, $name, "orr" }
+    };
+}
+
+// See `generate_aarch64_outlined_atomics` in build.rs.
+include!(concat!(env!("OUT_DIR"), "/outlined_atomics.rs"));
+foreach_cas!(compare_and_swap);
+foreach_cas16!(compare_and_swap_i128);
+foreach_swp!(swap);
+foreach_ldadd!(add);
+foreach_ldclr!(and);
+foreach_ldeor!(xor);
+foreach_ldset!(or);
diff --git a/library/compiler-builtins/compiler-builtins/src/arm.rs b/library/compiler-builtins/compiler-builtins/src/arm.rs
new file mode 100644
index 00000000000..a9107e3cdfd
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/arm.rs
@@ -0,0 +1,280 @@
+#![cfg(not(feature = "no-asm"))]
+
+// Interfaces used by naked trampolines.
+extern "C" {
+    fn __udivmodsi4(a: u32, b: u32, rem: *mut u32) -> u32;
+    fn __udivmoddi4(a: u64, b: u64, rem: *mut u64) -> u64;
+    fn __divmoddi4(a: i64, b: i64, rem: *mut i64) -> i64;
+}
+
+extern "aapcs" {
+    // AAPCS is not always the correct ABI for these intrinsics, but we only use this to
+    // forward another `__aeabi_` call so it doesn't matter.
+    fn __aeabi_idiv(a: i32, b: i32) -> i32;
+}
+
+intrinsics! {
+    // NOTE This function and the ones below are implemented using assembly because they are using a
+    // custom calling convention which can't be implemented using a normal Rust function.
+    #[unsafe(naked)]
+    #[cfg(not(target_env = "msvc"))]
+    pub unsafe extern "C" fn __aeabi_uidivmod() {
+        core::arch::naked_asm!(
+            "push {{lr}}",
+            "sub sp, sp, #4",
+            "mov r2, sp",
+            "bl {trampoline}",
+            "ldr r1, [sp]",
+            "add sp, sp, #4",
+            "pop {{pc}}",
+            trampoline = sym crate::arm::__udivmodsi4
+        );
+    }
+
+    #[unsafe(naked)]
+    pub unsafe extern "C" fn __aeabi_uldivmod() {
+        core::arch::naked_asm!(
+            "push {{r4, lr}}",
+            "sub sp, sp, #16",
+            "add r4, sp, #8",
+            "str r4, [sp]",
+            "bl {trampoline}",
+            "ldr r2, [sp, #8]",
+            "ldr r3, [sp, #12]",
+            "add sp, sp, #16",
+            "pop {{r4, pc}}",
+            trampoline = sym crate::arm::__udivmoddi4
+        );
+    }
+
+    #[unsafe(naked)]
+    pub unsafe extern "C" fn __aeabi_idivmod() {
+        core::arch::naked_asm!(
+            "push {{r0, r1, r4, lr}}",
+            "bl {trampoline}",
+            "pop {{r1, r2}}",
+            "muls r2, r2, r0",
+            "subs r1, r1, r2",
+            "pop {{r4, pc}}",
+            trampoline = sym crate::arm::__aeabi_idiv,
+        );
+    }
+
+    #[unsafe(naked)]
+    pub unsafe extern "C" fn __aeabi_ldivmod() {
+        core::arch::naked_asm!(
+            "push {{r4, lr}}",
+            "sub sp, sp, #16",
+            "add r4, sp, #8",
+            "str r4, [sp]",
+            "bl {trampoline}",
+            "ldr r2, [sp, #8]",
+            "ldr r3, [sp, #12]",
+            "add sp, sp, #16",
+            "pop {{r4, pc}}",
+            trampoline = sym crate::arm::__divmoddi4,
+        );
+    }
+
+    // FIXME(arm): The `*4` and `*8` variants should be defined as aliases.
+
+    /// `memcpy` provided with the `aapcs` ABI.
+    ///
+    /// # Safety
+    ///
+    /// Usual `memcpy` requirements apply.
+    #[cfg(not(target_vendor = "apple"))]
+    pub unsafe extern "aapcs" fn __aeabi_memcpy(dst: *mut u8, src: *const u8, n: usize) {
+        // SAFETY: memcpy preconditions apply.
+        unsafe { crate::mem::memcpy(dst, src, n) };
+    }
+
+    /// `memcpy` for 4-byte alignment.
+    ///
+    /// # Safety
+    ///
+    /// Usual `memcpy` requirements apply. Additionally, `dest` and `src` must be aligned to
+    /// four bytes.
+    #[cfg(not(target_vendor = "apple"))]
+    pub unsafe extern "aapcs" fn __aeabi_memcpy4(dst: *mut u8, src: *const u8, n: usize) {
+        // We are guaranteed 4-alignment, so accessing at u32 is okay.
+        let mut dst = dst.cast::<u32>();
+        let mut src = src.cast::<u32>();
+        debug_assert!(dst.is_aligned());
+        debug_assert!(src.is_aligned());
+        let mut n = n;
+
+        while n >= 4 {
+            // SAFETY: `dst` and `src` are both valid for at least 4 bytes, from
+            // `memcpy` preconditions and the loop guard.
+            unsafe { *dst = *src };
+
+            // FIXME(addr): if we can make this end-of-address-space safe without losing
+            // performance, we may want to consider that.
+            // SAFETY: memcpy is not expected to work at the end of the address space
+            unsafe {
+                dst = dst.offset(1);
+                src = src.offset(1);
+            }
+
+            n -= 4;
+        }
+
+        // SAFETY: `dst` and `src` will still be valid for `n` bytes
+        unsafe { __aeabi_memcpy(dst.cast::<u8>(), src.cast::<u8>(), n) };
+    }
+
+    /// `memcpy` for 8-byte alignment.
+    ///
+    /// # Safety
+    ///
+    /// Usual `memcpy` requirements apply. Additionally, `dest` and `src` must be aligned to
+    /// eight bytes.
+    #[cfg(not(target_vendor = "apple"))]
+    pub unsafe extern "aapcs" fn __aeabi_memcpy8(dst: *mut u8, src: *const u8, n: usize) {
+        debug_assert!(dst.addr() & 7 == 0);
+        debug_assert!(src.addr() & 7 == 0);
+
+        // SAFETY: memcpy preconditions apply, less strict alignment.
+        unsafe { __aeabi_memcpy4(dst, src, n) };
+    }
+
+    /// `memmove` provided with the `aapcs` ABI.
+    ///
+    /// # Safety
+    ///
+    /// Usual `memmove` requirements apply.
+    #[cfg(not(target_vendor = "apple"))]
+    pub unsafe extern "aapcs" fn __aeabi_memmove(dst: *mut u8, src: *const u8, n: usize) {
+        // SAFETY: memmove preconditions apply.
+        unsafe { crate::mem::memmove(dst, src, n) };
+    }
+
+    /// `memmove` for 4-byte alignment.
+    ///
+    /// # Safety
+    ///
+    /// Usual `memmove` requirements apply. Additionally, `dest` and `src` must be aligned to
+    /// four bytes.
+    #[cfg(not(any(target_vendor = "apple", target_env = "msvc")))]
+    pub unsafe extern "aapcs" fn __aeabi_memmove4(dst: *mut u8, src: *const u8, n: usize) {
+        debug_assert!(dst.addr() & 3 == 0);
+        debug_assert!(src.addr() & 3 == 0);
+
+        // SAFETY: same preconditions, less strict aligment.
+        unsafe { __aeabi_memmove(dst, src, n) };
+    }
+
+    /// `memmove` for 8-byte alignment.
+    ///
+    /// # Safety
+    ///
+    /// Usual `memmove` requirements apply. Additionally, `dst` and `src` must be aligned to
+    /// eight bytes.
+    #[cfg(not(any(target_vendor = "apple", target_env = "msvc")))]
+    pub unsafe extern "aapcs" fn __aeabi_memmove8(dst: *mut u8, src: *const u8, n: usize) {
+        debug_assert!(dst.addr() & 7 == 0);
+        debug_assert!(src.addr() & 7 == 0);
+
+        // SAFETY: memmove preconditions apply, less strict alignment.
+        unsafe { __aeabi_memmove(dst, src, n) };
+    }
+
+    /// `memset` provided with the `aapcs` ABI.
+    ///
+    /// # Safety
+    ///
+    /// Usual `memset` requirements apply.
+    #[cfg(not(target_vendor = "apple"))]
+    pub unsafe extern "aapcs" fn __aeabi_memset(dst: *mut u8, n: usize, c: i32) {
+        // Note the different argument order
+        // SAFETY: memset preconditions apply.
+        unsafe { crate::mem::memset(dst, c, n) };
+    }
+
+    /// `memset` for 4-byte alignment.
+    ///
+    /// # Safety
+    ///
+    /// Usual `memset` requirements apply. Additionally, `dest` and `src` must be aligned to
+    /// four bytes.
+    #[cfg(not(target_vendor = "apple"))]
+    pub unsafe extern "aapcs" fn __aeabi_memset4(dst: *mut u8, n: usize, c: i32) {
+        let mut dst = dst.cast::<u32>();
+        debug_assert!(dst.is_aligned());
+        let mut n = n;
+
+        let byte = (c as u32) & 0xff;
+        let c = (byte << 24) | (byte << 16) | (byte << 8) | byte;
+
+        while n >= 4 {
+            // SAFETY: `dst` is valid for at least 4 bytes, from `memset` preconditions and
+            // the loop guard.
+            unsafe { *dst = c };
+
+            // FIXME(addr): if we can make this end-of-address-space safe without losing
+            // performance, we may want to consider that.
+            // SAFETY: memcpy is not expected to work at the end of the address space
+            unsafe {
+                dst = dst.offset(1);
+            }
+            n -= 4;
+        }
+
+        // SAFETY: `dst` will still be valid for `n` bytes
+        unsafe { __aeabi_memset(dst.cast::<u8>(), n, byte as i32) };
+    }
+
+    /// `memset` for 8-byte alignment.
+    ///
+    /// # Safety
+    ///
+    /// Usual `memset` requirements apply. Additionally, `dst` and `src` must be aligned to
+    /// eight bytes.
+    #[cfg(not(target_vendor = "apple"))]
+    pub unsafe extern "aapcs" fn __aeabi_memset8(dst: *mut u8, n: usize, c: i32) {
+        debug_assert!(dst.addr() & 7 == 0);
+
+        // SAFETY: memset preconditions apply, less strict alignment.
+        unsafe { __aeabi_memset4(dst, n, c) };
+    }
+
+    /// `memclr` provided with the `aapcs` ABI.
+    ///
+    /// # Safety
+    ///
+    /// Usual `memclr` requirements apply.
+    #[cfg(not(target_vendor = "apple"))]
+    pub unsafe extern "aapcs" fn __aeabi_memclr(dst: *mut u8, n: usize) {
+        // SAFETY: memclr preconditions apply, less strict alignment.
+        unsafe { __aeabi_memset(dst, n, 0) };
+    }
+
+    /// `memclr` for 4-byte alignment.
+    ///
+    /// # Safety
+    ///
+    /// Usual `memclr` requirements apply. Additionally, `dest` and `src` must be aligned to
+    /// four bytes.
+    #[cfg(not(any(target_vendor = "apple", target_env = "msvc")))]
+    pub unsafe extern "aapcs" fn __aeabi_memclr4(dst: *mut u8, n: usize) {
+        debug_assert!(dst.addr() & 3 == 0);
+
+        // SAFETY: memclr preconditions apply, less strict alignment.
+        unsafe { __aeabi_memset4(dst, n, 0) };
+    }
+
+    /// `memclr` for 8-byte alignment.
+    ///
+    /// # Safety
+    ///
+    /// Usual `memclr` requirements apply. Additionally, `dst` and `src` must be aligned to
+    /// eight bytes.
+    #[cfg(not(any(target_vendor = "apple", target_env = "msvc")))]
+    pub unsafe extern "aapcs" fn __aeabi_memclr8(dst: *mut u8, n: usize) {
+        debug_assert!(dst.addr() & 7 == 0);
+
+        // SAFETY: memclr preconditions apply, less strict alignment.
+        unsafe { __aeabi_memset4(dst, n, 0) };
+    }
+}
diff --git a/library/compiler-builtins/compiler-builtins/src/arm_linux.rs b/library/compiler-builtins/compiler-builtins/src/arm_linux.rs
new file mode 100644
index 00000000000..6ce67ba719c
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/arm_linux.rs
@@ -0,0 +1,290 @@
+use core::sync::atomic::{AtomicU32, Ordering};
+use core::{arch, mem};
+
+// Kernel-provided user-mode helper functions:
+// https://www.kernel.org/doc/Documentation/arm/kernel_user_helpers.txt
+unsafe fn __kuser_cmpxchg(oldval: u32, newval: u32, ptr: *mut u32) -> bool {
+    let f: extern "C" fn(u32, u32, *mut u32) -> u32 = mem::transmute(0xffff0fc0usize as *const ());
+    f(oldval, newval, ptr) == 0
+}
+
+unsafe fn __kuser_memory_barrier() {
+    let f: extern "C" fn() = mem::transmute(0xffff0fa0usize as *const ());
+    f();
+}
+
+// Word-align a pointer
+fn align_ptr<T>(ptr: *mut T) -> *mut u32 {
+    // This gives us a mask of 0 when T == u32 since the pointer is already
+    // supposed to be aligned, which avoids any masking in that case.
+    let ptr_mask = 3 & (4 - mem::size_of::<T>());
+    (ptr as usize & !ptr_mask) as *mut u32
+}
+
+// Calculate the shift and mask of a value inside an aligned word
+fn get_shift_mask<T>(ptr: *mut T) -> (u32, u32) {
+    // Mask to get the low byte/halfword/word
+    let mask = match mem::size_of::<T>() {
+        1 => 0xff,
+        2 => 0xffff,
+        4 => 0xffffffff,
+        _ => unreachable!(),
+    };
+
+    // If we are on big-endian then we need to adjust the shift accordingly
+    let endian_adjust = if cfg!(target_endian = "little") {
+        0
+    } else {
+        4 - mem::size_of::<T>() as u32
+    };
+
+    // Shift to get the desired element in the word
+    let ptr_mask = 3 & (4 - mem::size_of::<T>());
+    let shift = ((ptr as usize & ptr_mask) as u32 ^ endian_adjust) * 8;
+
+    (shift, mask)
+}
+
+// Extract a value from an aligned word
+fn extract_aligned(aligned: u32, shift: u32, mask: u32) -> u32 {
+    (aligned >> shift) & mask
+}
+
+// Insert a value into an aligned word
+fn insert_aligned(aligned: u32, val: u32, shift: u32, mask: u32) -> u32 {
+    (aligned & !(mask << shift)) | ((val & mask) << shift)
+}
+
+/// Performs a relaxed atomic load of 4 bytes at `ptr`. Some of the bytes are allowed to be out of
+/// bounds as long as `size_of::<T>()` bytes are in bounds.
+///
+/// # Safety
+///
+/// - `ptr` must be 4-aligned.
+/// - `size_of::<T>()` must be at most 4.
+/// - if `size_of::<T>() == 1`, `ptr` or `ptr` offset by 1, 2 or 3 bytes must be valid for a relaxed
+///   atomic read of 1 byte.
+/// - if `size_of::<T>() == 2`, `ptr` or `ptr` offset by 2 bytes must be valid for a relaxed atomic
+///   read of 2 bytes.
+/// - if `size_of::<T>() == 4`, `ptr` must be valid for a relaxed atomic read of 4 bytes.
+unsafe fn atomic_load_aligned<T>(ptr: *mut u32) -> u32 {
+    if mem::size_of::<T>() == 4 {
+        // SAFETY: As `T` has a size of 4, the caller garantees this is sound.
+        unsafe { AtomicU32::from_ptr(ptr).load(Ordering::Relaxed) }
+    } else {
+        // SAFETY:
+        // As all 4 bytes pointed to by `ptr` might not be dereferenceable due to being out of
+        // bounds when doing atomic operations on a `u8`/`i8`/`u16`/`i16`, inline ASM is used to
+        // avoid causing undefined behaviour. However, as `ptr` is 4-aligned and at least 1 byte of
+        // `ptr` is dereferencable, the load won't cause a segfault as the page size is always
+        // larger than 4 bytes.
+        // The `ldr` instruction does not touch the stack or flags, or write to memory, so
+        // `nostack`, `preserves_flags` and `readonly` are sound. The caller garantees that `ptr` is
+        // 4-aligned, as required by `ldr`.
+        unsafe {
+            let res: u32;
+            arch::asm!(
+                "ldr {res}, [{ptr}]",
+                ptr = in(reg) ptr,
+                res = lateout(reg) res,
+                options(nostack, preserves_flags, readonly)
+            );
+            res
+        }
+    }
+}
+
+// Generic atomic read-modify-write operation
+unsafe fn atomic_rmw<T, F: Fn(u32) -> u32, G: Fn(u32, u32) -> u32>(ptr: *mut T, f: F, g: G) -> u32 {
+    let aligned_ptr = align_ptr(ptr);
+    let (shift, mask) = get_shift_mask(ptr);
+
+    loop {
+        let curval_aligned = atomic_load_aligned::<T>(aligned_ptr);
+        let curval = extract_aligned(curval_aligned, shift, mask);
+        let newval = f(curval);
+        let newval_aligned = insert_aligned(curval_aligned, newval, shift, mask);
+        if __kuser_cmpxchg(curval_aligned, newval_aligned, aligned_ptr) {
+            return g(curval, newval);
+        }
+    }
+}
+
+// Generic atomic compare-exchange operation
+unsafe fn atomic_cmpxchg<T>(ptr: *mut T, oldval: u32, newval: u32) -> u32 {
+    let aligned_ptr = align_ptr(ptr);
+    let (shift, mask) = get_shift_mask(ptr);
+
+    loop {
+        let curval_aligned = atomic_load_aligned::<T>(aligned_ptr);
+        let curval = extract_aligned(curval_aligned, shift, mask);
+        if curval != oldval {
+            return curval;
+        }
+        let newval_aligned = insert_aligned(curval_aligned, newval, shift, mask);
+        if __kuser_cmpxchg(curval_aligned, newval_aligned, aligned_ptr) {
+            return oldval;
+        }
+    }
+}
+
+macro_rules! atomic_rmw {
+    ($name:ident, $ty:ty, $op:expr, $fetch:expr) => {
+        intrinsics! {
+            pub unsafe extern "C" fn $name(ptr: *mut $ty, val: $ty) -> $ty {
+                atomic_rmw(ptr, |x| $op(x as $ty, val) as u32, |old, new| $fetch(old, new)) as $ty
+            }
+        }
+    };
+
+    (@old $name:ident, $ty:ty, $op:expr) => {
+        atomic_rmw!($name, $ty, $op, |old, _| old);
+    };
+
+    (@new $name:ident, $ty:ty, $op:expr) => {
+        atomic_rmw!($name, $ty, $op, |_, new| new);
+    };
+}
+macro_rules! atomic_cmpxchg {
+    ($name:ident, $ty:ty) => {
+        intrinsics! {
+            pub unsafe extern "C" fn $name(ptr: *mut $ty, oldval: $ty, newval: $ty) -> $ty {
+                atomic_cmpxchg(ptr, oldval as u32, newval as u32) as $ty
+            }
+        }
+    };
+}
+
+atomic_rmw!(@old __sync_fetch_and_add_1, u8, |a: u8, b: u8| a.wrapping_add(b));
+atomic_rmw!(@old __sync_fetch_and_add_2, u16, |a: u16, b: u16| a
+    .wrapping_add(b));
+atomic_rmw!(@old __sync_fetch_and_add_4, u32, |a: u32, b: u32| a
+    .wrapping_add(b));
+
+atomic_rmw!(@new __sync_add_and_fetch_1, u8, |a: u8, b: u8| a.wrapping_add(b));
+atomic_rmw!(@new __sync_add_and_fetch_2, u16, |a: u16, b: u16| a
+    .wrapping_add(b));
+atomic_rmw!(@new __sync_add_and_fetch_4, u32, |a: u32, b: u32| a
+    .wrapping_add(b));
+
+atomic_rmw!(@old __sync_fetch_and_sub_1, u8, |a: u8, b: u8| a.wrapping_sub(b));
+atomic_rmw!(@old __sync_fetch_and_sub_2, u16, |a: u16, b: u16| a
+    .wrapping_sub(b));
+atomic_rmw!(@old __sync_fetch_and_sub_4, u32, |a: u32, b: u32| a
+    .wrapping_sub(b));
+
+atomic_rmw!(@new __sync_sub_and_fetch_1, u8, |a: u8, b: u8| a.wrapping_sub(b));
+atomic_rmw!(@new __sync_sub_and_fetch_2, u16, |a: u16, b: u16| a
+    .wrapping_sub(b));
+atomic_rmw!(@new __sync_sub_and_fetch_4, u32, |a: u32, b: u32| a
+    .wrapping_sub(b));
+
+atomic_rmw!(@old __sync_fetch_and_and_1, u8, |a: u8, b: u8| a & b);
+atomic_rmw!(@old __sync_fetch_and_and_2, u16, |a: u16, b: u16| a & b);
+atomic_rmw!(@old __sync_fetch_and_and_4, u32, |a: u32, b: u32| a & b);
+
+atomic_rmw!(@new __sync_and_and_fetch_1, u8, |a: u8, b: u8| a & b);
+atomic_rmw!(@new __sync_and_and_fetch_2, u16, |a: u16, b: u16| a & b);
+atomic_rmw!(@new __sync_and_and_fetch_4, u32, |a: u32, b: u32| a & b);
+
+atomic_rmw!(@old __sync_fetch_and_or_1, u8, |a: u8, b: u8| a | b);
+atomic_rmw!(@old __sync_fetch_and_or_2, u16, |a: u16, b: u16| a | b);
+atomic_rmw!(@old __sync_fetch_and_or_4, u32, |a: u32, b: u32| a | b);
+
+atomic_rmw!(@new __sync_or_and_fetch_1, u8, |a: u8, b: u8| a | b);
+atomic_rmw!(@new __sync_or_and_fetch_2, u16, |a: u16, b: u16| a | b);
+atomic_rmw!(@new __sync_or_and_fetch_4, u32, |a: u32, b: u32| a | b);
+
+atomic_rmw!(@old __sync_fetch_and_xor_1, u8, |a: u8, b: u8| a ^ b);
+atomic_rmw!(@old __sync_fetch_and_xor_2, u16, |a: u16, b: u16| a ^ b);
+atomic_rmw!(@old __sync_fetch_and_xor_4, u32, |a: u32, b: u32| a ^ b);
+
+atomic_rmw!(@new __sync_xor_and_fetch_1, u8, |a: u8, b: u8| a ^ b);
+atomic_rmw!(@new __sync_xor_and_fetch_2, u16, |a: u16, b: u16| a ^ b);
+atomic_rmw!(@new __sync_xor_and_fetch_4, u32, |a: u32, b: u32| a ^ b);
+
+atomic_rmw!(@old __sync_fetch_and_nand_1, u8, |a: u8, b: u8| !(a & b));
+atomic_rmw!(@old __sync_fetch_and_nand_2, u16, |a: u16, b: u16| !(a & b));
+atomic_rmw!(@old __sync_fetch_and_nand_4, u32, |a: u32, b: u32| !(a & b));
+
+atomic_rmw!(@new __sync_nand_and_fetch_1, u8, |a: u8, b: u8| !(a & b));
+atomic_rmw!(@new __sync_nand_and_fetch_2, u16, |a: u16, b: u16| !(a & b));
+atomic_rmw!(@new __sync_nand_and_fetch_4, u32, |a: u32, b: u32| !(a & b));
+
+atomic_rmw!(@old __sync_fetch_and_max_1, i8, |a: i8, b: i8| if a > b {
+    a
+} else {
+    b
+});
+atomic_rmw!(@old __sync_fetch_and_max_2, i16, |a: i16, b: i16| if a > b {
+    a
+} else {
+    b
+});
+atomic_rmw!(@old __sync_fetch_and_max_4, i32, |a: i32, b: i32| if a > b {
+    a
+} else {
+    b
+});
+
+atomic_rmw!(@old __sync_fetch_and_umax_1, u8, |a: u8, b: u8| if a > b {
+    a
+} else {
+    b
+});
+atomic_rmw!(@old __sync_fetch_and_umax_2, u16, |a: u16, b: u16| if a > b {
+    a
+} else {
+    b
+});
+atomic_rmw!(@old __sync_fetch_and_umax_4, u32, |a: u32, b: u32| if a > b {
+    a
+} else {
+    b
+});
+
+atomic_rmw!(@old __sync_fetch_and_min_1, i8, |a: i8, b: i8| if a < b {
+    a
+} else {
+    b
+});
+atomic_rmw!(@old __sync_fetch_and_min_2, i16, |a: i16, b: i16| if a < b {
+    a
+} else {
+    b
+});
+atomic_rmw!(@old __sync_fetch_and_min_4, i32, |a: i32, b: i32| if a < b {
+    a
+} else {
+    b
+});
+
+atomic_rmw!(@old __sync_fetch_and_umin_1, u8, |a: u8, b: u8| if a < b {
+    a
+} else {
+    b
+});
+atomic_rmw!(@old __sync_fetch_and_umin_2, u16, |a: u16, b: u16| if a < b {
+    a
+} else {
+    b
+});
+atomic_rmw!(@old __sync_fetch_and_umin_4, u32, |a: u32, b: u32| if a < b {
+    a
+} else {
+    b
+});
+
+atomic_rmw!(@old __sync_lock_test_and_set_1, u8, |_: u8, b: u8| b);
+atomic_rmw!(@old __sync_lock_test_and_set_2, u16, |_: u16, b: u16| b);
+atomic_rmw!(@old __sync_lock_test_and_set_4, u32, |_: u32, b: u32| b);
+
+atomic_cmpxchg!(__sync_val_compare_and_swap_1, u8);
+atomic_cmpxchg!(__sync_val_compare_and_swap_2, u16);
+atomic_cmpxchg!(__sync_val_compare_and_swap_4, u32);
+
+intrinsics! {
+    pub unsafe extern "C" fn __sync_synchronize() {
+        __kuser_memory_barrier();
+    }
+}
diff --git a/library/compiler-builtins/compiler-builtins/src/avr.rs b/library/compiler-builtins/compiler-builtins/src/avr.rs
new file mode 100644
index 00000000000..359a1d1acc1
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/avr.rs
@@ -0,0 +1,23 @@
+intrinsics! {
+    pub unsafe extern "C" fn abort() -> ! {
+        // On AVRs, an architecture that doesn't support traps, unreachable code
+        // paths get lowered into calls to `abort`:
+        //
+        // https://github.com/llvm/llvm-project/blob/cbe8f3ad7621e402b050e768f400ff0d19c3aedd/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp#L4462
+        //
+        // When control gets here, it means that either core::intrinsics::abort()
+        // was called or an undefined bebavior has occurred, so there's not that
+        // much we can do to recover - we can't `panic!()`, because for all we
+        // know the environment is gone now, so panicking might end up with us
+        // getting back to this very function.
+        //
+        // So let's do the next best thing, loop.
+        //
+        // Alternatively we could (try to) restart the program, but since
+        // undefined behavior is undefined, there's really no obligation for us
+        // to do anything here - for all we care, we could just set the chip on
+        // fire; but that'd be bad for the environment.
+
+        loop {}
+    }
+}
diff --git a/library/compiler-builtins/compiler-builtins/src/float/add.rs b/library/compiler-builtins/compiler-builtins/src/float/add.rs
new file mode 100644
index 00000000000..0426c9cc44f
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/float/add.rs
@@ -0,0 +1,209 @@
+use crate::float::Float;
+use crate::int::{CastInto, Int, MinInt};
+
+/// Returns `a + b`
+fn add<F: Float>(a: F, b: F) -> F
+where
+    u32: CastInto<F::Int>,
+    F::Int: CastInto<u32>,
+    i32: CastInto<F::Int>,
+    F::Int: CastInto<i32>,
+{
+    let one = F::Int::ONE;
+    let zero = F::Int::ZERO;
+
+    let bits = F::BITS.cast();
+    let significand_bits = F::SIG_BITS;
+    let max_exponent = F::EXP_SAT;
+
+    let implicit_bit = F::IMPLICIT_BIT;
+    let significand_mask = F::SIG_MASK;
+    let sign_bit = F::SIGN_MASK as F::Int;
+    let abs_mask = sign_bit - one;
+    let exponent_mask = F::EXP_MASK;
+    let inf_rep = exponent_mask;
+    let quiet_bit = implicit_bit >> 1;
+    let qnan_rep = exponent_mask | quiet_bit;
+
+    let mut a_rep = a.to_bits();
+    let mut b_rep = b.to_bits();
+    let a_abs = a_rep & abs_mask;
+    let b_abs = b_rep & abs_mask;
+
+    // Detect if a or b is zero, infinity, or NaN.
+    if a_abs.wrapping_sub(one) >= inf_rep - one || b_abs.wrapping_sub(one) >= inf_rep - one {
+        // NaN + anything = qNaN
+        if a_abs > inf_rep {
+            return F::from_bits(a_abs | quiet_bit);
+        }
+        // anything + NaN = qNaN
+        if b_abs > inf_rep {
+            return F::from_bits(b_abs | quiet_bit);
+        }
+
+        if a_abs == inf_rep {
+            // +/-infinity + -/+infinity = qNaN
+            if (a.to_bits() ^ b.to_bits()) == sign_bit {
+                return F::from_bits(qnan_rep);
+            } else {
+                // +/-infinity + anything remaining = +/- infinity
+                return a;
+            }
+        }
+
+        // anything remaining + +/-infinity = +/-infinity
+        if b_abs == inf_rep {
+            return b;
+        }
+
+        // zero + anything = anything
+        if a_abs == MinInt::ZERO {
+            // but we need to get the sign right for zero + zero
+            if b_abs == MinInt::ZERO {
+                return F::from_bits(a.to_bits() & b.to_bits());
+            } else {
+                return b;
+            }
+        }
+
+        // anything + zero = anything
+        if b_abs == MinInt::ZERO {
+            return a;
+        }
+    }
+
+    // Swap a and b if necessary so that a has the larger absolute value.
+    if b_abs > a_abs {
+        // Don't use mem::swap because it may generate references to memcpy in unoptimized code.
+        let tmp = a_rep;
+        a_rep = b_rep;
+        b_rep = tmp;
+    }
+
+    // Extract the exponent and significand from the (possibly swapped) a and b.
+    let mut a_exponent: i32 = ((a_rep & exponent_mask) >> significand_bits).cast();
+    let mut b_exponent: i32 = ((b_rep & exponent_mask) >> significand_bits).cast();
+    let mut a_significand = a_rep & significand_mask;
+    let mut b_significand = b_rep & significand_mask;
+
+    // normalize any denormals, and adjust the exponent accordingly.
+    if a_exponent == 0 {
+        let (exponent, significand) = F::normalize(a_significand);
+        a_exponent = exponent;
+        a_significand = significand;
+    }
+    if b_exponent == 0 {
+        let (exponent, significand) = F::normalize(b_significand);
+        b_exponent = exponent;
+        b_significand = significand;
+    }
+
+    // The sign of the result is the sign of the larger operand, a.  If they
+    // have opposite signs, we are performing a subtraction; otherwise addition.
+    let result_sign = a_rep & sign_bit;
+    let subtraction = ((a_rep ^ b_rep) & sign_bit) != zero;
+
+    // Shift the significands to give us round, guard and sticky, and or in the
+    // implicit significand bit.  (If we fell through from the denormal path it
+    // was already set by normalize(), but setting it twice won't hurt
+    // anything.)
+    a_significand = (a_significand | implicit_bit) << 3;
+    b_significand = (b_significand | implicit_bit) << 3;
+
+    // Shift the significand of b by the difference in exponents, with a sticky
+    // bottom bit to get rounding correct.
+    let align = a_exponent.wrapping_sub(b_exponent).cast();
+    if align != MinInt::ZERO {
+        if align < bits {
+            let sticky =
+                F::Int::from_bool(b_significand << bits.wrapping_sub(align).cast() != MinInt::ZERO);
+            b_significand = (b_significand >> align.cast()) | sticky;
+        } else {
+            b_significand = one; // sticky; b is known to be non-zero.
+        }
+    }
+    if subtraction {
+        a_significand = a_significand.wrapping_sub(b_significand);
+        // If a == -b, return +zero.
+        if a_significand == MinInt::ZERO {
+            return F::from_bits(MinInt::ZERO);
+        }
+
+        // If partial cancellation occured, we need to left-shift the result
+        // and adjust the exponent:
+        if a_significand < implicit_bit << 3 {
+            let shift =
+                a_significand.leading_zeros() as i32 - (implicit_bit << 3).leading_zeros() as i32;
+            a_significand <<= shift;
+            a_exponent -= shift;
+        }
+    } else {
+        // addition
+        a_significand += b_significand;
+
+        // If the addition carried up, we need to right-shift the result and
+        // adjust the exponent:
+        if a_significand & (implicit_bit << 4) != MinInt::ZERO {
+            let sticky = F::Int::from_bool(a_significand & one != MinInt::ZERO);
+            a_significand = (a_significand >> 1) | sticky;
+            a_exponent += 1;
+        }
+    }
+
+    // If we have overflowed the type, return +/- infinity:
+    if a_exponent >= max_exponent as i32 {
+        return F::from_bits(inf_rep | result_sign);
+    }
+
+    if a_exponent <= 0 {
+        // Result is denormal before rounding; the exponent is zero and we
+        // need to shift the significand.
+        let shift = (1 - a_exponent).cast();
+        let sticky =
+            F::Int::from_bool((a_significand << bits.wrapping_sub(shift).cast()) != MinInt::ZERO);
+        a_significand = (a_significand >> shift.cast()) | sticky;
+        a_exponent = 0;
+    }
+
+    // Low three bits are round, guard, and sticky.
+    let a_significand_i32: i32 = a_significand.cast();
+    let round_guard_sticky: i32 = a_significand_i32 & 0x7;
+
+    // Shift the significand into place, and mask off the implicit bit.
+    let mut result = (a_significand >> 3) & significand_mask;
+
+    // Insert the exponent and sign.
+    result |= a_exponent.cast() << significand_bits;
+    result |= result_sign;
+
+    // Final rounding.  The result may overflow to infinity, but that is the
+    // correct result in that case.
+    if round_guard_sticky > 0x4 {
+        result += one;
+    }
+    if round_guard_sticky == 0x4 {
+        result += result & one;
+    }
+
+    F::from_bits(result)
+}
+
+intrinsics! {
+    #[aapcs_on_arm]
+    #[arm_aeabi_alias = __aeabi_fadd]
+    pub extern "C" fn __addsf3(a: f32, b: f32) -> f32 {
+        add(a, b)
+    }
+
+    #[aapcs_on_arm]
+    #[arm_aeabi_alias = __aeabi_dadd]
+    pub extern "C" fn __adddf3(a: f64, b: f64) -> f64 {
+        add(a, b)
+    }
+
+    #[ppc_alias = __addkf3]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __addtf3(a: f128, b: f128) -> f128 {
+        add(a, b)
+    }
+}
diff --git a/library/compiler-builtins/compiler-builtins/src/float/cmp.rs b/library/compiler-builtins/compiler-builtins/src/float/cmp.rs
new file mode 100644
index 00000000000..296952821cb
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/float/cmp.rs
@@ -0,0 +1,248 @@
+#![allow(unreachable_code)]
+
+use crate::float::Float;
+use crate::int::MinInt;
+
+// https://github.com/llvm/llvm-project/blob/1e6ba3cd2fe96be00b6ed6ba28b3d9f9271d784d/compiler-rt/lib/builtins/fp_compare_impl.inc#L22
+#[cfg(target_arch = "avr")]
+pub type CmpResult = i8;
+
+// https://github.com/llvm/llvm-project/blob/1e6ba3cd2fe96be00b6ed6ba28b3d9f9271d784d/compiler-rt/lib/builtins/fp_compare_impl.inc#L25
+#[cfg(not(target_arch = "avr"))]
+pub type CmpResult = i32;
+
+#[derive(Clone, Copy)]
+enum Result {
+    Less,
+    Equal,
+    Greater,
+    Unordered,
+}
+
+impl Result {
+    fn to_le_abi(self) -> CmpResult {
+        match self {
+            Result::Less => -1,
+            Result::Equal => 0,
+            Result::Greater => 1,
+            Result::Unordered => 1,
+        }
+    }
+
+    fn to_ge_abi(self) -> CmpResult {
+        match self {
+            Result::Less => -1,
+            Result::Equal => 0,
+            Result::Greater => 1,
+            Result::Unordered => -1,
+        }
+    }
+}
+
+fn cmp<F: Float>(a: F, b: F) -> Result {
+    let one = F::Int::ONE;
+    let zero = F::Int::ZERO;
+    let szero = F::SignedInt::ZERO;
+
+    let sign_bit = F::SIGN_MASK as F::Int;
+    let abs_mask = sign_bit - one;
+    let exponent_mask = F::EXP_MASK;
+    let inf_rep = exponent_mask;
+
+    let a_rep = a.to_bits();
+    let b_rep = b.to_bits();
+    let a_abs = a_rep & abs_mask;
+    let b_abs = b_rep & abs_mask;
+
+    // If either a or b is NaN, they are unordered.
+    if a_abs > inf_rep || b_abs > inf_rep {
+        return Result::Unordered;
+    }
+
+    // If a and b are both zeros, they are equal.
+    if a_abs | b_abs == zero {
+        return Result::Equal;
+    }
+
+    let a_srep = a.to_bits_signed();
+    let b_srep = b.to_bits_signed();
+
+    // If at least one of a and b is positive, we get the same result comparing
+    // a and b as signed integers as we would with a fp_ting-point compare.
+    if a_srep & b_srep >= szero {
+        if a_srep < b_srep {
+            Result::Less
+        } else if a_srep == b_srep {
+            Result::Equal
+        } else {
+            Result::Greater
+        }
+    // Otherwise, both are negative, so we need to flip the sense of the
+    // comparison to get the correct result.  (This assumes a twos- or ones-
+    // complement integer representation; if integers are represented in a
+    // sign-magnitude representation, then this flip is incorrect).
+    } else if a_srep > b_srep {
+        Result::Less
+    } else if a_srep == b_srep {
+        Result::Equal
+    } else {
+        Result::Greater
+    }
+}
+
+fn unord<F: Float>(a: F, b: F) -> bool {
+    let one = F::Int::ONE;
+
+    let sign_bit = F::SIGN_MASK as F::Int;
+    let abs_mask = sign_bit - one;
+    let exponent_mask = F::EXP_MASK;
+    let inf_rep = exponent_mask;
+
+    let a_rep = a.to_bits();
+    let b_rep = b.to_bits();
+    let a_abs = a_rep & abs_mask;
+    let b_abs = b_rep & abs_mask;
+
+    a_abs > inf_rep || b_abs > inf_rep
+}
+
+intrinsics! {
+    pub extern "C" fn __lesf2(a: f32, b: f32) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_le_abi()
+    }
+
+    pub extern "C" fn __gesf2(a: f32, b: f32) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_ge_abi()
+    }
+
+    #[arm_aeabi_alias = __aeabi_fcmpun]
+    pub extern "C" fn __unordsf2(a: f32, b: f32) -> crate::float::cmp::CmpResult {
+        unord(a, b) as crate::float::cmp::CmpResult
+    }
+
+    pub extern "C" fn __eqsf2(a: f32, b: f32) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_le_abi()
+    }
+
+    pub extern "C" fn __ltsf2(a: f32, b: f32) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_le_abi()
+    }
+
+    pub extern "C" fn __nesf2(a: f32, b: f32) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_le_abi()
+    }
+
+    pub extern "C" fn __gtsf2(a: f32, b: f32) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_ge_abi()
+    }
+
+    pub extern "C" fn __ledf2(a: f64, b: f64) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_le_abi()
+    }
+
+    pub extern "C" fn __gedf2(a: f64, b: f64) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_ge_abi()
+    }
+
+    #[arm_aeabi_alias = __aeabi_dcmpun]
+    pub extern "C" fn __unorddf2(a: f64, b: f64) -> crate::float::cmp::CmpResult {
+        unord(a, b) as crate::float::cmp::CmpResult
+    }
+
+    pub extern "C" fn __eqdf2(a: f64, b: f64) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_le_abi()
+    }
+
+    pub extern "C" fn __ltdf2(a: f64, b: f64) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_le_abi()
+    }
+
+    pub extern "C" fn __nedf2(a: f64, b: f64) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_le_abi()
+    }
+
+    pub extern "C" fn __gtdf2(a: f64, b: f64) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_ge_abi()
+    }
+}
+
+#[cfg(f128_enabled)]
+intrinsics! {
+    #[ppc_alias = __lekf2]
+    pub extern "C" fn __letf2(a: f128, b: f128) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_le_abi()
+    }
+
+    #[ppc_alias = __gekf2]
+    pub extern "C" fn __getf2(a: f128, b: f128) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_ge_abi()
+    }
+
+    #[ppc_alias = __unordkf2]
+    pub extern "C" fn __unordtf2(a: f128, b: f128) -> crate::float::cmp::CmpResult {
+        unord(a, b) as crate::float::cmp::CmpResult
+    }
+
+    #[ppc_alias = __eqkf2]
+    pub extern "C" fn __eqtf2(a: f128, b: f128) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_le_abi()
+    }
+
+    #[ppc_alias = __ltkf2]
+    pub extern "C" fn __lttf2(a: f128, b: f128) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_le_abi()
+    }
+
+    #[ppc_alias = __nekf2]
+    pub extern "C" fn __netf2(a: f128, b: f128) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_le_abi()
+    }
+
+    #[ppc_alias = __gtkf2]
+    pub extern "C" fn __gttf2(a: f128, b: f128) -> crate::float::cmp::CmpResult {
+        cmp(a, b).to_ge_abi()
+    }
+}
+
+#[cfg(target_arch = "arm")]
+intrinsics! {
+    pub extern "aapcs" fn __aeabi_fcmple(a: f32, b: f32) -> i32 {
+        (__lesf2(a, b) <= 0) as i32
+    }
+
+    pub extern "aapcs" fn __aeabi_fcmpge(a: f32, b: f32) -> i32 {
+        (__gesf2(a, b) >= 0) as i32
+    }
+
+    pub extern "aapcs" fn __aeabi_fcmpeq(a: f32, b: f32) -> i32 {
+        (__eqsf2(a, b) == 0) as i32
+    }
+
+    pub extern "aapcs" fn __aeabi_fcmplt(a: f32, b: f32) -> i32 {
+        (__ltsf2(a, b) < 0) as i32
+    }
+
+    pub extern "aapcs" fn __aeabi_fcmpgt(a: f32, b: f32) -> i32 {
+        (__gtsf2(a, b) > 0) as i32
+    }
+
+    pub extern "aapcs" fn __aeabi_dcmple(a: f64, b: f64) -> i32 {
+        (__ledf2(a, b) <= 0) as i32
+    }
+
+    pub extern "aapcs" fn __aeabi_dcmpge(a: f64, b: f64) -> i32 {
+        (__gedf2(a, b) >= 0) as i32
+    }
+
+    pub extern "aapcs" fn __aeabi_dcmpeq(a: f64, b: f64) -> i32 {
+        (__eqdf2(a, b) == 0) as i32
+    }
+
+    pub extern "aapcs" fn __aeabi_dcmplt(a: f64, b: f64) -> i32 {
+        (__ltdf2(a, b) < 0) as i32
+    }
+
+    pub extern "aapcs" fn __aeabi_dcmpgt(a: f64, b: f64) -> i32 {
+        (__gtdf2(a, b) > 0) as i32
+    }
+}
diff --git a/library/compiler-builtins/compiler-builtins/src/float/conv.rs b/library/compiler-builtins/compiler-builtins/src/float/conv.rs
new file mode 100644
index 00000000000..f5427a11390
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/float/conv.rs
@@ -0,0 +1,489 @@
+use core::ops::Neg;
+
+use super::Float;
+use crate::int::{CastFrom, CastInto, Int, MinInt};
+
+/// Conversions from integers to floats.
+///
+/// The algorithm is explained here: <https://blog.m-ou.se/floats/>. It roughly does the following:
+/// - Calculate a base mantissa by shifting the integer into mantissa position. This gives us a
+///   mantissa _with the implicit bit set_!
+/// - Figure out if rounding needs to occur by classifying the bits that are to be truncated. Some
+///   patterns are used to simplify this. Adjust the mantissa with the result if needed.
+/// - Calculate the exponent based on the base-2 logarithm of `i` (leading zeros). Subtract one.
+/// - Shift the exponent and add the mantissa to create the final representation. Subtracting one
+///   from the exponent (above) accounts for the explicit bit being set in the mantissa.
+///
+/// # Terminology
+///
+/// - `i`: the original integer
+/// - `i_m`: the integer, shifted fully left (no leading zeros)
+/// - `n`: number of leading zeroes
+/// - `e`: the resulting exponent. Usually 1 is subtracted to offset the mantissa implicit bit.
+/// - `m_base`: the mantissa before adjusting for truncated bits. Implicit bit is usually set.
+/// - `adj`: the bits that will be truncated, possibly compressed in some way.
+/// - `m`: the resulting mantissa. Implicit bit is usually set.
+mod int_to_float {
+    use super::*;
+
+    /// Calculate the exponent from the number of leading zeros.
+    ///
+    /// Usually 1 is subtracted from this function's result, so that a mantissa with the implicit
+    /// bit set can be added back later.
+    fn exp<I: Int, F: Float<Int: CastFrom<u32>>>(n: u32) -> F::Int {
+        F::Int::cast_from(F::EXP_BIAS - 1 + I::BITS - n)
+    }
+
+    /// Adjust a mantissa with dropped bits to perform correct rounding.
+    ///
+    /// The dropped bits should be exactly the bits that get truncated (left-aligned), but they
+    /// can be combined or compressed in some way that simplifies operations.
+    fn m_adj<F: Float>(m_base: F::Int, dropped_bits: F::Int) -> F::Int {
+        // Branchlessly extract a `1` if rounding up should happen, 0 otherwise
+        // This accounts for rounding to even.
+        let adj = (dropped_bits - ((dropped_bits >> (F::BITS - 1)) & !m_base)) >> (F::BITS - 1);
+
+        // Add one when we need to round up. Break ties to even.
+        m_base + adj
+    }
+
+    /// Shift the exponent to its position and add the mantissa.
+    ///
+    /// If the mantissa has the implicit bit set, the exponent should be one less than its actual
+    /// value to cancel it out.
+    fn repr<F: Float>(e: F::Int, m: F::Int) -> F::Int {
+        // + rather than | so the mantissa can overflow into the exponent
+        (e << F::SIG_BITS) + m
+    }
+
+    /// Shift distance from a left-aligned integer to a smaller float.
+    fn shift_f_lt_i<I: Int, F: Float>() -> u32 {
+        (I::BITS - F::BITS) + F::EXP_BITS
+    }
+
+    /// Shift distance from an integer with `n` leading zeros to a smaller float.
+    fn shift_f_gt_i<I: Int, F: Float>(n: u32) -> u32 {
+        F::SIG_BITS - I::BITS + 1 + n
+    }
+
+    /// Perform a signed operation as unsigned, then add the sign back.
+    pub fn signed<I, F, Conv>(i: I, conv: Conv) -> F
+    where
+        F: Float,
+        I: Int,
+        F::Int: CastFrom<I>,
+        Conv: Fn(I::UnsignedInt) -> F::Int,
+    {
+        let sign_bit = F::Int::cast_from(i >> (I::BITS - 1)) << (F::BITS - 1);
+        F::from_bits(conv(i.unsigned_abs()) | sign_bit)
+    }
+
+    pub fn u32_to_f32_bits(i: u32) -> u32 {
+        if i == 0 {
+            return 0;
+        }
+        let n = i.leading_zeros();
+        // Mantissa with implicit bit set (significant bits)
+        let m_base = (i << n) >> f32::EXP_BITS;
+        // Bits that will be dropped (insignificant bits)
+        let adj = (i << n) << (f32::SIG_BITS + 1);
+        let m = m_adj::<f32>(m_base, adj);
+        let e = exp::<u32, f32>(n) - 1;
+        repr::<f32>(e, m)
+    }
+
+    pub fn u32_to_f64_bits(i: u32) -> u64 {
+        if i == 0 {
+            return 0;
+        }
+        let n = i.leading_zeros();
+        // Mantissa with implicit bit set
+        let m = (i as u64) << shift_f_gt_i::<u32, f64>(n);
+        let e = exp::<u32, f64>(n) - 1;
+        repr::<f64>(e, m)
+    }
+
+    #[cfg(f128_enabled)]
+    pub fn u32_to_f128_bits(i: u32) -> u128 {
+        if i == 0 {
+            return 0;
+        }
+        let n = i.leading_zeros();
+
+        // Shift into mantissa position that is correct for the type, but shifted into the lower
+        // 64 bits over so can can avoid 128-bit math.
+        let m = (i as u64) << (shift_f_gt_i::<u32, f128>(n) - 64);
+        let e = exp::<u32, f128>(n) as u64 - 1;
+        // High 64 bits of f128 representation.
+        let h = (e << (f128::SIG_BITS - 64)) + m;
+
+        // Shift back to the high bits, the rest of the mantissa will always be 0.
+        (h as u128) << 64
+    }
+
+    pub fn u64_to_f32_bits(i: u64) -> u32 {
+        let n = i.leading_zeros();
+        let i_m = i.wrapping_shl(n);
+        // Mantissa with implicit bit set
+        let m_base: u32 = (i_m >> shift_f_lt_i::<u64, f32>()) as u32;
+        // The entire lower half of `i` will be truncated (masked portion), plus the
+        // next `EXP_BITS` bits.
+        let adj = ((i_m >> f32::EXP_BITS) | i_m & 0xFFFF) as u32;
+        let m = m_adj::<f32>(m_base, adj);
+        let e = if i == 0 { 0 } else { exp::<u64, f32>(n) - 1 };
+        repr::<f32>(e, m)
+    }
+
+    pub fn u64_to_f64_bits(i: u64) -> u64 {
+        if i == 0 {
+            return 0;
+        }
+        let n = i.leading_zeros();
+        // Mantissa with implicit bit set
+        let m_base = (i << n) >> f64::EXP_BITS;
+        let adj = (i << n) << (f64::SIG_BITS + 1);
+        let m = m_adj::<f64>(m_base, adj);
+        let e = exp::<u64, f64>(n) - 1;
+        repr::<f64>(e, m)
+    }
+
+    #[cfg(f128_enabled)]
+    pub fn u64_to_f128_bits(i: u64) -> u128 {
+        if i == 0 {
+            return 0;
+        }
+        let n = i.leading_zeros();
+        // Mantissa with implicit bit set
+        let m = (i as u128) << shift_f_gt_i::<u64, f128>(n);
+        let e = exp::<u64, f128>(n) - 1;
+        repr::<f128>(e, m)
+    }
+
+    pub fn u128_to_f32_bits(i: u128) -> u32 {
+        let n = i.leading_zeros();
+        let i_m = i.wrapping_shl(n); // Mantissa, shifted so the first bit is nonzero
+        let m_base: u32 = (i_m >> shift_f_lt_i::<u128, f32>()) as u32;
+
+        // Within the upper `F::BITS`, everything except for the signifcand
+        // gets truncated
+        let d1: u32 = (i_m >> (u128::BITS - f32::BITS - f32::SIG_BITS - 1)).cast();
+
+        // The entire rest of `i_m` gets truncated. Zero the upper `F::BITS` then just
+        // check if it is nonzero.
+        let d2: u32 = (i_m << f32::BITS >> f32::BITS != 0).into();
+        let adj = d1 | d2;
+
+        // Mantissa with implicit bit set
+        let m = m_adj::<f32>(m_base, adj);
+        let e = if i == 0 { 0 } else { exp::<u128, f32>(n) - 1 };
+        repr::<f32>(e, m)
+    }
+
+    pub fn u128_to_f64_bits(i: u128) -> u64 {
+        let n = i.leading_zeros();
+        let i_m = i.wrapping_shl(n);
+        // Mantissa with implicit bit set
+        let m_base: u64 = (i_m >> shift_f_lt_i::<u128, f64>()) as u64;
+        // The entire lower half of `i` will be truncated (masked portion), plus the
+        // next `EXP_BITS` bits.
+        let adj = ((i_m >> f64::EXP_BITS) | i_m & 0xFFFF_FFFF) as u64;
+        let m = m_adj::<f64>(m_base, adj);
+        let e = if i == 0 { 0 } else { exp::<u128, f64>(n) - 1 };
+        repr::<f64>(e, m)
+    }
+
+    #[cfg(f128_enabled)]
+    pub fn u128_to_f128_bits(i: u128) -> u128 {
+        if i == 0 {
+            return 0;
+        }
+        let n = i.leading_zeros();
+        // Mantissa with implicit bit set
+        let m_base = (i << n) >> f128::EXP_BITS;
+        let adj = (i << n) << (f128::SIG_BITS + 1);
+        let m = m_adj::<f128>(m_base, adj);
+        let e = exp::<u128, f128>(n) - 1;
+        repr::<f128>(e, m)
+    }
+}
+
+// Conversions from unsigned integers to floats.
+intrinsics! {
+    #[arm_aeabi_alias = __aeabi_ui2f]
+    pub extern "C" fn __floatunsisf(i: u32) -> f32 {
+        f32::from_bits(int_to_float::u32_to_f32_bits(i))
+    }
+
+    #[arm_aeabi_alias = __aeabi_ui2d]
+    pub extern "C" fn __floatunsidf(i: u32) -> f64 {
+        f64::from_bits(int_to_float::u32_to_f64_bits(i))
+    }
+
+    #[arm_aeabi_alias = __aeabi_ul2f]
+    pub extern "C" fn __floatundisf(i: u64) -> f32 {
+        f32::from_bits(int_to_float::u64_to_f32_bits(i))
+    }
+
+    #[arm_aeabi_alias = __aeabi_ul2d]
+    pub extern "C" fn __floatundidf(i: u64) -> f64 {
+        f64::from_bits(int_to_float::u64_to_f64_bits(i))
+    }
+
+    #[cfg_attr(target_os = "uefi", unadjusted_on_win64)]
+    pub extern "C" fn __floatuntisf(i: u128) -> f32 {
+        f32::from_bits(int_to_float::u128_to_f32_bits(i))
+    }
+
+    #[cfg_attr(target_os = "uefi", unadjusted_on_win64)]
+    pub extern "C" fn __floatuntidf(i: u128) -> f64 {
+        f64::from_bits(int_to_float::u128_to_f64_bits(i))
+    }
+
+    #[ppc_alias = __floatunsikf]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __floatunsitf(i: u32) -> f128 {
+        f128::from_bits(int_to_float::u32_to_f128_bits(i))
+    }
+
+    #[ppc_alias = __floatundikf]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __floatunditf(i: u64) -> f128 {
+        f128::from_bits(int_to_float::u64_to_f128_bits(i))
+    }
+
+    #[ppc_alias = __floatuntikf]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __floatuntitf(i: u128) -> f128 {
+        f128::from_bits(int_to_float::u128_to_f128_bits(i))
+    }
+}
+
+// Conversions from signed integers to floats.
+intrinsics! {
+    #[arm_aeabi_alias = __aeabi_i2f]
+    pub extern "C" fn __floatsisf(i: i32) -> f32 {
+        int_to_float::signed(i, int_to_float::u32_to_f32_bits)
+    }
+
+    #[arm_aeabi_alias = __aeabi_i2d]
+    pub extern "C" fn __floatsidf(i: i32) -> f64 {
+        int_to_float::signed(i, int_to_float::u32_to_f64_bits)
+    }
+
+    #[arm_aeabi_alias = __aeabi_l2f]
+    pub extern "C" fn __floatdisf(i: i64) -> f32 {
+        int_to_float::signed(i, int_to_float::u64_to_f32_bits)
+    }
+
+    #[arm_aeabi_alias = __aeabi_l2d]
+    pub extern "C" fn __floatdidf(i: i64) -> f64 {
+        int_to_float::signed(i, int_to_float::u64_to_f64_bits)
+    }
+
+    #[cfg_attr(target_os = "uefi", unadjusted_on_win64)]
+    pub extern "C" fn __floattisf(i: i128) -> f32 {
+        int_to_float::signed(i, int_to_float::u128_to_f32_bits)
+    }
+
+    #[cfg_attr(target_os = "uefi", unadjusted_on_win64)]
+    pub extern "C" fn __floattidf(i: i128) -> f64 {
+        int_to_float::signed(i, int_to_float::u128_to_f64_bits)
+    }
+
+    #[ppc_alias = __floatsikf]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __floatsitf(i: i32) -> f128 {
+        int_to_float::signed(i, int_to_float::u32_to_f128_bits)
+    }
+
+    #[ppc_alias = __floatdikf]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __floatditf(i: i64) -> f128 {
+        int_to_float::signed(i, int_to_float::u64_to_f128_bits)
+    }
+
+    #[ppc_alias = __floattikf]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __floattitf(i: i128) -> f128 {
+        int_to_float::signed(i, int_to_float::u128_to_f128_bits)
+    }
+}
+
+/// Generic float to unsigned int conversions.
+fn float_to_unsigned_int<F, U>(f: F) -> U
+where
+    F: Float,
+    U: Int<UnsignedInt = U>,
+    F::Int: CastInto<U>,
+    F::Int: CastFrom<u32>,
+    F::Int: CastInto<U::UnsignedInt>,
+    u32: CastFrom<F::Int>,
+{
+    float_to_int_inner::<F, U, _, _>(f.to_bits(), |i: U| i, || U::MAX)
+}
+
+/// Generic float to signed int conversions.
+fn float_to_signed_int<F, I>(f: F) -> I
+where
+    F: Float,
+    I: Int + Neg<Output = I>,
+    I::UnsignedInt: Int,
+    F::Int: CastInto<I::UnsignedInt>,
+    F::Int: CastFrom<u32>,
+    u32: CastFrom<F::Int>,
+{
+    float_to_int_inner::<F, I, _, _>(
+        f.to_bits() & !F::SIGN_MASK,
+        |i: I| if f.is_sign_negative() { -i } else { i },
+        || if f.is_sign_negative() { I::MIN } else { I::MAX },
+    )
+}
+
+/// Float to int conversions, generic for both signed and unsigned.
+///
+/// Parameters:
+/// - `fbits`: `abg(f)` bitcasted to an integer.
+/// - `map_inbounds`: apply this transformation to integers that are within range (add the sign back).
+/// - `out_of_bounds`: return value when out of range for `I`.
+fn float_to_int_inner<F, I, FnFoo, FnOob>(
+    fbits: F::Int,
+    map_inbounds: FnFoo,
+    out_of_bounds: FnOob,
+) -> I
+where
+    F: Float,
+    I: Int,
+    FnFoo: FnOnce(I) -> I,
+    FnOob: FnOnce() -> I,
+    I::UnsignedInt: Int,
+    F::Int: CastInto<I::UnsignedInt>,
+    F::Int: CastFrom<u32>,
+    u32: CastFrom<F::Int>,
+{
+    let int_max_exp = F::EXP_BIAS + I::MAX.ilog2() + 1;
+    let foobar = F::EXP_BIAS + I::UnsignedInt::BITS - 1;
+
+    if fbits < F::ONE.to_bits() {
+        // < 0 gets rounded to 0
+        I::ZERO
+    } else if fbits < F::Int::cast_from(int_max_exp) << F::SIG_BITS {
+        // >= 1, < integer max
+        let m_base = if I::UnsignedInt::BITS >= F::Int::BITS {
+            I::UnsignedInt::cast_from(fbits) << (I::BITS - F::SIG_BITS - 1)
+        } else {
+            I::UnsignedInt::cast_from(fbits >> (F::SIG_BITS - I::BITS + 1))
+        };
+
+        // Set the implicit 1-bit.
+        let m: I::UnsignedInt = (I::UnsignedInt::ONE << (I::BITS - 1)) | m_base;
+
+        // Shift based on the exponent and bias.
+        let s: u32 = (foobar) - u32::cast_from(fbits >> F::SIG_BITS);
+
+        let unsigned = m >> s;
+        map_inbounds(I::from_unsigned(unsigned))
+    } else if fbits <= F::EXP_MASK {
+        // >= max (incl. inf)
+        out_of_bounds()
+    } else {
+        I::ZERO
+    }
+}
+
+// Conversions from floats to unsigned integers.
+intrinsics! {
+    #[arm_aeabi_alias = __aeabi_f2uiz]
+    pub extern "C" fn __fixunssfsi(f: f32) -> u32 {
+        float_to_unsigned_int(f)
+    }
+
+    #[arm_aeabi_alias = __aeabi_f2ulz]
+    pub extern "C" fn __fixunssfdi(f: f32) -> u64 {
+        float_to_unsigned_int(f)
+    }
+
+    pub extern "C" fn __fixunssfti(f: f32) -> u128 {
+        float_to_unsigned_int(f)
+    }
+
+    #[arm_aeabi_alias = __aeabi_d2uiz]
+    pub extern "C" fn __fixunsdfsi(f: f64) -> u32 {
+        float_to_unsigned_int(f)
+    }
+
+    #[arm_aeabi_alias = __aeabi_d2ulz]
+    pub extern "C" fn __fixunsdfdi(f: f64) -> u64 {
+        float_to_unsigned_int(f)
+    }
+
+    pub extern "C" fn __fixunsdfti(f: f64) -> u128 {
+        float_to_unsigned_int(f)
+    }
+
+    #[ppc_alias = __fixunskfsi]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __fixunstfsi(f: f128) -> u32 {
+        float_to_unsigned_int(f)
+    }
+
+    #[ppc_alias = __fixunskfdi]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __fixunstfdi(f: f128) -> u64 {
+        float_to_unsigned_int(f)
+    }
+
+    #[ppc_alias = __fixunskfti]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __fixunstfti(f: f128) -> u128 {
+        float_to_unsigned_int(f)
+    }
+}
+
+// Conversions from floats to signed integers.
+intrinsics! {
+    #[arm_aeabi_alias = __aeabi_f2iz]
+    pub extern "C" fn __fixsfsi(f: f32) -> i32 {
+        float_to_signed_int(f)
+    }
+
+    #[arm_aeabi_alias = __aeabi_f2lz]
+    pub extern "C" fn __fixsfdi(f: f32) -> i64 {
+        float_to_signed_int(f)
+    }
+
+    pub extern "C" fn __fixsfti(f: f32) -> i128 {
+        float_to_signed_int(f)
+    }
+
+    #[arm_aeabi_alias = __aeabi_d2iz]
+    pub extern "C" fn __fixdfsi(f: f64) -> i32 {
+        float_to_signed_int(f)
+    }
+
+    #[arm_aeabi_alias = __aeabi_d2lz]
+    pub extern "C" fn __fixdfdi(f: f64) -> i64 {
+        float_to_signed_int(f)
+    }
+
+    pub extern "C" fn __fixdfti(f: f64) -> i128 {
+        float_to_signed_int(f)
+    }
+
+    #[ppc_alias = __fixkfsi]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __fixtfsi(f: f128) -> i32 {
+        float_to_signed_int(f)
+    }
+
+    #[ppc_alias = __fixkfdi]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __fixtfdi(f: f128) -> i64 {
+        float_to_signed_int(f)
+    }
+
+    #[ppc_alias = __fixkfti]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __fixtfti(f: f128) -> i128 {
+        float_to_signed_int(f)
+    }
+}
diff --git a/library/compiler-builtins/compiler-builtins/src/float/div.rs b/library/compiler-builtins/compiler-builtins/src/float/div.rs
new file mode 100644
index 00000000000..5df637c7e0f
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/float/div.rs
@@ -0,0 +1,635 @@
+//! Floating point division routines.
+//!
+//! This module documentation gives an overview of the method used. More documentation is inline.
+//!
+//! # Relevant notation
+//!
+//! - `m_a`: the mantissa of `a`, in base 2
+//! - `p_a`: the exponent of `a`, in base 2. I.e. `a = m_a * 2^p_a`
+//! - `uqN` (e.g. `uq1`): this refers to Q notation for fixed-point numbers. UQ1.31 is an unsigned
+//!   fixed-point number with 1 integral bit, and 31 decimal bits. A `uqN` variable of type `uM`
+//!   will have N bits of integer and M-N bits of fraction.
+//! - `hw`: half width, i.e. for `f64` this will be a `u32`.
+//! - `x` is the best estimate of `1/m_b`
+//!
+//! # Method Overview
+//!
+//! Division routines must solve for `a / b`, which is `res = m_a*2^p_a / m_b*2^p_b`. The basic
+//! process is as follows:
+//!
+//! - Rearange the exponent and significand to simplify the operations:
+//!   `res = (m_a / m_b) * 2^{p_a - p_b}`.
+//! - Check for early exits (infinity, zero, etc).
+//! - If `a` or `b` are subnormal, normalize by shifting the mantissa and adjusting the exponent.
+//! - Set the implicit bit so math is correct.
+//! - Shift mantissa significant digits (with implicit bit) fully left such that fixed-point UQ1
+//!   or UQ0 numbers can be used for mantissa math. These will have greater precision than the
+//!   actual mantissa, which is important for correct rounding.
+//! - Calculate the reciprocal of `m_b`, `x`.
+//! - Use the reciprocal to multiply rather than divide: `res = m_a * x_b * 2^{p_a - p_b}`.
+//! - Reapply rounding.
+//!
+//! # Reciprocal calculation
+//!
+//! Calculating the reciprocal is the most complicated part of this process. It uses the
+//! [Newton-Raphson method], which picks an initial estimation (of the reciprocal) and performs
+//! a number of iterations to increase its precision.
+//!
+//! In general, Newton's method takes the following form:
+//!
+//! ```text
+//! `x_n` is a guess or the result of a previous iteration. Increasing `n` converges to the
+//! desired result.
+//!
+//! The result approaches a zero of `f(x)` by applying a correction to the previous gues.
+//!
+//! x_{n+1} = x_n - f(x_n) / f'(x_n)
+//! ```
+//!
+//! Applying this to find the reciprocal:
+//!
+//! ```text
+//! 1 / x = b
+//!
+//! Rearrange so we can solve by finding a zero
+//! 0 = (1 / x) - b = f(x)
+//!
+//! f'(x) = -x^{-2}
+//!
+//! x_{n+1} = 2*x_n - b*x_n^2
+//! ```
+//!
+//! This is a process that can be repeated to calculate the reciprocal with enough precision to
+//! achieve a correctly rounded result for the overall division operation. The maximum required
+//! number of iterations is known since precision doubles with each iteration.
+//!
+//! # Half-width operations
+//!
+//! Calculating the reciprocal requires widening multiplication and performing arithmetic on the
+//! results, meaning that emulated integer arithmetic on `u128` (for `f64`) and `u256` (for `f128`)
+//! gets used instead of native math.
+//!
+//! To make this more efficient, all but the final operation can be computed using half-width
+//! integers. For example, rather than computing four iterations using 128-bit integers for `f64`,
+//! we can instead perform three iterations using native 64-bit integers and only one final
+//! iteration using the full 128 bits.
+//!
+//! This works because of precision doubling. Some leeway is allowed here because the fixed-point
+//! number has more bits than the final mantissa will.
+//!
+//! [Newton-Raphson method]: https://en.wikipedia.org/wiki/Newton%27s_method
+
+use core::mem::size_of;
+use core::ops;
+
+use super::HalfRep;
+use crate::float::Float;
+use crate::int::{CastFrom, CastInto, DInt, HInt, Int, MinInt};
+
+fn div<F: Float>(a: F, b: F) -> F
+where
+    F::Int: CastInto<i32>,
+    F::Int: From<HalfRep<F>>,
+    F::Int: From<u8>,
+    F::Int: HInt + DInt,
+    <F::Int as HInt>::D: ops::Shr<u32, Output = <F::Int as HInt>::D>,
+    F::Int: From<u32>,
+    u16: CastInto<F::Int>,
+    i32: CastInto<F::Int>,
+    u32: CastInto<F::Int>,
+    u128: CastInto<HalfRep<F>>,
+{
+    let one = F::Int::ONE;
+    let zero = F::Int::ZERO;
+    let one_hw = HalfRep::<F>::ONE;
+    let zero_hw = HalfRep::<F>::ZERO;
+    let hw = F::BITS / 2;
+    let lo_mask = F::Int::MAX >> hw;
+
+    let significand_bits = F::SIG_BITS;
+    // Saturated exponent, representing infinity
+    let exponent_sat: F::Int = F::EXP_SAT.cast();
+
+    let exponent_bias = F::EXP_BIAS;
+    let implicit_bit = F::IMPLICIT_BIT;
+    let significand_mask = F::SIG_MASK;
+    let sign_bit = F::SIGN_MASK;
+    let abs_mask = sign_bit - one;
+    let exponent_mask = F::EXP_MASK;
+    let inf_rep = exponent_mask;
+    let quiet_bit = implicit_bit >> 1;
+    let qnan_rep = exponent_mask | quiet_bit;
+    let (mut half_iterations, full_iterations) = get_iterations::<F>();
+    let recip_precision = reciprocal_precision::<F>();
+
+    if F::BITS == 128 {
+        // FIXME(tgross35): f128 seems to require one more half iteration than expected
+        half_iterations += 1;
+    }
+
+    let a_rep = a.to_bits();
+    let b_rep = b.to_bits();
+
+    // Exponent numeric representationm not accounting for bias
+    let a_exponent = (a_rep >> significand_bits) & exponent_sat;
+    let b_exponent = (b_rep >> significand_bits) & exponent_sat;
+    let quotient_sign = (a_rep ^ b_rep) & sign_bit;
+
+    let mut a_significand = a_rep & significand_mask;
+    let mut b_significand = b_rep & significand_mask;
+
+    // The exponent of our final result in its encoded form
+    let mut res_exponent: i32 =
+        i32::cast_from(a_exponent) - i32::cast_from(b_exponent) + (exponent_bias as i32);
+
+    // Detect if a or b is zero, denormal, infinity, or NaN.
+    if a_exponent.wrapping_sub(one) >= (exponent_sat - one)
+        || b_exponent.wrapping_sub(one) >= (exponent_sat - one)
+    {
+        let a_abs = a_rep & abs_mask;
+        let b_abs = b_rep & abs_mask;
+
+        // NaN / anything = qNaN
+        if a_abs > inf_rep {
+            return F::from_bits(a_rep | quiet_bit);
+        }
+
+        // anything / NaN = qNaN
+        if b_abs > inf_rep {
+            return F::from_bits(b_rep | quiet_bit);
+        }
+
+        if a_abs == inf_rep {
+            if b_abs == inf_rep {
+                // infinity / infinity = NaN
+                return F::from_bits(qnan_rep);
+            } else {
+                // infinity / anything else = +/- infinity
+                return F::from_bits(a_abs | quotient_sign);
+            }
+        }
+
+        // anything else / infinity = +/- 0
+        if b_abs == inf_rep {
+            return F::from_bits(quotient_sign);
+        }
+
+        if a_abs == zero {
+            if b_abs == zero {
+                // zero / zero = NaN
+                return F::from_bits(qnan_rep);
+            } else {
+                // zero / anything else = +/- zero
+                return F::from_bits(quotient_sign);
+            }
+        }
+
+        // anything else / zero = +/- infinity
+        if b_abs == zero {
+            return F::from_bits(inf_rep | quotient_sign);
+        }
+
+        // a is denormal. Renormalize it and set the scale to include the necessary exponent
+        // adjustment.
+        if a_abs < implicit_bit {
+            let (exponent, significand) = F::normalize(a_significand);
+            res_exponent += exponent;
+            a_significand = significand;
+        }
+
+        // b is denormal. Renormalize it and set the scale to include the necessary exponent
+        // adjustment.
+        if b_abs < implicit_bit {
+            let (exponent, significand) = F::normalize(b_significand);
+            res_exponent -= exponent;
+            b_significand = significand;
+        }
+    }
+
+    // Set the implicit significand bit. If we fell through from the
+    // denormal path it was already set by normalize( ), but setting it twice
+    // won't hurt anything.
+    a_significand |= implicit_bit;
+    b_significand |= implicit_bit;
+
+    // Transform to a fixed-point representation by shifting the significand to the high bits. We
+    // know this is in the range [1.0, 2.0] since the implicit bit is set to 1 above.
+    let b_uq1 = b_significand << (F::BITS - significand_bits - 1);
+
+    // Align the significand of b as a UQ1.(n-1) fixed-point number in the range
+    // [1.0, 2.0) and get a UQ0.n approximate reciprocal using a small minimax
+    // polynomial approximation: x0 = 3/4 + 1/sqrt(2) - b/2.
+    // The max error for this approximation is achieved at endpoints, so
+    //   abs(x0(b) - 1/b) <= abs(x0(1) - 1/1) = 3/4 - 1/sqrt(2) = 0.04289...,
+    // which is about 4.5 bits.
+    // The initial approximation is between x0(1.0) = 0.9571... and x0(2.0) = 0.4571...
+    //
+    // Then, refine the reciprocal estimate using a quadratically converging
+    // Newton-Raphson iteration:
+    //     x_{n+1} = x_n * (2 - x_n * b)
+    //
+    // Let b be the original divisor considered "in infinite precision" and
+    // obtained from IEEE754 representation of function argument (with the
+    // implicit bit set). Corresponds to rep_t-sized b_UQ1 represented in
+    // UQ1.(W-1).
+    //
+    // Let b_hw be an infinitely precise number obtained from the highest (HW-1)
+    // bits of divisor significand (with the implicit bit set). Corresponds to
+    // half_rep_t-sized b_UQ1_hw represented in UQ1.(HW-1) that is a **truncated**
+    // version of b_UQ1.
+    //
+    // Let e_n := x_n - 1/b_hw
+    //     E_n := x_n - 1/b
+    // abs(E_n) <= abs(e_n) + (1/b_hw - 1/b)
+    //           = abs(e_n) + (b - b_hw) / (b*b_hw)
+    //          <= abs(e_n) + 2 * 2^-HW
+    //
+    // rep_t-sized iterations may be slower than the corresponding half-width
+    // variant depending on the handware and whether single/double/quad precision
+    // is selected.
+    //
+    // NB: Using half-width iterations increases computation errors due to
+    // rounding, so error estimations have to be computed taking the selected
+    // mode into account!
+    let mut x_uq0 = if half_iterations > 0 {
+        // Starting with (n-1) half-width iterations
+        let b_uq1_hw: HalfRep<F> = b_uq1.hi();
+
+        // C is (3/4 + 1/sqrt(2)) - 1 truncated to W0 fractional bits as UQ0.HW
+        // with W0 being either 16 or 32 and W0 <= HW.
+        // That is, C is the aforementioned 3/4 + 1/sqrt(2) constant (from which
+        // b/2 is subtracted to obtain x0) wrapped to [0, 1) range.
+        let c_hw = c_hw::<F>();
+
+        // Check that the top bit is set, i.e. value is within `[1, 2)`.
+        debug_assert!(b_uq1_hw & (one_hw << (HalfRep::<F>::BITS - 1)) > zero_hw);
+
+        // b >= 1, thus an upper bound for 3/4 + 1/sqrt(2) - b/2 is about 0.9572,
+        // so x0 fits to UQ0.HW without wrapping.
+        let mut x_uq0_hw: HalfRep<F> =
+            c_hw.wrapping_sub(b_uq1_hw /* exact b_hw/2 as UQ0.HW */);
+
+        // An e_0 error is comprised of errors due to
+        // * x0 being an inherently imprecise first approximation of 1/b_hw
+        // * C_hw being some (irrational) number **truncated** to W0 bits
+        // Please note that e_0 is calculated against the infinitely precise
+        // reciprocal of b_hw (that is, **truncated** version of b).
+        //
+        // e_0 <= 3/4 - 1/sqrt(2) + 2^-W0
+        //
+        // By construction, 1 <= b < 2
+        // f(x)  = x * (2 - b*x) = 2*x - b*x^2
+        // f'(x) = 2 * (1 - b*x)
+        //
+        // On the [0, 1] interval, f(0)   = 0,
+        // then it increses until  f(1/b) = 1 / b, maximum on (0, 1),
+        // then it decreses to     f(1)   = 2 - b
+        //
+        // Let g(x) = x - f(x) = b*x^2 - x.
+        // On (0, 1/b), g(x) < 0 <=> f(x) > x
+        // On (1/b, 1], g(x) > 0 <=> f(x) < x
+        //
+        // For half-width iterations, b_hw is used instead of b.
+        for _ in 0..half_iterations {
+            // corr_UQ1_hw can be **larger** than 2 - b_hw*x by at most 1*Ulp
+            // of corr_UQ1_hw.
+            // "0.0 - (...)" is equivalent to "2.0 - (...)" in UQ1.(HW-1).
+            // On the other hand, corr_UQ1_hw should not overflow from 2.0 to 0.0 provided
+            // no overflow occurred earlier: ((rep_t)x_UQ0_hw * b_UQ1_hw >> HW) is
+            // expected to be strictly positive because b_UQ1_hw has its highest bit set
+            // and x_UQ0_hw should be rather large (it converges to 1/2 < 1/b_hw <= 1).
+            //
+            // Now, we should multiply UQ0.HW and UQ1.(HW-1) numbers, naturally
+            // obtaining an UQ1.(HW-1) number and proving its highest bit could be
+            // considered to be 0 to be able to represent it in UQ0.HW.
+            // From the above analysis of f(x), if corr_UQ1_hw would be represented
+            // without any intermediate loss of precision (that is, in twice_rep_t)
+            // x_UQ0_hw could be at most [1.]000... if b_hw is exactly 1.0 and strictly
+            // less otherwise. On the other hand, to obtain [1.]000..., one have to pass
+            // 1/b_hw == 1.0 to f(x), so this cannot occur at all without overflow (due
+            // to 1.0 being not representable as UQ0.HW).
+            // The fact corr_UQ1_hw was virtually round up (due to result of
+            // multiplication being **first** truncated, then negated - to improve
+            // error estimations) can increase x_UQ0_hw by up to 2*Ulp of x_UQ0_hw.
+            //
+            // Now, either no overflow occurred or x_UQ0_hw is 0 or 1 in its half_rep_t
+            // representation. In the latter case, x_UQ0_hw will be either 0 or 1 after
+            // any number of iterations, so just subtract 2 from the reciprocal
+            // approximation after last iteration.
+            //
+            // In infinite precision, with 0 <= eps1, eps2 <= U = 2^-HW:
+            // corr_UQ1_hw = 2 - (1/b_hw + e_n) * b_hw + 2*eps1
+            //             = 1 - e_n * b_hw + 2*eps1
+            // x_UQ0_hw = (1/b_hw + e_n) * (1 - e_n*b_hw + 2*eps1) - eps2
+            //          = 1/b_hw - e_n + 2*eps1/b_hw + e_n - e_n^2*b_hw + 2*e_n*eps1 - eps2
+            //          = 1/b_hw + 2*eps1/b_hw - e_n^2*b_hw + 2*e_n*eps1 - eps2
+            // e_{n+1} = -e_n^2*b_hw + 2*eps1/b_hw + 2*e_n*eps1 - eps2
+            //         = 2*e_n*eps1 - (e_n^2*b_hw + eps2) + 2*eps1/b_hw
+            //                        \------ >0 -------/   \-- >0 ---/
+            // abs(e_{n+1}) <= 2*abs(e_n)*U + max(2*e_n^2 + U, 2 * U)
+            x_uq0_hw = next_guess(x_uq0_hw, b_uq1_hw);
+        }
+
+        // For initial half-width iterations, U = 2^-HW
+        // Let  abs(e_n)     <= u_n * U,
+        // then abs(e_{n+1}) <= 2 * u_n * U^2 + max(2 * u_n^2 * U^2 + U, 2 * U)
+        // u_{n+1} <= 2 * u_n * U + max(2 * u_n^2 * U + 1, 2)
+        //
+        // Account for possible overflow (see above). For an overflow to occur for the
+        // first time, for "ideal" corr_UQ1_hw (that is, without intermediate
+        // truncation), the result of x_UQ0_hw * corr_UQ1_hw should be either maximum
+        // value representable in UQ0.HW or less by 1. This means that 1/b_hw have to
+        // be not below that value (see g(x) above), so it is safe to decrement just
+        // once after the final iteration. On the other hand, an effective value of
+        // divisor changes after this point (from b_hw to b), so adjust here.
+        x_uq0_hw = x_uq0_hw.wrapping_sub(one_hw);
+
+        // Error estimations for full-precision iterations are calculated just
+        // as above, but with U := 2^-W and taking extra decrementing into account.
+        // We need at least one such iteration.
+        //
+        // Simulating operations on a twice_rep_t to perform a single final full-width
+        // iteration. Using ad-hoc multiplication implementations to take advantage
+        // of particular structure of operands.
+        let blo: F::Int = b_uq1 & lo_mask;
+
+        // x_UQ0 = x_UQ0_hw * 2^HW - 1
+        // x_UQ0 * b_UQ1 = (x_UQ0_hw * 2^HW) * (b_UQ1_hw * 2^HW + blo) - b_UQ1
+        //
+        //   <--- higher half ---><--- lower half --->
+        //   [x_UQ0_hw * b_UQ1_hw]
+        // +            [  x_UQ0_hw *  blo  ]
+        // -                      [      b_UQ1       ]
+        // = [      result       ][.... discarded ...]
+        let corr_uq1: F::Int = (F::Int::from(x_uq0_hw) * F::Int::from(b_uq1_hw)
+            + ((F::Int::from(x_uq0_hw) * blo) >> hw))
+            .wrapping_sub(one)
+            .wrapping_neg(); // account for *possible* carry
+
+        let lo_corr: F::Int = corr_uq1 & lo_mask;
+        let hi_corr: F::Int = corr_uq1 >> hw;
+
+        // x_UQ0 * corr_UQ1 = (x_UQ0_hw * 2^HW) * (hi_corr * 2^HW + lo_corr) - corr_UQ1
+        let mut x_uq0: F::Int = ((F::Int::from(x_uq0_hw) * hi_corr) << 1)
+            .wrapping_add((F::Int::from(x_uq0_hw) * lo_corr) >> (hw - 1))
+            // 1 to account for the highest bit of corr_UQ1 can be 1
+            // 1 to account for possible carry
+            // Just like the case of half-width iterations but with possibility
+            // of overflowing by one extra Ulp of x_UQ0.
+            .wrapping_sub(F::Int::from(2u8));
+
+        x_uq0 -= one;
+        // ... and then traditional fixup by 2 should work
+
+        // On error estimation:
+        // abs(E_{N-1}) <=   (u_{N-1} + 2 /* due to conversion e_n -> E_n */) * 2^-HW
+        //                 + (2^-HW + 2^-W))
+        // abs(E_{N-1}) <= (u_{N-1} + 3.01) * 2^-HW
+        //
+        // Then like for the half-width iterations:
+        // With 0 <= eps1, eps2 < 2^-W
+        // E_N  = 4 * E_{N-1} * eps1 - (E_{N-1}^2 * b + 4 * eps2) + 4 * eps1 / b
+        // abs(E_N) <= 2^-W * [ 4 * abs(E_{N-1}) + max(2 * abs(E_{N-1})^2 * 2^W + 4, 8)) ]
+        // abs(E_N) <= 2^-W * [ 4 * (u_{N-1} + 3.01) * 2^-HW + max(4 + 2 * (u_{N-1} + 3.01)^2, 8) ]
+        x_uq0
+    } else {
+        // C is (3/4 + 1/sqrt(2)) - 1 truncated to 64 fractional bits as UQ0.n
+        let c: F::Int = F::Int::from(0x7504F333u32) << (F::BITS - 32);
+        let mut x_uq0: F::Int = c.wrapping_sub(b_uq1);
+
+        // E_0 <= 3/4 - 1/sqrt(2) + 2 * 2^-64
+        // x_uq0
+        for _ in 0..full_iterations {
+            x_uq0 = next_guess(x_uq0, b_uq1);
+        }
+
+        x_uq0
+    };
+
+    // Finally, account for possible overflow, as explained above.
+    x_uq0 = x_uq0.wrapping_sub(2.cast());
+
+    // Suppose 1/b - P * 2^-W < x < 1/b + P * 2^-W
+    x_uq0 -= recip_precision.cast();
+
+    // Now 1/b - (2*P) * 2^-W < x < 1/b
+    // FIXME Is x_UQ0 still >= 0.5?
+
+    let mut quotient_uq1: F::Int = x_uq0.widen_mul(a_significand << 1).hi();
+    // Now, a/b - 4*P * 2^-W < q < a/b for q=<quotient_UQ1:dummy> in UQ1.(SB+1+W).
+
+    // quotient_UQ1 is in [0.5, 2.0) as UQ1.(SB+1),
+    // adjust it to be in [1.0, 2.0) as UQ1.SB.
+    let mut residual_lo = if quotient_uq1 < (implicit_bit << 1) {
+        // Highest bit is 0, so just reinterpret quotient_UQ1 as UQ1.SB,
+        // effectively doubling its value as well as its error estimation.
+        let residual_lo = (a_significand << (significand_bits + 1))
+            .wrapping_sub(quotient_uq1.wrapping_mul(b_significand));
+        res_exponent -= 1;
+        a_significand <<= 1;
+        residual_lo
+    } else {
+        // Highest bit is 1 (the UQ1.(SB+1) value is in [1, 2)), convert it
+        // to UQ1.SB by right shifting by 1. Least significant bit is omitted.
+        quotient_uq1 >>= 1;
+        (a_significand << significand_bits).wrapping_sub(quotient_uq1.wrapping_mul(b_significand))
+    };
+
+    // drop mutability
+    let quotient = quotient_uq1;
+
+    // NB: residualLo is calculated above for the normal result case.
+    //     It is re-computed on denormal path that is expected to be not so
+    //     performance-sensitive.
+    //
+    // Now, q cannot be greater than a/b and can differ by at most 8*P * 2^-W + 2^-SB
+    // Each NextAfter() increments the floating point value by at least 2^-SB
+    // (more, if exponent was incremented).
+    // Different cases (<---> is of 2^-SB length, * = a/b that is shown as a midpoint):
+    //   q
+    //   |   | * |   |   |       |       |
+    //       <--->      2^t
+    //   |   |   |   |   |   *   |       |
+    //               q
+    // To require at most one NextAfter(), an error should be less than 1.5 * 2^-SB.
+    //   (8*P) * 2^-W + 2^-SB < 1.5 * 2^-SB
+    //   (8*P) * 2^-W         < 0.5 * 2^-SB
+    //   P < 2^(W-4-SB)
+    // Generally, for at most R NextAfter() to be enough,
+    //   P < (2*R - 1) * 2^(W-4-SB)
+    // For f32 (0+3): 10 < 32 (OK)
+    // For f32 (2+1): 32 < 74 < 32 * 3, so two NextAfter() are required
+    // For f64: 220 < 256 (OK)
+    // For f128: 4096 * 3 < 13922 < 4096 * 5 (three NextAfter() are required)
+    //
+    // If we have overflowed the exponent, return infinity
+    if res_exponent >= i32::cast_from(exponent_sat) {
+        return F::from_bits(inf_rep | quotient_sign);
+    }
+
+    // Now, quotient <= the correctly-rounded result
+    // and may need taking NextAfter() up to 3 times (see error estimates above)
+    // r = a - b * q
+    let mut abs_result = if res_exponent > 0 {
+        let mut ret = quotient & significand_mask;
+        ret |= F::Int::from(res_exponent as u32) << significand_bits;
+        residual_lo <<= 1;
+        ret
+    } else {
+        if ((significand_bits as i32) + res_exponent) < 0 {
+            return F::from_bits(quotient_sign);
+        }
+
+        let ret = quotient.wrapping_shr(u32::cast_from(res_exponent.wrapping_neg()) + 1);
+        residual_lo = a_significand
+            .wrapping_shl(significand_bits.wrapping_add(CastInto::<u32>::cast(res_exponent)))
+            .wrapping_sub(ret.wrapping_mul(b_significand) << 1);
+        ret
+    };
+
+    residual_lo += abs_result & one; // tie to even
+    // conditionally turns the below LT comparison into LTE
+    abs_result += u8::from(residual_lo > b_significand).into();
+
+    if F::BITS == 128 || (F::BITS == 32 && half_iterations > 0) {
+        // Do not round Infinity to NaN
+        abs_result +=
+            u8::from(abs_result < inf_rep && residual_lo > (2 + 1).cast() * b_significand).into();
+    }
+
+    if F::BITS == 128 {
+        abs_result +=
+            u8::from(abs_result < inf_rep && residual_lo > (4 + 1).cast() * b_significand).into();
+    }
+
+    F::from_bits(abs_result | quotient_sign)
+}
+
+/// Calculate the number of iterations required for a float type's precision.
+///
+/// This returns `(h, f)` where `h` is the number of iterations to be done using integers at half
+/// the float's bit width, and `f` is the number of iterations done using integers of the float's
+/// full width. This is further explained in the module documentation.
+///
+/// # Requirements
+///
+/// The initial estimate should have at least 8 bits of precision. If this is not true, results
+/// will be inaccurate.
+const fn get_iterations<F: Float>() -> (usize, usize) {
+    // Precision doubles with each iteration. Assume we start with 8 bits of precision.
+    let total_iterations = F::BITS.ilog2() as usize - 2;
+
+    if 2 * size_of::<F>() <= size_of::<*const ()>() {
+        // If widening multiplication will be efficient (uses word-sized integers), there is no
+        // reason to use half-sized iterations.
+        (0, total_iterations)
+    } else {
+        // Otherwise, do as many iterations as possible at half width.
+        (total_iterations - 1, 1)
+    }
+}
+
+/// `u_n` for different precisions (with N-1 half-width iterations).
+///
+/// W0 is the precision of C
+///   u_0 = (3/4 - 1/sqrt(2) + 2^-W0) * 2^HW
+///
+/// Estimated with bc:
+///
+/// ```text
+///   define half1(un) { return 2.0 * (un + un^2) / 2.0^hw + 1.0; }
+///   define half2(un) { return 2.0 * un / 2.0^hw + 2.0; }
+///   define full1(un) { return 4.0 * (un + 3.01) / 2.0^hw + 2.0 * (un + 3.01)^2 + 4.0; }
+///   define full2(un) { return 4.0 * (un + 3.01) / 2.0^hw + 8.0; }
+///
+///             | f32 (0 + 3) | f32 (2 + 1)  | f64 (3 + 1)  | f128 (4 + 1)
+/// u_0         | < 184224974 | < 2812.1     | < 184224974  | < 791240234244348797
+/// u_1         | < 15804007  | < 242.7      | < 15804007   | < 67877681371350440
+/// u_2         | < 116308    | < 2.81       | < 116308     | < 499533100252317
+/// u_3         | < 7.31      |              | < 7.31       | < 27054456580
+/// u_4         |             |              |              | < 80.4
+/// Final (U_N) | same as u_3 | < 72         | < 218        | < 13920
+/// ````
+///
+/// Add 2 to `U_N` due to final decrement.
+const fn reciprocal_precision<F: Float>() -> u16 {
+    let (half_iterations, full_iterations) = get_iterations::<F>();
+
+    if full_iterations < 1 {
+        panic!("Must have at least one full iteration");
+    }
+
+    // FIXME(tgross35): calculate this programmatically
+    if F::BITS == 32 && half_iterations == 2 && full_iterations == 1 {
+        74u16
+    } else if F::BITS == 32 && half_iterations == 0 && full_iterations == 3 {
+        10
+    } else if F::BITS == 64 && half_iterations == 3 && full_iterations == 1 {
+        220
+    } else if F::BITS == 128 && half_iterations == 4 && full_iterations == 1 {
+        13922
+    } else {
+        panic!("Invalid number of iterations")
+    }
+}
+
+/// The value of `C` adjusted to half width.
+///
+/// C is (3/4 + 1/sqrt(2)) - 1 truncated to W0 fractional bits as UQ0.HW with W0 being either
+/// 16 or 32 and W0 <= HW. That is, C is the aforementioned 3/4 + 1/sqrt(2) constant (from
+/// which b/2 is subtracted to obtain x0) wrapped to [0, 1) range.
+fn c_hw<F: Float>() -> HalfRep<F>
+where
+    F::Int: DInt,
+    u128: CastInto<HalfRep<F>>,
+{
+    const C_U128: u128 = 0x7504f333f9de6108b2fb1366eaa6a542;
+    const { C_U128 >> (u128::BITS - <HalfRep<F>>::BITS) }.cast()
+}
+
+/// Perform one iteration at any width to approach `1/b`, given previous guess `x`. Returns
+/// the next `x` as a UQ0 number.
+///
+/// This is the `x_{n+1} = 2*x_n - b*x_n^2` algorithm, implemented as `x_n * (2 - b*x_n)`. It
+/// uses widening multiplication to calculate the result with necessary precision.
+fn next_guess<I>(x_uq0: I, b_uq1: I) -> I
+where
+    I: Int + HInt,
+    <I as HInt>::D: ops::Shr<u32, Output = <I as HInt>::D>,
+{
+    // `corr = 2 - b*x_n`
+    //
+    // This looks like `0 - b*x_n`. However, this works - in `UQ1`, `0.0 - x = 2.0 - x`.
+    let corr_uq1: I = I::ZERO.wrapping_sub(x_uq0.widen_mul(b_uq1).hi());
+
+    // `x_n * corr = x_n * (2 - b*x_n)`
+    (x_uq0.widen_mul(corr_uq1) >> (I::BITS - 1)).lo()
+}
+
+intrinsics! {
+    #[arm_aeabi_alias = __aeabi_fdiv]
+    pub extern "C" fn __divsf3(a: f32, b: f32) -> f32 {
+        div(a, b)
+    }
+
+    #[arm_aeabi_alias = __aeabi_ddiv]
+    pub extern "C" fn __divdf3(a: f64, b: f64) -> f64 {
+        div(a, b)
+    }
+
+    #[ppc_alias = __divkf3]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __divtf3(a: f128, b: f128) -> f128 {
+        div(a, b)
+    }
+
+    #[cfg(target_arch = "arm")]
+    pub extern "C" fn __divsf3vfp(a: f32, b: f32) -> f32 {
+        a / b
+    }
+
+    #[cfg(target_arch = "arm")]
+    pub extern "C" fn __divdf3vfp(a: f64, b: f64) -> f64 {
+        a / b
+    }
+}
diff --git a/library/compiler-builtins/compiler-builtins/src/float/extend.rs b/library/compiler-builtins/compiler-builtins/src/float/extend.rs
new file mode 100644
index 00000000000..c4f1fe30e0e
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/float/extend.rs
@@ -0,0 +1,123 @@
+use crate::float::Float;
+use crate::int::{CastInto, Int, MinInt};
+
+/// Generic conversion from a narrower to a wider IEEE-754 floating-point type
+fn extend<F: Float, R: Float>(a: F) -> R
+where
+    F::Int: CastInto<u64>,
+    u64: CastInto<F::Int>,
+    u32: CastInto<R::Int>,
+    R::Int: CastInto<u32>,
+    R::Int: CastInto<u64>,
+    u64: CastInto<R::Int>,
+    F::Int: CastInto<R::Int>,
+{
+    let src_zero = F::Int::ZERO;
+    let src_one = F::Int::ONE;
+    let src_bits = F::BITS;
+    let src_sig_bits = F::SIG_BITS;
+    let src_exp_bias = F::EXP_BIAS;
+    let src_min_normal = F::IMPLICIT_BIT;
+    let src_infinity = F::EXP_MASK;
+    let src_sign_mask = F::SIGN_MASK;
+    let src_abs_mask = src_sign_mask - src_one;
+    let src_qnan = F::SIG_MASK;
+    let src_nan_code = src_qnan - src_one;
+
+    let dst_bits = R::BITS;
+    let dst_sig_bits = R::SIG_BITS;
+    let dst_inf_exp = R::EXP_SAT;
+    let dst_exp_bias = R::EXP_BIAS;
+    let dst_min_normal = R::IMPLICIT_BIT;
+
+    let sig_bits_delta = dst_sig_bits - src_sig_bits;
+    let exp_bias_delta = dst_exp_bias - src_exp_bias;
+    let a_abs = a.to_bits() & src_abs_mask;
+    let mut abs_result = R::Int::ZERO;
+
+    if a_abs.wrapping_sub(src_min_normal) < src_infinity.wrapping_sub(src_min_normal) {
+        // a is a normal number.
+        // Extend to the destination type by shifting the significand and
+        // exponent into the proper position and rebiasing the exponent.
+        let abs_dst: R::Int = a_abs.cast();
+        let bias_dst: R::Int = exp_bias_delta.cast();
+        abs_result = abs_dst.wrapping_shl(sig_bits_delta);
+        abs_result += bias_dst.wrapping_shl(dst_sig_bits);
+    } else if a_abs >= src_infinity {
+        // a is NaN or infinity.
+        // Conjure the result by beginning with infinity, then setting the qNaN
+        // bit (if needed) and right-aligning the rest of the trailing NaN
+        // payload field.
+        let qnan_dst: R::Int = (a_abs & src_qnan).cast();
+        let nan_code_dst: R::Int = (a_abs & src_nan_code).cast();
+        let inf_exp_dst: R::Int = dst_inf_exp.cast();
+        abs_result = inf_exp_dst.wrapping_shl(dst_sig_bits);
+        abs_result |= qnan_dst.wrapping_shl(sig_bits_delta);
+        abs_result |= nan_code_dst.wrapping_shl(sig_bits_delta);
+    } else if a_abs != src_zero {
+        // a is denormal.
+        // Renormalize the significand and clear the leading bit, then insert
+        // the correct adjusted exponent in the destination type.
+        let scale = a_abs.leading_zeros() - src_min_normal.leading_zeros();
+        let abs_dst: R::Int = a_abs.cast();
+        let bias_dst: R::Int = (exp_bias_delta - scale + 1).cast();
+        abs_result = abs_dst.wrapping_shl(sig_bits_delta + scale);
+        abs_result = (abs_result ^ dst_min_normal) | (bias_dst.wrapping_shl(dst_sig_bits));
+    }
+
+    let sign_result: R::Int = (a.to_bits() & src_sign_mask).cast();
+    R::from_bits(abs_result | (sign_result.wrapping_shl(dst_bits - src_bits)))
+}
+
+intrinsics! {
+    #[aapcs_on_arm]
+    #[arm_aeabi_alias = __aeabi_f2d]
+    pub extern "C" fn  __extendsfdf2(a: f32) -> f64 {
+        extend(a)
+    }
+}
+
+intrinsics! {
+    #[aapcs_on_arm]
+    #[apple_f16_arg_abi]
+    #[arm_aeabi_alias = __aeabi_h2f]
+    #[cfg(f16_enabled)]
+    pub extern "C" fn __extendhfsf2(a: f16) -> f32 {
+        extend(a)
+    }
+
+    #[aapcs_on_arm]
+    #[apple_f16_arg_abi]
+    #[cfg(f16_enabled)]
+    pub extern "C" fn __gnu_h2f_ieee(a: f16) -> f32 {
+        extend(a)
+    }
+
+    #[aapcs_on_arm]
+    #[apple_f16_arg_abi]
+    #[cfg(f16_enabled)]
+    pub extern "C" fn __extendhfdf2(a: f16) -> f64 {
+        extend(a)
+    }
+
+    #[aapcs_on_arm]
+    #[ppc_alias = __extendhfkf2]
+    #[cfg(all(f16_enabled, f128_enabled))]
+    pub extern "C" fn __extendhftf2(a: f16) -> f128 {
+        extend(a)
+    }
+
+    #[aapcs_on_arm]
+    #[ppc_alias = __extendsfkf2]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __extendsftf2(a: f32) -> f128 {
+        extend(a)
+    }
+
+    #[aapcs_on_arm]
+    #[ppc_alias = __extenddfkf2]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __extenddftf2(a: f64) -> f128 {
+        extend(a)
+    }
+}
diff --git a/library/compiler-builtins/compiler-builtins/src/float/mod.rs b/library/compiler-builtins/compiler-builtins/src/float/mod.rs
new file mode 100644
index 00000000000..4a379d0d357
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/float/mod.rs
@@ -0,0 +1,15 @@
+pub mod add;
+pub mod cmp;
+pub mod conv;
+pub mod div;
+pub mod extend;
+pub mod mul;
+pub mod pow;
+pub mod sub;
+pub(crate) mod traits;
+pub mod trunc;
+
+#[cfg(not(feature = "unstable-public-internals"))]
+pub(crate) use traits::{Float, HalfRep};
+#[cfg(feature = "unstable-public-internals")]
+pub use traits::{Float, HalfRep};
diff --git a/library/compiler-builtins/compiler-builtins/src/float/mul.rs b/library/compiler-builtins/compiler-builtins/src/float/mul.rs
new file mode 100644
index 00000000000..7f1f19d9bd7
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/float/mul.rs
@@ -0,0 +1,200 @@
+use crate::float::Float;
+use crate::int::{CastInto, DInt, HInt, Int, MinInt};
+
+fn mul<F: Float>(a: F, b: F) -> F
+where
+    u32: CastInto<F::Int>,
+    F::Int: CastInto<u32>,
+    i32: CastInto<F::Int>,
+    F::Int: CastInto<i32>,
+    F::Int: HInt,
+{
+    let one = F::Int::ONE;
+    let zero = F::Int::ZERO;
+
+    let bits = F::BITS;
+    let significand_bits = F::SIG_BITS;
+    let max_exponent = F::EXP_SAT;
+
+    let exponent_bias = F::EXP_BIAS;
+
+    let implicit_bit = F::IMPLICIT_BIT;
+    let significand_mask = F::SIG_MASK;
+    let sign_bit = F::SIGN_MASK;
+    let abs_mask = sign_bit - one;
+    let exponent_mask = F::EXP_MASK;
+    let inf_rep = exponent_mask;
+    let quiet_bit = implicit_bit >> 1;
+    let qnan_rep = exponent_mask | quiet_bit;
+    let exponent_bits = F::EXP_BITS;
+
+    let a_rep = a.to_bits();
+    let b_rep = b.to_bits();
+
+    let a_exponent = (a_rep >> significand_bits) & max_exponent.cast();
+    let b_exponent = (b_rep >> significand_bits) & max_exponent.cast();
+    let product_sign = (a_rep ^ b_rep) & sign_bit;
+
+    let mut a_significand = a_rep & significand_mask;
+    let mut b_significand = b_rep & significand_mask;
+    let mut scale = 0;
+
+    // Detect if a or b is zero, denormal, infinity, or NaN.
+    if a_exponent.wrapping_sub(one) >= (max_exponent - 1).cast()
+        || b_exponent.wrapping_sub(one) >= (max_exponent - 1).cast()
+    {
+        let a_abs = a_rep & abs_mask;
+        let b_abs = b_rep & abs_mask;
+
+        // NaN + anything = qNaN
+        if a_abs > inf_rep {
+            return F::from_bits(a_rep | quiet_bit);
+        }
+        // anything + NaN = qNaN
+        if b_abs > inf_rep {
+            return F::from_bits(b_rep | quiet_bit);
+        }
+
+        if a_abs == inf_rep {
+            if b_abs != zero {
+                // infinity * non-zero = +/- infinity
+                return F::from_bits(a_abs | product_sign);
+            } else {
+                // infinity * zero = NaN
+                return F::from_bits(qnan_rep);
+            }
+        }
+
+        if b_abs == inf_rep {
+            if a_abs != zero {
+                // infinity * non-zero = +/- infinity
+                return F::from_bits(b_abs | product_sign);
+            } else {
+                // infinity * zero = NaN
+                return F::from_bits(qnan_rep);
+            }
+        }
+
+        // zero * anything = +/- zero
+        if a_abs == zero {
+            return F::from_bits(product_sign);
+        }
+
+        // anything * zero = +/- zero
+        if b_abs == zero {
+            return F::from_bits(product_sign);
+        }
+
+        // one or both of a or b is denormal, the other (if applicable) is a
+        // normal number.  Renormalize one or both of a and b, and set scale to
+        // include the necessary exponent adjustment.
+        if a_abs < implicit_bit {
+            let (exponent, significand) = F::normalize(a_significand);
+            scale += exponent;
+            a_significand = significand;
+        }
+
+        if b_abs < implicit_bit {
+            let (exponent, significand) = F::normalize(b_significand);
+            scale += exponent;
+            b_significand = significand;
+        }
+    }
+
+    // Or in the implicit significand bit.  (If we fell through from the
+    // denormal path it was already set by normalize( ), but setting it twice
+    // won't hurt anything.)
+    a_significand |= implicit_bit;
+    b_significand |= implicit_bit;
+
+    // Get the significand of a*b.  Before multiplying the significands, shift
+    // one of them left to left-align it in the field.  Thus, the product will
+    // have (exponentBits + 2) integral digits, all but two of which must be
+    // zero.  Normalizing this result is just a conditional left-shift by one
+    // and bumping the exponent accordingly.
+    let (mut product_low, mut product_high) = a_significand
+        .widen_mul(b_significand << exponent_bits)
+        .lo_hi();
+
+    let a_exponent_i32: i32 = a_exponent.cast();
+    let b_exponent_i32: i32 = b_exponent.cast();
+    let mut product_exponent: i32 = a_exponent_i32
+        .wrapping_add(b_exponent_i32)
+        .wrapping_add(scale)
+        .wrapping_sub(exponent_bias as i32);
+
+    // Normalize the significand, adjust exponent if needed.
+    if (product_high & implicit_bit) != zero {
+        product_exponent = product_exponent.wrapping_add(1);
+    } else {
+        product_high = (product_high << 1) | (product_low >> (bits - 1));
+        product_low <<= 1;
+    }
+
+    // If we have overflowed the type, return +/- infinity.
+    if product_exponent >= max_exponent as i32 {
+        return F::from_bits(inf_rep | product_sign);
+    }
+
+    if product_exponent <= 0 {
+        // Result is denormal before rounding
+        //
+        // If the result is so small that it just underflows to zero, return
+        // a zero of the appropriate sign.  Mathematically there is no need to
+        // handle this case separately, but we make it a special case to
+        // simplify the shift logic.
+        let shift = one.wrapping_sub(product_exponent.cast()).cast();
+        if shift >= bits {
+            return F::from_bits(product_sign);
+        }
+
+        // Otherwise, shift the significand of the result so that the round
+        // bit is the high bit of `product_low`.
+        // Ensure one of the non-highest bits in `product_low` is set if the shifted out bit are
+        // not all zero so that the result is correctly rounded below.
+        let sticky = product_low << (bits - shift) != zero;
+        product_low =
+            (product_high << (bits - shift)) | (product_low >> shift) | (sticky as u32).cast();
+        product_high >>= shift;
+    } else {
+        // Result is normal before rounding; insert the exponent.
+        product_high &= significand_mask;
+        product_high |= product_exponent.cast() << significand_bits;
+    }
+
+    // Insert the sign of the result:
+    product_high |= product_sign;
+
+    // Final rounding.  The final result may overflow to infinity, or underflow
+    // to zero, but those are the correct results in those cases.  We use the
+    // default IEEE-754 round-to-nearest, ties-to-even rounding mode.
+    if product_low > sign_bit {
+        product_high += one;
+    }
+
+    if product_low == sign_bit {
+        product_high += product_high & one;
+    }
+
+    F::from_bits(product_high)
+}
+
+intrinsics! {
+    #[aapcs_on_arm]
+    #[arm_aeabi_alias = __aeabi_fmul]
+    pub extern "C" fn __mulsf3(a: f32, b: f32) -> f32 {
+        mul(a, b)
+    }
+
+    #[aapcs_on_arm]
+    #[arm_aeabi_alias = __aeabi_dmul]
+    pub extern "C" fn __muldf3(a: f64, b: f64) -> f64 {
+        mul(a, b)
+    }
+
+    #[ppc_alias = __mulkf3]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __multf3(a: f128, b: f128) -> f128 {
+        mul(a, b)
+    }
+}
diff --git a/library/compiler-builtins/compiler-builtins/src/float/pow.rs b/library/compiler-builtins/compiler-builtins/src/float/pow.rs
new file mode 100644
index 00000000000..45a4ad9049d
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/float/pow.rs
@@ -0,0 +1,40 @@
+use crate::float::Float;
+use crate::int::Int;
+
+/// Returns `a` raised to the power `b`
+fn pow<F: Float>(a: F, b: i32) -> F {
+    let mut a = a;
+    let recip = b < 0;
+    let mut pow = Int::abs_diff(b, 0);
+    let mut mul = F::ONE;
+    loop {
+        if (pow & 1) != 0 {
+            mul *= a;
+        }
+        pow >>= 1;
+        if pow == 0 {
+            break;
+        }
+        a *= a;
+    }
+
+    if recip { F::ONE / mul } else { mul }
+}
+
+intrinsics! {
+    pub extern "C" fn __powisf2(a: f32, b: i32) -> f32 {
+        pow(a, b)
+    }
+
+    pub extern "C" fn __powidf2(a: f64, b: i32) -> f64 {
+        pow(a, b)
+    }
+
+    #[ppc_alias = __powikf2]
+    #[cfg(f128_enabled)]
+    // FIXME(f16_f128): MSVC cannot build these until `__divtf3` is available in nightly.
+    #[cfg(not(target_env = "msvc"))]
+    pub extern "C" fn __powitf2(a: f128, b: i32) -> f128 {
+        pow(a, b)
+    }
+}
diff --git a/library/compiler-builtins/compiler-builtins/src/float/sub.rs b/library/compiler-builtins/compiler-builtins/src/float/sub.rs
new file mode 100644
index 00000000000..a0fd9dff97f
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/float/sub.rs
@@ -0,0 +1,24 @@
+use crate::float::Float;
+
+intrinsics! {
+    #[arm_aeabi_alias = __aeabi_fsub]
+    pub extern "C" fn __subsf3(a: f32, b: f32) -> f32 {
+        crate::float::add::__addsf3(a, f32::from_bits(b.to_bits() ^ f32::SIGN_MASK))
+    }
+
+    #[arm_aeabi_alias = __aeabi_dsub]
+    pub extern "C" fn __subdf3(a: f64, b: f64) -> f64 {
+        crate::float::add::__adddf3(a, f64::from_bits(b.to_bits() ^ f64::SIGN_MASK))
+    }
+
+    #[ppc_alias = __subkf3]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __subtf3(a: f128, b: f128) -> f128 {
+        #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+        use crate::float::add::__addkf3 as __addtf3;
+        #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+        use crate::float::add::__addtf3;
+
+        __addtf3(a, f128::from_bits(b.to_bits() ^ f128::SIGN_MASK))
+    }
+}
diff --git a/library/compiler-builtins/compiler-builtins/src/float/traits.rs b/library/compiler-builtins/compiler-builtins/src/float/traits.rs
new file mode 100644
index 00000000000..8ccaa7bcbd7
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/float/traits.rs
@@ -0,0 +1,189 @@
+use core::ops;
+
+use crate::int::{DInt, Int, MinInt};
+
+/// Wrapper to extract the integer type half of the float's size
+pub type HalfRep<F> = <<F as Float>::Int as DInt>::H;
+
+/// Trait for some basic operations on floats
+#[allow(dead_code)]
+pub trait Float:
+    Copy
+    + core::fmt::Debug
+    + PartialEq
+    + PartialOrd
+    + ops::AddAssign
+    + ops::MulAssign
+    + ops::Add<Output = Self>
+    + ops::Sub<Output = Self>
+    + ops::Div<Output = Self>
+    + ops::Rem<Output = Self>
+{
+    /// A uint of the same width as the float
+    type Int: Int<OtherSign = Self::SignedInt, UnsignedInt = Self::Int>;
+
+    /// A int of the same width as the float
+    type SignedInt: Int + MinInt<OtherSign = Self::Int, UnsignedInt = Self::Int>;
+
+    /// An int capable of containing the exponent bits plus a sign bit. This is signed.
+    type ExpInt: Int;
+
+    const ZERO: Self;
+    const ONE: Self;
+
+    /// The bitwidth of the float type.
+    const BITS: u32;
+
+    /// The bitwidth of the significand.
+    const SIG_BITS: u32;
+
+    /// The bitwidth of the exponent.
+    const EXP_BITS: u32 = Self::BITS - Self::SIG_BITS - 1;
+
+    /// The saturated (maximum bitpattern) value of the exponent, i.e. the infinite
+    /// representation.
+    ///
+    /// This is in the rightmost position, use `EXP_MASK` for the shifted value.
+    const EXP_SAT: u32 = (1 << Self::EXP_BITS) - 1;
+
+    /// The exponent bias value.
+    const EXP_BIAS: u32 = Self::EXP_SAT >> 1;
+
+    /// A mask for the sign bit.
+    const SIGN_MASK: Self::Int;
+
+    /// A mask for the significand.
+    const SIG_MASK: Self::Int;
+
+    /// The implicit bit of the float format.
+    const IMPLICIT_BIT: Self::Int;
+
+    /// A mask for the exponent.
+    const EXP_MASK: Self::Int;
+
+    /// Returns `self` transmuted to `Self::Int`
+    fn to_bits(self) -> Self::Int;
+
+    /// Returns `self` transmuted to `Self::SignedInt`
+    fn to_bits_signed(self) -> Self::SignedInt;
+
+    /// Checks if two floats have the same bit representation. *Except* for NaNs! NaN can be
+    /// represented in multiple different ways. This method returns `true` if two NaNs are
+    /// compared.
+    fn eq_repr(self, rhs: Self) -> bool;
+
+    /// Returns true if the sign is negative
+    fn is_sign_negative(self) -> bool;
+
+    /// Returns the exponent, not adjusting for bias.
+    fn exp(self) -> Self::ExpInt;
+
+    /// Returns the significand with no implicit bit (or the "fractional" part)
+    fn frac(self) -> Self::Int;
+
+    /// Returns the significand with implicit bit
+    fn imp_frac(self) -> Self::Int;
+
+    /// Returns a `Self::Int` transmuted back to `Self`
+    fn from_bits(a: Self::Int) -> Self;
+
+    /// Constructs a `Self` from its parts. Inputs are treated as bits and shifted into position.
+    fn from_parts(negative: bool, exponent: Self::Int, significand: Self::Int) -> Self;
+
+    fn abs(self) -> Self {
+        let abs_mask = !Self::SIGN_MASK;
+        Self::from_bits(self.to_bits() & abs_mask)
+    }
+
+    /// Returns (normalized exponent, normalized significand)
+    fn normalize(significand: Self::Int) -> (i32, Self::Int);
+
+    /// Returns if `self` is subnormal
+    fn is_subnormal(self) -> bool;
+}
+
+macro_rules! float_impl {
+    ($ty:ident, $ity:ident, $sity:ident, $expty:ident, $bits:expr, $significand_bits:expr) => {
+        impl Float for $ty {
+            type Int = $ity;
+            type SignedInt = $sity;
+            type ExpInt = $expty;
+
+            const ZERO: Self = 0.0;
+            const ONE: Self = 1.0;
+
+            const BITS: u32 = $bits;
+            const SIG_BITS: u32 = $significand_bits;
+
+            const SIGN_MASK: Self::Int = 1 << (Self::BITS - 1);
+            const SIG_MASK: Self::Int = (1 << Self::SIG_BITS) - 1;
+            const IMPLICIT_BIT: Self::Int = 1 << Self::SIG_BITS;
+            const EXP_MASK: Self::Int = !(Self::SIGN_MASK | Self::SIG_MASK);
+
+            fn to_bits(self) -> Self::Int {
+                self.to_bits()
+            }
+            fn to_bits_signed(self) -> Self::SignedInt {
+                self.to_bits() as Self::SignedInt
+            }
+            fn eq_repr(self, rhs: Self) -> bool {
+                #[cfg(feature = "mangled-names")]
+                fn is_nan(x: $ty) -> bool {
+                    // When using mangled-names, the "real" compiler-builtins might not have the
+                    // necessary builtin (__unordtf2) to test whether `f128` is NaN.
+                    // FIXME(f16_f128): Remove once the nightly toolchain has the __unordtf2 builtin
+                    // x is NaN if all the bits of the exponent are set and the significand is non-0
+                    x.to_bits() & $ty::EXP_MASK == $ty::EXP_MASK && x.to_bits() & $ty::SIG_MASK != 0
+                }
+                #[cfg(not(feature = "mangled-names"))]
+                fn is_nan(x: $ty) -> bool {
+                    x.is_nan()
+                }
+                if is_nan(self) && is_nan(rhs) {
+                    true
+                } else {
+                    self.to_bits() == rhs.to_bits()
+                }
+            }
+            fn is_sign_negative(self) -> bool {
+                self.is_sign_negative()
+            }
+            fn exp(self) -> Self::ExpInt {
+                ((self.to_bits() & Self::EXP_MASK) >> Self::SIG_BITS) as Self::ExpInt
+            }
+            fn frac(self) -> Self::Int {
+                self.to_bits() & Self::SIG_MASK
+            }
+            fn imp_frac(self) -> Self::Int {
+                self.frac() | Self::IMPLICIT_BIT
+            }
+            fn from_bits(a: Self::Int) -> Self {
+                Self::from_bits(a)
+            }
+            fn from_parts(negative: bool, exponent: Self::Int, significand: Self::Int) -> Self {
+                Self::from_bits(
+                    ((negative as Self::Int) << (Self::BITS - 1))
+                        | ((exponent << Self::SIG_BITS) & Self::EXP_MASK)
+                        | (significand & Self::SIG_MASK),
+                )
+            }
+            fn normalize(significand: Self::Int) -> (i32, Self::Int) {
+                let shift = significand.leading_zeros().wrapping_sub(Self::EXP_BITS);
+                (
+                    1i32.wrapping_sub(shift as i32),
+                    significand << shift as Self::Int,
+                )
+            }
+            fn is_subnormal(self) -> bool {
+                (self.to_bits() & Self::EXP_MASK) == Self::Int::ZERO
+            }
+        }
+    };
+}
+
+#[cfg(f16_enabled)]
+float_impl!(f16, u16, i16, i8, 16, 10);
+float_impl!(f32, u32, i32, i16, 32, 23);
+float_impl!(f64, u64, i64, i16, 64, 52);
+#[cfg(f128_enabled)]
+float_impl!(f128, u128, i128, i16, 128, 112);
diff --git a/library/compiler-builtins/compiler-builtins/src/float/trunc.rs b/library/compiler-builtins/compiler-builtins/src/float/trunc.rs
new file mode 100644
index 00000000000..ca8a0f368b5
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/float/trunc.rs
@@ -0,0 +1,169 @@
+use crate::float::Float;
+use crate::int::{CastInto, Int, MinInt};
+
+fn trunc<F: Float, R: Float>(a: F) -> R
+where
+    F::Int: CastInto<u64>,
+    F::Int: CastInto<u32>,
+    u64: CastInto<F::Int>,
+    u32: CastInto<F::Int>,
+    R::Int: CastInto<u32>,
+    u32: CastInto<R::Int>,
+    F::Int: CastInto<R::Int>,
+{
+    let src_zero = F::Int::ZERO;
+    let src_one = F::Int::ONE;
+    let src_bits = F::BITS;
+    let src_exp_bias = F::EXP_BIAS;
+
+    let src_min_normal = F::IMPLICIT_BIT;
+    let src_sig_mask = F::SIG_MASK;
+    let src_infinity = F::EXP_MASK;
+    let src_sign_mask = F::SIGN_MASK;
+    let src_abs_mask = src_sign_mask - src_one;
+    let round_mask = (src_one << (F::SIG_BITS - R::SIG_BITS)) - src_one;
+    let halfway = src_one << (F::SIG_BITS - R::SIG_BITS - 1);
+    let src_qnan = src_one << (F::SIG_BITS - 1);
+    let src_nan_code = src_qnan - src_one;
+
+    let dst_zero = R::Int::ZERO;
+    let dst_one = R::Int::ONE;
+    let dst_bits = R::BITS;
+    let dst_inf_exp = R::EXP_SAT;
+    let dst_exp_bias = R::EXP_BIAS;
+
+    let underflow_exponent: F::Int = (src_exp_bias + 1 - dst_exp_bias).cast();
+    let overflow_exponent: F::Int = (src_exp_bias + dst_inf_exp - dst_exp_bias).cast();
+    let underflow: F::Int = underflow_exponent << F::SIG_BITS;
+    let overflow: F::Int = overflow_exponent << F::SIG_BITS;
+
+    let dst_qnan = R::Int::ONE << (R::SIG_BITS - 1);
+    let dst_nan_code = dst_qnan - dst_one;
+
+    let sig_bits_delta = F::SIG_BITS - R::SIG_BITS;
+    // Break a into a sign and representation of the absolute value.
+    let a_abs = a.to_bits() & src_abs_mask;
+    let sign = a.to_bits() & src_sign_mask;
+    let mut abs_result: R::Int;
+
+    if a_abs.wrapping_sub(underflow) < a_abs.wrapping_sub(overflow) {
+        // The exponent of a is within the range of normal numbers in the
+        // destination format.  We can convert by simply right-shifting with
+        // rounding and adjusting the exponent.
+        abs_result = (a_abs >> sig_bits_delta).cast();
+        // Cast before shifting to prevent overflow.
+        let bias_diff: R::Int = src_exp_bias.wrapping_sub(dst_exp_bias).cast();
+        let tmp = bias_diff << R::SIG_BITS;
+        abs_result = abs_result.wrapping_sub(tmp);
+
+        let round_bits = a_abs & round_mask;
+        if round_bits > halfway {
+            // Round to nearest.
+            abs_result += dst_one;
+        } else if round_bits == halfway {
+            // Tie to even.
+            abs_result += abs_result & dst_one;
+        };
+    } else if a_abs > src_infinity {
+        // a is NaN.
+        // Conjure the result by beginning with infinity, setting the qNaN
+        // bit and inserting the (truncated) trailing NaN field.
+        // Cast before shifting to prevent overflow.
+        let dst_inf_exp: R::Int = dst_inf_exp.cast();
+        abs_result = dst_inf_exp << R::SIG_BITS;
+        abs_result |= dst_qnan;
+        abs_result |= dst_nan_code & ((a_abs & src_nan_code) >> (F::SIG_BITS - R::SIG_BITS)).cast();
+    } else if a_abs >= overflow {
+        // a overflows to infinity.
+        // Cast before shifting to prevent overflow.
+        let dst_inf_exp: R::Int = dst_inf_exp.cast();
+        abs_result = dst_inf_exp << R::SIG_BITS;
+    } else {
+        // a underflows on conversion to the destination type or is an exact
+        // zero.  The result may be a denormal or zero.  Extract the exponent
+        // to get the shift amount for the denormalization.
+        let a_exp: u32 = (a_abs >> F::SIG_BITS).cast();
+        let shift = src_exp_bias - dst_exp_bias - a_exp + 1;
+
+        let significand = (a.to_bits() & src_sig_mask) | src_min_normal;
+
+        // Right shift by the denormalization amount with sticky.
+        if shift > F::SIG_BITS {
+            abs_result = dst_zero;
+        } else {
+            let sticky = if (significand << (src_bits - shift)) != src_zero {
+                src_one
+            } else {
+                src_zero
+            };
+            let denormalized_significand: F::Int = (significand >> shift) | sticky;
+            abs_result = (denormalized_significand >> (F::SIG_BITS - R::SIG_BITS)).cast();
+            let round_bits = denormalized_significand & round_mask;
+            // Round to nearest
+            if round_bits > halfway {
+                abs_result += dst_one;
+            }
+            // Ties to even
+            else if round_bits == halfway {
+                abs_result += abs_result & dst_one;
+            };
+        }
+    }
+
+    // Apply the signbit to the absolute value.
+    R::from_bits(abs_result | sign.wrapping_shr(src_bits - dst_bits).cast())
+}
+
+intrinsics! {
+    #[aapcs_on_arm]
+    #[arm_aeabi_alias = __aeabi_d2f]
+    pub extern "C" fn __truncdfsf2(a: f64) -> f32 {
+        trunc(a)
+    }
+}
+
+intrinsics! {
+    #[aapcs_on_arm]
+    #[apple_f16_ret_abi]
+    #[arm_aeabi_alias = __aeabi_f2h]
+    #[cfg(f16_enabled)]
+    pub extern "C" fn __truncsfhf2(a: f32) -> f16 {
+        trunc(a)
+    }
+
+    #[aapcs_on_arm]
+    #[apple_f16_ret_abi]
+    #[cfg(f16_enabled)]
+    pub extern "C" fn __gnu_f2h_ieee(a: f32) -> f16 {
+        trunc(a)
+    }
+
+    #[aapcs_on_arm]
+    #[apple_f16_ret_abi]
+    #[arm_aeabi_alias = __aeabi_d2h]
+    #[cfg(f16_enabled)]
+    pub extern "C" fn __truncdfhf2(a: f64) -> f16 {
+        trunc(a)
+    }
+
+    #[aapcs_on_arm]
+    #[ppc_alias = __trunckfhf2]
+    #[cfg(all(f16_enabled, f128_enabled))]
+    pub extern "C" fn __trunctfhf2(a: f128) -> f16 {
+        trunc(a)
+    }
+
+    #[aapcs_on_arm]
+    #[ppc_alias = __trunckfsf2]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __trunctfsf2(a: f128) -> f32 {
+        trunc(a)
+    }
+
+    #[aapcs_on_arm]
+    #[ppc_alias = __trunckfdf2]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __trunctfdf2(a: f128) -> f64 {
+        trunc(a)
+    }
+}
diff --git a/library/compiler-builtins/compiler-builtins/src/hexagon.rs b/library/compiler-builtins/compiler-builtins/src/hexagon.rs
new file mode 100644
index 00000000000..91cf91c3142
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/hexagon.rs
@@ -0,0 +1,55 @@
+#![cfg(not(feature = "no-asm"))]
+
+use core::arch::global_asm;
+
+global_asm!(include_str!("hexagon/func_macro.s"), options(raw));
+
+global_asm!(include_str!("hexagon/dfaddsub.s"), options(raw));
+
+global_asm!(include_str!("hexagon/dfdiv.s"), options(raw));
+
+global_asm!(include_str!("hexagon/dffma.s"), options(raw));
+
+global_asm!(include_str!("hexagon/dfminmax.s"), options(raw));
+
+global_asm!(include_str!("hexagon/dfmul.s"), options(raw));
+
+global_asm!(include_str!("hexagon/dfsqrt.s"), options(raw));
+
+global_asm!(include_str!("hexagon/divdi3.s"), options(raw));
+
+global_asm!(include_str!("hexagon/divsi3.s"), options(raw));
+
+global_asm!(include_str!("hexagon/fastmath2_dlib_asm.s"), options(raw));
+
+global_asm!(include_str!("hexagon/fastmath2_ldlib_asm.s"), options(raw));
+
+global_asm!(
+    include_str!("hexagon/memcpy_forward_vp4cp4n2.s"),
+    options(raw)
+);
+
+global_asm!(
+    include_str!("hexagon/memcpy_likely_aligned.s"),
+    options(raw)
+);
+
+global_asm!(include_str!("hexagon/moddi3.s"), options(raw));
+
+global_asm!(include_str!("hexagon/modsi3.s"), options(raw));
+
+global_asm!(include_str!("hexagon/sfdiv_opt.s"), options(raw));
+
+global_asm!(include_str!("hexagon/sfsqrt_opt.s"), options(raw));
+
+global_asm!(include_str!("hexagon/udivdi3.s"), options(raw));
+
+global_asm!(include_str!("hexagon/udivmoddi4.s"), options(raw));
+
+global_asm!(include_str!("hexagon/udivmodsi4.s"), options(raw));
+
+global_asm!(include_str!("hexagon/udivsi3.s"), options(raw));
+
+global_asm!(include_str!("hexagon/umoddi3.s"), options(raw));
+
+global_asm!(include_str!("hexagon/umodsi3.s"), options(raw));
diff --git a/library/compiler-builtins/compiler-builtins/src/hexagon/dfaddsub.s b/library/compiler-builtins/compiler-builtins/src/hexagon/dfaddsub.s
new file mode 100644
index 00000000000..1f59e460be6
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/hexagon/dfaddsub.s
@@ -0,0 +1,321 @@
+ .text
+ .global __hexagon_adddf3
+ .global __hexagon_subdf3
+ .type __hexagon_adddf3, @function
+ .type __hexagon_subdf3, @function
+
+.global __qdsp_adddf3 ; .set __qdsp_adddf3, __hexagon_adddf3
+.global __hexagon_fast_adddf3 ; .set __hexagon_fast_adddf3, __hexagon_adddf3
+.global __hexagon_fast2_adddf3 ; .set __hexagon_fast2_adddf3, __hexagon_adddf3
+.global __qdsp_subdf3 ; .set __qdsp_subdf3, __hexagon_subdf3
+.global __hexagon_fast_subdf3 ; .set __hexagon_fast_subdf3, __hexagon_subdf3
+.global __hexagon_fast2_subdf3 ; .set __hexagon_fast2_subdf3, __hexagon_subdf3
+
+ .p2align 5
+__hexagon_adddf3:
+ {
+  r4 = extractu(r1,#11,#20)
+  r5 = extractu(r3,#11,#20)
+  r13:12 = combine(##0x20000000,#0)
+ }
+ {
+  p3 = dfclass(r1:0,#2)
+  p3 = dfclass(r3:2,#2)
+  r9:8 = r13:12
+  p2 = cmp.gtu(r5,r4)
+ }
+ {
+  if (!p3) jump .Ladd_abnormal
+  if (p2) r1:0 = r3:2
+  if (p2) r3:2 = r1:0
+  if (p2) r5:4 = combine(r4,r5)
+ }
+ {
+  r13:12 = insert(r1:0,#52,#11 -2)
+  r9:8 = insert(r3:2,#52,#11 -2)
+  r15 = sub(r4,r5)
+  r7:6 = combine(#62,#1)
+ }
+
+
+
+
+
+.Ladd_continue:
+ {
+  r15 = min(r15,r7)
+
+  r11:10 = neg(r13:12)
+  p2 = cmp.gt(r1,#-1)
+  r14 = #0
+ }
+ {
+  if (!p2) r13:12 = r11:10
+  r11:10 = extractu(r9:8,r15:14)
+  r9:8 = ASR(r9:8,r15)
+
+
+
+
+  r15:14 = #0
+ }
+ {
+  p1 = cmp.eq(r11:10,r15:14)
+  if (!p1.new) r8 = or(r8,r6)
+  r5 = add(r4,#-1024 -60)
+  p3 = cmp.gt(r3,#-1)
+ }
+ {
+  r13:12 = add(r13:12,r9:8)
+  r11:10 = sub(r13:12,r9:8)
+  r7:6 = combine(#54,##2045)
+ }
+ {
+  p0 = cmp.gtu(r4,r7)
+  p0 = !cmp.gtu(r4,r6)
+  if (!p0.new) jump:nt .Ladd_ovf_unf
+  if (!p3) r13:12 = r11:10
+ }
+ {
+  r1:0 = convert_d2df(r13:12)
+  p0 = cmp.eq(r13,#0)
+  p0 = cmp.eq(r12,#0)
+  if (p0.new) jump:nt .Ladd_zero
+ }
+ {
+  r1 += asl(r5,#20)
+  jumpr r31
+ }
+ .falign
+__hexagon_subdf3:
+ {
+  r3 = togglebit(r3,#31)
+  jump __qdsp_adddf3
+ }
+
+
+ .falign
+.Ladd_zero:
+
+
+ {
+  r28 = USR
+  r1:0 = #0
+  r3 = #1
+ }
+ {
+  r28 = extractu(r28,#2,#22)
+  r3 = asl(r3,#31)
+ }
+ {
+  p0 = cmp.eq(r28,#2)
+  if (p0.new) r1 = xor(r1,r3)
+  jumpr r31
+ }
+ .falign
+.Ladd_ovf_unf:
+ {
+  r1:0 = convert_d2df(r13:12)
+  p0 = cmp.eq(r13,#0)
+  p0 = cmp.eq(r12,#0)
+  if (p0.new) jump:nt .Ladd_zero
+ }
+ {
+  r28 = extractu(r1,#11,#20)
+  r1 += asl(r5,#20)
+ }
+ {
+  r5 = add(r5,r28)
+  r3:2 = combine(##0x00100000,#0)
+ }
+ {
+  p0 = cmp.gt(r5,##1024 +1024 -2)
+  if (p0.new) jump:nt .Ladd_ovf
+ }
+ {
+  p0 = cmp.gt(r5,#0)
+  if (p0.new) jumpr:t r31
+  r28 = sub(#1,r5)
+ }
+ {
+  r3:2 = insert(r1:0,#52,#0)
+  r1:0 = r13:12
+ }
+ {
+  r3:2 = lsr(r3:2,r28)
+ }
+ {
+  r1:0 = insert(r3:2,#63,#0)
+  jumpr r31
+ }
+ .falign
+.Ladd_ovf:
+
+ {
+  r1:0 = r13:12
+  r28 = USR
+  r13:12 = combine(##0x7fefffff,#-1)
+ }
+ {
+  r5 = extractu(r28,#2,#22)
+  r28 = or(r28,#0x28)
+  r9:8 = combine(##0x7ff00000,#0)
+ }
+ {
+  USR = r28
+  r5 ^= lsr(r1,#31)
+  r28 = r5
+ }
+ {
+  p0 = !cmp.eq(r28,#1)
+  p0 = !cmp.eq(r5,#2)
+  if (p0.new) r13:12 = r9:8
+ }
+ {
+  r1:0 = insert(r13:12,#63,#0)
+ }
+ {
+  p0 = dfcmp.eq(r1:0,r1:0)
+  jumpr r31
+ }
+
+.Ladd_abnormal:
+ {
+  r13:12 = extractu(r1:0,#63,#0)
+  r9:8 = extractu(r3:2,#63,#0)
+ }
+ {
+  p3 = cmp.gtu(r13:12,r9:8)
+  if (!p3.new) r1:0 = r3:2
+  if (!p3.new) r3:2 = r1:0
+ }
+ {
+
+  p0 = dfclass(r1:0,#0x0f)
+  if (!p0.new) jump:nt .Linvalid_nan_add
+  if (!p3) r13:12 = r9:8
+  if (!p3) r9:8 = r13:12
+ }
+ {
+
+
+  p1 = dfclass(r1:0,#0x08)
+  if (p1.new) jump:nt .Linf_add
+ }
+ {
+  p2 = dfclass(r3:2,#0x01)
+  if (p2.new) jump:nt .LB_zero
+  r13:12 = #0
+ }
+
+ {
+  p0 = dfclass(r1:0,#4)
+  if (p0.new) jump:nt .Ladd_two_subnormal
+  r13:12 = combine(##0x20000000,#0)
+ }
+ {
+  r4 = extractu(r1,#11,#20)
+  r5 = #1
+
+  r9:8 = asl(r9:8,#11 -2)
+ }
+
+
+
+ {
+  r13:12 = insert(r1:0,#52,#11 -2)
+  r15 = sub(r4,r5)
+  r7:6 = combine(#62,#1)
+  jump .Ladd_continue
+ }
+
+.Ladd_two_subnormal:
+ {
+  r13:12 = extractu(r1:0,#63,#0)
+  r9:8 = extractu(r3:2,#63,#0)
+ }
+ {
+  r13:12 = neg(r13:12)
+  r9:8 = neg(r9:8)
+  p0 = cmp.gt(r1,#-1)
+  p1 = cmp.gt(r3,#-1)
+ }
+ {
+  if (p0) r13:12 = r1:0
+  if (p1) r9:8 = r3:2
+ }
+ {
+  r13:12 = add(r13:12,r9:8)
+ }
+ {
+  r9:8 = neg(r13:12)
+  p0 = cmp.gt(r13,#-1)
+  r3:2 = #0
+ }
+ {
+  if (!p0) r1:0 = r9:8
+  if (p0) r1:0 = r13:12
+  r3 = ##0x80000000
+ }
+ {
+  if (!p0) r1 = or(r1,r3)
+  p0 = dfcmp.eq(r1:0,r3:2)
+  if (p0.new) jump:nt .Lzero_plus_zero
+ }
+ {
+  jumpr r31
+ }
+
+.Linvalid_nan_add:
+ {
+  r28 = convert_df2sf(r1:0)
+  p0 = dfclass(r3:2,#0x0f)
+  if (p0.new) r3:2 = r1:0
+ }
+ {
+  r2 = convert_df2sf(r3:2)
+  r1:0 = #-1
+  jumpr r31
+ }
+ .falign
+.LB_zero:
+ {
+  p0 = dfcmp.eq(r13:12,r1:0)
+  if (!p0.new) jumpr:t r31
+ }
+
+
+
+
+.Lzero_plus_zero:
+ {
+  p0 = cmp.eq(r1:0,r3:2)
+  if (p0.new) jumpr:t r31
+ }
+ {
+  r28 = USR
+ }
+ {
+  r28 = extractu(r28,#2,#22)
+  r1:0 = #0
+ }
+ {
+  p0 = cmp.eq(r28,#2)
+  if (p0.new) r1 = ##0x80000000
+  jumpr r31
+ }
+.Linf_add:
+
+ {
+  p0 = !cmp.eq(r1,r3)
+  p0 = dfclass(r3:2,#8)
+  if (!p0.new) jumpr:t r31
+ }
+ {
+  r2 = ##0x7f800001
+ }
+ {
+  r1:0 = convert_sf2df(r2)
+  jumpr r31
+ }
+.size __hexagon_adddf3,.-__hexagon_adddf3
diff --git a/library/compiler-builtins/compiler-builtins/src/hexagon/dfdiv.s b/library/compiler-builtins/compiler-builtins/src/hexagon/dfdiv.s
new file mode 100644
index 00000000000..6d65dbfc4df
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/hexagon/dfdiv.s
@@ -0,0 +1,372 @@
+ .text
+ .global __hexagon_divdf3
+ .type __hexagon_divdf3,@function
+ .global __qdsp_divdf3 ; .set __qdsp_divdf3, __hexagon_divdf3
+        .global __hexagon_fast_divdf3 ; .set __hexagon_fast_divdf3, __hexagon_divdf3
+        .global __hexagon_fast2_divdf3 ; .set __hexagon_fast2_divdf3, __hexagon_divdf3
+ .p2align 5
+__hexagon_divdf3:
+ {
+  p2 = dfclass(r1:0,#0x02)
+  p2 = dfclass(r3:2,#0x02)
+  r13:12 = combine(r3,r1)
+  r28 = xor(r1,r3)
+ }
+ {
+  if (!p2) jump .Ldiv_abnormal
+  r7:6 = extractu(r3:2,#23,#52 -23)
+  r8 = ##0x3f800001
+ }
+ {
+  r9 = or(r8,r6)
+  r13 = extractu(r13,#11,#52 -32)
+  r12 = extractu(r12,#11,#52 -32)
+  p3 = cmp.gt(r28,#-1)
+ }
+
+
+.Ldenorm_continue:
+ {
+  r11,p0 = sfrecipa(r8,r9)
+  r10 = and(r8,#-2)
+  r28 = #1
+  r12 = sub(r12,r13)
+ }
+
+
+ {
+  r10 -= sfmpy(r11,r9):lib
+  r1 = insert(r28,#11 +1,#52 -32)
+  r13 = ##0x00800000 << 3
+ }
+ {
+  r11 += sfmpy(r11,r10):lib
+  r3 = insert(r28,#11 +1,#52 -32)
+  r10 = and(r8,#-2)
+ }
+ {
+  r10 -= sfmpy(r11,r9):lib
+  r5 = #-0x3ff +1
+  r4 = #0x3ff -1
+ }
+ {
+  r11 += sfmpy(r11,r10):lib
+  p1 = cmp.gt(r12,r5)
+  p1 = !cmp.gt(r12,r4)
+ }
+ {
+  r13 = insert(r11,#23,#3)
+  r5:4 = #0
+  r12 = add(r12,#-61)
+ }
+
+
+
+
+ {
+  r13 = add(r13,#((-3) << 3))
+ }
+ { r7:6 = mpyu(r13,r1); r1:0 = asl(r1:0,# ( 15 )); }; { r6 = # 0; r1:0 -= mpyu(r7,r2); r15:14 = mpyu(r7,r3); }; { r5:4 += ASL(r7:6, # ( 14 )); r1:0 -= asl(r15:14, # 32); }
+ { r7:6 = mpyu(r13,r1); r1:0 = asl(r1:0,# ( 15 )); }; { r6 = # 0; r1:0 -= mpyu(r7,r2); r15:14 = mpyu(r7,r3); }; { r5:4 += ASR(r7:6, # ( 1 )); r1:0 -= asl(r15:14, # 32); }
+ { r7:6 = mpyu(r13,r1); r1:0 = asl(r1:0,# ( 15 )); }; { r6 = # 0; r1:0 -= mpyu(r7,r2); r15:14 = mpyu(r7,r3); }; { r5:4 += ASR(r7:6, # ( 16 )); r1:0 -= asl(r15:14, # 32); }
+ { r7:6 = mpyu(r13,r1); r1:0 = asl(r1:0,# ( 15 )); }; { r6 = # 0; r1:0 -= mpyu(r7,r2); r15:14 = mpyu(r7,r3); }; { r5:4 += ASR(r7:6, # ( 31 )); r1:0 -= asl(r15:14, # 32); r7:6=# ( 0 ); }
+
+
+
+
+
+
+
+ {
+
+  r15:14 = sub(r1:0,r3:2)
+  p0 = cmp.gtu(r3:2,r1:0)
+
+  if (!p0.new) r6 = #2
+ }
+ {
+  r5:4 = add(r5:4,r7:6)
+  if (!p0) r1:0 = r15:14
+  r15:14 = #0
+ }
+ {
+  p0 = cmp.eq(r1:0,r15:14)
+  if (!p0.new) r4 = or(r4,r28)
+ }
+ {
+  r7:6 = neg(r5:4)
+ }
+ {
+  if (!p3) r5:4 = r7:6
+ }
+ {
+  r1:0 = convert_d2df(r5:4)
+  if (!p1) jump .Ldiv_ovf_unf
+ }
+ {
+  r1 += asl(r12,#52 -32)
+  jumpr r31
+ }
+
+.Ldiv_ovf_unf:
+ {
+  r1 += asl(r12,#52 -32)
+  r13 = extractu(r1,#11,#52 -32)
+ }
+ {
+  r7:6 = abs(r5:4)
+  r12 = add(r12,r13)
+ }
+ {
+  p0 = cmp.gt(r12,##0x3ff +0x3ff)
+  if (p0.new) jump:nt .Ldiv_ovf
+ }
+ {
+  p0 = cmp.gt(r12,#0)
+  if (p0.new) jump:nt .Lpossible_unf2
+ }
+ {
+  r13 = add(clb(r7:6),#-1)
+  r12 = sub(#7,r12)
+  r10 = USR
+  r11 = #63
+ }
+ {
+  r13 = min(r12,r11)
+  r11 = or(r10,#0x030)
+  r7:6 = asl(r7:6,r13)
+  r12 = #0
+ }
+ {
+  r15:14 = extractu(r7:6,r13:12)
+  r7:6 = lsr(r7:6,r13)
+  r3:2 = #1
+ }
+ {
+  p0 = cmp.gtu(r3:2,r15:14)
+  if (!p0.new) r6 = or(r2,r6)
+  r7 = setbit(r7,#52 -32+4)
+ }
+ {
+  r5:4 = neg(r7:6)
+  p0 = bitsclr(r6,#(1<<4)-1)
+  if (!p0.new) r10 = r11
+ }
+ {
+  USR = r10
+  if (p3) r5:4 = r7:6
+  r10 = #-0x3ff -(52 +4)
+ }
+ {
+  r1:0 = convert_d2df(r5:4)
+ }
+ {
+  r1 += asl(r10,#52 -32)
+  jumpr r31
+ }
+
+
+.Lpossible_unf2:
+
+
+ {
+  r3:2 = extractu(r1:0,#63,#0)
+  r15:14 = combine(##0x00100000,#0)
+  r10 = #0x7FFF
+ }
+ {
+  p0 = dfcmp.eq(r15:14,r3:2)
+  p0 = bitsset(r7,r10)
+ }
+
+
+
+
+
+
+ {
+  if (!p0) jumpr r31
+  r10 = USR
+ }
+
+ {
+  r10 = or(r10,#0x30)
+ }
+ {
+  USR = r10
+ }
+ {
+  p0 = dfcmp.eq(r1:0,r1:0)
+  jumpr r31
+ }
+
+.Ldiv_ovf:
+
+
+
+ {
+  r10 = USR
+  r3:2 = combine(##0x7fefffff,#-1)
+  r1 = mux(p3,#0,#-1)
+ }
+ {
+  r7:6 = combine(##0x7ff00000,#0)
+  r5 = extractu(r10,#2,#22)
+  r10 = or(r10,#0x28)
+ }
+ {
+  USR = r10
+  r5 ^= lsr(r1,#31)
+  r4 = r5
+ }
+ {
+  p0 = !cmp.eq(r4,#1)
+  p0 = !cmp.eq(r5,#2)
+  if (p0.new) r3:2 = r7:6
+  p0 = dfcmp.eq(r3:2,r3:2)
+ }
+ {
+  r1:0 = insert(r3:2,#63,#0)
+  jumpr r31
+ }
+
+
+
+
+
+
+
+.Ldiv_abnormal:
+ {
+  p0 = dfclass(r1:0,#0x0F)
+  p0 = dfclass(r3:2,#0x0F)
+  p3 = cmp.gt(r28,#-1)
+ }
+ {
+  p1 = dfclass(r1:0,#0x08)
+  p1 = dfclass(r3:2,#0x08)
+ }
+ {
+  p2 = dfclass(r1:0,#0x01)
+  p2 = dfclass(r3:2,#0x01)
+ }
+ {
+  if (!p0) jump .Ldiv_nan
+  if (p1) jump .Ldiv_invalid
+ }
+ {
+  if (p2) jump .Ldiv_invalid
+ }
+ {
+  p2 = dfclass(r1:0,#(0x0F ^ 0x01))
+  p2 = dfclass(r3:2,#(0x0F ^ 0x08))
+ }
+ {
+  p1 = dfclass(r1:0,#(0x0F ^ 0x08))
+  p1 = dfclass(r3:2,#(0x0F ^ 0x01))
+ }
+ {
+  if (!p2) jump .Ldiv_zero_result
+  if (!p1) jump .Ldiv_inf_result
+ }
+
+
+
+
+
+ {
+  p0 = dfclass(r1:0,#0x02)
+  p1 = dfclass(r3:2,#0x02)
+  r10 = ##0x00100000
+ }
+ {
+  r13:12 = combine(r3,r1)
+  r1 = insert(r10,#11 +1,#52 -32)
+  r3 = insert(r10,#11 +1,#52 -32)
+ }
+ {
+  if (p0) r1 = or(r1,r10)
+  if (p1) r3 = or(r3,r10)
+ }
+ {
+  r5 = add(clb(r1:0),#-11)
+  r4 = add(clb(r3:2),#-11)
+  r10 = #1
+ }
+ {
+  r12 = extractu(r12,#11,#52 -32)
+  r13 = extractu(r13,#11,#52 -32)
+ }
+ {
+  r1:0 = asl(r1:0,r5)
+  r3:2 = asl(r3:2,r4)
+  if (!p0) r12 = sub(r10,r5)
+  if (!p1) r13 = sub(r10,r4)
+ }
+ {
+  r7:6 = extractu(r3:2,#23,#52 -23)
+ }
+ {
+  r9 = or(r8,r6)
+  jump .Ldenorm_continue
+ }
+
+.Ldiv_zero_result:
+ {
+  r1 = xor(r1,r3)
+  r3:2 = #0
+ }
+ {
+  r1:0 = insert(r3:2,#63,#0)
+  jumpr r31
+ }
+.Ldiv_inf_result:
+ {
+  p2 = dfclass(r3:2,#0x01)
+  p2 = dfclass(r1:0,#(0x0F ^ 0x08))
+ }
+ {
+  r10 = USR
+  if (!p2) jump 1f
+  r1 = xor(r1,r3)
+ }
+ {
+  r10 = or(r10,#0x04)
+ }
+ {
+  USR = r10
+ }
+1:
+ {
+  r3:2 = combine(##0x7ff00000,#0)
+  p0 = dfcmp.uo(r3:2,r3:2)
+ }
+ {
+  r1:0 = insert(r3:2,#63,#0)
+  jumpr r31
+ }
+.Ldiv_nan:
+ {
+  p0 = dfclass(r1:0,#0x10)
+  p1 = dfclass(r3:2,#0x10)
+  if (!p0.new) r1:0 = r3:2
+  if (!p1.new) r3:2 = r1:0
+ }
+ {
+  r5 = convert_df2sf(r1:0)
+  r4 = convert_df2sf(r3:2)
+ }
+ {
+  r1:0 = #-1
+  jumpr r31
+ }
+
+.Ldiv_invalid:
+ {
+  r10 = ##0x7f800001
+ }
+ {
+  r1:0 = convert_sf2df(r10)
+  jumpr r31
+ }
+.size __hexagon_divdf3,.-__hexagon_divdf3
diff --git a/library/compiler-builtins/compiler-builtins/src/hexagon/dffma.s b/library/compiler-builtins/compiler-builtins/src/hexagon/dffma.s
new file mode 100644
index 00000000000..97d05eb1839
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/hexagon/dffma.s
@@ -0,0 +1,534 @@
+ .text
+ .global __hexagon_fmadf4
+        .type __hexagon_fmadf4,@function
+ .global __hexagon_fmadf5
+        .type __hexagon_fmadf5,@function
+ .global __qdsp_fmadf5 ; .set __qdsp_fmadf5, __hexagon_fmadf5
+ .p2align 5
+__hexagon_fmadf4:
+__hexagon_fmadf5:
+fma:
+ {
+  p0 = dfclass(r1:0,#2)
+  p0 = dfclass(r3:2,#2)
+  r13:12 = #0
+  r15:14 = #0
+ }
+ {
+  r13:12 = insert(r1:0,#52,#11 -3)
+  r15:14 = insert(r3:2,#52,#11 -3)
+  r7 = ##0x10000000
+  allocframe(#32)
+ }
+ {
+  r9:8 = mpyu(r12,r14)
+  if (!p0) jump .Lfma_abnormal_ab
+  r13 = or(r13,r7)
+  r15 = or(r15,r7)
+ }
+ {
+  p0 = dfclass(r5:4,#2)
+  if (!p0.new) jump:nt .Lfma_abnormal_c
+  r11:10 = combine(r7,#0)
+  r7:6 = combine(#0,r9)
+ }
+.Lfma_abnormal_c_restart:
+ {
+  r7:6 += mpyu(r14,r13)
+  r11:10 = insert(r5:4,#52,#11 -3)
+  memd(r29+#0) = r17:16
+  memd(r29+#8) = r19:18
+ }
+ {
+  r7:6 += mpyu(r12,r15)
+  r19:18 = neg(r11:10)
+  p0 = cmp.gt(r5,#-1)
+  r28 = xor(r1,r3)
+ }
+ {
+  r18 = extractu(r1,#11,#20)
+  r19 = extractu(r3,#11,#20)
+  r17:16 = combine(#0,r7)
+  if (!p0) r11:10 = r19:18
+ }
+ {
+  r17:16 += mpyu(r13,r15)
+  r9:8 = combine(r6,r8)
+  r18 = add(r18,r19)
+
+
+
+
+  r19 = extractu(r5,#11,#20)
+ }
+ {
+  r18 = add(r18,#-1023 +(4))
+  p3 = !cmp.gt(r28,#-1)
+  r7:6 = #0
+  r15:14 = #0
+ }
+ {
+  r7:6 = sub(r7:6,r9:8,p3):carry
+  p0 = !cmp.gt(r28,#-1)
+  p1 = cmp.gt(r19,r18)
+  if (p1.new) r19:18 = combine(r18,r19)
+ }
+ {
+  r15:14 = sub(r15:14,r17:16,p3):carry
+  if (p0) r9:8 = r7:6
+
+
+
+
+  r7:6 = #0
+  r19 = sub(r18,r19)
+ }
+ {
+  if (p0) r17:16 = r15:14
+  p0 = cmp.gt(r19,#63)
+  if (p1) r9:8 = r7:6
+  if (p1) r7:6 = r9:8
+ }
+
+
+
+
+
+
+
+ {
+  if (p1) r17:16 = r11:10
+  if (p1) r11:10 = r17:16
+  if (p0) r19 = add(r19,#-64)
+  r28 = #63
+ }
+ {
+
+  if (p0) r7:6 = r11:10
+  r28 = asr(r11,#31)
+  r13 = min(r19,r28)
+  r12 = #0
+ }
+
+
+
+
+
+
+ {
+  if (p0) r11:10 = combine(r28,r28)
+  r5:4 = extract(r7:6,r13:12)
+  r7:6 = lsr(r7:6,r13)
+  r12 = sub(#64,r13)
+ }
+ {
+  r15:14 = #0
+  r28 = #-2
+  r7:6 |= lsl(r11:10,r12)
+  r11:10 = asr(r11:10,r13)
+ }
+ {
+  p3 = cmp.gtu(r5:4,r15:14)
+  if (p3.new) r6 = and(r6,r28)
+
+
+
+  r15:14 = #1
+  r5:4 = #0
+ }
+ {
+  r9:8 = add(r7:6,r9:8,p3):carry
+ }
+ {
+  r17:16 = add(r11:10,r17:16,p3):carry
+  r28 = #62
+ }
+
+
+
+
+
+
+
+ {
+  r12 = add(clb(r17:16),#-2)
+  if (!cmp.eq(r12.new,r28)) jump:t 1f
+ }
+
+ {
+  r11:10 = extractu(r9:8,#62,#2)
+  r9:8 = asl(r9:8,#62)
+  r18 = add(r18,#-62)
+ }
+ {
+  r17:16 = insert(r11:10,#62,#0)
+ }
+ {
+  r12 = add(clb(r17:16),#-2)
+ }
+ .falign
+1:
+ {
+  r11:10 = asl(r17:16,r12)
+  r5:4 |= asl(r9:8,r12)
+  r13 = sub(#64,r12)
+  r18 = sub(r18,r12)
+ }
+ {
+  r11:10 |= lsr(r9:8,r13)
+  p2 = cmp.gtu(r15:14,r5:4)
+  r28 = #1023 +1023 -2
+ }
+ {
+  if (!p2) r10 = or(r10,r14)
+
+  p0 = !cmp.gt(r18,r28)
+  p0 = cmp.gt(r18,#1)
+  if (!p0.new) jump:nt .Lfma_ovf_unf
+ }
+ {
+
+  p0 = cmp.gtu(r15:14,r11:10)
+  r1:0 = convert_d2df(r11:10)
+  r18 = add(r18,#-1023 -60)
+  r17:16 = memd(r29+#0)
+ }
+ {
+  r1 += asl(r18,#20)
+  r19:18 = memd(r29+#8)
+  if (!p0) dealloc_return
+ }
+.Ladd_yields_zero:
+
+ {
+  r28 = USR
+  r1:0 = #0
+ }
+ {
+  r28 = extractu(r28,#2,#22)
+  r17:16 = memd(r29+#0)
+  r19:18 = memd(r29+#8)
+ }
+ {
+  p0 = cmp.eq(r28,#2)
+  if (p0.new) r1 = ##0x80000000
+  dealloc_return
+ }
+.Lfma_ovf_unf:
+ {
+  p0 = cmp.gtu(r15:14,r11:10)
+  if (p0.new) jump:nt .Ladd_yields_zero
+ }
+ {
+  r1:0 = convert_d2df(r11:10)
+  r18 = add(r18,#-1023 -60)
+  r28 = r18
+ }
+
+
+ {
+  r1 += asl(r18,#20)
+  r7 = extractu(r1,#11,#20)
+ }
+ {
+  r6 = add(r18,r7)
+  r17:16 = memd(r29+#0)
+  r19:18 = memd(r29+#8)
+  r9:8 = abs(r11:10)
+ }
+ {
+  p0 = cmp.gt(r6,##1023 +1023)
+  if (p0.new) jump:nt .Lfma_ovf
+ }
+ {
+  p0 = cmp.gt(r6,#0)
+  if (p0.new) jump:nt .Lpossible_unf0
+ }
+ {
+
+
+
+  r7 = add(clb(r9:8),#-2)
+  r6 = sub(#1+5,r28)
+  p3 = cmp.gt(r11,#-1)
+ }
+
+
+
+ {
+  r6 = add(r6,r7)
+  r9:8 = asl(r9:8,r7)
+  r1 = USR
+  r28 = #63
+ }
+ {
+  r7 = min(r6,r28)
+  r6 = #0
+  r0 = #0x0030
+ }
+ {
+  r3:2 = extractu(r9:8,r7:6)
+  r9:8 = asr(r9:8,r7)
+ }
+ {
+  p0 = cmp.gtu(r15:14,r3:2)
+  if (!p0.new) r8 = or(r8,r14)
+  r9 = setbit(r9,#20 +3)
+ }
+ {
+  r11:10 = neg(r9:8)
+  p1 = bitsclr(r8,#(1<<3)-1)
+  if (!p1.new) r1 = or(r1,r0)
+  r3:2 = #0
+ }
+ {
+  if (p3) r11:10 = r9:8
+  USR = r1
+  r28 = #-1023 -(52 +3)
+ }
+ {
+  r1:0 = convert_d2df(r11:10)
+ }
+ {
+  r1 += asl(r28,#20)
+  dealloc_return
+ }
+.Lpossible_unf0:
+ {
+  r28 = ##0x7fefffff
+  r9:8 = abs(r11:10)
+ }
+ {
+  p0 = cmp.eq(r0,#0)
+  p0 = bitsclr(r1,r28)
+  if (!p0.new) dealloc_return:t
+  r28 = #0x7fff
+ }
+ {
+  p0 = bitsset(r9,r28)
+  r3 = USR
+  r2 = #0x0030
+ }
+ {
+  if (p0) r3 = or(r3,r2)
+ }
+ {
+  USR = r3
+ }
+ {
+  p0 = dfcmp.eq(r1:0,r1:0)
+  dealloc_return
+ }
+.Lfma_ovf:
+ {
+  r28 = USR
+  r11:10 = combine(##0x7fefffff,#-1)
+  r1:0 = r11:10
+ }
+ {
+  r9:8 = combine(##0x7ff00000,#0)
+  r3 = extractu(r28,#2,#22)
+  r28 = or(r28,#0x28)
+ }
+ {
+  USR = r28
+  r3 ^= lsr(r1,#31)
+  r2 = r3
+ }
+ {
+  p0 = !cmp.eq(r2,#1)
+  p0 = !cmp.eq(r3,#2)
+ }
+ {
+  p0 = dfcmp.eq(r9:8,r9:8)
+  if (p0.new) r11:10 = r9:8
+ }
+ {
+  r1:0 = insert(r11:10,#63,#0)
+  dealloc_return
+ }
+.Lfma_abnormal_ab:
+ {
+  r9:8 = extractu(r1:0,#63,#0)
+  r11:10 = extractu(r3:2,#63,#0)
+  deallocframe
+ }
+ {
+  p3 = cmp.gtu(r9:8,r11:10)
+  if (!p3.new) r1:0 = r3:2
+  if (!p3.new) r3:2 = r1:0
+ }
+ {
+  p0 = dfclass(r1:0,#0x0f)
+  if (!p0.new) jump:nt .Lnan
+  if (!p3) r9:8 = r11:10
+  if (!p3) r11:10 = r9:8
+ }
+ {
+  p1 = dfclass(r1:0,#0x08)
+  p1 = dfclass(r3:2,#0x0e)
+ }
+ {
+  p0 = dfclass(r1:0,#0x08)
+  p0 = dfclass(r3:2,#0x01)
+ }
+ {
+  if (p1) jump .Lab_inf
+  p2 = dfclass(r3:2,#0x01)
+ }
+ {
+  if (p0) jump .Linvalid
+  if (p2) jump .Lab_true_zero
+  r28 = ##0x7c000000
+ }
+
+
+
+
+
+ {
+  p0 = bitsclr(r1,r28)
+  if (p0.new) jump:nt .Lfma_ab_tiny
+ }
+ {
+  r28 = add(clb(r11:10),#-11)
+ }
+ {
+  r11:10 = asl(r11:10,r28)
+ }
+ {
+  r3:2 = insert(r11:10,#63,#0)
+  r1 -= asl(r28,#20)
+ }
+ jump fma
+
+.Lfma_ab_tiny:
+ r9:8 = combine(##0x00100000,#0)
+ {
+  r1:0 = insert(r9:8,#63,#0)
+  r3:2 = insert(r9:8,#63,#0)
+ }
+ jump fma
+
+.Lab_inf:
+ {
+  r3:2 = lsr(r3:2,#63)
+  p0 = dfclass(r5:4,#0x10)
+ }
+ {
+  r1:0 ^= asl(r3:2,#63)
+  if (p0) jump .Lnan
+ }
+ {
+  p1 = dfclass(r5:4,#0x08)
+  if (p1.new) jump:nt .Lfma_inf_plus_inf
+ }
+
+ {
+  jumpr r31
+ }
+ .falign
+.Lfma_inf_plus_inf:
+ {
+  p0 = dfcmp.eq(r1:0,r5:4)
+  if (!p0.new) jump:nt .Linvalid
+ }
+ {
+  jumpr r31
+ }
+
+.Lnan:
+ {
+  p0 = dfclass(r3:2,#0x10)
+  p1 = dfclass(r5:4,#0x10)
+  if (!p0.new) r3:2 = r1:0
+  if (!p1.new) r5:4 = r1:0
+ }
+ {
+  r3 = convert_df2sf(r3:2)
+  r2 = convert_df2sf(r5:4)
+ }
+ {
+  r3 = convert_df2sf(r1:0)
+  r1:0 = #-1
+  jumpr r31
+ }
+
+.Linvalid:
+ {
+  r28 = ##0x7f800001
+ }
+ {
+  r1:0 = convert_sf2df(r28)
+  jumpr r31
+ }
+
+.Lab_true_zero:
+
+ {
+  p0 = dfclass(r5:4,#0x10)
+  if (p0.new) jump:nt .Lnan
+  if (p0.new) r1:0 = r5:4
+ }
+ {
+  p0 = dfcmp.eq(r3:2,r5:4)
+  r1 = lsr(r1,#31)
+ }
+ {
+  r3 ^= asl(r1,#31)
+  if (!p0) r1:0 = r5:4
+  if (!p0) jumpr r31
+ }
+
+ {
+  p0 = cmp.eq(r3:2,r5:4)
+  if (p0.new) jumpr:t r31
+  r1:0 = r3:2
+ }
+ {
+  r28 = USR
+ }
+ {
+  r28 = extractu(r28,#2,#22)
+  r1:0 = #0
+ }
+ {
+  p0 = cmp.eq(r28,#2)
+  if (p0.new) r1 = ##0x80000000
+  jumpr r31
+ }
+
+
+
+
+ .falign
+.Lfma_abnormal_c:
+
+
+ {
+  p0 = dfclass(r5:4,#0x10)
+  if (p0.new) jump:nt .Lnan
+  if (p0.new) r1:0 = r5:4
+  deallocframe
+ }
+ {
+  p0 = dfclass(r5:4,#0x08)
+  if (p0.new) r1:0 = r5:4
+  if (p0.new) jumpr:nt r31
+ }
+
+
+ {
+  p0 = dfclass(r5:4,#0x01)
+  if (p0.new) jump:nt __hexagon_muldf3
+  r28 = #1
+ }
+
+
+ {
+  allocframe(#32)
+  r11:10 = #0
+  r5 = insert(r28,#11,#20)
+  jump .Lfma_abnormal_c_restart
+ }
+.size fma,.-fma
diff --git a/library/compiler-builtins/compiler-builtins/src/hexagon/dfminmax.s b/library/compiler-builtins/compiler-builtins/src/hexagon/dfminmax.s
new file mode 100644
index 00000000000..953e773bf19
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/hexagon/dfminmax.s
@@ -0,0 +1,45 @@
+ .text
+ .global __hexagon_mindf3
+ .global __hexagon_maxdf3
+ .type __hexagon_mindf3,@function
+ .type __hexagon_maxdf3,@function
+ .global __qdsp_mindf3 ; .set __qdsp_mindf3, __hexagon_mindf3
+ .global __qdsp_maxdf3 ; .set __qdsp_maxdf3, __hexagon_maxdf3
+ .p2align 5
+__hexagon_mindf3:
+ {
+  p0 = dfclass(r1:0,#0x10)
+  p1 = dfcmp.gt(r1:0,r3:2)
+  r5:4 = r1:0
+ }
+ {
+  if (p0) r1:0 = r3:2
+  if (p1) r1:0 = r3:2
+  p2 = dfcmp.eq(r1:0,r3:2)
+  if (!p2.new) jumpr:t r31
+ }
+
+ {
+  r1:0 = or(r5:4,r3:2)
+  jumpr r31
+ }
+.size __hexagon_mindf3,.-__hexagon_mindf3
+ .falign
+__hexagon_maxdf3:
+ {
+  p0 = dfclass(r1:0,#0x10)
+  p1 = dfcmp.gt(r3:2,r1:0)
+  r5:4 = r1:0
+ }
+ {
+  if (p0) r1:0 = r3:2
+  if (p1) r1:0 = r3:2
+  p2 = dfcmp.eq(r1:0,r3:2)
+  if (!p2.new) jumpr:t r31
+ }
+
+ {
+  r1:0 = and(r5:4,r3:2)
+  jumpr r31
+ }
+.size __hexagon_maxdf3,.-__hexagon_maxdf3
diff --git a/library/compiler-builtins/compiler-builtins/src/hexagon/dfmul.s b/library/compiler-builtins/compiler-builtins/src/hexagon/dfmul.s
new file mode 100644
index 00000000000..32fc674f975
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/hexagon/dfmul.s
@@ -0,0 +1,309 @@
+ .text
+ .global __hexagon_muldf3
+ .type __hexagon_muldf3,@function
+ .global __qdsp_muldf3 ; .set __qdsp_muldf3, __hexagon_muldf3
+  .global __hexagon_fast_muldf3 ; .set __hexagon_fast_muldf3, __hexagon_muldf3
+  .global __hexagon_fast2_muldf3 ; .set __hexagon_fast2_muldf3, __hexagon_muldf3
+ .p2align 5
+__hexagon_muldf3:
+ {
+  p0 = dfclass(r1:0,#2)
+  p0 = dfclass(r3:2,#2)
+  r13:12 = combine(##0x40000000,#0)
+ }
+ {
+  r13:12 = insert(r1:0,#52,#11 -1)
+  r5:4 = asl(r3:2,#11 -1)
+  r28 = #-1024
+  r9:8 = #1
+ }
+ {
+  r7:6 = mpyu(r4,r13)
+  r5:4 = insert(r9:8,#2,#62)
+ }
+
+
+
+
+ {
+  r15:14 = mpyu(r12,r4)
+  r7:6 += mpyu(r12,r5)
+ }
+ {
+  r7:6 += lsr(r15:14,#32)
+  r11:10 = mpyu(r13,r5)
+  r5:4 = combine(##1024 +1024 -4,#0)
+ }
+ {
+  r11:10 += lsr(r7:6,#32)
+  if (!p0) jump .Lmul_abnormal
+  p1 = cmp.eq(r14,#0)
+  p1 = cmp.eq(r6,#0)
+ }
+ {
+  if (!p1) r10 = or(r10,r8)
+  r6 = extractu(r1,#11,#20)
+  r7 = extractu(r3,#11,#20)
+ }
+ {
+  r15:14 = neg(r11:10)
+  r6 += add(r28,r7)
+  r28 = xor(r1,r3)
+ }
+ {
+  if (!p2.new) r11:10 = r15:14
+  p2 = cmp.gt(r28,#-1)
+  p0 = !cmp.gt(r6,r5)
+  p0 = cmp.gt(r6,r4)
+  if (!p0.new) jump:nt .Lmul_ovf_unf
+ }
+ {
+  r1:0 = convert_d2df(r11:10)
+  r6 = add(r6,#-1024 -58)
+ }
+ {
+  r1 += asl(r6,#20)
+  jumpr r31
+ }
+
+ .falign
+.Lpossible_unf1:
+ {
+  p0 = cmp.eq(r0,#0)
+  p0 = bitsclr(r1,r4)
+  if (!p0.new) jumpr:t r31
+  r5 = #0x7fff
+ }
+ {
+  p0 = bitsset(r13,r5)
+  r4 = USR
+  r5 = #0x030
+ }
+ {
+  if (p0) r4 = or(r4,r5)
+ }
+ {
+  USR = r4
+ }
+ {
+  p0 = dfcmp.eq(r1:0,r1:0)
+  jumpr r31
+ }
+ .falign
+.Lmul_ovf_unf:
+ {
+  r1:0 = convert_d2df(r11:10)
+  r13:12 = abs(r11:10)
+  r7 = add(r6,#-1024 -58)
+ }
+ {
+  r1 += asl(r7,#20)
+  r7 = extractu(r1,#11,#20)
+  r4 = ##0x7FEFFFFF
+ }
+ {
+  r7 += add(r6,##-1024 -58)
+
+  r5 = #0
+ }
+ {
+  p0 = cmp.gt(r7,##1024 +1024 -2)
+  if (p0.new) jump:nt .Lmul_ovf
+ }
+ {
+  p0 = cmp.gt(r7,#0)
+  if (p0.new) jump:nt .Lpossible_unf1
+  r5 = sub(r6,r5)
+  r28 = #63
+ }
+ {
+  r4 = #0
+  r5 = sub(#5,r5)
+ }
+ {
+  p3 = cmp.gt(r11,#-1)
+  r5 = min(r5,r28)
+  r11:10 = r13:12
+ }
+ {
+  r28 = USR
+  r15:14 = extractu(r11:10,r5:4)
+ }
+ {
+  r11:10 = asr(r11:10,r5)
+  r4 = #0x0030
+  r1 = insert(r9,#11,#20)
+ }
+ {
+  p0 = cmp.gtu(r9:8,r15:14)
+  if (!p0.new) r10 = or(r10,r8)
+  r11 = setbit(r11,#20 +3)
+ }
+ {
+  r15:14 = neg(r11:10)
+  p1 = bitsclr(r10,#0x7)
+  if (!p1.new) r28 = or(r4,r28)
+ }
+ {
+  if (!p3) r11:10 = r15:14
+  USR = r28
+ }
+ {
+  r1:0 = convert_d2df(r11:10)
+  p0 = dfcmp.eq(r1:0,r1:0)
+ }
+ {
+  r1 = insert(r9,#11 -1,#20 +1)
+  jumpr r31
+ }
+ .falign
+.Lmul_ovf:
+
+ {
+  r28 = USR
+  r13:12 = combine(##0x7fefffff,#-1)
+  r1:0 = r11:10
+ }
+ {
+  r14 = extractu(r28,#2,#22)
+  r28 = or(r28,#0x28)
+  r5:4 = combine(##0x7ff00000,#0)
+ }
+ {
+  USR = r28
+  r14 ^= lsr(r1,#31)
+  r28 = r14
+ }
+ {
+  p0 = !cmp.eq(r28,#1)
+  p0 = !cmp.eq(r14,#2)
+  if (p0.new) r13:12 = r5:4
+  p0 = dfcmp.eq(r1:0,r1:0)
+ }
+ {
+  r1:0 = insert(r13:12,#63,#0)
+  jumpr r31
+ }
+
+.Lmul_abnormal:
+ {
+  r13:12 = extractu(r1:0,#63,#0)
+  r5:4 = extractu(r3:2,#63,#0)
+ }
+ {
+  p3 = cmp.gtu(r13:12,r5:4)
+  if (!p3.new) r1:0 = r3:2
+  if (!p3.new) r3:2 = r1:0
+ }
+ {
+
+  p0 = dfclass(r1:0,#0x0f)
+  if (!p0.new) jump:nt .Linvalid_nan
+  if (!p3) r13:12 = r5:4
+  if (!p3) r5:4 = r13:12
+ }
+ {
+
+  p1 = dfclass(r1:0,#0x08)
+  p1 = dfclass(r3:2,#0x0e)
+ }
+ {
+
+
+  p0 = dfclass(r1:0,#0x08)
+  p0 = dfclass(r3:2,#0x01)
+ }
+ {
+  if (p1) jump .Ltrue_inf
+  p2 = dfclass(r3:2,#0x01)
+ }
+ {
+  if (p0) jump .Linvalid_zeroinf
+  if (p2) jump .Ltrue_zero
+  r28 = ##0x7c000000
+ }
+
+
+
+
+
+ {
+  p0 = bitsclr(r1,r28)
+  if (p0.new) jump:nt .Lmul_tiny
+ }
+ {
+  r28 = cl0(r5:4)
+ }
+ {
+  r28 = add(r28,#-11)
+ }
+ {
+  r5:4 = asl(r5:4,r28)
+ }
+ {
+  r3:2 = insert(r5:4,#63,#0)
+  r1 -= asl(r28,#20)
+ }
+ jump __hexagon_muldf3
+.Lmul_tiny:
+ {
+  r28 = USR
+  r1:0 = xor(r1:0,r3:2)
+ }
+ {
+  r28 = or(r28,#0x30)
+  r1:0 = insert(r9:8,#63,#0)
+  r5 = extractu(r28,#2,#22)
+ }
+ {
+  USR = r28
+  p0 = cmp.gt(r5,#1)
+  if (!p0.new) r0 = #0
+  r5 ^= lsr(r1,#31)
+ }
+ {
+  p0 = cmp.eq(r5,#3)
+  if (!p0.new) r0 = #0
+  jumpr r31
+ }
+.Linvalid_zeroinf:
+ {
+  r28 = USR
+ }
+ {
+  r1:0 = #-1
+  r28 = or(r28,#2)
+ }
+ {
+  USR = r28
+ }
+ {
+  p0 = dfcmp.uo(r1:0,r1:0)
+  jumpr r31
+ }
+.Linvalid_nan:
+ {
+  p0 = dfclass(r3:2,#0x0f)
+  r28 = convert_df2sf(r1:0)
+  if (p0.new) r3:2 = r1:0
+ }
+ {
+  r2 = convert_df2sf(r3:2)
+  r1:0 = #-1
+  jumpr r31
+ }
+ .falign
+.Ltrue_zero:
+ {
+  r1:0 = r3:2
+  r3:2 = r1:0
+ }
+.Ltrue_inf:
+ {
+  r3 = extract(r3,#1,#31)
+ }
+ {
+  r1 ^= asl(r3,#31)
+  jumpr r31
+ }
+.size __hexagon_muldf3,.-__hexagon_muldf3
diff --git a/library/compiler-builtins/compiler-builtins/src/hexagon/dfsqrt.s b/library/compiler-builtins/compiler-builtins/src/hexagon/dfsqrt.s
new file mode 100644
index 00000000000..14f584a1133
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/hexagon/dfsqrt.s
@@ -0,0 +1,277 @@
+ .text
+ .global __hexagon_sqrtdf2
+ .type __hexagon_sqrtdf2,@function
+ .global __hexagon_sqrt
+ .type __hexagon_sqrt,@function
+ .global __qdsp_sqrtdf2 ; .set __qdsp_sqrtdf2, __hexagon_sqrtdf2; .type __qdsp_sqrtdf2,@function
+ .global __qdsp_sqrt ; .set __qdsp_sqrt, __hexagon_sqrt; .type __qdsp_sqrt,@function
+ .global __hexagon_fast_sqrtdf2 ; .set __hexagon_fast_sqrtdf2, __hexagon_sqrtdf2; .type __hexagon_fast_sqrtdf2,@function
+ .global __hexagon_fast_sqrt ; .set __hexagon_fast_sqrt, __hexagon_sqrt; .type __hexagon_fast_sqrt,@function
+ .global __hexagon_fast2_sqrtdf2 ; .set __hexagon_fast2_sqrtdf2, __hexagon_sqrtdf2; .type __hexagon_fast2_sqrtdf2,@function
+ .global __hexagon_fast2_sqrt ; .set __hexagon_fast2_sqrt, __hexagon_sqrt; .type __hexagon_fast2_sqrt,@function
+ .type sqrt,@function
+ .p2align 5
+__hexagon_sqrtdf2:
+__hexagon_sqrt:
+ {
+  r15:14 = extractu(r1:0,#23 +1,#52 -23)
+  r28 = extractu(r1,#11,#52 -32)
+  r5:4 = combine(##0x3f000004,#1)
+ }
+ {
+  p2 = dfclass(r1:0,#0x02)
+  p2 = cmp.gt(r1,#-1)
+  if (!p2.new) jump:nt .Lsqrt_abnormal
+  r9 = or(r5,r14)
+ }
+
+.Ldenormal_restart:
+ {
+  r11:10 = r1:0
+  r7,p0 = sfinvsqrta(r9)
+  r5 = and(r5,#-16)
+  r3:2 = #0
+ }
+ {
+  r3 += sfmpy(r7,r9):lib
+  r2 += sfmpy(r7,r5):lib
+  r6 = r5
+
+
+  r9 = and(r28,#1)
+ }
+ {
+  r6 -= sfmpy(r3,r2):lib
+  r11 = insert(r4,#11 +1,#52 -32)
+  p1 = cmp.gtu(r9,#0)
+ }
+ {
+  r3 += sfmpy(r3,r6):lib
+  r2 += sfmpy(r2,r6):lib
+  r6 = r5
+  r9 = mux(p1,#8,#9)
+ }
+ {
+  r6 -= sfmpy(r3,r2):lib
+  r11:10 = asl(r11:10,r9)
+  r9 = mux(p1,#3,#2)
+ }
+ {
+  r2 += sfmpy(r2,r6):lib
+
+  r15:14 = asl(r11:10,r9)
+ }
+ {
+  r2 = and(r2,##0x007fffff)
+ }
+ {
+  r2 = add(r2,##0x00800000 - 3)
+  r9 = mux(p1,#7,#8)
+ }
+ {
+  r8 = asl(r2,r9)
+  r9 = mux(p1,#15-(1+1),#15-(1+0))
+ }
+ {
+  r13:12 = mpyu(r8,r15)
+ }
+ {
+  r1:0 = asl(r11:10,#15)
+  r15:14 = mpyu(r13,r13)
+  p1 = cmp.eq(r0,r0)
+ }
+ {
+  r1:0 -= asl(r15:14,#15)
+  r15:14 = mpyu(r13,r12)
+  p2 = cmp.eq(r0,r0)
+ }
+ {
+  r1:0 -= lsr(r15:14,#16)
+  p3 = cmp.eq(r0,r0)
+ }
+ {
+  r1:0 = mpyu(r1,r8)
+ }
+ {
+  r13:12 += lsr(r1:0,r9)
+  r9 = add(r9,#16)
+  r1:0 = asl(r11:10,#31)
+ }
+
+ {
+  r15:14 = mpyu(r13,r13)
+  r1:0 -= mpyu(r13,r12)
+ }
+ {
+  r1:0 -= asl(r15:14,#31)
+  r15:14 = mpyu(r12,r12)
+ }
+ {
+  r1:0 -= lsr(r15:14,#33)
+ }
+ {
+  r1:0 = mpyu(r1,r8)
+ }
+ {
+  r13:12 += lsr(r1:0,r9)
+  r9 = add(r9,#16)
+  r1:0 = asl(r11:10,#47)
+ }
+
+ {
+  r15:14 = mpyu(r13,r13)
+ }
+ {
+  r1:0 -= asl(r15:14,#47)
+  r15:14 = mpyu(r13,r12)
+ }
+ {
+  r1:0 -= asl(r15:14,#16)
+  r15:14 = mpyu(r12,r12)
+ }
+ {
+  r1:0 -= lsr(r15:14,#17)
+ }
+ {
+  r1:0 = mpyu(r1,r8)
+ }
+ {
+  r13:12 += lsr(r1:0,r9)
+ }
+ {
+  r3:2 = mpyu(r13,r12)
+  r5:4 = mpyu(r12,r12)
+  r15:14 = #0
+  r1:0 = #0
+ }
+ {
+  r3:2 += lsr(r5:4,#33)
+  r5:4 += asl(r3:2,#33)
+  p1 = cmp.eq(r0,r0)
+ }
+ {
+  r7:6 = mpyu(r13,r13)
+  r1:0 = sub(r1:0,r5:4,p1):carry
+  r9:8 = #1
+ }
+ {
+  r7:6 += lsr(r3:2,#31)
+  r9:8 += asl(r13:12,#1)
+ }
+
+
+
+
+
+ {
+  r15:14 = sub(r11:10,r7:6,p1):carry
+  r5:4 = sub(r1:0,r9:8,p2):carry
+
+
+
+
+  r7:6 = #1
+  r11:10 = #0
+ }
+ {
+  r3:2 = sub(r15:14,r11:10,p2):carry
+  r7:6 = add(r13:12,r7:6)
+  r28 = add(r28,#-0x3ff)
+ }
+ {
+
+  if (p2) r13:12 = r7:6
+  if (p2) r1:0 = r5:4
+  if (p2) r15:14 = r3:2
+ }
+ {
+  r5:4 = sub(r1:0,r9:8,p3):carry
+  r7:6 = #1
+  r28 = asr(r28,#1)
+ }
+ {
+  r3:2 = sub(r15:14,r11:10,p3):carry
+  r7:6 = add(r13:12,r7:6)
+ }
+ {
+  if (p3) r13:12 = r7:6
+  if (p3) r1:0 = r5:4
+
+
+
+
+
+  r2 = #1
+ }
+ {
+  p0 = cmp.eq(r1:0,r11:10)
+  if (!p0.new) r12 = or(r12,r2)
+  r3 = cl0(r13:12)
+  r28 = add(r28,#-63)
+ }
+
+
+
+ {
+  r1:0 = convert_ud2df(r13:12)
+  r28 = add(r28,r3)
+ }
+ {
+  r1 += asl(r28,#52 -32)
+  jumpr r31
+ }
+.Lsqrt_abnormal:
+ {
+  p0 = dfclass(r1:0,#0x01)
+  if (p0.new) jumpr:t r31
+ }
+ {
+  p0 = dfclass(r1:0,#0x10)
+  if (p0.new) jump:nt .Lsqrt_nan
+ }
+ {
+  p0 = cmp.gt(r1,#-1)
+  if (!p0.new) jump:nt .Lsqrt_invalid_neg
+  if (!p0.new) r28 = ##0x7F800001
+ }
+ {
+  p0 = dfclass(r1:0,#0x08)
+  if (p0.new) jumpr:nt r31
+ }
+
+
+ {
+  r1:0 = extractu(r1:0,#52,#0)
+ }
+ {
+  r28 = add(clb(r1:0),#-11)
+ }
+ {
+  r1:0 = asl(r1:0,r28)
+  r28 = sub(#1,r28)
+ }
+ {
+  r1 = insert(r28,#1,#52 -32)
+ }
+ {
+  r3:2 = extractu(r1:0,#23 +1,#52 -23)
+  r5 = ##0x3f000004
+ }
+ {
+  r9 = or(r5,r2)
+  r5 = and(r5,#-16)
+  jump .Ldenormal_restart
+ }
+.Lsqrt_nan:
+ {
+  r28 = convert_df2sf(r1:0)
+  r1:0 = #-1
+  jumpr r31
+ }
+.Lsqrt_invalid_neg:
+ {
+  r1:0 = convert_sf2df(r28)
+  jumpr r31
+ }
+.size __hexagon_sqrt,.-__hexagon_sqrt
+.size __hexagon_sqrtdf2,.-__hexagon_sqrtdf2
diff --git a/library/compiler-builtins/compiler-builtins/src/hexagon/divdi3.s b/library/compiler-builtins/compiler-builtins/src/hexagon/divdi3.s
new file mode 100644
index 00000000000..0fee6e70f06
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/hexagon/divdi3.s
@@ -0,0 +1,64 @@
+
+FUNCTION_BEGIN __hexagon_divdi3
+ {
+  p2 = tstbit(r1,#31)
+  p3 = tstbit(r3,#31)
+ }
+ {
+  r1:0 = abs(r1:0)
+  r3:2 = abs(r3:2)
+ }
+ {
+  r6 = cl0(r1:0)
+  r7 = cl0(r3:2)
+  r5:4 = r3:2
+  r3:2 = r1:0
+ }
+ {
+  p3 = xor(p2,p3)
+  r10 = sub(r7,r6)
+  r1:0 = #0
+  r15:14 = #1
+ }
+ {
+  r11 = add(r10,#1)
+  r13:12 = lsl(r5:4,r10)
+  r15:14 = lsl(r15:14,r10)
+ }
+ {
+  p0 = cmp.gtu(r5:4,r3:2)
+  loop0(1f,r11)
+ }
+ {
+  if (p0) jump .hexagon_divdi3_return
+ }
+ .falign
+1:
+ {
+  p0 = cmp.gtu(r13:12,r3:2)
+ }
+ {
+  r7:6 = sub(r3:2, r13:12)
+  r9:8 = add(r1:0, r15:14)
+ }
+ {
+  r1:0 = vmux(p0, r1:0, r9:8)
+  r3:2 = vmux(p0, r3:2, r7:6)
+ }
+ {
+  r15:14 = lsr(r15:14, #1)
+  r13:12 = lsr(r13:12, #1)
+ }:endloop0
+
+.hexagon_divdi3_return:
+ {
+  r3:2 = neg(r1:0)
+ }
+ {
+  r1:0 = vmux(p3,r3:2,r1:0)
+  jumpr r31
+ }
+FUNCTION_END __hexagon_divdi3
+
+  .globl __qdsp_divdi3
+  .set __qdsp_divdi3, __hexagon_divdi3
diff --git a/library/compiler-builtins/compiler-builtins/src/hexagon/divsi3.s b/library/compiler-builtins/compiler-builtins/src/hexagon/divsi3.s
new file mode 100644
index 00000000000..fc957a43146
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/hexagon/divsi3.s
@@ -0,0 +1,53 @@
+
+FUNCTION_BEGIN __hexagon_divsi3
+ {
+  p0 = cmp.ge(r0,#0)
+  p1 = cmp.ge(r1,#0)
+  r1 = abs(r0)
+  r2 = abs(r1)
+ }
+ {
+  r3 = cl0(r1)
+  r4 = cl0(r2)
+  r5 = sub(r1,r2)
+  p2 = cmp.gtu(r2,r1)
+ }
+ {
+  r0 = #0
+  p1 = xor(p0,p1)
+  p0 = cmp.gtu(r2,r5)
+  if (p2) jumpr r31
+ }
+
+ {
+  r0 = mux(p1,#-1,#1)
+  if (p0) jumpr r31
+  r4 = sub(r4,r3)
+  r3 = #1
+ }
+ {
+  r0 = #0
+  r3:2 = vlslw(r3:2,r4)
+  loop0(1f,r4)
+ }
+ .falign
+1:
+ {
+  p0 = cmp.gtu(r2,r1)
+  if (!p0.new) r1 = sub(r1,r2)
+  if (!p0.new) r0 = add(r0,r3)
+  r3:2 = vlsrw(r3:2,#1)
+ }:endloop0
+ {
+  p0 = cmp.gtu(r2,r1)
+  if (!p0.new) r0 = add(r0,r3)
+  if (!p1) jumpr r31
+ }
+ {
+  r0 = neg(r0)
+  jumpr r31
+ }
+FUNCTION_END __hexagon_divsi3
+
+  .globl __qdsp_divsi3
+  .set __qdsp_divsi3, __hexagon_divsi3
diff --git a/library/compiler-builtins/compiler-builtins/src/hexagon/fastmath2_dlib_asm.s b/library/compiler-builtins/compiler-builtins/src/hexagon/fastmath2_dlib_asm.s
new file mode 100644
index 00000000000..e77b7db0332
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/hexagon/fastmath2_dlib_asm.s
@@ -0,0 +1,266 @@
+        .text
+        .global __hexagon_fast2_dadd_asm
+        .type __hexagon_fast2_dadd_asm, @function
+__hexagon_fast2_dadd_asm:
+        .falign
+      {
+        R7:6 = VABSDIFFH(R1:0, R3:2)
+        R9 = #62
+        R4 = SXTH(R0)
+        R5 = SXTH(R2)
+      } {
+        R6 = SXTH(R6)
+        P0 = CMP.GT(R4, R5);
+        if ( P0.new) R8 = add(R4, #1)
+        if (!P0.new) R8 = add(R5, #1)
+      } {
+        if ( P0) R4 = #1
+        if (!P0) R5 = #1
+        R0.L = #0
+        R6 = MIN(R6, R9)
+      } {
+        if (!P0) R4 = add(R6, #1)
+        if ( P0) R5 = add(R6, #1)
+        R2.L = #0
+        R11:10 = #0
+      } {
+        R1:0 = ASR(R1:0, R4)
+        R3:2 = ASR(R3:2, R5)
+      } {
+        R1:0 = add(R1:0, R3:2)
+        R10.L = #0x8001
+      } {
+        R4 = clb(R1:0)
+        R9 = #58
+      } {
+        R4 = add(R4, #-1)
+        p0 = cmp.gt(R4, R9)
+      } {
+        R1:0 = ASL(R1:0, R4)
+        R8 = SUB(R8, R4)
+        if(p0) jump .Ldenorma
+      } {
+        R0 = insert(R8, #16, #0)
+        jumpr r31
+      }
+.Ldenorma:
+      {
+        R1:0 = R11:10
+        jumpr r31
+      }
+        .text
+        .global __hexagon_fast2_dsub_asm
+        .type __hexagon_fast2_dsub_asm, @function
+__hexagon_fast2_dsub_asm:
+        .falign
+      {
+        R7:6 = VABSDIFFH(R1:0, R3:2)
+        R9 = #62
+        R4 = SXTH(R0)
+        R5 = SXTH(R2)
+      } {
+        R6 = SXTH(R6)
+        P0 = CMP.GT(R4, R5);
+        if ( P0.new) R8 = add(R4, #1)
+        if (!P0.new) R8 = add(R5, #1)
+      } {
+        if ( P0) R4 = #1
+        if (!P0) R5 = #1
+        R0.L = #0
+        R6 = MIN(R6, R9)
+      } {
+        if (!P0) R4 = add(R6, #1)
+        if ( P0) R5 = add(R6, #1)
+        R2.L = #0
+        R11:10 = #0
+      } {
+        R1:0 = ASR(R1:0, R4)
+        R3:2 = ASR(R3:2, R5)
+      } {
+        R1:0 = sub(R1:0, R3:2)
+        R10.L = #0x8001
+      } {
+        R4 = clb(R1:0)
+        R9 = #58
+      } {
+        R4 = add(R4, #-1)
+        p0 = cmp.gt(R4, R9)
+      } {
+        R1:0 = ASL(R1:0, R4)
+        R8 = SUB(R8, R4)
+        if(p0) jump .Ldenorm
+      } {
+        R0 = insert(R8, #16, #0)
+        jumpr r31
+      }
+.Ldenorm:
+      {
+        R1:0 = R11:10
+        jumpr r31
+      }
+        .text
+        .global __hexagon_fast2_dmpy_asm
+        .type __hexagon_fast2_dmpy_asm, @function
+__hexagon_fast2_dmpy_asm:
+        .falign
+      {
+        R13= lsr(R2, #16)
+        R5 = sxth(R2)
+        R4 = sxth(R0)
+        R12= lsr(R0, #16)
+      }
+      {
+        R11:10 = mpy(R1, R3)
+        R7:6 = mpy(R1, R13)
+        R0.L = #0x0
+        R15:14 = #0
+      }
+      {
+        R11:10 = add(R11:10, R11:10)
+        R7:6 += mpy(R3, R12)
+        R2.L = #0x0
+        R15.H = #0x8000
+      }
+      {
+        R7:6 = asr(R7:6, #15)
+        R12.L = #0x8001
+        p1 = cmp.eq(R1:0, R3:2)
+      }
+      {
+        R7:6 = add(R7:6, R11:10)
+        R8 = add(R4, R5)
+        p2 = cmp.eq(R1:0, R15:14)
+      }
+      {
+        R9 = clb(R7:6)
+        R3:2 = abs(R7:6)
+        R11 = #58
+      }
+      {
+        p1 = and(p1, p2)
+        R8 = sub(R8, R9)
+        R9 = add(R9, #-1)
+ p0 = cmp.gt(R9, R11)
+      }
+      {
+        R8 = add(R8, #1)
+        R1:0 = asl(R7:6, R9)
+        if(p1) jump .Lsat
+      }
+      {
+        R0 = insert(R8,#16, #0)
+        if(!p0) jumpr r31
+      }
+      {
+        R0 = insert(R12,#16, #0)
+        jumpr r31
+      }
+.Lsat:
+      {
+        R1:0 = #-1
+      }
+      {
+        R1:0 = lsr(R1:0, #1)
+      }
+      {
+        R0 = insert(R8,#16, #0)
+        jumpr r31
+      }
+        .text
+        .global __hexagon_fast2_qd2f_asm
+        .type __hexagon_fast2_qd2f_asm, @function
+__hexagon_fast2_qd2f_asm:
+      .falign
+     {
+       R3 = abs(R1):sat
+       R4 = sxth(R0)
+       R5 = #0x40
+       R6.L = #0xffc0
+     }
+     {
+       R0 = extractu(R3, #8, #0)
+       p2 = cmp.gt(R4, #126)
+       p3 = cmp.ge(R4, #-126)
+       R6.H = #0x7fff
+     }
+     {
+       p1 = cmp.eq(R0,#0x40)
+       if(p1.new) R5 = #0
+       R4 = add(R4, #126)
+       if(!p3) jump .Lmin
+     }
+     {
+       p0 = bitsset(R3, R6)
+       R0.L = #0x0000
+       R2 = add(R3, R5)
+       R7 = lsr(R6, #8)
+     }
+     {
+       if(p0) R4 = add(R4, #1)
+       if(p0) R3 = #0
+       R2 = lsr(R2, #7)
+       R0.H = #0x8000
+     }
+     {
+       R0 = and(R0, R1)
+       R6 &= asl(R4, #23)
+       if(!p0) R3 = and(R2, R7)
+       if(p2) jump .Lmax
+     }
+     {
+       R0 += add(R6, R3)
+       jumpr r31
+     }
+.Lmax:
+     {
+       R0.L = #0xffff;
+     }
+     {
+       R0.H = #0x7f7f;
+       jumpr r31
+     }
+.Lmin:
+     {
+       R0 = #0x0
+       jumpr r31
+     }
+        .text
+        .global __hexagon_fast2_f2qd_asm
+        .type __hexagon_fast2_f2qd_asm, @function
+__hexagon_fast2_f2qd_asm:
+
+
+
+
+
+
+
+        .falign
+  {
+       R1 = asl(R0, #7)
+       p0 = tstbit(R0, #31)
+       R5:4 = #0
+       R3 = add(R0,R0)
+  }
+  {
+       R1 = setbit(R1, #30)
+       R0= extractu(R0,#8,#23)
+       R4.L = #0x8001
+       p1 = cmp.eq(R3, #0)
+  }
+  {
+       R1= extractu(R1, #31, #0)
+       R0= add(R0, #-126)
+       R2 = #0
+       if(p1) jump .Lminqd
+  }
+  {
+       R0 = zxth(R0)
+       if(p0) R1= sub(R2, R1)
+       jumpr r31
+  }
+.Lminqd:
+  {
+       R1:0 = R5:4
+       jumpr r31
+  }
diff --git a/library/compiler-builtins/compiler-builtins/src/hexagon/fastmath2_ldlib_asm.s b/library/compiler-builtins/compiler-builtins/src/hexagon/fastmath2_ldlib_asm.s
new file mode 100644
index 00000000000..3251057d78c
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/hexagon/fastmath2_ldlib_asm.s
@@ -0,0 +1,187 @@
+        .text
+        .global __hexagon_fast2ldadd_asm
+        .type __hexagon_fast2ldadd_asm, @function
+__hexagon_fast2ldadd_asm:
+        .falign
+      {
+        R4 = memw(r29+#8)
+        R5 = memw(r29+#24)
+        r7 = r0
+      }
+      {
+        R6 = sub(R4, R5):sat
+        P0 = CMP.GT(R4, R5);
+        if ( P0.new) R8 = add(R4, #1)
+        if (!P0.new) R8 = add(R5, #1)
+      } {
+        R6 = abs(R6):sat
+        if ( P0) R4 = #1
+        if (!P0) R5 = #1
+        R9 = #62
+      } {
+        R6 = MIN(R6, R9)
+        R1:0 = memd(r29+#0)
+        R3:2 = memd(r29+#16)
+      } {
+        if (!P0) R4 = add(R6, #1)
+        if ( P0) R5 = add(R6, #1)
+      } {
+        R1:0 = ASR(R1:0, R4)
+        R3:2 = ASR(R3:2, R5)
+      } {
+        R1:0 = add(R1:0, R3:2)
+        R3:2 = #0
+      } {
+        R4 = clb(R1:0)
+        R9.L =#0x0001
+      } {
+        R8 -= add(R4, #-1)
+        R4 = add(R4, #-1)
+        p0 = cmp.gt(R4, #58)
+        R9.H =#0x8000
+      } {
+        if(!p0)memw(r7+#8) = R8
+        R1:0 = ASL(R1:0, R4)
+        if(p0) jump .Ldenorma1
+      } {
+        memd(r7+#0) = R1:0
+        jumpr r31
+      }
+.Ldenorma1:
+        memd(r7+#0) = R3:2
+      {
+        memw(r7+#8) = R9
+        jumpr r31
+      }
+        .text
+        .global __hexagon_fast2ldsub_asm
+        .type __hexagon_fast2ldsub_asm, @function
+__hexagon_fast2ldsub_asm:
+        .falign
+      {
+        R4 = memw(r29+#8)
+        R5 = memw(r29+#24)
+        r7 = r0
+      }
+      {
+        R6 = sub(R4, R5):sat
+        P0 = CMP.GT(R4, R5);
+        if ( P0.new) R8 = add(R4, #1)
+        if (!P0.new) R8 = add(R5, #1)
+      } {
+        R6 = abs(R6):sat
+        if ( P0) R4 = #1
+        if (!P0) R5 = #1
+        R9 = #62
+      } {
+        R6 = min(R6, R9)
+        R1:0 = memd(r29+#0)
+        R3:2 = memd(r29+#16)
+      } {
+        if (!P0) R4 = add(R6, #1)
+        if ( P0) R5 = add(R6, #1)
+      } {
+        R1:0 = ASR(R1:0, R4)
+        R3:2 = ASR(R3:2, R5)
+      } {
+        R1:0 = sub(R1:0, R3:2)
+        R3:2 = #0
+      } {
+        R4 = clb(R1:0)
+        R9.L =#0x0001
+      } {
+        R8 -= add(R4, #-1)
+        R4 = add(R4, #-1)
+        p0 = cmp.gt(R4, #58)
+        R9.H =#0x8000
+      } {
+        if(!p0)memw(r7+#8) = R8
+        R1:0 = asl(R1:0, R4)
+        if(p0) jump .Ldenorma_s
+      } {
+        memd(r7+#0) = R1:0
+        jumpr r31
+      }
+.Ldenorma_s:
+        memd(r7+#0) = R3:2
+      {
+        memw(r7+#8) = R9
+        jumpr r31
+      }
+        .text
+        .global __hexagon_fast2ldmpy_asm
+        .type __hexagon_fast2ldmpy_asm, @function
+__hexagon_fast2ldmpy_asm:
+        .falign
+      {
+        R15:14 = memd(r29+#0)
+        R3:2 = memd(r29+#16)
+        R13:12 = #0
+      }
+      {
+        R8= extractu(R2, #31, #1)
+        R9= extractu(R14, #31, #1)
+        R13.H = #0x8000
+      }
+      {
+        R11:10 = mpy(R15, R3)
+        R7:6 = mpy(R15, R8)
+        R4 = memw(r29+#8)
+        R5 = memw(r29+#24)
+      }
+      {
+        R11:10 = add(R11:10, R11:10)
+        R7:6 += mpy(R3, R9)
+      }
+      {
+        R7:6 = asr(R7:6, #30)
+        R8.L = #0x0001
+        p1 = cmp.eq(R15:14, R3:2)
+      }
+      {
+        R7:6 = add(R7:6, R11:10)
+        R4= add(R4, R5)
+        p2 = cmp.eq(R3:2, R13:12)
+      }
+      {
+        R9 = clb(R7:6)
+        R8.H = #0x8000
+        p1 = and(p1, p2)
+      }
+      {
+        R4-= add(R9, #-1)
+        R9 = add(R9, #-1)
+        if(p1) jump .Lsat1
+      }
+      {
+        R7:6 = asl(R7:6, R9)
+        memw(R0+#8) = R4
+ p0 = cmp.gt(R9, #58)
+        if(p0.new) jump:NT .Ldenorm1
+      }
+      {
+        memd(R0+#0) = R7:6
+        jumpr r31
+      }
+.Lsat1:
+      {
+        R13:12 = #0
+        R4+= add(R9, #1)
+      }
+      {
+        R13.H = #0x4000
+        memw(R0+#8) = R4
+      }
+      {
+        memd(R0+#0) = R13:12
+        jumpr r31
+      }
+.Ldenorm1:
+      {
+        memw(R0+#8) = R8
+        R15:14 = #0
+      }
+      {
+        memd(R0+#0) = R15:14
+        jumpr r31
+      }
diff --git a/library/compiler-builtins/compiler-builtins/src/hexagon/func_macro.s b/library/compiler-builtins/compiler-builtins/src/hexagon/func_macro.s
new file mode 100644
index 00000000000..9a1e11aebcb
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/hexagon/func_macro.s
@@ -0,0 +1,12 @@
+ .macro FUNCTION_BEGIN name
+ .text
+ .p2align 5
+ .globl \name
+ .type \name, @function
+\name:
+ .endm
+
+ .macro FUNCTION_END name
+ .size \name, . - \name
+ .endm
+
diff --git a/library/compiler-builtins/compiler-builtins/src/hexagon/memcpy_forward_vp4cp4n2.s b/library/compiler-builtins/compiler-builtins/src/hexagon/memcpy_forward_vp4cp4n2.s
new file mode 100644
index 00000000000..89f69010aa4
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/hexagon/memcpy_forward_vp4cp4n2.s
@@ -0,0 +1,91 @@
+  .text
+
+
+
+
+
+
+  .globl hexagon_memcpy_forward_vp4cp4n2
+  .balign 32
+  .type hexagon_memcpy_forward_vp4cp4n2,@function
+hexagon_memcpy_forward_vp4cp4n2:
+
+
+
+
+  {
+    r3 = sub(##4096, r1)
+    r5 = lsr(r2, #3)
+  }
+  {
+
+
+    r3 = extractu(r3, #10, #2)
+    r4 = extractu(r3, #7, #5)
+  }
+  {
+    r3 = minu(r2, r3)
+    r4 = minu(r5, r4)
+  }
+  {
+    r4 = or(r4, ##2105344)
+    p0 = cmp.eq(r3, #0)
+    if (p0.new) jump:nt .Lskipprolog
+  }
+    l2fetch(r1, r4)
+  {
+    loop0(.Lprolog, r3)
+    r2 = sub(r2, r3)
+  }
+  .falign
+.Lprolog:
+  {
+    r4 = memw(r1++#4)
+    memw(r0++#4) = r4.new
+  } :endloop0
+.Lskipprolog:
+  {
+
+    r3 = lsr(r2, #10)
+    if (cmp.eq(r3.new, #0)) jump:nt .Lskipmain
+  }
+  {
+    loop1(.Lout, r3)
+    r2 = extractu(r2, #10, #0)
+    r3 = ##2105472
+  }
+
+  .falign
+.Lout:
+
+    l2fetch(r1, r3)
+    loop0(.Lpage, #512)
+  .falign
+.Lpage:
+    r5:4 = memd(r1++#8)
+  {
+    memw(r0++#8) = r4
+    memw(r0+#4) = r5
+  } :endloop0:endloop1
+.Lskipmain:
+  {
+    r3 = ##2105344
+    r4 = lsr(r2, #3)
+    p0 = cmp.eq(r2, #0)
+    if (p0.new) jumpr:nt r31
+  }
+  {
+    r3 = or(r3, r4)
+    loop0(.Lepilog, r2)
+  }
+    l2fetch(r1, r3)
+  .falign
+.Lepilog:
+  {
+    r4 = memw(r1++#4)
+    memw(r0++#4) = r4.new
+  } :endloop0
+
+    jumpr r31
+
+.size hexagon_memcpy_forward_vp4cp4n2, . - hexagon_memcpy_forward_vp4cp4n2
diff --git a/library/compiler-builtins/compiler-builtins/src/hexagon/memcpy_likely_aligned.s b/library/compiler-builtins/compiler-builtins/src/hexagon/memcpy_likely_aligned.s
new file mode 100644
index 00000000000..7e9b62f6a79
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/hexagon/memcpy_likely_aligned.s
@@ -0,0 +1,42 @@
+
+FUNCTION_BEGIN __hexagon_memcpy_likely_aligned_min32bytes_mult8bytes
+ {
+  p0 = bitsclr(r1,#7)
+  p0 = bitsclr(r0,#7)
+  if (p0.new) r5:4 = memd(r1)
+  r3 = #-3
+ }
+ {
+  if (!p0) jump .Lmemcpy_call
+  if (p0) memd(r0++#8) = r5:4
+  if (p0) r5:4 = memd(r1+#8)
+  r3 += lsr(r2,#3)
+ }
+ {
+  memd(r0++#8) = r5:4
+  r5:4 = memd(r1+#16)
+  r1 = add(r1,#24)
+  loop0(1f,r3)
+ }
+ .falign
+1:
+ {
+  memd(r0++#8) = r5:4
+  r5:4 = memd(r1++#8)
+ }:endloop0
+ {
+  memd(r0) = r5:4
+  r0 -= add(r2,#-8)
+  jumpr r31
+ }
+FUNCTION_END __hexagon_memcpy_likely_aligned_min32bytes_mult8bytes
+
+.Lmemcpy_call:
+
+ jump memcpy@PLT
+
+
+
+
+  .globl __qdsp_memcpy_likely_aligned_min32bytes_mult8bytes
+  .set __qdsp_memcpy_likely_aligned_min32bytes_mult8bytes, __hexagon_memcpy_likely_aligned_min32bytes_mult8bytes
diff --git a/library/compiler-builtins/compiler-builtins/src/hexagon/moddi3.s b/library/compiler-builtins/compiler-builtins/src/hexagon/moddi3.s
new file mode 100644
index 00000000000..53ea6d52a58
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/hexagon/moddi3.s
@@ -0,0 +1,63 @@
+
+
+FUNCTION_BEGIN __hexagon_moddi3
+ {
+  p3 = tstbit(r1,#31)
+ }
+ {
+  r1:0 = abs(r1:0)
+  r3:2 = abs(r3:2)
+ }
+ {
+  r6 = cl0(r1:0)
+  r7 = cl0(r3:2)
+  r5:4 = r3:2
+  r3:2 = r1:0
+ }
+ {
+  r10 = sub(r7,r6)
+  r1:0 = #0
+  r15:14 = #1
+ }
+ {
+  r11 = add(r10,#1)
+  r13:12 = lsl(r5:4,r10)
+  r15:14 = lsl(r15:14,r10)
+ }
+ {
+  p0 = cmp.gtu(r5:4,r3:2)
+  loop0(1f,r11)
+ }
+ {
+  if (p0) jump .hexagon_moddi3_return
+ }
+ .falign
+1:
+ {
+  p0 = cmp.gtu(r13:12,r3:2)
+ }
+ {
+  r7:6 = sub(r3:2, r13:12)
+  r9:8 = add(r1:0, r15:14)
+ }
+ {
+  r1:0 = vmux(p0, r1:0, r9:8)
+  r3:2 = vmux(p0, r3:2, r7:6)
+ }
+ {
+  r15:14 = lsr(r15:14, #1)
+  r13:12 = lsr(r13:12, #1)
+ }:endloop0
+
+.hexagon_moddi3_return:
+ {
+  r1:0 = neg(r3:2)
+ }
+ {
+  r1:0 = vmux(p3,r1:0,r3:2)
+  jumpr r31
+ }
+FUNCTION_END __hexagon_moddi3
+
+  .globl __qdsp_moddi3
+  .set __qdsp_moddi3, __hexagon_moddi3
diff --git a/library/compiler-builtins/compiler-builtins/src/hexagon/modsi3.s b/library/compiler-builtins/compiler-builtins/src/hexagon/modsi3.s
new file mode 100644
index 00000000000..c4ae7e59edc
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/hexagon/modsi3.s
@@ -0,0 +1,44 @@
+
+
+FUNCTION_BEGIN __hexagon_modsi3
+ {
+  p2 = cmp.ge(r0,#0)
+  r2 = abs(r0)
+  r1 = abs(r1)
+ }
+ {
+  r3 = cl0(r2)
+  r4 = cl0(r1)
+  p0 = cmp.gtu(r1,r2)
+ }
+ {
+  r3 = sub(r4,r3)
+  if (p0) jumpr r31
+ }
+ {
+  p1 = cmp.eq(r3,#0)
+  loop0(1f,r3)
+  r0 = r2
+  r2 = lsl(r1,r3)
+ }
+ .falign
+1:
+ {
+  p0 = cmp.gtu(r2,r0)
+  if (!p0.new) r0 = sub(r0,r2)
+  r2 = lsr(r2,#1)
+  if (p1) r1 = #0
+ }:endloop0
+ {
+  p0 = cmp.gtu(r2,r0)
+  if (!p0.new) r0 = sub(r0,r1)
+  if (p2) jumpr r31
+ }
+ {
+  r0 = neg(r0)
+  jumpr r31
+ }
+FUNCTION_END __hexagon_modsi3
+
+  .globl __qdsp_modsi3
+  .set __qdsp_modsi3, __hexagon_modsi3
diff --git a/library/compiler-builtins/compiler-builtins/src/hexagon/sfdiv_opt.s b/library/compiler-builtins/compiler-builtins/src/hexagon/sfdiv_opt.s
new file mode 100644
index 00000000000..26c91f15cbb
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/hexagon/sfdiv_opt.s
@@ -0,0 +1,42 @@
+
+FUNCTION_BEGIN __hexagon_divsf3
+  {
+    r2,p0 = sfrecipa(r0,r1)
+    r4 = sffixupd(r0,r1)
+    r3 = ##0x3f800000
+  }
+  {
+    r5 = sffixupn(r0,r1)
+    r3 -= sfmpy(r4,r2):lib
+    r6 = ##0x80000000
+    r7 = r3
+  }
+  {
+    r2 += sfmpy(r3,r2):lib
+    r3 = r7
+    r6 = r5
+    r0 = and(r6,r5)
+  }
+  {
+    r3 -= sfmpy(r4,r2):lib
+    r0 += sfmpy(r5,r2):lib
+  }
+  {
+    r2 += sfmpy(r3,r2):lib
+    r6 -= sfmpy(r0,r4):lib
+  }
+  {
+    r0 += sfmpy(r6,r2):lib
+  }
+  {
+    r5 -= sfmpy(r0,r4):lib
+  }
+  {
+    r0 += sfmpy(r5,r2,p0):scale
+    jumpr r31
+  }
+FUNCTION_END __hexagon_divsf3
+
+.global __qdsp_divsf3 ; .set __qdsp_divsf3, __hexagon_divsf3
+.global __hexagon_fast_divsf3 ; .set __hexagon_fast_divsf3, __hexagon_divsf3
+.global __hexagon_fast2_divsf3 ; .set __hexagon_fast2_divsf3, __hexagon_divsf3
diff --git a/library/compiler-builtins/compiler-builtins/src/hexagon/sfsqrt_opt.s b/library/compiler-builtins/compiler-builtins/src/hexagon/sfsqrt_opt.s
new file mode 100644
index 00000000000..c90af179754
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/hexagon/sfsqrt_opt.s
@@ -0,0 +1,49 @@
+FUNCTION_BEGIN __hexagon_sqrtf
+  {
+    r3,p0 = sfinvsqrta(r0)
+    r5 = sffixupr(r0)
+    r4 = ##0x3f000000
+    r1:0 = combine(#0,#0)
+  }
+  {
+    r0 += sfmpy(r3,r5):lib
+    r1 += sfmpy(r3,r4):lib
+    r2 = r4
+    r3 = r5
+  }
+  {
+    r2 -= sfmpy(r0,r1):lib
+    p1 = sfclass(r5,#1)
+
+  }
+  {
+    r0 += sfmpy(r0,r2):lib
+    r1 += sfmpy(r1,r2):lib
+    r2 = r4
+    r3 = r5
+  }
+  {
+    r2 -= sfmpy(r0,r1):lib
+    r3 -= sfmpy(r0,r0):lib
+  }
+  {
+    r0 += sfmpy(r1,r3):lib
+    r1 += sfmpy(r1,r2):lib
+    r2 = r4
+    r3 = r5
+  }
+  {
+
+    r3 -= sfmpy(r0,r0):lib
+    if (p1) r0 = or(r0,r5)
+  }
+  {
+    r0 += sfmpy(r1,r3,p0):scale
+    jumpr r31
+  }
+
+FUNCTION_END __hexagon_sqrtf
+
+.global __qdsp_sqrtf ; .set __qdsp_sqrtf, __hexagon_sqrtf
+.global __hexagon_fast_sqrtf ; .set __hexagon_fast_sqrtf, __hexagon_sqrtf
+.global __hexagon_fast2_sqrtf ; .set __hexagon_fast2_sqrtf, __hexagon_sqrtf
diff --git a/library/compiler-builtins/compiler-builtins/src/hexagon/udivdi3.s b/library/compiler-builtins/compiler-builtins/src/hexagon/udivdi3.s
new file mode 100644
index 00000000000..f0fffc23df0
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/hexagon/udivdi3.s
@@ -0,0 +1,50 @@
+
+
+FUNCTION_BEGIN __hexagon_udivdi3
+ {
+  r6 = cl0(r1:0)
+  r7 = cl0(r3:2)
+  r5:4 = r3:2
+  r3:2 = r1:0
+ }
+ {
+  r10 = sub(r7,r6)
+  r1:0 = #0
+  r15:14 = #1
+ }
+ {
+  r11 = add(r10,#1)
+  r13:12 = lsl(r5:4,r10)
+  r15:14 = lsl(r15:14,r10)
+ }
+ {
+  p0 = cmp.gtu(r5:4,r3:2)
+  loop0(1f,r11)
+ }
+ {
+  if (p0) jumpr r31
+ }
+ .falign
+1:
+ {
+  p0 = cmp.gtu(r13:12,r3:2)
+ }
+ {
+  r7:6 = sub(r3:2, r13:12)
+  r9:8 = add(r1:0, r15:14)
+ }
+ {
+  r1:0 = vmux(p0, r1:0, r9:8)
+  r3:2 = vmux(p0, r3:2, r7:6)
+ }
+ {
+  r15:14 = lsr(r15:14, #1)
+  r13:12 = lsr(r13:12, #1)
+ }:endloop0
+ {
+  jumpr r31
+ }
+FUNCTION_END __hexagon_udivdi3
+
+  .globl __qdsp_udivdi3
+  .set __qdsp_udivdi3, __hexagon_udivdi3
diff --git a/library/compiler-builtins/compiler-builtins/src/hexagon/udivmoddi4.s b/library/compiler-builtins/compiler-builtins/src/hexagon/udivmoddi4.s
new file mode 100644
index 00000000000..cbfb3987dd2
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/hexagon/udivmoddi4.s
@@ -0,0 +1,50 @@
+
+
+FUNCTION_BEGIN __hexagon_udivmoddi4
+ {
+  r6 = cl0(r1:0)
+  r7 = cl0(r3:2)
+  r5:4 = r3:2
+  r3:2 = r1:0
+ }
+ {
+  r10 = sub(r7,r6)
+  r1:0 = #0
+  r15:14 = #1
+ }
+ {
+  r11 = add(r10,#1)
+  r13:12 = lsl(r5:4,r10)
+  r15:14 = lsl(r15:14,r10)
+ }
+ {
+  p0 = cmp.gtu(r5:4,r3:2)
+  loop0(1f,r11)
+ }
+ {
+  if (p0) jumpr r31
+ }
+ .falign
+1:
+ {
+  p0 = cmp.gtu(r13:12,r3:2)
+ }
+ {
+  r7:6 = sub(r3:2, r13:12)
+  r9:8 = add(r1:0, r15:14)
+ }
+ {
+  r1:0 = vmux(p0, r1:0, r9:8)
+  r3:2 = vmux(p0, r3:2, r7:6)
+ }
+ {
+  r15:14 = lsr(r15:14, #1)
+  r13:12 = lsr(r13:12, #1)
+ }:endloop0
+ {
+  jumpr r31
+ }
+FUNCTION_END __hexagon_udivmoddi4
+
+  .globl __qdsp_udivmoddi4
+  .set __qdsp_udivmoddi4, __hexagon_udivmoddi4
diff --git a/library/compiler-builtins/compiler-builtins/src/hexagon/udivmodsi4.s b/library/compiler-builtins/compiler-builtins/src/hexagon/udivmodsi4.s
new file mode 100644
index 00000000000..83489c51431
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/hexagon/udivmodsi4.s
@@ -0,0 +1,39 @@
+
+
+FUNCTION_BEGIN __hexagon_udivmodsi4
+ {
+  r2 = cl0(r0)
+  r3 = cl0(r1)
+  r5:4 = combine(#1,#0)
+  p0 = cmp.gtu(r1,r0)
+ }
+ {
+  r6 = sub(r3,r2)
+  r4 = r1
+  r1:0 = combine(r0,r4)
+  if (p0) jumpr r31
+ }
+ {
+  r3:2 = vlslw(r5:4,r6)
+  loop0(1f,r6)
+  p0 = cmp.eq(r6,#0)
+  if (p0.new) r4 = #0
+ }
+ .falign
+1:
+ {
+  p0 = cmp.gtu(r2,r1)
+  if (!p0.new) r1 = sub(r1,r2)
+  if (!p0.new) r0 = add(r0,r3)
+  r3:2 = vlsrw(r3:2,#1)
+ }:endloop0
+ {
+  p0 = cmp.gtu(r2,r1)
+  if (!p0.new) r1 = sub(r1,r4)
+  if (!p0.new) r0 = add(r0,r3)
+  jumpr r31
+ }
+FUNCTION_END __hexagon_udivmodsi4
+
+  .globl __qdsp_udivmodsi4
+  .set __qdsp_udivmodsi4, __hexagon_udivmodsi4
diff --git a/library/compiler-builtins/compiler-builtins/src/hexagon/udivsi3.s b/library/compiler-builtins/compiler-builtins/src/hexagon/udivsi3.s
new file mode 100644
index 00000000000..e0b94aa9982
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/hexagon/udivsi3.s
@@ -0,0 +1,36 @@
+
+
+FUNCTION_BEGIN __hexagon_udivsi3
+ {
+  r2 = cl0(r0)
+  r3 = cl0(r1)
+  r5:4 = combine(#1,#0)
+  p0 = cmp.gtu(r1,r0)
+ }
+ {
+  r6 = sub(r3,r2)
+  r4 = r1
+  r1:0 = combine(r0,r4)
+  if (p0) jumpr r31
+ }
+ {
+  r3:2 = vlslw(r5:4,r6)
+  loop0(1f,r6)
+ }
+ .falign
+1:
+ {
+  p0 = cmp.gtu(r2,r1)
+  if (!p0.new) r1 = sub(r1,r2)
+  if (!p0.new) r0 = add(r0,r3)
+  r3:2 = vlsrw(r3:2,#1)
+ }:endloop0
+ {
+  p0 = cmp.gtu(r2,r1)
+  if (!p0.new) r0 = add(r0,r3)
+  jumpr r31
+ }
+FUNCTION_END __hexagon_udivsi3
+
+  .globl __qdsp_udivsi3
+  .set __qdsp_udivsi3, __hexagon_udivsi3
diff --git a/library/compiler-builtins/compiler-builtins/src/hexagon/umoddi3.s b/library/compiler-builtins/compiler-builtins/src/hexagon/umoddi3.s
new file mode 100644
index 00000000000..c76011c3e7a
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/hexagon/umoddi3.s
@@ -0,0 +1,53 @@
+
+
+FUNCTION_BEGIN __hexagon_umoddi3
+ {
+  r6 = cl0(r1:0)
+  r7 = cl0(r3:2)
+  r5:4 = r3:2
+  r3:2 = r1:0
+ }
+ {
+  r10 = sub(r7,r6)
+  r1:0 = #0
+  r15:14 = #1
+ }
+ {
+  r11 = add(r10,#1)
+  r13:12 = lsl(r5:4,r10)
+  r15:14 = lsl(r15:14,r10)
+ }
+ {
+  p0 = cmp.gtu(r5:4,r3:2)
+  loop0(1f,r11)
+ }
+ {
+  if (p0) jump .hexagon_umoddi3_return
+ }
+ .falign
+1:
+ {
+  p0 = cmp.gtu(r13:12,r3:2)
+ }
+ {
+  r7:6 = sub(r3:2, r13:12)
+  r9:8 = add(r1:0, r15:14)
+ }
+ {
+  r1:0 = vmux(p0, r1:0, r9:8)
+  r3:2 = vmux(p0, r3:2, r7:6)
+ }
+ {
+  r15:14 = lsr(r15:14, #1)
+  r13:12 = lsr(r13:12, #1)
+ }:endloop0
+
+.hexagon_umoddi3_return:
+ {
+  r1:0 = r3:2
+  jumpr r31
+ }
+FUNCTION_END __hexagon_umoddi3
+
+  .globl __qdsp_umoddi3
+  .set __qdsp_umoddi3, __hexagon_umoddi3
diff --git a/library/compiler-builtins/compiler-builtins/src/hexagon/umodsi3.s b/library/compiler-builtins/compiler-builtins/src/hexagon/umodsi3.s
new file mode 100644
index 00000000000..1b592a7c561
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/hexagon/umodsi3.s
@@ -0,0 +1,34 @@
+
+
+FUNCTION_BEGIN __hexagon_umodsi3
+ {
+  r2 = cl0(r0)
+  r3 = cl0(r1)
+  p0 = cmp.gtu(r1,r0)
+ }
+ {
+  r2 = sub(r3,r2)
+  if (p0) jumpr r31
+ }
+ {
+  loop0(1f,r2)
+  p1 = cmp.eq(r2,#0)
+  r2 = lsl(r1,r2)
+ }
+ .falign
+1:
+ {
+  p0 = cmp.gtu(r2,r0)
+  if (!p0.new) r0 = sub(r0,r2)
+  r2 = lsr(r2,#1)
+  if (p1) r1 = #0
+ }:endloop0
+ {
+  p0 = cmp.gtu(r2,r0)
+  if (!p0.new) r0 = sub(r0,r1)
+  jumpr r31
+ }
+FUNCTION_END __hexagon_umodsi3
+
+  .globl __qdsp_umodsi3
+  .set __qdsp_umodsi3, __hexagon_umodsi3
diff --git a/library/compiler-builtins/compiler-builtins/src/int/addsub.rs b/library/compiler-builtins/compiler-builtins/src/int/addsub.rs
new file mode 100644
index 00000000000..1f84e8eb1e1
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/int/addsub.rs
@@ -0,0 +1,104 @@
+use crate::int::{DInt, Int, MinInt};
+
+trait UAddSub: DInt + Int {
+    fn uadd(self, other: Self) -> Self {
+        let (lo, carry) = self.lo().overflowing_add(other.lo());
+        let hi = self.hi().wrapping_add(other.hi());
+        let carry = if carry { Self::H::ONE } else { Self::H::ZERO };
+        Self::from_lo_hi(lo, hi.wrapping_add(carry))
+    }
+    fn uadd_one(self) -> Self {
+        let (lo, carry) = self.lo().overflowing_add(Self::H::ONE);
+        let carry = if carry { Self::H::ONE } else { Self::H::ZERO };
+        Self::from_lo_hi(lo, self.hi().wrapping_add(carry))
+    }
+    fn usub(self, other: Self) -> Self {
+        let uneg = (!other).uadd_one();
+        self.uadd(uneg)
+    }
+}
+
+impl UAddSub for u128 {}
+
+trait AddSub: Int
+where
+    <Self as MinInt>::UnsignedInt: UAddSub,
+{
+    fn add(self, other: Self) -> Self {
+        Self::from_unsigned(self.unsigned().uadd(other.unsigned()))
+    }
+    fn sub(self, other: Self) -> Self {
+        Self::from_unsigned(self.unsigned().usub(other.unsigned()))
+    }
+}
+
+impl AddSub for u128 {}
+impl AddSub for i128 {}
+
+trait Addo: AddSub
+where
+    <Self as MinInt>::UnsignedInt: UAddSub,
+{
+    fn addo(self, other: Self) -> (Self, bool) {
+        let sum = AddSub::add(self, other);
+        (sum, (other < Self::ZERO) != (sum < self))
+    }
+}
+
+impl Addo for i128 {}
+impl Addo for u128 {}
+
+trait Subo: AddSub
+where
+    <Self as MinInt>::UnsignedInt: UAddSub,
+{
+    fn subo(self, other: Self) -> (Self, bool) {
+        let sum = AddSub::sub(self, other);
+        (sum, (other < Self::ZERO) != (self < sum))
+    }
+}
+
+impl Subo for i128 {}
+impl Subo for u128 {}
+
+intrinsics! {
+    pub extern "C" fn __rust_i128_add(a: i128, b: i128) -> i128 {
+        AddSub::add(a,b)
+    }
+
+    pub extern "C" fn __rust_i128_addo(a: i128, b: i128, oflow: &mut i32) -> i128 {
+        let (add, o) = a.addo(b);
+        *oflow = o.into();
+        add
+    }
+
+    pub extern "C" fn __rust_u128_add(a: u128, b: u128) -> u128 {
+        AddSub::add(a,b)
+    }
+
+    pub extern "C" fn __rust_u128_addo(a: u128, b: u128, oflow: &mut i32) -> u128 {
+        let (add, o) = a.addo(b);
+        *oflow = o.into();
+        add
+    }
+
+    pub extern "C" fn __rust_i128_sub(a: i128, b: i128) -> i128 {
+        AddSub::sub(a,b)
+    }
+
+    pub extern "C" fn __rust_i128_subo(a: i128, b: i128, oflow: &mut i32) -> i128 {
+        let (sub, o) = a.subo(b);
+        *oflow = o.into();
+        sub
+    }
+
+    pub extern "C" fn __rust_u128_sub(a: u128, b: u128) -> u128 {
+        AddSub::sub(a,b)
+    }
+
+    pub extern "C" fn __rust_u128_subo(a: u128, b: u128, oflow: &mut i32) -> u128 {
+        let (sub, o) = a.subo(b);
+        *oflow = o.into();
+        sub
+    }
+}
diff --git a/library/compiler-builtins/compiler-builtins/src/int/big.rs b/library/compiler-builtins/compiler-builtins/src/int/big.rs
new file mode 100644
index 00000000000..1402efb8ed4
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/int/big.rs
@@ -0,0 +1,295 @@
+//! Integers used for wide operations, larger than `u128`.
+
+#![allow(unused)]
+
+use core::{fmt, ops};
+
+use crate::int::{DInt, HInt, Int, MinInt};
+
+const WORD_LO_MASK: u64 = 0x00000000ffffffff;
+const WORD_HI_MASK: u64 = 0xffffffff00000000;
+const WORD_FULL_MASK: u64 = 0xffffffffffffffff;
+const U128_LO_MASK: u128 = u64::MAX as u128;
+const U128_HI_MASK: u128 = (u64::MAX as u128) << 64;
+
+/// A 256-bit unsigned integer represented as 4 64-bit limbs.
+///
+/// Each limb is a native-endian number, but the array is little-limb-endian.
+#[allow(non_camel_case_types)]
+#[derive(Clone, Copy, Debug, PartialEq, PartialOrd)]
+pub struct u256(pub [u64; 4]);
+
+impl u256 {
+    pub const MAX: Self = Self([u64::MAX, u64::MAX, u64::MAX, u64::MAX]);
+
+    /// Reinterpret as a signed integer
+    pub fn signed(self) -> i256 {
+        i256(self.0)
+    }
+}
+
+/// A 256-bit signed integer represented as 4 64-bit limbs.
+///
+/// Each limb is a native-endian number, but the array is little-limb-endian.
+#[allow(non_camel_case_types)]
+#[derive(Clone, Copy, Debug, PartialEq, PartialOrd)]
+pub struct i256(pub [u64; 4]);
+
+impl i256 {
+    /// Reinterpret as an unsigned integer
+    pub fn unsigned(self) -> u256 {
+        u256(self.0)
+    }
+}
+
+impl MinInt for u256 {
+    type OtherSign = i256;
+
+    type UnsignedInt = u256;
+
+    const SIGNED: bool = false;
+    const BITS: u32 = 256;
+    const ZERO: Self = Self([0u64; 4]);
+    const ONE: Self = Self([1, 0, 0, 0]);
+    const MIN: Self = Self([0u64; 4]);
+    const MAX: Self = Self([u64::MAX; 4]);
+}
+
+impl MinInt for i256 {
+    type OtherSign = u256;
+
+    type UnsignedInt = u256;
+
+    const SIGNED: bool = false;
+    const BITS: u32 = 256;
+    const ZERO: Self = Self([0u64; 4]);
+    const ONE: Self = Self([1, 0, 0, 0]);
+    const MIN: Self = Self([0, 0, 0, 1 << 63]);
+    const MAX: Self = Self([u64::MAX, u64::MAX, u64::MAX, u64::MAX >> 1]);
+}
+
+macro_rules! impl_common {
+    ($ty:ty) => {
+        impl ops::BitOr for $ty {
+            type Output = Self;
+
+            fn bitor(mut self, rhs: Self) -> Self::Output {
+                self.0[0] |= rhs.0[0];
+                self.0[1] |= rhs.0[1];
+                self.0[2] |= rhs.0[2];
+                self.0[3] |= rhs.0[3];
+                self
+            }
+        }
+
+        impl ops::Not for $ty {
+            type Output = Self;
+
+            fn not(self) -> Self::Output {
+                Self([!self.0[0], !self.0[1], !self.0[2], !self.0[3]])
+            }
+        }
+
+        impl ops::Shl<u32> for $ty {
+            type Output = Self;
+
+            fn shl(self, rhs: u32) -> Self::Output {
+                unimplemented!("only used to meet trait bounds")
+            }
+        }
+    };
+}
+
+impl_common!(i256);
+impl_common!(u256);
+
+impl ops::Shr<u32> for u256 {
+    type Output = Self;
+
+    fn shr(self, rhs: u32) -> Self::Output {
+        assert!(rhs < Self::BITS, "attempted to shift right with overflow");
+
+        if rhs == 0 {
+            return self;
+        }
+
+        let mut ret = self;
+        let byte_shift = rhs / 64;
+        let bit_shift = rhs % 64;
+
+        for idx in 0..4 {
+            let base_idx = idx + byte_shift as usize;
+
+            let Some(base) = ret.0.get(base_idx) else {
+                ret.0[idx] = 0;
+                continue;
+            };
+
+            let mut new_val = base >> bit_shift;
+
+            if let Some(new) = ret.0.get(base_idx + 1) {
+                new_val |= new.overflowing_shl(64 - bit_shift).0;
+            }
+
+            ret.0[idx] = new_val;
+        }
+
+        ret
+    }
+}
+
+macro_rules! word {
+    (1, $val:expr) => {
+        (($val >> (32 * 3)) & Self::from(WORD_LO_MASK)) as u64
+    };
+    (2, $val:expr) => {
+        (($val >> (32 * 2)) & Self::from(WORD_LO_MASK)) as u64
+    };
+    (3, $val:expr) => {
+        (($val >> (32 * 1)) & Self::from(WORD_LO_MASK)) as u64
+    };
+    (4, $val:expr) => {
+        (($val >> (32 * 0)) & Self::from(WORD_LO_MASK)) as u64
+    };
+}
+
+impl HInt for u128 {
+    type D = u256;
+
+    fn widen(self) -> Self::D {
+        let w0 = self & u128::from(u64::MAX);
+        let w1 = (self >> u64::BITS) & u128::from(u64::MAX);
+        u256([w0 as u64, w1 as u64, 0, 0])
+    }
+
+    fn zero_widen(self) -> Self::D {
+        self.widen()
+    }
+
+    fn zero_widen_mul(self, rhs: Self) -> Self::D {
+        let product11: u64 = word!(1, self) * word!(1, rhs);
+        let product12: u64 = word!(1, self) * word!(2, rhs);
+        let product13: u64 = word!(1, self) * word!(3, rhs);
+        let product14: u64 = word!(1, self) * word!(4, rhs);
+        let product21: u64 = word!(2, self) * word!(1, rhs);
+        let product22: u64 = word!(2, self) * word!(2, rhs);
+        let product23: u64 = word!(2, self) * word!(3, rhs);
+        let product24: u64 = word!(2, self) * word!(4, rhs);
+        let product31: u64 = word!(3, self) * word!(1, rhs);
+        let product32: u64 = word!(3, self) * word!(2, rhs);
+        let product33: u64 = word!(3, self) * word!(3, rhs);
+        let product34: u64 = word!(3, self) * word!(4, rhs);
+        let product41: u64 = word!(4, self) * word!(1, rhs);
+        let product42: u64 = word!(4, self) * word!(2, rhs);
+        let product43: u64 = word!(4, self) * word!(3, rhs);
+        let product44: u64 = word!(4, self) * word!(4, rhs);
+
+        let sum0: u128 = u128::from(product44);
+        let sum1: u128 = u128::from(product34) + u128::from(product43);
+        let sum2: u128 = u128::from(product24) + u128::from(product33) + u128::from(product42);
+        let sum3: u128 = u128::from(product14)
+            + u128::from(product23)
+            + u128::from(product32)
+            + u128::from(product41);
+        let sum4: u128 = u128::from(product13) + u128::from(product22) + u128::from(product31);
+        let sum5: u128 = u128::from(product12) + u128::from(product21);
+        let sum6: u128 = u128::from(product11);
+
+        let r0: u128 =
+            (sum0 & u128::from(WORD_FULL_MASK)) + ((sum1 & u128::from(WORD_LO_MASK)) << 32);
+        let r1: u128 = (sum0 >> 64)
+            + ((sum1 >> 32) & u128::from(WORD_FULL_MASK))
+            + (sum2 & u128::from(WORD_FULL_MASK))
+            + ((sum3 << 32) & u128::from(WORD_HI_MASK));
+
+        let (lo, carry) = r0.overflowing_add(r1 << 64);
+        let hi = (r1 >> 64)
+            + (sum1 >> 96)
+            + (sum2 >> 64)
+            + (sum3 >> 32)
+            + sum4
+            + (sum5 << 32)
+            + (sum6 << 64)
+            + u128::from(carry);
+
+        u256([
+            (lo & U128_LO_MASK) as u64,
+            ((lo >> 64) & U128_LO_MASK) as u64,
+            (hi & U128_LO_MASK) as u64,
+            ((hi >> 64) & U128_LO_MASK) as u64,
+        ])
+    }
+
+    fn widen_mul(self, rhs: Self) -> Self::D {
+        self.zero_widen_mul(rhs)
+    }
+
+    fn widen_hi(self) -> Self::D {
+        self.widen() << <Self as MinInt>::BITS
+    }
+}
+
+impl HInt for i128 {
+    type D = i256;
+
+    fn widen(self) -> Self::D {
+        let mut ret = self.unsigned().zero_widen().signed();
+        if self.is_negative() {
+            ret.0[2] = u64::MAX;
+            ret.0[3] = u64::MAX;
+        }
+        ret
+    }
+
+    fn zero_widen(self) -> Self::D {
+        self.unsigned().zero_widen().signed()
+    }
+
+    fn zero_widen_mul(self, rhs: Self) -> Self::D {
+        self.unsigned().zero_widen_mul(rhs.unsigned()).signed()
+    }
+
+    fn widen_mul(self, rhs: Self) -> Self::D {
+        unimplemented!("signed i128 widening multiply is not used")
+    }
+
+    fn widen_hi(self) -> Self::D {
+        self.widen() << <Self as MinInt>::BITS
+    }
+}
+
+impl DInt for u256 {
+    type H = u128;
+
+    fn lo(self) -> Self::H {
+        let mut tmp = [0u8; 16];
+        tmp[..8].copy_from_slice(&self.0[0].to_le_bytes());
+        tmp[8..].copy_from_slice(&self.0[1].to_le_bytes());
+        u128::from_le_bytes(tmp)
+    }
+
+    fn hi(self) -> Self::H {
+        let mut tmp = [0u8; 16];
+        tmp[..8].copy_from_slice(&self.0[2].to_le_bytes());
+        tmp[8..].copy_from_slice(&self.0[3].to_le_bytes());
+        u128::from_le_bytes(tmp)
+    }
+}
+
+impl DInt for i256 {
+    type H = i128;
+
+    fn lo(self) -> Self::H {
+        let mut tmp = [0u8; 16];
+        tmp[..8].copy_from_slice(&self.0[0].to_le_bytes());
+        tmp[8..].copy_from_slice(&self.0[1].to_le_bytes());
+        i128::from_le_bytes(tmp)
+    }
+
+    fn hi(self) -> Self::H {
+        let mut tmp = [0u8; 16];
+        tmp[..8].copy_from_slice(&self.0[2].to_le_bytes());
+        tmp[8..].copy_from_slice(&self.0[3].to_le_bytes());
+        i128::from_le_bytes(tmp)
+    }
+}
diff --git a/library/compiler-builtins/compiler-builtins/src/int/bswap.rs b/library/compiler-builtins/compiler-builtins/src/int/bswap.rs
new file mode 100644
index 00000000000..3ede08882dc
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/int/bswap.rs
@@ -0,0 +1,19 @@
+intrinsics! {
+    #[maybe_use_optimized_c_shim]
+    /// Swaps bytes in 32-bit number
+    pub extern "C" fn __bswapsi2(x: u32) -> u32 {
+        x.swap_bytes()
+    }
+
+    #[maybe_use_optimized_c_shim]
+    /// Swaps bytes in 64-bit number
+    pub extern "C" fn __bswapdi2(x: u64) -> u64 {
+        x.swap_bytes()
+    }
+
+    #[maybe_use_optimized_c_shim]
+    /// Swaps bytes in 128-bit number
+    pub extern "C" fn __bswapti2(x: u128) -> u128 {
+        x.swap_bytes()
+    }
+}
diff --git a/library/compiler-builtins/compiler-builtins/src/int/leading_zeros.rs b/library/compiler-builtins/compiler-builtins/src/int/leading_zeros.rs
new file mode 100644
index 00000000000..112f4d03613
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/int/leading_zeros.rs
@@ -0,0 +1,164 @@
+// Note: these functions happen to produce the correct `usize::leading_zeros(0)` value
+// without a explicit zero check. Zero is probably common enough that it could warrant
+// adding a zero check at the beginning, but `__clzsi2` has a precondition that `x != 0`.
+// Compilers will insert the check for zero in cases where it is needed.
+
+#[cfg(feature = "unstable-public-internals")]
+pub use implementation::{leading_zeros_default, leading_zeros_riscv};
+#[cfg(not(feature = "unstable-public-internals"))]
+pub(crate) use implementation::{leading_zeros_default, leading_zeros_riscv};
+
+mod implementation {
+    use crate::int::{CastInto, Int};
+
+    /// Returns the number of leading binary zeros in `x`.
+    #[allow(dead_code)]
+    pub fn leading_zeros_default<T: Int + CastInto<usize>>(x: T) -> usize {
+        // The basic idea is to test if the higher bits of `x` are zero and bisect the number
+        // of leading zeros. It is possible for all branches of the bisection to use the same
+        // code path by conditionally shifting the higher parts down to let the next bisection
+        // step work on the higher or lower parts of `x`. Instead of starting with `z == 0`
+        // and adding to the number of zeros, it is slightly faster to start with
+        // `z == usize::MAX.count_ones()` and subtract from the potential number of zeros,
+        // because it simplifies the final bisection step.
+        let mut x = x;
+        // the number of potential leading zeros
+        let mut z = T::BITS as usize;
+        // a temporary
+        let mut t: T;
+
+        const { assert!(T::BITS <= 64) };
+        if T::BITS >= 64 {
+            t = x >> 32;
+            if t != T::ZERO {
+                z -= 32;
+                x = t;
+            }
+        }
+        if T::BITS >= 32 {
+            t = x >> 16;
+            if t != T::ZERO {
+                z -= 16;
+                x = t;
+            }
+        }
+        const { assert!(T::BITS >= 16) };
+        t = x >> 8;
+        if t != T::ZERO {
+            z -= 8;
+            x = t;
+        }
+        t = x >> 4;
+        if t != T::ZERO {
+            z -= 4;
+            x = t;
+        }
+        t = x >> 2;
+        if t != T::ZERO {
+            z -= 2;
+            x = t;
+        }
+        // the last two bisections are combined into one conditional
+        t = x >> 1;
+        if t != T::ZERO { z - 2 } else { z - x.cast() }
+
+        // We could potentially save a few cycles by using the LUT trick from
+        // "https://embeddedgurus.com/state-space/2014/09/
+        // fast-deterministic-and-portable-counting-leading-zeros/".
+        // However, 256 bytes for a LUT is too large for embedded use cases. We could remove
+        // the last 3 bisections  and use this 16 byte LUT for the rest of the work:
+        //const LUT: [u8; 16] = [0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4];
+        //z -= LUT[x] as usize;
+        //z
+        // However, it ends up generating about the same number of instructions. When benchmarked
+        // on x86_64, it is slightly faster to use the LUT, but this is probably because of OOO
+        // execution effects. Changing to using a LUT and branching is risky for smaller cores.
+    }
+
+    // The above method does not compile well on RISC-V (because of the lack of predicated
+    // instructions), producing code with many branches or using an excessively long
+    // branchless solution. This method takes advantage of the set-if-less-than instruction on
+    // RISC-V that allows `(x >= power-of-two) as usize` to be branchless.
+
+    /// Returns the number of leading binary zeros in `x`.
+    #[allow(dead_code)]
+    pub fn leading_zeros_riscv<T: Int + CastInto<usize>>(x: T) -> usize {
+        let mut x = x;
+        // the number of potential leading zeros
+        let mut z = T::BITS;
+        // a temporary
+        let mut t: u32;
+
+        // RISC-V does not have a set-if-greater-than-or-equal instruction and
+        // `(x >= power-of-two) as usize` will get compiled into two instructions, but this is
+        // still the most optimal method. A conditional set can only be turned into a single
+        // immediate instruction if `x` is compared with an immediate `imm` (that can fit into
+        // 12 bits) like `x < imm` but not `imm < x` (because the immediate is always on the
+        // right). If we try to save an instruction by using `x < imm` for each bisection, we
+        // have to shift `x` left and compare with powers of two approaching `usize::MAX + 1`,
+        // but the immediate will never fit into 12 bits and never save an instruction.
+        const { assert!(T::BITS <= 64) };
+        if T::BITS >= 64 {
+            // If the upper 32 bits of `x` are not all 0, `t` is set to `1 << 5`, otherwise
+            // `t` is set to 0.
+            t = ((x >= (T::ONE << 32)) as u32) << 5;
+            // If `t` was set to `1 << 5`, then the upper 32 bits are shifted down for the
+            // next step to process.
+            x >>= t;
+            // If `t` was set to `1 << 5`, then we subtract 32 from the number of potential
+            // leading zeros
+            z -= t;
+        }
+        if T::BITS >= 32 {
+            t = ((x >= (T::ONE << 16)) as u32) << 4;
+            x >>= t;
+            z -= t;
+        }
+        const { assert!(T::BITS >= 16) };
+        t = ((x >= (T::ONE << 8)) as u32) << 3;
+        x >>= t;
+        z -= t;
+        t = ((x >= (T::ONE << 4)) as u32) << 2;
+        x >>= t;
+        z -= t;
+        t = ((x >= (T::ONE << 2)) as u32) << 1;
+        x >>= t;
+        z -= t;
+        t = (x >= (T::ONE << 1)) as u32;
+        x >>= t;
+        z -= t;
+        // All bits except the LSB are guaranteed to be zero for this final bisection step.
+        // If `x != 0` then `x == 1` and subtracts one potential zero from `z`.
+        z as usize - x.cast()
+    }
+}
+
+intrinsics! {
+    /// Returns the number of leading binary zeros in `x`
+    pub extern "C" fn __clzsi2(x: u32) -> usize {
+        if cfg!(any(target_arch = "riscv32", target_arch = "riscv64")) {
+            leading_zeros_riscv(x)
+        } else {
+            leading_zeros_default(x)
+        }
+    }
+
+    /// Returns the number of leading binary zeros in `x`
+    pub extern "C" fn __clzdi2(x: u64) -> usize {
+        if cfg!(any(target_arch = "riscv32", target_arch = "riscv64")) {
+            leading_zeros_riscv(x)
+        } else {
+            leading_zeros_default(x)
+        }
+    }
+
+    /// Returns the number of leading binary zeros in `x`
+    pub extern "C" fn __clzti2(x: u128) -> usize {
+        let hi = (x >> 64) as u64;
+        if hi == 0 {
+            64 + __clzdi2(x as u64)
+        } else {
+            __clzdi2(hi)
+        }
+    }
+}
diff --git a/library/compiler-builtins/compiler-builtins/src/int/mod.rs b/library/compiler-builtins/compiler-builtins/src/int/mod.rs
new file mode 100644
index 00000000000..518ccb23f80
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/int/mod.rs
@@ -0,0 +1,18 @@
+mod specialized_div_rem;
+
+pub mod addsub;
+mod big;
+pub mod bswap;
+pub mod leading_zeros;
+pub mod mul;
+pub mod sdiv;
+pub mod shift;
+pub mod trailing_zeros;
+mod traits;
+pub mod udiv;
+
+pub use big::{i256, u256};
+#[cfg(not(feature = "unstable-public-internals"))]
+pub(crate) use traits::{CastFrom, CastInto, DInt, HInt, Int, MinInt};
+#[cfg(feature = "unstable-public-internals")]
+pub use traits::{CastFrom, CastInto, DInt, HInt, Int, MinInt};
diff --git a/library/compiler-builtins/compiler-builtins/src/int/mul.rs b/library/compiler-builtins/compiler-builtins/src/int/mul.rs
new file mode 100644
index 00000000000..040c69342d1
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/int/mul.rs
@@ -0,0 +1,142 @@
+use crate::int::{DInt, HInt, Int};
+
+trait Mul: DInt + Int
+where
+    Self::H: DInt,
+{
+    fn mul(self, rhs: Self) -> Self {
+        // In order to prevent infinite recursion, we cannot use the `widen_mul` in this:
+        //self.lo().widen_mul(rhs.lo())
+        //    .wrapping_add(self.lo().wrapping_mul(rhs.hi()).widen_hi())
+        //    .wrapping_add(self.hi().wrapping_mul(rhs.lo()).widen_hi())
+
+        let lhs_lo = self.lo();
+        let rhs_lo = rhs.lo();
+        // construct the widening multiplication using only `Self::H` sized multiplications
+        let tmp_0 = lhs_lo.lo().zero_widen_mul(rhs_lo.lo());
+        let tmp_1 = lhs_lo.lo().zero_widen_mul(rhs_lo.hi());
+        let tmp_2 = lhs_lo.hi().zero_widen_mul(rhs_lo.lo());
+        let tmp_3 = lhs_lo.hi().zero_widen_mul(rhs_lo.hi());
+        // sum up all widening partials
+        let mul = Self::from_lo_hi(tmp_0, tmp_3)
+            .wrapping_add(tmp_1.zero_widen() << (Self::BITS / 4))
+            .wrapping_add(tmp_2.zero_widen() << (Self::BITS / 4));
+        // add the higher partials
+        mul.wrapping_add(lhs_lo.wrapping_mul(rhs.hi()).widen_hi())
+            .wrapping_add(self.hi().wrapping_mul(rhs_lo).widen_hi())
+    }
+}
+
+impl Mul for u64 {}
+impl Mul for i128 {}
+
+pub(crate) trait UMulo: DInt + Int {
+    fn mulo(self, rhs: Self) -> (Self, bool) {
+        match (self.hi().is_zero(), rhs.hi().is_zero()) {
+            // overflow is guaranteed
+            (false, false) => (self.wrapping_mul(rhs), true),
+            (true, false) => {
+                let mul_lo = self.lo().widen_mul(rhs.lo());
+                let mul_hi = self.lo().widen_mul(rhs.hi());
+                let (mul, o) = mul_lo.overflowing_add(mul_hi.lo().widen_hi());
+                (mul, o || !mul_hi.hi().is_zero())
+            }
+            (false, true) => {
+                let mul_lo = rhs.lo().widen_mul(self.lo());
+                let mul_hi = rhs.lo().widen_mul(self.hi());
+                let (mul, o) = mul_lo.overflowing_add(mul_hi.lo().widen_hi());
+                (mul, o || !mul_hi.hi().is_zero())
+            }
+            // overflow is guaranteed to not happen, and use a smaller widening multiplication
+            (true, true) => (self.lo().widen_mul(rhs.lo()), false),
+        }
+    }
+}
+
+impl UMulo for u32 {}
+impl UMulo for u64 {}
+impl UMulo for u128 {}
+
+macro_rules! impl_signed_mulo {
+    ($fn:ident, $iD:ident, $uD:ident) => {
+        fn $fn(lhs: $iD, rhs: $iD) -> ($iD, bool) {
+            let mut lhs = lhs;
+            let mut rhs = rhs;
+            // the test against `mul_neg` below fails without this early return
+            if lhs == 0 || rhs == 0 {
+                return (0, false);
+            }
+
+            let lhs_neg = lhs < 0;
+            let rhs_neg = rhs < 0;
+            if lhs_neg {
+                lhs = lhs.wrapping_neg();
+            }
+            if rhs_neg {
+                rhs = rhs.wrapping_neg();
+            }
+            let mul_neg = lhs_neg != rhs_neg;
+
+            let (mul, o) = (lhs as $uD).mulo(rhs as $uD);
+            let mut mul = mul as $iD;
+
+            if mul_neg {
+                mul = mul.wrapping_neg();
+            }
+            if (mul < 0) != mul_neg {
+                // this one check happens to catch all edge cases related to `$iD::MIN`
+                (mul, true)
+            } else {
+                (mul, o)
+            }
+        }
+    };
+}
+
+impl_signed_mulo!(i32_overflowing_mul, i32, u32);
+impl_signed_mulo!(i64_overflowing_mul, i64, u64);
+impl_signed_mulo!(i128_overflowing_mul, i128, u128);
+
+intrinsics! {
+    #[maybe_use_optimized_c_shim]
+    #[arm_aeabi_alias = __aeabi_lmul]
+    #[cfg(any(not(any(target_arch = "riscv32", target_arch = "riscv64")), target_feature = "m"))]
+    pub extern "C" fn __muldi3(a: u64, b: u64) -> u64 {
+        a.mul(b)
+    }
+
+    pub extern "C" fn __multi3(a: i128, b: i128) -> i128 {
+        a.mul(b)
+    }
+
+    pub extern "C" fn __mulosi4(a: i32, b: i32, oflow: &mut i32) -> i32 {
+        let (mul, o) = i32_overflowing_mul(a, b);
+        *oflow = o as i32;
+        mul
+    }
+
+    pub extern "C" fn __mulodi4(a: i64, b: i64, oflow: &mut i32) -> i64 {
+        let (mul, o) = i64_overflowing_mul(a, b);
+        *oflow = o as i32;
+        mul
+    }
+
+    #[unadjusted_on_win64]
+    pub extern "C" fn __muloti4(a: i128, b: i128, oflow: &mut i32) -> i128 {
+        let (mul, o) = i128_overflowing_mul(a, b);
+        *oflow = o as i32;
+        mul
+    }
+
+    pub extern "C" fn __rust_i128_mulo(a: i128, b: i128, oflow: &mut i32) -> i128 {
+        let (mul, o) = i128_overflowing_mul(a, b);
+        *oflow = o.into();
+        mul
+    }
+
+    pub extern "C" fn __rust_u128_mulo(a: u128, b: u128, oflow: &mut i32) -> u128 {
+        let (mul, o) = a.mulo(b);
+        *oflow = o.into();
+        mul
+    }
+}
diff --git a/library/compiler-builtins/compiler-builtins/src/int/sdiv.rs b/library/compiler-builtins/compiler-builtins/src/int/sdiv.rs
new file mode 100644
index 00000000000..6a9029de7f2
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/int/sdiv.rs
@@ -0,0 +1,205 @@
+use crate::int::udiv::*;
+
+macro_rules! sdivmod {
+    (
+        $unsigned_fn:ident, // name of the unsigned division function
+        $signed_fn:ident, // name of the signed division function
+        $uX:ident, // unsigned integer type for the inputs and outputs of `$unsigned_name`
+        $iX:ident, // signed integer type for the inputs and outputs of `$signed_name`
+        $($attr:tt),* // attributes
+    ) => {
+        intrinsics! {
+            $(
+                #[$attr]
+            )*
+            /// Returns `n / d` and sets `*rem = n % d`
+            pub extern "C" fn $signed_fn(a: $iX, b: $iX, rem: &mut $iX) -> $iX {
+                let a_neg = a < 0;
+                let b_neg = b < 0;
+                let mut a = a;
+                let mut b = b;
+
+                if a_neg {
+                    a = a.wrapping_neg();
+                }
+                if b_neg {
+                    b = b.wrapping_neg();
+                }
+
+                let mut r = *rem as $uX;
+                let t = $unsigned_fn(a as $uX, b as $uX, Some(&mut r)) as $iX;
+                let mut r = r as $iX;
+
+                if a_neg {
+                    r = r.wrapping_neg();
+                }
+                *rem = r;
+                if a_neg != b_neg {
+                    t.wrapping_neg()
+                } else {
+                    t
+                }
+            }
+        }
+    }
+}
+
+macro_rules! sdiv {
+    (
+        $unsigned_fn:ident, // name of the unsigned division function
+        $signed_fn:ident, // name of the signed division function
+        $uX:ident, // unsigned integer type for the inputs and outputs of `$unsigned_name`
+        $iX:ident, // signed integer type for the inputs and outputs of `$signed_name`
+        $($attr:tt),* // attributes
+    ) => {
+        intrinsics! {
+            $(
+                #[$attr]
+            )*
+            /// Returns `n / d`
+            pub extern "C" fn $signed_fn(a: $iX, b: $iX) -> $iX {
+                let a_neg = a < 0;
+                let b_neg = b < 0;
+                let mut a = a;
+                let mut b = b;
+                if a_neg {
+                    a = a.wrapping_neg();
+                }
+                if b_neg {
+                    b = b.wrapping_neg();
+                }
+                let t = $unsigned_fn(a as $uX, b as $uX) as $iX;
+                if a_neg != b_neg {
+                    t.wrapping_neg()
+                } else {
+                    t
+                }
+            }
+        }
+    }
+}
+
+macro_rules! smod {
+    (
+        $unsigned_fn:ident, // name of the unsigned division function
+        $signed_fn:ident, // name of the signed division function
+        $uX:ident, // unsigned integer type for the inputs and outputs of `$unsigned_name`
+        $iX:ident, // signed integer type for the inputs and outputs of `$signed_name`
+        $($attr:tt),* // attributes
+    ) => {
+        intrinsics! {
+            $(
+                #[$attr]
+            )*
+            /// Returns `n % d`
+            pub extern "C" fn $signed_fn(a: $iX, b: $iX) -> $iX {
+                let a_neg = a < 0;
+                let b_neg = b < 0;
+                let mut a = a;
+                let mut b = b;
+                if a_neg {
+                    a = a.wrapping_neg();
+                }
+                if b_neg {
+                    b = b.wrapping_neg();
+                }
+                let r = $unsigned_fn(a as $uX, b as $uX) as $iX;
+                if a_neg {
+                    r.wrapping_neg()
+                } else {
+                    r
+                }
+            }
+        }
+    }
+}
+
+#[cfg(not(target_arch = "avr"))]
+sdivmod!(
+    __udivmodsi4,
+    __divmodsi4,
+    u32,
+    i32,
+    maybe_use_optimized_c_shim
+);
+
+#[cfg(target_arch = "avr")]
+intrinsics! {
+    /// Returns `a / b` and `a % b` packed together.
+    ///
+    /// Ideally we'd use `-> (u32, u32)` or some kind of a packed struct, but
+    /// both force a stack allocation, while our result has to be in R18:R26.
+    pub extern "C" fn __divmodsi4(a: i32, b: i32) -> u64 {
+        let a_neg = a < 0;
+        let b_neg = b < 0;
+        let mut a = a;
+        let mut b = b;
+
+        if a_neg {
+            a = a.wrapping_neg();
+        }
+        if b_neg {
+            b = b.wrapping_neg();
+        }
+
+        let tr = __udivmodsi4(a as u32, b as u32);
+        let mut t = tr as u32 as i32;
+        let mut r = (tr >> 32) as u32 as i32;
+
+        if a_neg {
+            r = r.wrapping_neg();
+        }
+        if a_neg != b_neg {
+            t = t.wrapping_neg();
+        }
+
+        ((r as u32 as u64) << 32) | (t as u32 as u64)
+    }
+}
+
+// The `#[arm_aeabi_alias = __aeabi_idiv]` attribute cannot be made to work with `intrinsics!` in macros
+intrinsics! {
+    #[maybe_use_optimized_c_shim]
+    #[arm_aeabi_alias = __aeabi_idiv]
+    /// Returns `n / d`
+    pub extern "C" fn __divsi3(a: i32, b: i32) -> i32 {
+        let a_neg = a < 0;
+        let b_neg = b < 0;
+        let mut a = a;
+        let mut b = b;
+        if a_neg {
+            a = a.wrapping_neg();
+        }
+        if b_neg {
+            b = b.wrapping_neg();
+        }
+        let t = __udivsi3(a as u32, b as u32) as i32;
+        if a_neg != b_neg {
+            t.wrapping_neg()
+        } else {
+            t
+        }
+    }
+}
+smod!(__umodsi3, __modsi3, u32, i32, maybe_use_optimized_c_shim);
+
+sdivmod!(
+    __udivmoddi4,
+    __divmoddi4,
+    u64,
+    i64,
+    maybe_use_optimized_c_shim
+);
+sdiv!(__udivdi3, __divdi3, u64, i64, maybe_use_optimized_c_shim);
+smod!(__umoddi3, __moddi3, u64, i64, maybe_use_optimized_c_shim);
+
+// LLVM does not currently have a `__divmodti4` function, but GCC does
+sdivmod!(
+    __udivmodti4,
+    __divmodti4,
+    u128,
+    i128,
+    maybe_use_optimized_c_shim
+);
+sdiv!(__udivti3, __divti3, u128, i128,);
+smod!(__umodti3, __modti3, u128, i128,);
diff --git a/library/compiler-builtins/compiler-builtins/src/int/shift.rs b/library/compiler-builtins/compiler-builtins/src/int/shift.rs
new file mode 100644
index 00000000000..a85c1b33d67
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/int/shift.rs
@@ -0,0 +1,116 @@
+use crate::int::{DInt, HInt, Int, MinInt};
+
+trait Ashl: DInt {
+    /// Returns `a << b`, requires `b < Self::BITS`
+    fn ashl(self, shl: u32) -> Self {
+        let n_h = Self::H::BITS;
+        if shl & n_h != 0 {
+            // we only need `self.lo()` because `self.hi()` will be shifted out entirely
+            self.lo().wrapping_shl(shl - n_h).widen_hi()
+        } else if shl == 0 {
+            self
+        } else {
+            Self::from_lo_hi(
+                self.lo().wrapping_shl(shl),
+                self.lo().logical_shr(n_h.wrapping_sub(shl)) | self.hi().wrapping_shl(shl),
+            )
+        }
+    }
+}
+
+impl Ashl for u32 {}
+impl Ashl for u64 {}
+impl Ashl for u128 {}
+
+trait Ashr: DInt {
+    /// Returns arithmetic `a >> b`, requires `b < Self::BITS`
+    fn ashr(self, shr: u32) -> Self {
+        let n_h = Self::H::BITS;
+        if shr & n_h != 0 {
+            Self::from_lo_hi(
+                self.hi().wrapping_shr(shr - n_h),
+                // smear the sign bit
+                self.hi().wrapping_shr(n_h - 1),
+            )
+        } else if shr == 0 {
+            self
+        } else {
+            Self::from_lo_hi(
+                self.lo().logical_shr(shr) | self.hi().wrapping_shl(n_h.wrapping_sub(shr)),
+                self.hi().wrapping_shr(shr),
+            )
+        }
+    }
+}
+
+impl Ashr for i32 {}
+impl Ashr for i64 {}
+impl Ashr for i128 {}
+
+trait Lshr: DInt {
+    /// Returns logical `a >> b`, requires `b < Self::BITS`
+    fn lshr(self, shr: u32) -> Self {
+        let n_h = Self::H::BITS;
+        if shr & n_h != 0 {
+            self.hi().logical_shr(shr - n_h).zero_widen()
+        } else if shr == 0 {
+            self
+        } else {
+            Self::from_lo_hi(
+                self.lo().logical_shr(shr) | self.hi().wrapping_shl(n_h.wrapping_sub(shr)),
+                self.hi().logical_shr(shr),
+            )
+        }
+    }
+}
+
+impl Lshr for u32 {}
+impl Lshr for u64 {}
+impl Lshr for u128 {}
+
+intrinsics! {
+    #[maybe_use_optimized_c_shim]
+    pub extern "C" fn __ashlsi3(a: u32, b: u32) -> u32 {
+        a.ashl(b)
+    }
+
+    #[maybe_use_optimized_c_shim]
+    #[arm_aeabi_alias = __aeabi_llsl]
+    pub extern "C" fn __ashldi3(a: u64, b: core::ffi::c_uint) -> u64 {
+        a.ashl(b as u32)
+    }
+
+    pub extern "C" fn __ashlti3(a: u128, b: u32) -> u128 {
+        a.ashl(b)
+    }
+
+    #[maybe_use_optimized_c_shim]
+    pub extern "C" fn __ashrsi3(a: i32, b: u32) -> i32 {
+        a.ashr(b)
+    }
+
+    #[maybe_use_optimized_c_shim]
+    #[arm_aeabi_alias = __aeabi_lasr]
+    pub extern "C" fn __ashrdi3(a: i64, b: core::ffi::c_uint) -> i64 {
+        a.ashr(b as u32)
+    }
+
+    pub extern "C" fn __ashrti3(a: i128, b: u32) -> i128 {
+        a.ashr(b)
+    }
+
+    #[maybe_use_optimized_c_shim]
+    pub extern "C" fn __lshrsi3(a: u32, b: u32) -> u32 {
+        a.lshr(b)
+    }
+
+    #[maybe_use_optimized_c_shim]
+    #[arm_aeabi_alias = __aeabi_llsr]
+    pub extern "C" fn __lshrdi3(a: u64, b: core::ffi::c_uint) -> u64 {
+        a.lshr(b as u32)
+    }
+
+    pub extern "C" fn __lshrti3(a: u128, b: u32) -> u128 {
+        a.lshr(b)
+    }
+}
diff --git a/library/compiler-builtins/compiler-builtins/src/int/specialized_div_rem/asymmetric.rs b/library/compiler-builtins/compiler-builtins/src/int/specialized_div_rem/asymmetric.rs
new file mode 100644
index 00000000000..56ce188a373
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/int/specialized_div_rem/asymmetric.rs
@@ -0,0 +1,69 @@
+/// Creates an unsigned division function optimized for dividing integers with the same
+/// bitwidth as the largest operand in an asymmetrically sized division. For example, x86-64 has an
+/// assembly instruction that can divide a 128 bit integer by a 64 bit integer if the quotient fits
+/// in 64 bits. The 128 bit version of this algorithm would use that fast hardware division to
+/// construct a full 128 bit by 128 bit division.
+#[allow(unused_macros)]
+macro_rules! impl_asymmetric {
+    (
+        $fn:ident, // name of the unsigned division function
+        $zero_div_fn:ident, // function called when division by zero is attempted
+        $half_division:ident, // function for division of a $uX by a $uX
+        $asymmetric_division:ident, // function for division of a $uD by a $uX
+        $n_h:expr, // the number of bits in a $iH or $uH
+        $uH:ident, // unsigned integer with half the bit width of $uX
+        $uX:ident, // unsigned integer with half the bit width of $uD
+        $uD:ident // unsigned integer type for the inputs and outputs of `$fn`
+    ) => {
+        /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
+        /// tuple.
+        pub fn $fn(duo: $uD, div: $uD) -> ($uD, $uD) {
+            let n: u32 = $n_h * 2;
+
+            let duo_lo = duo as $uX;
+            let duo_hi = (duo >> n) as $uX;
+            let div_lo = div as $uX;
+            let div_hi = (div >> n) as $uX;
+            if div_hi == 0 {
+                if div_lo == 0 {
+                    $zero_div_fn()
+                }
+                if duo_hi < div_lo {
+                    // `$uD` by `$uX` division with a quotient that will fit into a `$uX`
+                    let (quo, rem) = unsafe { $asymmetric_division(duo, div_lo) };
+                    return (quo as $uD, rem as $uD);
+                } else {
+                    // Short division using the $uD by $uX division
+                    let (quo_hi, rem_hi) = $half_division(duo_hi, div_lo);
+                    let tmp = unsafe {
+                        $asymmetric_division((duo_lo as $uD) | ((rem_hi as $uD) << n), div_lo)
+                    };
+                    return ((tmp.0 as $uD) | ((quo_hi as $uD) << n), tmp.1 as $uD);
+                }
+            }
+
+            // This has been adapted from
+            // https://www.codeproject.com/tips/785014/uint-division-modulus which was in turn
+            // adapted from Hacker's Delight. This is similar to the two possibility algorithm
+            // in that it uses only more significant parts of `duo` and `div` to divide a large
+            // integer with a smaller division instruction.
+            let div_lz = div_hi.leading_zeros();
+            let div_extra = n - div_lz;
+            let div_sig_n = (div >> div_extra) as $uX;
+            let tmp = unsafe { $asymmetric_division(duo >> 1, div_sig_n) };
+
+            let mut quo = tmp.0 >> ((n - 1) - div_lz);
+            if quo != 0 {
+                quo -= 1;
+            }
+
+            // Note that this is a full `$uD` multiplication being used here
+            let mut rem = duo - (quo as $uD).wrapping_mul(div);
+            if div <= rem {
+                quo += 1;
+                rem -= div;
+            }
+            return (quo as $uD, rem);
+        }
+    };
+}
diff --git a/library/compiler-builtins/compiler-builtins/src/int/specialized_div_rem/binary_long.rs b/library/compiler-builtins/compiler-builtins/src/int/specialized_div_rem/binary_long.rs
new file mode 100644
index 00000000000..2c61a45e06e
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/int/specialized_div_rem/binary_long.rs
@@ -0,0 +1,552 @@
+/// Creates an unsigned division function that uses binary long division, designed for
+/// computer architectures without division instructions. These functions have good performance for
+/// microarchitectures with large branch miss penalties and architectures without the ability to
+/// predicate instructions. For architectures with predicated instructions, one of the algorithms
+/// described in the documentation of these functions probably has higher performance, and a custom
+/// assembly routine should be used instead.
+#[allow(unused_macros)]
+macro_rules! impl_binary_long {
+    (
+        $fn:ident, // name of the unsigned division function
+        $zero_div_fn:ident, // function called when division by zero is attempted
+        $normalization_shift:ident, // function for finding the normalization shift
+        $n:tt, // the number of bits in a $iX or $uX
+        $uX:ident, // unsigned integer type for the inputs and outputs of `$fn`
+        $iX:ident // signed integer type with same bitwidth as `$uX`
+        $(, $fun_attr:meta)* // attributes for the function
+    ) => {
+        /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
+        /// tuple.
+        $(
+            #[$fun_attr]
+        )*
+        pub fn $fn(duo: $uX, div: $uX) -> ($uX, $uX) {
+            let mut duo = duo;
+            // handle edge cases before calling `$normalization_shift`
+            if div == 0 {
+                $zero_div_fn()
+            }
+            if duo < div {
+                return (0, duo);
+            }
+
+            // There are many variations of binary division algorithm that could be used. This
+            // documentation gives a tour of different methods so that future readers wanting to
+            // optimize further do not have to painstakingly derive them. The SWAR variation is
+            // especially hard to understand without reading the less convoluted methods first.
+
+            // You may notice that a `duo < div_original` check is included in many these
+            // algorithms. A critical optimization that many algorithms miss is handling of
+            // quotients that will turn out to have many trailing zeros or many leading zeros. This
+            // happens in cases of exact or close-to-exact divisions, divisions by power of two, and
+            // in cases where the quotient is small. The `duo < div_original` check handles these
+            // cases of early returns and ends up replacing other kinds of mundane checks that
+            // normally terminate a binary division algorithm.
+            //
+            // Something you may see in other algorithms that is not special-cased here is checks
+            // for division by powers of two. The `duo < div_original` check handles this case and
+            // more, however it can be checked up front before the bisection using the
+            // `((div > 0) && ((div & (div - 1)) == 0))` trick. This is not special-cased because
+            // compilers should handle most cases where divisions by power of two occur, and we do
+            // not want to add on a few cycles for every division operation just to save a few
+            // cycles rarely.
+
+            // The following example is the most straightforward translation from the way binary
+            // long division is typically visualized:
+            // Dividing 178u8 (0b10110010) by 6u8 (0b110). `div` is shifted left by 5, according to
+            // the result from `$normalization_shift(duo, div, false)`.
+            //
+            // Step 0: `sub` is negative, so there is not full normalization, so no `quo` bit is set
+            // and `duo` is kept unchanged.
+            // duo:10110010, div_shifted:11000000, sub:11110010, quo:00000000, shl:5
+            //
+            // Step 1: `sub` is positive, set a `quo` bit and update `duo` for next step.
+            // duo:10110010, div_shifted:01100000, sub:01010010, quo:00010000, shl:4
+            //
+            // Step 2: Continue based on `sub`. The `quo` bits start accumulating.
+            // duo:01010010, div_shifted:00110000, sub:00100010, quo:00011000, shl:3
+            // duo:00100010, div_shifted:00011000, sub:00001010, quo:00011100, shl:2
+            // duo:00001010, div_shifted:00001100, sub:11111110, quo:00011100, shl:1
+            // duo:00001010, div_shifted:00000110, sub:00000100, quo:00011100, shl:0
+            // The `duo < div_original` check terminates the algorithm with the correct quotient of
+            // 29u8 and remainder of 4u8
+            /*
+            let div_original = div;
+            let mut shl = $normalization_shift(duo, div, false);
+            let mut quo = 0;
+            loop {
+                let div_shifted = div << shl;
+                let sub = duo.wrapping_sub(div_shifted);
+                // it is recommended to use `println!`s like this if functionality is unclear
+                /*
+                println!("duo:{:08b}, div_shifted:{:08b}, sub:{:08b}, quo:{:08b}, shl:{}",
+                    duo,
+                    div_shifted,
+                    sub,
+                    quo,
+                    shl
+                );
+                */
+                if 0 <= (sub as $iX) {
+                    duo = sub;
+                    quo += 1 << shl;
+                    if duo < div_original {
+                        // this branch is optional
+                        return (quo, duo)
+                    }
+                }
+                if shl == 0 {
+                    return (quo, duo)
+                }
+                shl -= 1;
+            }
+            */
+
+            // This restoring binary long division algorithm reduces the number of operations
+            // overall via:
+            // - `pow` can be shifted right instead of recalculating from `shl`
+            // - starting `div` shifted left and shifting it right for each step instead of
+            //   recalculating from `shl`
+            // - The `duo < div_original` branch is used to terminate the algorithm instead of the
+            //   `shl == 0` branch. This check is strong enough to prevent set bits of `pow` and
+            //   `div` from being shifted off the end. This check also only occurs on half of steps
+            //   on average, since it is behind the `(sub as $iX) >= 0` branch.
+            // - `shl` is now not needed by any aspect of of the loop and thus only 3 variables are
+            //   being updated between steps
+            //
+            // There are many variations of this algorithm, but this encompases the largest number
+            // of architectures and does not rely on carry flags, add-with-carry, or SWAR
+            // complications to be decently fast.
+            /*
+            let div_original = div;
+            let shl = $normalization_shift(duo, div, false);
+            let mut div: $uX = div << shl;
+            let mut pow: $uX = 1 << shl;
+            let mut quo: $uX = 0;
+            loop {
+                let sub = duo.wrapping_sub(div);
+                if 0 <= (sub as $iX) {
+                    duo = sub;
+                    quo |= pow;
+                    if duo < div_original {
+                        return (quo, duo)
+                    }
+                }
+                div >>= 1;
+                pow >>= 1;
+            }
+            */
+
+            // If the architecture has flags and predicated arithmetic instructions, it is possible
+            // to do binary long division without branching and in only 3 or 4 instructions. This is
+            // a variation of a 3 instruction central loop from
+            // http://www.chiark.greenend.org.uk/~theom/riscos/docs/ultimate/a252div.txt.
+            //
+            // What allows doing division in only 3 instructions is realizing that instead of
+            // keeping `duo` in place and shifting `div` right to align bits, `div` can be kept in
+            // place and `duo` can be shifted left. This means `div` does not have to be updated,
+            // but causes edge case problems and makes `duo < div_original` tests harder. Some
+            // architectures have an option to shift an argument in an arithmetic operation, which
+            // means `duo` can be shifted left and subtracted from in one instruction. The other two
+            // instructions are updating `quo` and undoing the subtraction if it turns out things
+            // were not normalized.
+
+            /*
+            // Perform one binary long division step on the already normalized arguments, because
+            // the main. Note that this does a full normalization since the central loop needs
+            // `duo.leading_zeros()` to be at least 1 more than `div.leading_zeros()`. The original
+            // variation only did normalization to the nearest 4 steps, but this makes handling edge
+            // cases much harder. We do a full normalization and perform a binary long division
+            // step. In the edge case where the msbs of `duo` and `div` are set, it clears the msb
+            // of `duo`, then the edge case handler shifts `div` right and does another long
+            // division step to always insure `duo.leading_zeros() + 1 >= div.leading_zeros()`.
+            let div_original = div;
+            let mut shl = $normalization_shift(duo, div, true);
+            let mut div: $uX = (div << shl);
+            let mut quo: $uX = 1;
+            duo = duo.wrapping_sub(div);
+            if duo < div_original {
+                return (1 << shl, duo);
+            }
+            let div_neg: $uX;
+            if (div as $iX) < 0 {
+                // A very ugly edge case where the most significant bit of `div` is set (after
+                // shifting to match `duo` when its most significant bit is at the sign bit), which
+                // leads to the sign bit of `div_neg` being cut off and carries not happening when
+                // they should. This branch performs a long division step that keeps `duo` in place
+                // and shifts `div` down.
+                div >>= 1;
+                div_neg = div.wrapping_neg();
+                let (sub, carry) = duo.overflowing_add(div_neg);
+                duo = sub;
+                quo = quo.wrapping_add(quo).wrapping_add(carry as $uX);
+                if !carry {
+                    duo = duo.wrapping_add(div);
+                }
+                shl -= 1;
+            } else {
+                div_neg = div.wrapping_neg();
+            }
+            // The add-with-carry that updates `quo` needs to have the carry set when a normalized
+            // subtract happens. Using `duo.wrapping_shl(1).overflowing_sub(div)` to do the
+            // subtraction generates a carry when an unnormalized subtract happens, which is the
+            // opposite of what we want. Instead, we use
+            // `duo.wrapping_shl(1).overflowing_add(div_neg)`, where `div_neg` is negative `div`.
+            let mut i = shl;
+            loop {
+                if i == 0 {
+                    break;
+                }
+                i -= 1;
+                // `ADDS duo, div, duo, LSL #1`
+                // (add `div` to `duo << 1` and set flags)
+                let (sub, carry) = duo.wrapping_shl(1).overflowing_add(div_neg);
+                duo = sub;
+                // `ADC quo, quo, quo`
+                // (add with carry). Effectively shifts `quo` left by 1 and sets the least
+                // significant bit to the carry.
+                quo = quo.wrapping_add(quo).wrapping_add(carry as $uX);
+                // `ADDCC duo, duo, div`
+                // (add if carry clear). Undoes the subtraction if no carry was generated.
+                if !carry {
+                    duo = duo.wrapping_add(div);
+                }
+            }
+            return (quo, duo >> shl);
+            */
+
+            // This is the SWAR (SIMD within in a register) restoring division algorithm.
+            // This combines several ideas of the above algorithms:
+            //  - If `duo` is shifted left instead of shifting `div` right like in the 3 instruction
+            //    restoring division algorithm, some architectures can do the shifting and
+            //    subtraction step in one instruction.
+            //  - `quo` can be constructed by adding powers-of-two to it or shifting it left by one
+            //    and adding one.
+            //  - Every time `duo` is shifted left, there is another unused 0 bit shifted into the
+            //    LSB, so what if we use those bits to store `quo`?
+            // Through a complex setup, it is possible to manage `duo` and `quo` in the same
+            // register, and perform one step with 2 or 3 instructions. The only major downsides are
+            // that there is significant setup (it is only saves instructions if `shl` is
+            // approximately more than 4), `duo < div_original` checks are impractical once SWAR is
+            // initiated, and the number of division steps taken has to be exact (we cannot do more
+            // division steps than `shl`, because it introduces edge cases where quotient bits in
+            // `duo` start to collide with the real part of `div`.
+            /*
+            // first step. The quotient bit is stored in `quo` for now
+            let div_original = div;
+            let mut shl = $normalization_shift(duo, div, true);
+            let mut div: $uX = (div << shl);
+            duo = duo.wrapping_sub(div);
+            let mut quo: $uX = 1 << shl;
+            if duo < div_original {
+                return (quo, duo);
+            }
+
+            let mask: $uX;
+            if (div as $iX) < 0 {
+                // deal with same edge case as the 3 instruction restoring division algorithm, but
+                // the quotient bit from this step also has to be stored in `quo`
+                div >>= 1;
+                shl -= 1;
+                let tmp = 1 << shl;
+                mask = tmp - 1;
+                let sub = duo.wrapping_sub(div);
+                if (sub as $iX) >= 0 {
+                    // restore
+                    duo = sub;
+                    quo |= tmp;
+                }
+                if duo < div_original {
+                    return (quo, duo);
+                }
+            } else {
+                mask = quo - 1;
+            }
+            // There is now room for quotient bits in `duo`.
+
+            // Note that `div` is already shifted left and has `shl` unset bits. We subtract 1 from
+            // `div` and end up with the subset of `shl` bits being all being set. This subset acts
+            // just like a two's complement negative one. The subset of `div` containing the divisor
+            // had 1 subtracted from it, but a carry will always be generated from the `shl` subset
+            // as long as the quotient stays positive.
+            //
+            // When the modified `div` is subtracted from `duo.wrapping_shl(1)`, the `shl` subset
+            // adds a quotient bit to the least significant bit.
+            // For example, 89 (0b01011001) divided by 3 (0b11):
+            //
+            // shl:4, div:0b00110000
+            // first step:
+            //       duo:0b01011001
+            // + div_neg:0b11010000
+            // ____________________
+            //           0b00101001
+            // quo is set to 0b00010000 and mask is set to 0b00001111 for later
+            //
+            // 1 is subtracted from `div`. I will differentiate the `shl` part of `div` and the
+            // quotient part of `duo` with `^`s.
+            // chars.
+            //     div:0b00110000
+            //               ^^^^
+            //   +     0b11111111
+            //   ________________
+            //         0b00101111
+            //               ^^^^
+            // div_neg:0b11010001
+            //
+            // first SWAR step:
+            //  duo_shl1:0b01010010
+            //                    ^
+            // + div_neg:0b11010001
+            // ____________________
+            //           0b00100011
+            //                    ^
+            // second:
+            //  duo_shl1:0b01000110
+            //                   ^^
+            // + div_neg:0b11010001
+            // ____________________
+            //           0b00010111
+            //                   ^^
+            // third:
+            //  duo_shl1:0b00101110
+            //                  ^^^
+            // + div_neg:0b11010001
+            // ____________________
+            //           0b11111111
+            //                  ^^^
+            // 3 steps resulted in the quotient with 3 set bits as expected, but currently the real
+            // part of `duo` is negative and the third step was an unnormalized step. The restore
+            // branch then restores `duo`. Note that the restore branch does not shift `duo` left.
+            //
+            //   duo:0b11111111
+            //              ^^^
+            // + div:0b00101111
+            //             ^^^^
+            // ________________
+            //       0b00101110
+            //              ^^^
+            // `duo` is now back in the `duo_shl1` state it was at in the the third step, with an
+            // unset quotient bit.
+            //
+            // final step (`shl` was 4, so exactly 4 steps must be taken)
+            //  duo_shl1:0b01011100
+            //                 ^^^^
+            // + div_neg:0b11010001
+            // ____________________
+            //           0b00101101
+            //                 ^^^^
+            // The quotient includes the `^` bits added with the `quo` bits from the beginning that
+            // contained the first step and potential edge case step,
+            // `quo:0b00010000 + (duo:0b00101101 & mask:0b00001111) == 0b00011101 == 29u8`.
+            // The remainder is the bits remaining in `duo` that are not part of the quotient bits,
+            // `duo:0b00101101 >> shl == 0b0010 == 2u8`.
+            let div: $uX = div.wrapping_sub(1);
+            let mut i = shl;
+            loop {
+                if i == 0 {
+                    break;
+                }
+                i -= 1;
+                duo = duo.wrapping_shl(1).wrapping_sub(div);
+                if (duo as $iX) < 0 {
+                    // restore
+                    duo = duo.wrapping_add(div);
+                }
+            }
+            // unpack the results of SWAR
+            return ((duo & mask) | quo, duo >> shl);
+            */
+
+            // The problem with the conditional restoring SWAR algorithm above is that, in practice,
+            // it requires assembly code to bring out its full unrolled potential (It seems that
+            // LLVM can't use unrolled conditionals optimally and ends up erasing all the benefit
+            // that my algorithm intends. On architectures without predicated instructions, the code
+            // gen is especially bad. We need a default software division algorithm that is
+            // guaranteed to get decent code gen for the central loop.
+
+            // For non-SWAR algorithms, there is a way to do binary long division without
+            // predication or even branching. This involves creating a mask from the sign bit and
+            // performing different kinds of steps using that.
+            /*
+            let shl = $normalization_shift(duo, div, true);
+            let mut div: $uX = div << shl;
+            let mut pow: $uX = 1 << shl;
+            let mut quo: $uX = 0;
+            loop {
+                let sub = duo.wrapping_sub(div);
+                let sign_mask = !((sub as $iX).wrapping_shr($n - 1) as $uX);
+                duo -= div & sign_mask;
+                quo |= pow & sign_mask;
+                div >>= 1;
+                pow >>= 1;
+                if pow == 0 {
+                    break;
+                }
+            }
+            return (quo, duo);
+            */
+            // However, it requires about 4 extra operations (smearing the sign bit, negating the
+            // mask, and applying the mask twice) on top of the operations done by the actual
+            // algorithm. With SWAR however, just 2 extra operations are needed, making it
+            // practical and even the most optimal algorithm for some architectures.
+
+            // What we do is use custom assembly for predicated architectures that need software
+            // division, and for the default algorithm use a mask based restoring SWAR algorithm
+            // without conditionals or branches. On almost all architectures, this Rust code is
+            // guaranteed to compile down to 5 assembly instructions or less for each step, and LLVM
+            // will unroll it in a decent way.
+
+            // standard opening for SWAR algorithm with first step and edge case handling
+            let div_original = div;
+            let mut shl = $normalization_shift(duo, div, true);
+            let mut div: $uX = (div << shl);
+            duo = duo.wrapping_sub(div);
+            let mut quo: $uX = 1 << shl;
+            if duo < div_original {
+                return (quo, duo);
+            }
+            let mask: $uX;
+            if (div as $iX) < 0 {
+                div >>= 1;
+                shl -= 1;
+                let tmp = 1 << shl;
+                mask = tmp - 1;
+                let sub = duo.wrapping_sub(div);
+                if (sub as $iX) >= 0 {
+                    duo = sub;
+                    quo |= tmp;
+                }
+                if duo < div_original {
+                    return (quo, duo);
+                }
+            } else {
+                mask = quo - 1;
+            }
+
+            // central loop
+            div = div.wrapping_sub(1);
+            let mut i = shl;
+            loop {
+                if i == 0 {
+                    break;
+                }
+                i -= 1;
+                // shift left 1 and subtract
+                duo = duo.wrapping_shl(1).wrapping_sub(div);
+                // create mask
+                let mask = (duo as $iX).wrapping_shr($n - 1) as $uX;
+                // restore
+                duo = duo.wrapping_add(div & mask);
+            }
+            // unpack
+            return ((duo & mask) | quo, duo >> shl);
+
+            // miscellanious binary long division algorithms that might be better for specific
+            // architectures
+
+            // Another kind of long division uses an interesting fact that `div` and `pow` can be
+            // negated when `duo` is negative to perform a "negated" division step that works in
+            // place of any normalization mechanism. This is a non-restoring division algorithm that
+            // is very similar to the non-restoring division algorithms that can be found on the
+            // internet, except there is only one test for `duo < 0`. The subtraction from `quo` can
+            // be viewed as shifting the least significant set bit right (e.x. if we enter a series
+            // of negated binary long division steps starting with `quo == 0b1011_0000` and
+            // `pow == 0b0000_1000`, `quo` will progress like this: 0b1010_1000, 0b1010_0100,
+            // 0b1010_0010, 0b1010_0001).
+            /*
+            let div_original = div;
+            let shl = $normalization_shift(duo, div, true);
+            let mut div: $uX = (div << shl);
+            let mut pow: $uX = 1 << shl;
+            let mut quo: $uX = pow;
+            duo = duo.wrapping_sub(div);
+            if duo < div_original {
+                return (quo, duo);
+            }
+            div >>= 1;
+            pow >>= 1;
+            loop {
+                if (duo as $iX) < 0 {
+                    // Negated binary long division step.
+                    duo = duo.wrapping_add(div);
+                    quo = quo.wrapping_sub(pow);
+                } else {
+                    // Normal long division step.
+                    if duo < div_original {
+                        return (quo, duo)
+                    }
+                    duo = duo.wrapping_sub(div);
+                    quo = quo.wrapping_add(pow);
+                }
+                pow >>= 1;
+                div >>= 1;
+            }
+            */
+
+            // This is the Nonrestoring SWAR algorithm, combining the nonrestoring algorithm with
+            // SWAR techniques that makes the only difference between steps be negation of `div`.
+            // If there was an architecture with an instruction that negated inputs to an adder
+            // based on conditionals, and in place shifting (or a three input addition operation
+            // that can have `duo` as two of the inputs to effectively shift it left by 1), then a
+            // single instruction central loop is possible. Microarchitectures often have inputs to
+            // their ALU that can invert the arguments and carry in of adders, but the architectures
+            // unfortunately do not have an instruction to dynamically invert this input based on
+            // conditionals.
+            /*
+            // SWAR opening
+            let div_original = div;
+            let mut shl = $normalization_shift(duo, div, true);
+            let mut div: $uX = (div << shl);
+            duo = duo.wrapping_sub(div);
+            let mut quo: $uX = 1 << shl;
+            if duo < div_original {
+                return (quo, duo);
+            }
+            let mask: $uX;
+            if (div as $iX) < 0 {
+                div >>= 1;
+                shl -= 1;
+                let tmp = 1 << shl;
+                let sub = duo.wrapping_sub(div);
+                if (sub as $iX) >= 0 {
+                    // restore
+                    duo = sub;
+                    quo |= tmp;
+                }
+                if duo < div_original {
+                    return (quo, duo);
+                }
+                mask = tmp - 1;
+            } else {
+                mask = quo - 1;
+            }
+
+            // central loop
+            let div: $uX = div.wrapping_sub(1);
+            let mut i = shl;
+            loop {
+                if i == 0 {
+                    break;
+                }
+                i -= 1;
+                // note: the `wrapping_shl(1)` can be factored out, but would require another
+                // restoring division step to prevent `(duo as $iX)` from overflowing
+                if (duo as $iX) < 0 {
+                    // Negated binary long division step.
+                    duo = duo.wrapping_shl(1).wrapping_add(div);
+                } else {
+                    // Normal long division step.
+                    duo = duo.wrapping_shl(1).wrapping_sub(div);
+                }
+            }
+            if (duo as $iX) < 0 {
+                // Restore. This was not needed in the original nonrestoring algorithm because of
+                // the `duo < div_original` checks.
+                duo = duo.wrapping_add(div);
+            }
+            // unpack
+            return ((duo & mask) | quo, duo >> shl);
+            */
+        }
+    };
+}
diff --git a/library/compiler-builtins/compiler-builtins/src/int/specialized_div_rem/delegate.rs b/library/compiler-builtins/compiler-builtins/src/int/specialized_div_rem/delegate.rs
new file mode 100644
index 00000000000..f5c6e50239a
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/int/specialized_div_rem/delegate.rs
@@ -0,0 +1,317 @@
+/// Creates an unsigned division function that uses a combination of hardware division and
+/// binary long division to divide integers larger than what hardware division by itself can do. This
+/// function is intended for microarchitectures that have division hardware, but not fast enough
+/// multiplication hardware for `impl_trifecta` to be faster.
+#[allow(unused_macros)]
+macro_rules! impl_delegate {
+    (
+        $fn:ident, // name of the unsigned division function
+        $zero_div_fn:ident, // function called when division by zero is attempted
+        $half_normalization_shift:ident, // function for finding the normalization shift of $uX
+        $half_division:ident, // function for division of a $uX by a $uX
+        $n_h:expr, // the number of bits in $iH or $uH
+        $uH:ident, // unsigned integer with half the bit width of $uX
+        $uX:ident, // unsigned integer with half the bit width of $uD.
+        $uD:ident, // unsigned integer type for the inputs and outputs of `$fn`
+        $iD:ident // signed integer type with the same bitwidth as `$uD`
+    ) => {
+        /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
+        /// tuple.
+        pub fn $fn(duo: $uD, div: $uD) -> ($uD, $uD) {
+            // The two possibility algorithm, undersubtracting long division algorithm, or any kind
+            // of reciprocal based algorithm will not be fastest, because they involve large
+            // multiplications that we assume to not be fast enough relative to the divisions to
+            // outweigh setup times.
+
+            // the number of bits in a $uX
+            let n = $n_h * 2;
+
+            let duo_lo = duo as $uX;
+            let duo_hi = (duo >> n) as $uX;
+            let div_lo = div as $uX;
+            let div_hi = (div >> n) as $uX;
+
+            match (div_lo == 0, div_hi == 0, duo_hi == 0) {
+                (true, true, _) => $zero_div_fn(),
+                (_, false, true) => {
+                    // `duo` < `div`
+                    return (0, duo);
+                }
+                (false, true, true) => {
+                    // delegate to smaller division
+                    let tmp = $half_division(duo_lo, div_lo);
+                    return (tmp.0 as $uD, tmp.1 as $uD);
+                }
+                (false, true, false) => {
+                    if duo_hi < div_lo {
+                        // `quo_hi` will always be 0. This performs a binary long division algorithm
+                        // to zero `duo_hi` followed by a half division.
+
+                        // We can calculate the normalization shift using only `$uX` size functions.
+                        // If we calculated the normalization shift using
+                        // `$half_normalization_shift(duo_hi, div_lo false)`, it would break the
+                        // assumption the function has that the first argument is more than the
+                        // second argument. If the arguments are switched, the assumption holds true
+                        // since `duo_hi < div_lo`.
+                        let norm_shift = $half_normalization_shift(div_lo, duo_hi, false);
+                        let shl = if norm_shift == 0 {
+                            // Consider what happens if the msbs of `duo_hi` and `div_lo` align with
+                            // no shifting. The normalization shift will always return
+                            // `norm_shift == 0` regardless of whether it is fully normalized,
+                            // because `duo_hi < div_lo`. In that edge case, `n - norm_shift` would
+                            // result in shift overflow down the line. For the edge case, because
+                            // both `duo_hi < div_lo` and we are comparing all the significant bits
+                            // of `duo_hi` and `div`, we can make `shl = n - 1`.
+                            n - 1
+                        } else {
+                            // We also cannot just use `shl = n - norm_shift - 1` in the general
+                            // case, because when we are not in the edge case comparing all the
+                            // significant bits, then the full `duo < div` may not be true and thus
+                            // breaks the division algorithm.
+                            n - norm_shift
+                        };
+
+                        // The 3 variable restoring division algorithm (see binary_long.rs) is ideal
+                        // for this task, since `pow` and `quo` can be `$uX` and the delegation
+                        // check is simple.
+                        let mut div: $uD = div << shl;
+                        let mut pow_lo: $uX = 1 << shl;
+                        let mut quo_lo: $uX = 0;
+                        let mut duo = duo;
+                        loop {
+                            let sub = duo.wrapping_sub(div);
+                            if 0 <= (sub as $iD) {
+                                duo = sub;
+                                quo_lo |= pow_lo;
+                                let duo_hi = (duo >> n) as $uX;
+                                if duo_hi == 0 {
+                                    // Delegate to get the rest of the quotient. Note that the
+                                    // `div_lo` here is the original unshifted `div`.
+                                    let tmp = $half_division(duo as $uX, div_lo);
+                                    return ((quo_lo | tmp.0) as $uD, tmp.1 as $uD);
+                                }
+                            }
+                            div >>= 1;
+                            pow_lo >>= 1;
+                        }
+                    } else if duo_hi == div_lo {
+                        // `quo_hi == 1`. This branch is cheap and helps with edge cases.
+                        let tmp = $half_division(duo as $uX, div as $uX);
+                        return ((1 << n) | (tmp.0 as $uD), tmp.1 as $uD);
+                    } else {
+                        // `div_lo < duo_hi`
+                        // `rem_hi == 0`
+                        if (div_lo >> $n_h) == 0 {
+                            // Short division of $uD by a $uH, using $uX by $uX division
+                            let div_0 = div_lo as $uH as $uX;
+                            let (quo_hi, rem_3) = $half_division(duo_hi, div_0);
+
+                            let duo_mid = ((duo >> $n_h) as $uH as $uX) | (rem_3 << $n_h);
+                            let (quo_1, rem_2) = $half_division(duo_mid, div_0);
+
+                            let duo_lo = (duo as $uH as $uX) | (rem_2 << $n_h);
+                            let (quo_0, rem_1) = $half_division(duo_lo, div_0);
+
+                            return (
+                                (quo_0 as $uD) | ((quo_1 as $uD) << $n_h) | ((quo_hi as $uD) << n),
+                                rem_1 as $uD,
+                            );
+                        }
+
+                        // This is basically a short division composed of a half division for the hi
+                        // part, specialized 3 variable binary long division in the middle, and
+                        // another half division for the lo part.
+                        let duo_lo = duo as $uX;
+                        let tmp = $half_division(duo_hi, div_lo);
+                        let quo_hi = tmp.0;
+                        let mut duo = (duo_lo as $uD) | ((tmp.1 as $uD) << n);
+                        // This check is required to avoid breaking the long division below.
+                        if duo < div {
+                            return ((quo_hi as $uD) << n, duo);
+                        }
+
+                        // The half division handled all shift alignments down to `n`, so this
+                        // division can continue with a shift of `n - 1`.
+                        let mut div: $uD = div << (n - 1);
+                        let mut pow_lo: $uX = 1 << (n - 1);
+                        let mut quo_lo: $uX = 0;
+                        loop {
+                            let sub = duo.wrapping_sub(div);
+                            if 0 <= (sub as $iD) {
+                                duo = sub;
+                                quo_lo |= pow_lo;
+                                let duo_hi = (duo >> n) as $uX;
+                                if duo_hi == 0 {
+                                    // Delegate to get the rest of the quotient. Note that the
+                                    // `div_lo` here is the original unshifted `div`.
+                                    let tmp = $half_division(duo as $uX, div_lo);
+                                    return (
+                                        (tmp.0) as $uD | (quo_lo as $uD) | ((quo_hi as $uD) << n),
+                                        tmp.1 as $uD,
+                                    );
+                                }
+                            }
+                            div >>= 1;
+                            pow_lo >>= 1;
+                        }
+                    }
+                }
+                (_, false, false) => {
+                    // Full $uD by $uD binary long division. `quo_hi` will always be 0.
+                    if duo < div {
+                        return (0, duo);
+                    }
+                    let div_original = div;
+                    let shl = $half_normalization_shift(duo_hi, div_hi, false);
+                    let mut duo = duo;
+                    let mut div: $uD = div << shl;
+                    let mut pow_lo: $uX = 1 << shl;
+                    let mut quo_lo: $uX = 0;
+                    loop {
+                        let sub = duo.wrapping_sub(div);
+                        if 0 <= (sub as $iD) {
+                            duo = sub;
+                            quo_lo |= pow_lo;
+                            if duo < div_original {
+                                return (quo_lo as $uD, duo);
+                            }
+                        }
+                        div >>= 1;
+                        pow_lo >>= 1;
+                    }
+                }
+            }
+        }
+    };
+}
+
+/// Returns `n / d` and sets `*rem = n % d`.
+///
+/// This specialization exists because:
+///  - The LLVM backend for 32-bit SPARC cannot compile functions that return `(u128, u128)`,
+///    so we have to use an old fashioned `&mut u128` argument to return the remainder.
+///  - 64-bit SPARC does not have u64 * u64 => u128 widening multiplication, which makes the
+///    delegate algorithm strategy the only reasonably fast way to perform `u128` division.
+// used on SPARC
+#[allow(dead_code)]
+pub fn u128_divide_sparc(duo: u128, div: u128, rem: &mut u128) -> u128 {
+    use super::*;
+    let duo_lo = duo as u64;
+    let duo_hi = (duo >> 64) as u64;
+    let div_lo = div as u64;
+    let div_hi = (div >> 64) as u64;
+
+    match (div_lo == 0, div_hi == 0, duo_hi == 0) {
+        (true, true, _) => zero_div_fn(),
+        (_, false, true) => {
+            *rem = duo;
+            return 0;
+        }
+        (false, true, true) => {
+            let tmp = u64_by_u64_div_rem(duo_lo, div_lo);
+            *rem = tmp.1 as u128;
+            return tmp.0 as u128;
+        }
+        (false, true, false) => {
+            if duo_hi < div_lo {
+                let norm_shift = u64_normalization_shift(div_lo, duo_hi, false);
+                let shl = if norm_shift == 0 {
+                    64 - 1
+                } else {
+                    64 - norm_shift
+                };
+
+                let mut div: u128 = div << shl;
+                let mut pow_lo: u64 = 1 << shl;
+                let mut quo_lo: u64 = 0;
+                let mut duo = duo;
+                loop {
+                    let sub = duo.wrapping_sub(div);
+                    if 0 <= (sub as i128) {
+                        duo = sub;
+                        quo_lo |= pow_lo;
+                        let duo_hi = (duo >> 64) as u64;
+                        if duo_hi == 0 {
+                            let tmp = u64_by_u64_div_rem(duo as u64, div_lo);
+                            *rem = tmp.1 as u128;
+                            return (quo_lo | tmp.0) as u128;
+                        }
+                    }
+                    div >>= 1;
+                    pow_lo >>= 1;
+                }
+            } else if duo_hi == div_lo {
+                let tmp = u64_by_u64_div_rem(duo as u64, div as u64);
+                *rem = tmp.1 as u128;
+                return (1 << 64) | (tmp.0 as u128);
+            } else {
+                if (div_lo >> 32) == 0 {
+                    let div_0 = div_lo as u32 as u64;
+                    let (quo_hi, rem_3) = u64_by_u64_div_rem(duo_hi, div_0);
+
+                    let duo_mid = ((duo >> 32) as u32 as u64) | (rem_3 << 32);
+                    let (quo_1, rem_2) = u64_by_u64_div_rem(duo_mid, div_0);
+
+                    let duo_lo = (duo as u32 as u64) | (rem_2 << 32);
+                    let (quo_0, rem_1) = u64_by_u64_div_rem(duo_lo, div_0);
+
+                    *rem = rem_1 as u128;
+                    return (quo_0 as u128) | ((quo_1 as u128) << 32) | ((quo_hi as u128) << 64);
+                }
+
+                let duo_lo = duo as u64;
+                let tmp = u64_by_u64_div_rem(duo_hi, div_lo);
+                let quo_hi = tmp.0;
+                let mut duo = (duo_lo as u128) | ((tmp.1 as u128) << 64);
+                if duo < div {
+                    *rem = duo;
+                    return (quo_hi as u128) << 64;
+                }
+
+                let mut div: u128 = div << (64 - 1);
+                let mut pow_lo: u64 = 1 << (64 - 1);
+                let mut quo_lo: u64 = 0;
+                loop {
+                    let sub = duo.wrapping_sub(div);
+                    if 0 <= (sub as i128) {
+                        duo = sub;
+                        quo_lo |= pow_lo;
+                        let duo_hi = (duo >> 64) as u64;
+                        if duo_hi == 0 {
+                            let tmp = u64_by_u64_div_rem(duo as u64, div_lo);
+                            *rem = tmp.1 as u128;
+                            return (tmp.0) as u128 | (quo_lo as u128) | ((quo_hi as u128) << 64);
+                        }
+                    }
+                    div >>= 1;
+                    pow_lo >>= 1;
+                }
+            }
+        }
+        (_, false, false) => {
+            if duo < div {
+                *rem = duo;
+                return 0;
+            }
+            let div_original = div;
+            let shl = u64_normalization_shift(duo_hi, div_hi, false);
+            let mut duo = duo;
+            let mut div: u128 = div << shl;
+            let mut pow_lo: u64 = 1 << shl;
+            let mut quo_lo: u64 = 0;
+            loop {
+                let sub = duo.wrapping_sub(div);
+                if 0 <= (sub as i128) {
+                    duo = sub;
+                    quo_lo |= pow_lo;
+                    if duo < div_original {
+                        *rem = duo;
+                        return quo_lo as u128;
+                    }
+                }
+                div >>= 1;
+                pow_lo >>= 1;
+            }
+        }
+    }
+}
diff --git a/library/compiler-builtins/compiler-builtins/src/int/specialized_div_rem/mod.rs b/library/compiler-builtins/compiler-builtins/src/int/specialized_div_rem/mod.rs
new file mode 100644
index 00000000000..43f466e75ba
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/int/specialized_div_rem/mod.rs
@@ -0,0 +1,320 @@
+// TODO: when `unsafe_block_in_unsafe_fn` is stabilized, remove this
+#![allow(unused_unsafe)]
+// The functions are complex with many branches, and explicit
+// `return`s makes it clear where function exit points are
+#![allow(clippy::needless_return)]
+#![allow(clippy::comparison_chain)]
+// Clippy is confused by the complex configuration
+#![allow(clippy::if_same_then_else)]
+#![allow(clippy::needless_bool)]
+
+//! This `specialized_div_rem` module is originally from version 1.0.0 of the
+//! `specialized-div-rem` crate. Note that `for` loops with ranges are not used in this
+//! module, since unoptimized compilation may generate references to `memcpy`.
+//!
+//! The purpose of these macros is to easily change the both the division algorithm used
+//! for a given integer size and the half division used by that algorithm. The way
+//! functions call each other is also constructed such that linkers will find the chain of
+//! software and hardware divisions needed for every size of signed and unsigned division.
+//! For example, most target compilations do the following:
+//!
+//!  - Many 128 bit division functions like `u128::wrapping_div` use
+//!    `std::intrinsics::unchecked_div`, which gets replaced by `__udivti3` because there
+//!    is not a 128 bit by 128 bit hardware division function in most architectures.
+//!    `__udivti3` uses `u128_div_rem` (this extra level of function calls exists because
+//!    `__umodti3` and `__udivmodti4` also exist, and `specialized_div_rem` supplies just
+//!    one function to calculate both the quotient and remainder. If configuration flags
+//!    enable it, `impl_trifecta!` defines `u128_div_rem` to use the trifecta algorithm,
+//!    which requires the half sized division `u64_by_u64_div_rem`. If the architecture
+//!    supplies a 64 bit hardware division instruction, `u64_by_u64_div_rem` will be
+//!    reduced to those instructions. Note that we do not specify the half size division
+//!    directly to be `__udivdi3`, because hardware division would never be introduced.
+//!  - If the architecture does not supply a 64 bit hardware division instruction, u64
+//!    divisions will use functions such as `__udivdi3`. This will call `u64_div_rem`
+//!    which is defined by `impl_delegate!`. The half division for this algorithm is
+//!    `u32_by_u32_div_rem` which in turn becomes hardware division instructions or more
+//!    software division algorithms.
+//!  - If the architecture does not supply a 32 bit hardware instruction, linkers will
+//!    look for `__udivsi3`. `impl_binary_long!` is used, but this  algorithm uses no half
+//!    division, so the chain of calls ends here.
+//!
+//! On some architectures like x86_64, an asymmetrically sized division is supplied, in
+//! which 128 bit numbers can be divided by 64 bit numbers. `impl_asymmetric!` is used to
+//! extend the 128 by 64 bit division to a full 128 by 128 bit division.
+
+// `allow(dead_code)` is used in various places, because the configuration code would otherwise be
+// ridiculously complex
+
+#[macro_use]
+mod norm_shift;
+
+#[macro_use]
+mod binary_long;
+
+#[macro_use]
+mod delegate;
+
+// used on SPARC
+#[allow(unused_imports)]
+#[cfg(not(feature = "unstable-public-internals"))]
+pub(crate) use self::delegate::u128_divide_sparc;
+#[cfg(feature = "unstable-public-internals")]
+pub use self::delegate::u128_divide_sparc;
+
+#[macro_use]
+mod trifecta;
+
+#[macro_use]
+mod asymmetric;
+
+/// The behavior of all divisions by zero is controlled by this function. This function should be
+/// impossible to reach by Rust users, unless `compiler-builtins` public division functions or
+/// `core/std::unchecked_div/rem` are directly used without a zero check in front.
+fn zero_div_fn() -> ! {
+    // Calling the intrinsic directly, to avoid the `assert_unsafe_precondition` that cannot be used
+    // here because it involves non-`inline` functions
+    // (https://github.com/rust-lang/compiler-builtins/issues/491).
+    unsafe { core::intrinsics::unreachable() }
+}
+
+const USE_LZ: bool = {
+    if cfg!(target_arch = "arm") {
+        if cfg!(target_feature = "thumb-mode") {
+            // ARM thumb targets have CLZ instructions if the instruction set of ARMv6T2 is
+            // supported. This is needed to successfully differentiate between targets like
+            // `thumbv8.base` and `thumbv8.main`.
+            cfg!(target_feature = "v6t2")
+        } else {
+            // Regular ARM targets have CLZ instructions if the ARMv5TE instruction set is
+            // supported. Technically, ARMv5T was the first to have CLZ, but the "v5t" target
+            // feature does not seem to work.
+            cfg!(target_feature = "v5te")
+        }
+    } else if cfg!(any(target_arch = "sparc", target_arch = "sparc64")) {
+        // LZD or LZCNT on SPARC only exists for the VIS 3 extension and later.
+        cfg!(target_feature = "vis3")
+    } else if cfg!(any(target_arch = "riscv32", target_arch = "riscv64")) {
+        // The 'Zbb' Basic Bit-Manipulation extension on RISC-V
+        // determines if a CLZ assembly instruction exists
+        cfg!(target_feature = "zbb")
+    } else {
+        // All other common targets Rust supports should have CLZ instructions
+        true
+    }
+};
+
+impl_normalization_shift!(
+    u32_normalization_shift,
+    USE_LZ,
+    32,
+    u32,
+    i32,
+    allow(dead_code)
+);
+impl_normalization_shift!(
+    u64_normalization_shift,
+    USE_LZ,
+    64,
+    u64,
+    i64,
+    allow(dead_code)
+);
+
+/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
+/// `checked_div` and `checked_rem` are used to avoid bringing in panic function
+/// dependencies.
+#[inline]
+fn u64_by_u64_div_rem(duo: u64, div: u64) -> (u64, u64) {
+    if let Some(quo) = duo.checked_div(div) {
+        if let Some(rem) = duo.checked_rem(div) {
+            return (quo, rem);
+        }
+    }
+    zero_div_fn()
+}
+
+// Whether `trifecta` or `delegate` is faster for 128 bit division depends on the speed at which a
+// microarchitecture can multiply and divide. We decide to be optimistic and assume `trifecta` is
+// faster if the target pointer width is at least 64. Note that this
+// implementation is additionally included on WebAssembly despite the typical
+// pointer width there being 32 because it's typically run on a 64-bit machine
+// that has access to faster 64-bit operations.
+#[cfg(all(
+    any(
+        target_family = "wasm",
+        not(any(target_pointer_width = "16", target_pointer_width = "32")),
+    ),
+    not(all(not(feature = "no-asm"), target_arch = "x86_64")),
+    not(any(target_arch = "sparc", target_arch = "sparc64"))
+))]
+impl_trifecta!(
+    u128_div_rem,
+    zero_div_fn,
+    u64_by_u64_div_rem,
+    32,
+    u32,
+    u64,
+    u128
+);
+
+// If the pointer width less than 64 and this isn't wasm, then the target
+// architecture almost certainly does not have the fast 64 to 128 bit widening
+// multiplication needed for `trifecta` to be faster.
+#[cfg(all(
+    not(any(
+        target_family = "wasm",
+        not(any(target_pointer_width = "16", target_pointer_width = "32")),
+    )),
+    not(all(not(feature = "no-asm"), target_arch = "x86_64")),
+    not(any(target_arch = "sparc", target_arch = "sparc64"))
+))]
+impl_delegate!(
+    u128_div_rem,
+    zero_div_fn,
+    u64_normalization_shift,
+    u64_by_u64_div_rem,
+    32,
+    u32,
+    u64,
+    u128,
+    i128
+);
+
+/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
+///
+/// # Safety
+///
+/// If the quotient does not fit in a `u64`, a floating point exception occurs.
+/// If `div == 0`, then a division by zero exception occurs.
+#[cfg(all(not(feature = "no-asm"), target_arch = "x86_64"))]
+#[inline]
+unsafe fn u128_by_u64_div_rem(duo: u128, div: u64) -> (u64, u64) {
+    let duo_lo = duo as u64;
+    let duo_hi = (duo >> 64) as u64;
+    let quo: u64;
+    let rem: u64;
+    unsafe {
+        // divides the combined registers rdx:rax (`duo` is split into two 64 bit parts to do this)
+        // by `div`. The quotient is stored in rax and the remainder in rdx.
+        // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
+        core::arch::asm!(
+            "div {0}",
+            in(reg) div,
+            inlateout("rax") duo_lo => quo,
+            inlateout("rdx") duo_hi => rem,
+            options(att_syntax, pure, nomem, nostack)
+        );
+    }
+    (quo, rem)
+}
+
+// use `asymmetric` instead of `trifecta` on x86_64
+#[cfg(all(not(feature = "no-asm"), target_arch = "x86_64"))]
+impl_asymmetric!(
+    u128_div_rem,
+    zero_div_fn,
+    u64_by_u64_div_rem,
+    u128_by_u64_div_rem,
+    32,
+    u32,
+    u64,
+    u128
+);
+
+/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
+/// `checked_div` and `checked_rem` are used to avoid bringing in panic function
+/// dependencies.
+#[inline]
+#[allow(dead_code)]
+fn u32_by_u32_div_rem(duo: u32, div: u32) -> (u32, u32) {
+    if let Some(quo) = duo.checked_div(div) {
+        if let Some(rem) = duo.checked_rem(div) {
+            return (quo, rem);
+        }
+    }
+    zero_div_fn()
+}
+
+// When not on x86 and the pointer width is not 64, use `delegate` since the division size is larger
+// than register size.
+#[cfg(all(
+    not(all(not(feature = "no-asm"), target_arch = "x86")),
+    not(target_pointer_width = "64")
+))]
+impl_delegate!(
+    u64_div_rem,
+    zero_div_fn,
+    u32_normalization_shift,
+    u32_by_u32_div_rem,
+    16,
+    u16,
+    u32,
+    u64,
+    i64
+);
+
+// When not on x86 and the pointer width is 64, use `binary_long`.
+#[cfg(all(
+    not(all(not(feature = "no-asm"), target_arch = "x86")),
+    target_pointer_width = "64"
+))]
+impl_binary_long!(
+    u64_div_rem,
+    zero_div_fn,
+    u64_normalization_shift,
+    64,
+    u64,
+    i64
+);
+
+/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
+///
+/// # Safety
+///
+/// If the quotient does not fit in a `u32`, a floating point exception occurs.
+/// If `div == 0`, then a division by zero exception occurs.
+#[cfg(all(not(feature = "no-asm"), target_arch = "x86"))]
+#[inline]
+unsafe fn u64_by_u32_div_rem(duo: u64, div: u32) -> (u32, u32) {
+    let duo_lo = duo as u32;
+    let duo_hi = (duo >> 32) as u32;
+    let quo: u32;
+    let rem: u32;
+    unsafe {
+        // divides the combined registers rdx:rax (`duo` is split into two 32 bit parts to do this)
+        // by `div`. The quotient is stored in rax and the remainder in rdx.
+        // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
+        core::arch::asm!(
+            "div {0}",
+            in(reg) div,
+            inlateout("rax") duo_lo => quo,
+            inlateout("rdx") duo_hi => rem,
+            options(att_syntax, pure, nomem, nostack)
+        );
+    }
+    (quo, rem)
+}
+
+// use `asymmetric` instead of `delegate` on x86
+#[cfg(all(not(feature = "no-asm"), target_arch = "x86"))]
+impl_asymmetric!(
+    u64_div_rem,
+    zero_div_fn,
+    u32_by_u32_div_rem,
+    u64_by_u32_div_rem,
+    16,
+    u16,
+    u32,
+    u64
+);
+
+// 32 bits is the smallest division used by `compiler-builtins`, so we end with binary long division
+impl_binary_long!(
+    u32_div_rem,
+    zero_div_fn,
+    u32_normalization_shift,
+    32,
+    u32,
+    i32,
+    allow(dead_code)
+);
diff --git a/library/compiler-builtins/compiler-builtins/src/int/specialized_div_rem/norm_shift.rs b/library/compiler-builtins/compiler-builtins/src/int/specialized_div_rem/norm_shift.rs
new file mode 100644
index 00000000000..61b67b6bc3d
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/int/specialized_div_rem/norm_shift.rs
@@ -0,0 +1,106 @@
+/// Creates a function used by some division algorithms to compute the "normalization shift".
+#[allow(unused_macros)]
+macro_rules! impl_normalization_shift {
+    (
+        $name:ident, // name of the normalization shift function
+        // boolean for if `$uX::leading_zeros` should be used (if an architecture does not have a
+        // hardware instruction for `usize::leading_zeros`, then this should be `true`)
+        $use_lz:ident,
+        $n:tt, // the number of bits in a $iX or $uX
+        $uX:ident, // unsigned integer type for the inputs of `$name`
+        $iX:ident, // signed integer type for the inputs of `$name`
+        $($unsigned_attr:meta),* // attributes for the function
+    ) => {
+        /// Finds the shift left that the divisor `div` would need to be normalized for a binary
+        /// long division step with the dividend `duo`. NOTE: This function assumes that these edge
+        /// cases have been handled before reaching it:
+        /// `
+        /// if div == 0 {
+        ///     panic!("attempt to divide by zero")
+        /// }
+        /// if duo < div {
+        ///     return (0, duo)
+        /// }
+        /// `
+        ///
+        /// Normalization is defined as (where `shl` is the output of this function):
+        /// `
+        /// if duo.leading_zeros() != (div << shl).leading_zeros() {
+        ///     // If the most significant bits of `duo` and `div << shl` are not in the same place,
+        ///     // then `div << shl` has one more leading zero than `duo`.
+        ///     assert_eq!(duo.leading_zeros() + 1, (div << shl).leading_zeros());
+        ///     // Also, `2*(div << shl)` is not more than `duo` (otherwise the first division step
+        ///     // would not be able to clear the msb of `duo`)
+        ///     assert!(duo < (div << (shl + 1)));
+        /// }
+        /// if full_normalization {
+        ///     // Some algorithms do not need "full" normalization, which means that `duo` is
+        ///     // larger than `div << shl` when the most significant bits are aligned.
+        ///     assert!((div << shl) <= duo);
+        /// }
+        /// `
+        ///
+        /// Note: If the software bisection algorithm is being used in this function, it happens
+        /// that full normalization always occurs, so be careful that new algorithms are not
+        /// invisibly depending on this invariant when `full_normalization` is set to `false`.
+        $(
+            #[$unsigned_attr]
+        )*
+        fn $name(duo: $uX, div: $uX, full_normalization: bool) -> usize {
+            // We have to find the leading zeros of `div` to know where its msb (most significant
+            // set bit) is to even begin binary long division. It is also good to know where the msb
+            // of `duo` is so that useful work can be started instead of shifting `div` for all
+            // possible quotients (many division steps are wasted if `duo.leading_zeros()` is large
+            // and `div` starts out being shifted all the way to the msb). Aligning the msbs of
+            // `div` and `duo` could be done by shifting `div` left by
+            // `div.leading_zeros() - duo.leading_zeros()`, but some CPUs without division hardware
+            // also do not have single instructions for calculating `leading_zeros`. Instead of
+            // software doing two bisections to find the two `leading_zeros`, we do one bisection to
+            // find `div.leading_zeros() - duo.leading_zeros()` without actually knowing either of
+            // the leading zeros values.
+
+            let mut shl: usize;
+            if $use_lz {
+                shl = (div.leading_zeros() - duo.leading_zeros()) as usize;
+                if full_normalization {
+                    if duo < (div << shl) {
+                        // when the msb of `duo` and `div` are aligned, the resulting `div` may be
+                        // larger than `duo`, so we decrease the shift by 1.
+                        shl -= 1;
+                    }
+                }
+            } else {
+                let mut test = duo;
+                shl = 0usize;
+                let mut lvl = $n >> 1;
+                loop {
+                    let tmp = test >> lvl;
+                    // It happens that a final `duo < (div << shl)` check is not needed, because the
+                    // `div <= tmp` check insures that the msb of `test` never passes the msb of
+                    // `div`, and any set bits shifted off the end of `test` would still keep
+                    // `div <= tmp` true.
+                    if div <= tmp {
+                        test = tmp;
+                        shl += lvl;
+                    }
+                    // narrow down bisection
+                    lvl >>= 1;
+                    if lvl == 0 {
+                        break
+                    }
+                }
+            }
+            // tests the invariants that should hold before beginning binary long division
+            /*
+            if full_normalization {
+                assert!((div << shl) <= duo);
+            }
+            if duo.leading_zeros() != (div << shl).leading_zeros() {
+                assert_eq!(duo.leading_zeros() + 1, (div << shl).leading_zeros());
+                assert!(duo < (div << (shl + 1)));
+            }
+            */
+            shl
+        }
+    }
+}
diff --git a/library/compiler-builtins/compiler-builtins/src/int/specialized_div_rem/trifecta.rs b/library/compiler-builtins/compiler-builtins/src/int/specialized_div_rem/trifecta.rs
new file mode 100644
index 00000000000..7e104053b8b
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/int/specialized_div_rem/trifecta.rs
@@ -0,0 +1,386 @@
+/// Creates an unsigned division function optimized for division of integers with bitwidths
+/// larger than the largest hardware integer division supported. These functions use large radix
+/// division algorithms that require both fast division and very fast widening multiplication on the
+/// target microarchitecture. Otherwise, `impl_delegate` should be used instead.
+#[allow(unused_macros)]
+macro_rules! impl_trifecta {
+    (
+        $fn:ident, // name of the unsigned division function
+        $zero_div_fn:ident, // function called when division by zero is attempted
+        $half_division:ident, // function for division of a $uX by a $uX
+        $n_h:expr, // the number of bits in $iH or $uH
+        $uH:ident, // unsigned integer with half the bit width of $uX
+        $uX:ident, // unsigned integer with half the bit width of $uD
+        $uD:ident // unsigned integer type for the inputs and outputs of `$unsigned_name`
+    ) => {
+        /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
+        /// tuple.
+        pub fn $fn(duo: $uD, div: $uD) -> ($uD, $uD) {
+            // This is called the trifecta algorithm because it uses three main algorithms: short
+            // division for small divisors, the two possibility algorithm for large divisors, and an
+            // undersubtracting long division algorithm for intermediate cases.
+
+            // This replicates `carrying_mul` (rust-lang rfc #2417). LLVM correctly optimizes this
+            // to use a widening multiply to 128 bits on the relevant architectures.
+            fn carrying_mul(lhs: $uX, rhs: $uX) -> ($uX, $uX) {
+                let tmp = (lhs as $uD).wrapping_mul(rhs as $uD);
+                (tmp as $uX, (tmp >> ($n_h * 2)) as $uX)
+            }
+            fn carrying_mul_add(lhs: $uX, mul: $uX, add: $uX) -> ($uX, $uX) {
+                let tmp = (lhs as $uD)
+                    .wrapping_mul(mul as $uD)
+                    .wrapping_add(add as $uD);
+                (tmp as $uX, (tmp >> ($n_h * 2)) as $uX)
+            }
+
+            // the number of bits in a $uX
+            let n = $n_h * 2;
+
+            if div == 0 {
+                $zero_div_fn()
+            }
+
+            // Trying to use a normalization shift function will cause inelegancies in the code and
+            // inefficiencies for architectures with a native count leading zeros instruction. The
+            // undersubtracting algorithm needs both values (keeping the original `div_lz` but
+            // updating `duo_lz` multiple times), so we assume hardware support for fast
+            // `leading_zeros` calculation.
+            let div_lz = div.leading_zeros();
+            let mut duo_lz = duo.leading_zeros();
+
+            // the possible ranges of `duo` and `div` at this point:
+            // `0 <= duo < 2^n_d`
+            // `1 <= div < 2^n_d`
+
+            // quotient is 0 or 1 branch
+            if div_lz <= duo_lz {
+                // The quotient cannot be more than 1. The highest set bit of `duo` needs to be at
+                // least one place higher than `div` for the quotient to be more than 1.
+                if duo >= div {
+                    return (1, duo - div);
+                } else {
+                    return (0, duo);
+                }
+            }
+
+            // `_sb` is the number of significant bits (from the ones place to the highest set bit)
+            // `{2, 2^div_sb} <= duo < 2^n_d`
+            // `1 <= div < {2^duo_sb, 2^(n_d - 1)}`
+            // smaller division branch
+            if duo_lz >= n {
+                // `duo < 2^n` so it will fit in a $uX. `div` will also fit in a $uX (because of the
+                // `div_lz <= duo_lz` branch) so no numerical error.
+                let (quo, rem) = $half_division(duo as $uX, div as $uX);
+                return (quo as $uD, rem as $uD);
+            }
+
+            // `{2^n, 2^div_sb} <= duo < 2^n_d`
+            // `1 <= div < {2^duo_sb, 2^(n_d - 1)}`
+            // short division branch
+            if div_lz >= (n + $n_h) {
+                // `1 <= div < {2^duo_sb, 2^n_h}`
+
+                // It is barely possible to improve the performance of this by calculating the
+                // reciprocal and removing one `$half_division`, but only if the CPU can do fast
+                // multiplications in parallel. Other reciprocal based methods can remove two
+                // `$half_division`s, but have multiplications that cannot be done in parallel and
+                // reduce performance. I have decided to use this trivial short division method and
+                // rely on the CPU having quick divisions.
+
+                let duo_hi = (duo >> n) as $uX;
+                let div_0 = div as $uH as $uX;
+                let (quo_hi, rem_3) = $half_division(duo_hi, div_0);
+
+                let duo_mid = ((duo >> $n_h) as $uH as $uX) | (rem_3 << $n_h);
+                let (quo_1, rem_2) = $half_division(duo_mid, div_0);
+
+                let duo_lo = (duo as $uH as $uX) | (rem_2 << $n_h);
+                let (quo_0, rem_1) = $half_division(duo_lo, div_0);
+
+                return (
+                    (quo_0 as $uD) | ((quo_1 as $uD) << $n_h) | ((quo_hi as $uD) << n),
+                    rem_1 as $uD,
+                );
+            }
+
+            // relative leading significant bits, cannot overflow because of above branches
+            let lz_diff = div_lz - duo_lz;
+
+            // `{2^n, 2^div_sb} <= duo < 2^n_d`
+            // `2^n_h <= div < {2^duo_sb, 2^(n_d - 1)}`
+            // `mul` or `mul - 1` branch
+            if lz_diff < $n_h {
+                // Two possibility division algorithm
+
+                // The most significant bits of `duo` and `div` are within `$n_h` bits of each
+                // other. If we take the `n` most significant bits of `duo` and divide them by the
+                // corresponding bits in `div`, it produces a quotient value `quo`. It happens that
+                // `quo` or `quo - 1` will always be the correct quotient for the whole number. In
+                // other words, the bits less significant than the `n` most significant bits of
+                // `duo` and `div` can only influence the quotient to be one of two values.
+                // Because there are only two possibilities, there only needs to be one `$uH` sized
+                // division, a `$uH` by `$uD` multiplication, and only one branch with a few simple
+                // operations.
+                //
+                // Proof that the true quotient can only be `quo` or `quo - 1`.
+                // All `/` operators here are floored divisions.
+                //
+                // `shift` is the number of bits not in the higher `n` significant bits of `duo`.
+                // (definitions)
+                // 0. shift = n - duo_lz
+                // 1. duo_sig_n == duo / 2^shift
+                // 2. div_sig_n == div / 2^shift
+                // 3. quo == duo_sig_n / div_sig_n
+                //
+                //
+                // We are trying to find the true quotient, `true_quo`.
+                // 4. true_quo = duo / div. (definition)
+                //
+                // This is true because of the bits that are cut off during the bit shift.
+                // 5. duo_sig_n * 2^shift <= duo < (duo_sig_n + 1) * 2^shift.
+                // 6. div_sig_n * 2^shift <= div < (div_sig_n + 1) * 2^shift.
+                //
+                // Dividing each bound of (5) by each bound of (6) gives 4 possibilities for what
+                // `true_quo == duo / div` is bounded by:
+                // (duo_sig_n * 2^shift) / (div_sig_n * 2^shift)
+                // (duo_sig_n * 2^shift) / ((div_sig_n + 1) * 2^shift)
+                // ((duo_sig_n + 1) * 2^shift) / (div_sig_n * 2^shift)
+                // ((duo_sig_n + 1) * 2^shift) / ((div_sig_n + 1) * 2^shift)
+                //
+                // Simplifying each of these four:
+                // duo_sig_n / div_sig_n
+                // duo_sig_n / (div_sig_n + 1)
+                // (duo_sig_n + 1) / div_sig_n
+                // (duo_sig_n + 1) / (div_sig_n + 1)
+                //
+                // Taking the smallest and the largest of these as the low and high bounds
+                // and replacing `duo / div` with `true_quo`:
+                // 7. duo_sig_n / (div_sig_n + 1) <= true_quo < (duo_sig_n + 1) / div_sig_n
+                //
+                // The `lz_diff < n_h` conditional on this branch makes sure that `div_sig_n` is at
+                // least `2^n_h`, and the `div_lz <= duo_lz` branch makes sure that the highest bit
+                // of `div_sig_n` is not the `2^(n - 1)` bit.
+                // 8. `2^(n - 1) <= duo_sig_n < 2^n`
+                // 9. `2^n_h <= div_sig_n < 2^(n - 1)`
+                //
+                // We want to prove that either
+                // `(duo_sig_n + 1) / div_sig_n == duo_sig_n / (div_sig_n + 1)` or that
+                // `(duo_sig_n + 1) / div_sig_n == duo_sig_n / (div_sig_n + 1) + 1`.
+                //
+                // We also want to prove that `quo` is one of these:
+                // `duo_sig_n / div_sig_n == duo_sig_n / (div_sig_n + 1)` or
+                // `duo_sig_n / div_sig_n == (duo_sig_n + 1) / div_sig_n`.
+                //
+                // When 1 is added to the numerator of `duo_sig_n / div_sig_n` to produce
+                // `(duo_sig_n + 1) / div_sig_n`, it is not possible that the value increases by
+                // more than 1 with floored integer arithmetic and `div_sig_n != 0`. Consider
+                // `x/y + 1 < (x + 1)/y` <=> `x/y + 1 < x/y + 1/y` <=> `1 < 1/y` <=> `y < 1`.
+                // `div_sig_n` is a nonzero integer. Thus,
+                // 10. `duo_sig_n / div_sig_n == (duo_sig_n + 1) / div_sig_n` or
+                //     `(duo_sig_n / div_sig_n) + 1 == (duo_sig_n + 1) / div_sig_n.
+                //
+                // When 1 is added to the denominator of `duo_sig_n / div_sig_n` to produce
+                // `duo_sig_n / (div_sig_n + 1)`, it is not possible that the value decreases by
+                // more than 1 with the bounds (8) and (9). Consider `x/y - 1 <= x/(y + 1)` <=>
+                // `(x - y)/y < x/(y + 1)` <=> `(y + 1)*(x - y) < x*y` <=> `x*y - y*y + x - y < x*y`
+                // <=> `x < y*y + y`. The smallest value of `div_sig_n` is `2^n_h` and the largest
+                // value of `duo_sig_n` is `2^n - 1`. Substituting reveals `2^n - 1 < 2^n + 2^n_h`.
+                // Thus,
+                // 11. `duo_sig_n / div_sig_n == duo_sig_n / (div_sig_n + 1)` or
+                //     `(duo_sig_n / div_sig_n) - 1` == duo_sig_n / (div_sig_n + 1)`
+                //
+                // Combining both (10) and (11), we know that
+                // `quo - 1 <= duo_sig_n / (div_sig_n + 1) <= true_quo
+                // < (duo_sig_n + 1) / div_sig_n <= quo + 1` and therefore:
+                // 12. quo - 1 <= true_quo < quo + 1
+                //
+                // In a lot of division algorithms using smaller divisions to construct a larger
+                // division, we often encounter a situation where the approximate `quo` value
+                // calculated from a smaller division is multiple increments away from the true
+                // `quo` value. In those algorithms, multiple correction steps have to be applied.
+                // Those correction steps may need more multiplications to test `duo - (quo*div)`
+                // again. Because of the fact that our `quo` can only be one of two values, we can
+                // see if `duo - (quo*div)` overflows. If it did overflow, then we know that we have
+                // the larger of the two values (since the true quotient is unique, and any larger
+                // quotient will cause `duo - (quo*div)` to be negative). Also because there is only
+                // one correction needed, we can calculate the remainder `duo - (true_quo*div) ==
+                // duo - ((quo - 1)*div) == duo - (quo*div - div) == duo + div - quo*div`.
+                // If `duo - (quo*div)` did not overflow, then we have the correct answer.
+                let shift = n - duo_lz;
+                let duo_sig_n = (duo >> shift) as $uX;
+                let div_sig_n = (div >> shift) as $uX;
+                let quo = $half_division(duo_sig_n, div_sig_n).0;
+
+                // The larger `quo` value can overflow `$uD` in the right circumstances. This is a
+                // manual `carrying_mul_add` with overflow checking.
+                let div_lo = div as $uX;
+                let div_hi = (div >> n) as $uX;
+                let (tmp_lo, carry) = carrying_mul(quo, div_lo);
+                let (tmp_hi, overflow) = carrying_mul_add(quo, div_hi, carry);
+                let tmp = (tmp_lo as $uD) | ((tmp_hi as $uD) << n);
+                if (overflow != 0) || (duo < tmp) {
+                    return (
+                        (quo - 1) as $uD,
+                        // Both the addition and subtraction can overflow, but when combined end up
+                        // as a correct positive number.
+                        duo.wrapping_add(div).wrapping_sub(tmp),
+                    );
+                } else {
+                    return (quo as $uD, duo - tmp);
+                }
+            }
+
+            // Undersubtracting long division algorithm.
+            // Instead of clearing a minimum of 1 bit from `duo` per iteration via binary long
+            // division, `n_h - 1` bits are cleared per iteration with this algorithm. It is a more
+            // complicated version of regular long division. Most integer division algorithms tend
+            // to guess a part of the quotient, and may have a larger quotient than the true
+            // quotient (which when multiplied by `div` will "oversubtract" the original dividend).
+            // They then check if the quotient was in fact too large and then have to correct it.
+            // This long division algorithm has been carefully constructed to always underguess the
+            // quotient by slim margins. This allows different subalgorithms to be blindly jumped to
+            // without needing an extra correction step.
+            //
+            // The only problem is that this subalgorithm will not work for many ranges of `duo` and
+            // `div`. Fortunately, the short division, two possibility algorithm, and other simple
+            // cases happen to exactly fill these gaps.
+            //
+            // For an example, consider the division of 76543210 by 213 and assume that `n_h` is
+            // equal to two decimal digits (note: we are working with base 10 here for readability).
+            // The first `sig_n_h` part of the divisor (21) is taken and is incremented by 1 to
+            // prevent oversubtraction. We also record the number of extra places not a part of
+            // the `sig_n` or `sig_n_h` parts.
+            //
+            // sig_n_h == 2 digits, sig_n == 4 digits
+            //
+            // vvvv     <- `duo_sig_n`
+            // 76543210
+            //     ^^^^ <- extra places in duo, `duo_extra == 4`
+            //
+            // vv  <- `div_sig_n_h`
+            // 213
+            //   ^ <- extra places in div, `div_extra == 1`
+            //
+            // The difference in extra places, `duo_extra - div_extra == extra_shl == 3`, is used
+            // for shifting partial sums in the long division.
+            //
+            // In the first step, the first `sig_n` part of duo (7654) is divided by
+            // `div_sig_n_h_add_1` (22), which results in a partial quotient of 347. This is
+            // multiplied by the whole divisor to make 73911, which is shifted left by `extra_shl`
+            // and subtracted from duo. The partial quotient is also shifted left by `extra_shl` to
+            // be added to `quo`.
+            //
+            //    347
+            //  ________
+            // |76543210
+            // -73911
+            //   2632210
+            //
+            // Variables dependent on duo have to be updated:
+            //
+            // vvvv    <- `duo_sig_n == 2632`
+            // 2632210
+            //     ^^^ <- `duo_extra == 3`
+            //
+            // `extra_shl == 2`
+            //
+            // Two more steps are taken after this and then duo fits into `n` bits, and then a final
+            // normal long division step is made. The partial quotients are all progressively added
+            // to each other in the actual algorithm, but here I have left them all in a tower that
+            // can be added together to produce the quotient, 359357.
+            //
+            //        14
+            //       443
+            //     119
+            //    347
+            //  ________
+            // |76543210
+            // -73911
+            //   2632210
+            //  -25347
+            //     97510
+            //    -94359
+            //      3151
+            //     -2982
+            //       169 <- the remainder
+
+            let mut duo = duo;
+            let mut quo: $uD = 0;
+
+            // The number of lesser significant bits not a part of `div_sig_n_h`
+            let div_extra = (n + $n_h) - div_lz;
+
+            // The most significant `n_h` bits of div
+            let div_sig_n_h = (div >> div_extra) as $uH;
+
+            // This needs to be a `$uX` in case of overflow from the increment
+            let div_sig_n_h_add1 = (div_sig_n_h as $uX) + 1;
+
+            // `{2^n, 2^(div_sb + n_h)} <= duo < 2^n_d`
+            // `2^n_h <= div < {2^(duo_sb - n_h), 2^n}`
+            loop {
+                // The number of lesser significant bits not a part of `duo_sig_n`
+                let duo_extra = n - duo_lz;
+
+                // The most significant `n` bits of `duo`
+                let duo_sig_n = (duo >> duo_extra) as $uX;
+
+                // the two possibility algorithm requires that the difference between msbs is less
+                // than `n_h`, so the comparison is `<=` here.
+                if div_extra <= duo_extra {
+                    // Undersubtracting long division step
+                    let quo_part = $half_division(duo_sig_n, div_sig_n_h_add1).0 as $uD;
+                    let extra_shl = duo_extra - div_extra;
+
+                    // Addition to the quotient.
+                    quo += (quo_part << extra_shl);
+
+                    // Subtraction from `duo`. At least `n_h - 1` bits are cleared from `duo` here.
+                    duo -= (div.wrapping_mul(quo_part) << extra_shl);
+                } else {
+                    // Two possibility algorithm
+                    let shift = n - duo_lz;
+                    let duo_sig_n = (duo >> shift) as $uX;
+                    let div_sig_n = (div >> shift) as $uX;
+                    let quo_part = $half_division(duo_sig_n, div_sig_n).0;
+                    let div_lo = div as $uX;
+                    let div_hi = (div >> n) as $uX;
+
+                    let (tmp_lo, carry) = carrying_mul(quo_part, div_lo);
+                    // The undersubtracting long division algorithm has already run once, so
+                    // overflow beyond `$uD` bits is not possible here
+                    let (tmp_hi, _) = carrying_mul_add(quo_part, div_hi, carry);
+                    let tmp = (tmp_lo as $uD) | ((tmp_hi as $uD) << n);
+
+                    if duo < tmp {
+                        return (
+                            quo + ((quo_part - 1) as $uD),
+                            duo.wrapping_add(div).wrapping_sub(tmp),
+                        );
+                    } else {
+                        return (quo + (quo_part as $uD), duo - tmp);
+                    }
+                }
+
+                duo_lz = duo.leading_zeros();
+
+                if div_lz <= duo_lz {
+                    // quotient can have 0 or 1 added to it
+                    if div <= duo {
+                        return (quo + 1, duo - div);
+                    } else {
+                        return (quo, duo);
+                    }
+                }
+
+                // This can only happen if `div_sd < n` (because of previous "quo = 0 or 1"
+                // branches), but it is not worth it to unroll further.
+                if n <= duo_lz {
+                    // simple division and addition
+                    let tmp = $half_division(duo as $uX, div as $uX);
+                    return (quo + (tmp.0 as $uD), tmp.1 as $uD);
+                }
+            }
+        }
+    };
+}
diff --git a/library/compiler-builtins/compiler-builtins/src/int/trailing_zeros.rs b/library/compiler-builtins/compiler-builtins/src/int/trailing_zeros.rs
new file mode 100644
index 00000000000..c45d6b1cfe8
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/int/trailing_zeros.rs
@@ -0,0 +1,69 @@
+#[cfg(feature = "unstable-public-internals")]
+pub use implementation::trailing_zeros;
+#[cfg(not(feature = "unstable-public-internals"))]
+pub(crate) use implementation::trailing_zeros;
+
+mod implementation {
+    use crate::int::{CastInto, Int};
+
+    /// Returns number of trailing binary zeros in `x`.
+    #[allow(dead_code)]
+    pub fn trailing_zeros<T: Int + CastInto<u32> + CastInto<u16> + CastInto<u8>>(x: T) -> usize {
+        let mut x = x;
+        let mut r: u32 = 0;
+        let mut t: u32;
+
+        const { assert!(T::BITS <= 64) };
+        if T::BITS >= 64 {
+            r += ((CastInto::<u32>::cast(x) == 0) as u32) << 5; // if (x has no 32 small bits) t = 32 else 0
+            x >>= r; // remove 32 zero bits
+        }
+
+        if T::BITS >= 32 {
+            t = ((CastInto::<u16>::cast(x) == 0) as u32) << 4; // if (x has no 16 small bits) t = 16 else 0
+            r += t;
+            x >>= t; // x = [0 - 0xFFFF] + higher garbage bits
+        }
+
+        const { assert!(T::BITS >= 16) };
+        t = ((CastInto::<u8>::cast(x) == 0) as u32) << 3;
+        x >>= t; // x = [0 - 0xFF] + higher garbage bits
+        r += t;
+
+        let mut x: u8 = x.cast();
+
+        t = (((x & 0x0F) == 0) as u32) << 2;
+        x >>= t; // x = [0 - 0xF] + higher garbage bits
+        r += t;
+
+        t = (((x & 0x3) == 0) as u32) << 1;
+        x >>= t; // x = [0 - 0x3] + higher garbage bits
+        r += t;
+
+        x &= 3;
+
+        r as usize + ((2 - (x >> 1) as usize) & (((x & 1) == 0) as usize).wrapping_neg())
+    }
+}
+
+intrinsics! {
+    /// Returns the number of trailing binary zeros in `x` (32 bit version).
+    pub extern "C" fn __ctzsi2(x: u32) -> usize {
+        trailing_zeros(x)
+    }
+
+    /// Returns the number of trailing binary zeros in `x` (64 bit version).
+    pub extern "C" fn __ctzdi2(x: u64) -> usize {
+        trailing_zeros(x)
+    }
+
+    /// Returns the number of trailing binary zeros in `x` (128 bit version).
+    pub extern "C" fn __ctzti2(x: u128) -> usize {
+        let lo = x as u64;
+        if lo == 0 {
+            64 + __ctzdi2((x >> 64) as u64)
+        } else {
+            __ctzdi2(lo)
+        }
+    }
+}
diff --git a/library/compiler-builtins/compiler-builtins/src/int/traits.rs b/library/compiler-builtins/compiler-builtins/src/int/traits.rs
new file mode 100644
index 00000000000..152cb2eee2e
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/int/traits.rs
@@ -0,0 +1,411 @@
+use core::ops;
+
+/// Minimal integer implementations needed on all integer types, including wide integers.
+#[allow(dead_code)]
+pub trait MinInt:
+    Copy
+    + core::fmt::Debug
+    + ops::BitOr<Output = Self>
+    + ops::Not<Output = Self>
+    + ops::Shl<u32, Output = Self>
+{
+    /// Type with the same width but other signedness
+    type OtherSign: MinInt;
+    /// Unsigned version of Self
+    type UnsignedInt: MinInt;
+
+    /// If `Self` is a signed integer
+    const SIGNED: bool;
+
+    /// The bitwidth of the int type
+    const BITS: u32;
+
+    const ZERO: Self;
+    const ONE: Self;
+    const MIN: Self;
+    const MAX: Self;
+}
+
+/// Trait for some basic operations on integers
+#[allow(dead_code)]
+pub trait Int:
+    MinInt
+    + PartialEq
+    + PartialOrd
+    + ops::AddAssign
+    + ops::SubAssign
+    + ops::BitAndAssign
+    + ops::BitOrAssign
+    + ops::BitXorAssign
+    + ops::ShlAssign<i32>
+    + ops::ShrAssign<u32>
+    + ops::Add<Output = Self>
+    + ops::Sub<Output = Self>
+    + ops::Mul<Output = Self>
+    + ops::Div<Output = Self>
+    + ops::Shr<u32, Output = Self>
+    + ops::BitXor<Output = Self>
+    + ops::BitAnd<Output = Self>
+{
+    /// LUT used for maximizing the space covered and minimizing the computational cost of fuzzing
+    /// in `builtins-test`. For example, Self = u128 produces [0,1,2,7,8,15,16,31,32,63,64,95,96,
+    /// 111,112,119,120,125,126,127].
+    const FUZZ_LENGTHS: [u8; 20] = make_fuzz_lengths(<Self as MinInt>::BITS);
+
+    /// The number of entries of `FUZZ_LENGTHS` actually used. The maximum is 20 for u128.
+    const FUZZ_NUM: usize = {
+        let log2 = (<Self as MinInt>::BITS - 1).count_ones() as usize;
+        if log2 == 3 {
+            // case for u8
+            6
+        } else {
+            // 3 entries on each extreme, 2 in the middle, and 4 for each scale of intermediate
+            // boundaries.
+            8 + (4 * (log2 - 4))
+        }
+    };
+
+    fn unsigned(self) -> Self::UnsignedInt;
+    fn from_unsigned(unsigned: Self::UnsignedInt) -> Self;
+    fn unsigned_abs(self) -> Self::UnsignedInt;
+
+    fn from_bool(b: bool) -> Self;
+
+    /// Prevents the need for excessive conversions between signed and unsigned
+    fn logical_shr(self, other: u32) -> Self;
+
+    /// Absolute difference between two integers.
+    fn abs_diff(self, other: Self) -> Self::UnsignedInt;
+
+    // copied from primitive integers, but put in a trait
+    fn is_zero(self) -> bool;
+    fn wrapping_neg(self) -> Self;
+    fn wrapping_add(self, other: Self) -> Self;
+    fn wrapping_mul(self, other: Self) -> Self;
+    fn wrapping_sub(self, other: Self) -> Self;
+    fn wrapping_shl(self, other: u32) -> Self;
+    fn wrapping_shr(self, other: u32) -> Self;
+    fn rotate_left(self, other: u32) -> Self;
+    fn overflowing_add(self, other: Self) -> (Self, bool);
+    fn leading_zeros(self) -> u32;
+    fn ilog2(self) -> u32;
+}
+
+pub(crate) const fn make_fuzz_lengths(bits: u32) -> [u8; 20] {
+    let mut v = [0u8; 20];
+    v[0] = 0;
+    v[1] = 1;
+    v[2] = 2; // important for parity and the iX::MIN case when reversed
+    let mut i = 3;
+
+    // No need for any more until the byte boundary, because there should be no algorithms
+    // that are sensitive to anything not next to byte boundaries after 2. We also scale
+    // in powers of two, which is important to prevent u128 corner tests from getting too
+    // big.
+    let mut l = 8;
+    loop {
+        if l >= ((bits / 2) as u8) {
+            break;
+        }
+        // get both sides of the byte boundary
+        v[i] = l - 1;
+        i += 1;
+        v[i] = l;
+        i += 1;
+        l *= 2;
+    }
+
+    if bits != 8 {
+        // add the lower side of the middle boundary
+        v[i] = ((bits / 2) - 1) as u8;
+        i += 1;
+    }
+
+    // We do not want to jump directly from the Self::BITS/2 boundary to the Self::BITS
+    // boundary because of algorithms that split the high part up. We reverse the scaling
+    // as we go to Self::BITS.
+    let mid = i;
+    let mut j = 1;
+    loop {
+        v[i] = (bits as u8) - (v[mid - j]) - 1;
+        if j == mid {
+            break;
+        }
+        i += 1;
+        j += 1;
+    }
+    v
+}
+
+macro_rules! int_impl_common {
+    ($ty:ty) => {
+        fn from_bool(b: bool) -> Self {
+            b as $ty
+        }
+
+        fn logical_shr(self, other: u32) -> Self {
+            Self::from_unsigned(self.unsigned().wrapping_shr(other))
+        }
+
+        fn is_zero(self) -> bool {
+            self == Self::ZERO
+        }
+
+        fn wrapping_neg(self) -> Self {
+            <Self>::wrapping_neg(self)
+        }
+
+        fn wrapping_add(self, other: Self) -> Self {
+            <Self>::wrapping_add(self, other)
+        }
+
+        fn wrapping_mul(self, other: Self) -> Self {
+            <Self>::wrapping_mul(self, other)
+        }
+        fn wrapping_sub(self, other: Self) -> Self {
+            <Self>::wrapping_sub(self, other)
+        }
+
+        fn wrapping_shl(self, other: u32) -> Self {
+            <Self>::wrapping_shl(self, other)
+        }
+
+        fn wrapping_shr(self, other: u32) -> Self {
+            <Self>::wrapping_shr(self, other)
+        }
+
+        fn rotate_left(self, other: u32) -> Self {
+            <Self>::rotate_left(self, other)
+        }
+
+        fn overflowing_add(self, other: Self) -> (Self, bool) {
+            <Self>::overflowing_add(self, other)
+        }
+
+        fn leading_zeros(self) -> u32 {
+            <Self>::leading_zeros(self)
+        }
+
+        fn ilog2(self) -> u32 {
+            <Self>::ilog2(self)
+        }
+    };
+}
+
+macro_rules! int_impl {
+    ($ity:ty, $uty:ty) => {
+        impl MinInt for $uty {
+            type OtherSign = $ity;
+            type UnsignedInt = $uty;
+
+            const BITS: u32 = <Self as MinInt>::ZERO.count_zeros();
+            const SIGNED: bool = Self::MIN != Self::ZERO;
+
+            const ZERO: Self = 0;
+            const ONE: Self = 1;
+            const MIN: Self = <Self>::MIN;
+            const MAX: Self = <Self>::MAX;
+        }
+
+        impl Int for $uty {
+            fn unsigned(self) -> $uty {
+                self
+            }
+
+            // It makes writing macros easier if this is implemented for both signed and unsigned
+            #[allow(clippy::wrong_self_convention)]
+            fn from_unsigned(me: $uty) -> Self {
+                me
+            }
+
+            fn unsigned_abs(self) -> Self {
+                self
+            }
+
+            fn abs_diff(self, other: Self) -> Self {
+                self.abs_diff(other)
+            }
+
+            int_impl_common!($uty);
+        }
+
+        impl MinInt for $ity {
+            type OtherSign = $uty;
+            type UnsignedInt = $uty;
+
+            const BITS: u32 = <Self as MinInt>::ZERO.count_zeros();
+            const SIGNED: bool = Self::MIN != Self::ZERO;
+
+            const ZERO: Self = 0;
+            const ONE: Self = 1;
+            const MIN: Self = <Self>::MIN;
+            const MAX: Self = <Self>::MAX;
+        }
+
+        impl Int for $ity {
+            fn unsigned(self) -> $uty {
+                self as $uty
+            }
+
+            fn from_unsigned(me: $uty) -> Self {
+                me as $ity
+            }
+
+            fn unsigned_abs(self) -> Self::UnsignedInt {
+                self.unsigned_abs()
+            }
+
+            fn abs_diff(self, other: Self) -> $uty {
+                self.abs_diff(other)
+            }
+
+            int_impl_common!($ity);
+        }
+    };
+}
+
+int_impl!(isize, usize);
+int_impl!(i8, u8);
+int_impl!(i16, u16);
+int_impl!(i32, u32);
+int_impl!(i64, u64);
+int_impl!(i128, u128);
+
+/// Trait for integers twice the bit width of another integer. This is implemented for all
+/// primitives except for `u8`, because there is not a smaller primitive.
+pub trait DInt: MinInt {
+    /// Integer that is half the bit width of the integer this trait is implemented for
+    type H: HInt<D = Self>;
+
+    /// Returns the low half of `self`
+    fn lo(self) -> Self::H;
+    /// Returns the high half of `self`
+    fn hi(self) -> Self::H;
+    /// Returns the low and high halves of `self` as a tuple
+    fn lo_hi(self) -> (Self::H, Self::H) {
+        (self.lo(), self.hi())
+    }
+    /// Constructs an integer using lower and higher half parts
+    fn from_lo_hi(lo: Self::H, hi: Self::H) -> Self {
+        lo.zero_widen() | hi.widen_hi()
+    }
+}
+
+/// Trait for integers half the bit width of another integer. This is implemented for all
+/// primitives except for `u128`, because it there is not a larger primitive.
+pub trait HInt: Int {
+    /// Integer that is double the bit width of the integer this trait is implemented for
+    type D: DInt<H = Self> + MinInt;
+
+    // NB: some of the below methods could have default implementations (e.g. `widen_hi`), but for
+    // unknown reasons this can cause infinite recursion when optimizations are disabled. See
+    // <https://github.com/rust-lang/compiler-builtins/pull/707> for context.
+
+    /// Widens (using default extension) the integer to have double bit width
+    fn widen(self) -> Self::D;
+    /// Widens (zero extension only) the integer to have double bit width. This is needed to get
+    /// around problems with associated type bounds (such as `Int<Othersign: DInt>`) being unstable
+    fn zero_widen(self) -> Self::D;
+    /// Widens the integer to have double bit width and shifts the integer into the higher bits
+    fn widen_hi(self) -> Self::D;
+    /// Widening multiplication with zero widening. This cannot overflow.
+    fn zero_widen_mul(self, rhs: Self) -> Self::D;
+    /// Widening multiplication. This cannot overflow.
+    fn widen_mul(self, rhs: Self) -> Self::D;
+}
+
+macro_rules! impl_d_int {
+    ($($X:ident $D:ident),*) => {
+        $(
+            impl DInt for $D {
+                type H = $X;
+
+                fn lo(self) -> Self::H {
+                    self as $X
+                }
+                fn hi(self) -> Self::H {
+                    (self >> <$X as MinInt>::BITS) as $X
+                }
+            }
+        )*
+    };
+}
+
+macro_rules! impl_h_int {
+    ($($H:ident $uH:ident $X:ident),*) => {
+        $(
+            impl HInt for $H {
+                type D = $X;
+
+                fn widen(self) -> Self::D {
+                    self as $X
+                }
+                fn zero_widen(self) -> Self::D {
+                    (self as $uH) as $X
+                }
+                fn zero_widen_mul(self, rhs: Self) -> Self::D {
+                    self.zero_widen().wrapping_mul(rhs.zero_widen())
+                }
+                fn widen_mul(self, rhs: Self) -> Self::D {
+                    self.widen().wrapping_mul(rhs.widen())
+                }
+                fn widen_hi(self) -> Self::D {
+                    (self as $X) << <Self as MinInt>::BITS
+                }
+            }
+        )*
+    };
+}
+
+impl_d_int!(u8 u16, u16 u32, u32 u64, u64 u128, i8 i16, i16 i32, i32 i64, i64 i128);
+impl_h_int!(
+    u8 u8 u16,
+    u16 u16 u32,
+    u32 u32 u64,
+    u64 u64 u128,
+    i8 u8 i16,
+    i16 u16 i32,
+    i32 u32 i64,
+    i64 u64 i128
+);
+
+/// Trait to express (possibly lossy) casting of integers
+pub trait CastInto<T: Copy>: Copy {
+    fn cast(self) -> T;
+}
+
+pub trait CastFrom<T: Copy>: Copy {
+    fn cast_from(value: T) -> Self;
+}
+
+impl<T: Copy, U: CastInto<T> + Copy> CastFrom<U> for T {
+    fn cast_from(value: U) -> Self {
+        value.cast()
+    }
+}
+
+macro_rules! cast_into {
+    ($ty:ty) => {
+        cast_into!($ty; usize, isize, u8, i8, u16, i16, u32, i32, u64, i64, u128, i128);
+    };
+    ($ty:ty; $($into:ty),*) => {$(
+        impl CastInto<$into> for $ty {
+            fn cast(self) -> $into {
+                self as $into
+            }
+        }
+    )*};
+}
+
+cast_into!(usize);
+cast_into!(isize);
+cast_into!(u8);
+cast_into!(i8);
+cast_into!(u16);
+cast_into!(i16);
+cast_into!(u32);
+cast_into!(i32);
+cast_into!(u64);
+cast_into!(i64);
+cast_into!(u128);
+cast_into!(i128);
diff --git a/library/compiler-builtins/compiler-builtins/src/int/udiv.rs b/library/compiler-builtins/compiler-builtins/src/int/udiv.rs
new file mode 100644
index 00000000000..b9dee63c4cc
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/int/udiv.rs
@@ -0,0 +1,199 @@
+#[cfg(not(feature = "unstable-public-internals"))]
+pub(crate) use crate::int::specialized_div_rem::*;
+#[cfg(feature = "unstable-public-internals")]
+pub use crate::int::specialized_div_rem::*;
+
+intrinsics! {
+    #[maybe_use_optimized_c_shim]
+    #[arm_aeabi_alias = __aeabi_uidiv]
+    /// Returns `n / d`
+    pub extern "C" fn __udivsi3(n: u32, d: u32) -> u32 {
+        u32_div_rem(n, d).0
+    }
+
+    #[maybe_use_optimized_c_shim]
+    /// Returns `n % d`
+    pub extern "C" fn __umodsi3(n: u32, d: u32) -> u32 {
+        u32_div_rem(n, d).1
+    }
+}
+
+#[cfg(not(target_arch = "avr"))]
+intrinsics! {
+    #[maybe_use_optimized_c_shim]
+    /// Returns `n / d` and sets `*rem = n % d`
+    pub extern "C" fn __udivmodsi4(n: u32, d: u32, rem: Option<&mut u32>) -> u32 {
+        let quo_rem = u32_div_rem(n, d);
+        if let Some(rem) = rem {
+            *rem = quo_rem.1;
+        }
+        quo_rem.0
+    }
+}
+
+#[cfg(target_arch = "avr")]
+intrinsics! {
+    /// Returns `n / d` and `n % d` packed together.
+    ///
+    /// Ideally we'd use `-> (u32, u32)` or some kind of a packed struct, but
+    /// both force a stack allocation, while our result has to be in R18:R26.
+    pub extern "C" fn __udivmodsi4(n: u32, d: u32) -> u64 {
+        let (div, rem) = u32_div_rem(n, d);
+
+        ((rem as u64) << 32) | (div as u64)
+    }
+
+    #[unsafe(naked)]
+    pub unsafe extern "C" fn __udivmodqi4() {
+        // compute unsigned 8-bit `n / d` and `n % d`.
+        //
+        // Note: GCC implements a [non-standard calling convention](https://gcc.gnu.org/wiki/avr-gcc#Exceptions_to_the_Calling_Convention) for this function.
+        // Inputs:
+        //     R24: dividend
+        //     R22: divisor
+        // Outputs:
+        //     R24: quotient  (dividend / divisor)
+        //     R25: remainder (dividend % divisor)
+        // Clobbers:
+        //     R23: loop counter
+        core::arch::naked_asm!(
+            // This assembly routine implements the [long division](https://en.wikipedia.org/wiki/Division_algorithm#Long_division) algorithm.
+            // Bits shift out of the dividend and into the quotient, so R24 is used for both.
+            "clr R25",      // remainder = 0
+
+            "ldi R23, 8",   // for each bit
+            "1:",
+            "lsl R24",      //     shift the dividend MSb
+            "rol R25",      //     into the remainder LSb
+
+            "cp  R25, R22", //     if remainder >= divisor
+            "brlo 2f",
+            "sub R25, R22", //         remainder -= divisor
+            "sbr R24, 1",   //         quotient |= 1
+            "2:",
+
+            "dec R23",      // end loop
+            "brne 1b",
+            "ret",
+        );
+    }
+
+    #[unsafe(naked)]
+    pub unsafe extern "C" fn __udivmodhi4() {
+        // compute unsigned 16-bit `n / d` and `n % d`.
+        //
+        // Note: GCC implements a [non-standard calling convention](https://gcc.gnu.org/wiki/avr-gcc#Exceptions_to_the_Calling_Convention) for this function.
+        // Inputs:
+        //     R24: dividend [low]
+        //     R25: dividend [high]
+        //     R22: divisor [low]
+        //     R23: divisor [high]
+        // Outputs:
+        //     R22: quotient [low]  (dividend / divisor)
+        //     R23: quotient [high]
+        //     R24: remainder [low] (dividend % divisor)
+        //     R25: remainder [high]
+        // Clobbers:
+        //     R21: loop counter
+        //     R26: divisor [low]
+        //     R27: divisor [high]
+        core::arch::naked_asm!(
+            // This assembly routine implements the [long division](https://en.wikipedia.org/wiki/Division_algorithm#Long_division) algorithm.
+            // Bits shift out of the dividend and into the quotient, so R24+R25 are used for both.
+            "mov R26, R22",     // move divisor to make room for quotient
+            "mov R27, R23",
+            "mov R22, R24",     // move dividend to output location (becomes quotient)
+            "mov R23, R25",
+            "clr R24",          // remainder = 0
+            "clr R25",
+
+            "ldi R21, 16",      // for each bit
+            "1:",
+            "lsl R22",          //     shift the dividend MSb
+            "rol R23",
+            "rol R24",          //     into the remainder LSb
+            "rol R25",
+
+            "cp  R24, R26",     //     if remainder >= divisor
+            "cpc R25, R27",
+            "brlo 2f",
+            "sub R24, R26",     //         remainder -= divisor
+            "sbc R25, R27",
+            "sbr R22, 1",       //         quotient |= 1
+            "2:",
+
+            "dec R21",          // end loop
+            "brne 1b",
+            "ret",
+        );
+    }
+
+}
+
+intrinsics! {
+    #[maybe_use_optimized_c_shim]
+    /// Returns `n / d`
+    pub extern "C" fn __udivdi3(n: u64, d: u64) -> u64 {
+        u64_div_rem(n, d).0
+    }
+
+    #[maybe_use_optimized_c_shim]
+    /// Returns `n % d`
+    pub extern "C" fn __umoddi3(n: u64, d: u64) -> u64 {
+        u64_div_rem(n, d).1
+    }
+
+    #[maybe_use_optimized_c_shim]
+    /// Returns `n / d` and sets `*rem = n % d`
+    pub extern "C" fn __udivmoddi4(n: u64, d: u64, rem: Option<&mut u64>) -> u64 {
+        let quo_rem = u64_div_rem(n, d);
+        if let Some(rem) = rem {
+            *rem = quo_rem.1;
+        }
+        quo_rem.0
+    }
+
+    // Note: we use block configuration and not `if cfg!(...)`, because we need to entirely disable
+    // the existence of `u128_div_rem` to get 32-bit SPARC to compile, see `u128_divide_sparc` docs.
+
+    /// Returns `n / d`
+    pub extern "C" fn __udivti3(n: u128, d: u128) -> u128 {
+        #[cfg(not(any(target_arch = "sparc", target_arch = "sparc64")))] {
+            u128_div_rem(n, d).0
+        }
+        #[cfg(any(target_arch = "sparc", target_arch = "sparc64"))] {
+            u128_divide_sparc(n, d, &mut 0)
+        }
+    }
+
+    /// Returns `n % d`
+    pub extern "C" fn __umodti3(n: u128, d: u128) -> u128 {
+        #[cfg(not(any(target_arch = "sparc", target_arch = "sparc64")))] {
+            u128_div_rem(n, d).1
+        }
+        #[cfg(any(target_arch = "sparc", target_arch = "sparc64"))] {
+            let mut rem = 0;
+            u128_divide_sparc(n, d, &mut rem);
+            rem
+        }
+    }
+
+    /// Returns `n / d` and sets `*rem = n % d`
+    pub extern "C" fn __udivmodti4(n: u128, d: u128, rem: Option<&mut u128>) -> u128 {
+        #[cfg(not(any(target_arch = "sparc", target_arch = "sparc64")))] {
+            let quo_rem = u128_div_rem(n, d);
+            if let Some(rem) = rem {
+                *rem = quo_rem.1;
+            }
+            quo_rem.0
+        }
+        #[cfg(any(target_arch = "sparc", target_arch = "sparc64"))] {
+            let mut tmp = 0;
+            let quo = u128_divide_sparc(n, d, &mut tmp);
+            if let Some(rem) = rem {
+                *rem = tmp;
+            }
+            quo
+        }
+    }
+}
diff --git a/library/compiler-builtins/compiler-builtins/src/lib.miri.rs b/library/compiler-builtins/compiler-builtins/src/lib.miri.rs
new file mode 100644
index 00000000000..17288058e5e
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/lib.miri.rs
@@ -0,0 +1,5 @@
+//! Grep bootstrap for `MIRI_REPLACE_LIBRS_IF_NOT_TEST` to learn what this is about.
+#![no_std]
+#![feature(rustc_private)]
+extern crate compiler_builtins as real;
+pub use real::*;
diff --git a/library/compiler-builtins/compiler-builtins/src/lib.rs b/library/compiler-builtins/compiler-builtins/src/lib.rs
new file mode 100644
index 00000000000..6a6b28067e8
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/lib.rs
@@ -0,0 +1,84 @@
+#![cfg_attr(feature = "compiler-builtins", compiler_builtins)]
+#![cfg_attr(all(target_family = "wasm"), feature(wasm_numeric_instr))]
+#![feature(abi_unadjusted)]
+#![feature(asm_experimental_arch)]
+#![feature(cfg_target_has_atomic)]
+#![feature(compiler_builtins)]
+#![feature(core_intrinsics)]
+#![feature(linkage)]
+#![feature(naked_functions)]
+#![feature(repr_simd)]
+#![cfg_attr(f16_enabled, feature(f16))]
+#![cfg_attr(f128_enabled, feature(f128))]
+#![no_builtins]
+#![no_std]
+#![allow(unused_features)]
+#![allow(internal_features)]
+// We use `u128` in a whole bunch of places which we currently agree with the
+// compiler on ABIs and such, so we should be "good enough" for now and changes
+// to the `u128` ABI will be reflected here.
+#![allow(improper_ctypes, improper_ctypes_definitions)]
+// `mem::swap` cannot be used because it may generate references to memcpy in unoptimized code.
+#![allow(clippy::manual_swap)]
+// Support compiling on both stage0 and stage1 which may differ in supported stable features.
+#![allow(stable_features)]
+// By default, disallow this as it is forbidden in edition 2024. There is a lot of unsafe code to
+// be migrated, however, so exceptions exist.
+#![warn(unsafe_op_in_unsafe_fn)]
+
+// We disable #[no_mangle] for tests so that we can verify the test results
+// against the native compiler-rt implementations of the builtins.
+
+// NOTE cfg(all(feature = "c", ..)) indicate that compiler-rt provides an arch optimized
+// implementation of that intrinsic and we'll prefer to use that
+
+// NOTE(aapcs, aeabi, arm) ARM targets use intrinsics named __aeabi_* instead of the intrinsics
+// that follow "x86 naming convention" (e.g. addsf3). Those aeabi intrinsics must adhere to the
+// AAPCS calling convention (`extern "aapcs"`) because that's how LLVM will call them.
+
+#[cfg(test)]
+extern crate core;
+
+#[macro_use]
+mod macros;
+
+pub mod float;
+pub mod int;
+pub mod math;
+pub mod mem;
+
+// `libm` expects its `support` module to be available in the crate root.
+use math::libm_math::support;
+
+#[cfg(target_arch = "arm")]
+pub mod arm;
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+pub mod aarch64;
+
+#[cfg(all(target_arch = "aarch64", target_os = "linux", not(feature = "no-asm"),))]
+pub mod aarch64_linux;
+
+#[cfg(all(
+    kernel_user_helpers,
+    any(target_os = "linux", target_os = "android"),
+    target_arch = "arm"
+))]
+pub mod arm_linux;
+
+#[cfg(target_arch = "avr")]
+pub mod avr;
+
+#[cfg(target_arch = "hexagon")]
+pub mod hexagon;
+
+#[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))]
+pub mod riscv;
+
+#[cfg(target_arch = "x86")]
+pub mod x86;
+
+#[cfg(target_arch = "x86_64")]
+pub mod x86_64;
+
+pub mod probestack;
diff --git a/library/compiler-builtins/compiler-builtins/src/macros.rs b/library/compiler-builtins/compiler-builtins/src/macros.rs
new file mode 100644
index 00000000000..22e0dd27f2f
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/macros.rs
@@ -0,0 +1,486 @@
+//! Macros shared throughout the compiler-builtins implementation
+
+/// The "main macro" used for defining intrinsics.
+///
+/// The compiler-builtins library is super platform-specific with tons of crazy
+/// little tweaks for various platforms. As a result it *could* involve a lot of
+/// #[cfg] and macro soup, but the intention is that this macro alleviates a lot
+/// of that complexity. Ideally this macro has all the weird ABI things
+/// platforms need and elsewhere in this library it just looks like normal Rust
+/// code.
+///
+/// All intrinsics functions are marked with #[linkage = "weak"] when
+/// `not(windows) and not(target_vendor = "apple")`.
+/// `weak` linkage attribute is used so that these functions can be replaced
+/// by another implementation at link time. This is particularly useful for mixed
+/// Rust/C++ binaries that want to use the C++ intrinsics, otherwise linking against
+/// the Rust stdlib will replace those from the compiler-rt library.
+///
+/// This macro is structured to be invoked with a bunch of functions that looks
+/// like:
+/// ```ignore
+///     intrinsics! {
+///         pub extern "C" fn foo(a: i32) -> u32 {
+///             // ...
+///         }
+///
+///         #[nonstandard_attribute]
+///         pub extern "C" fn bar(a: i32) -> u32 {
+///             // ...
+///         }
+///     }
+/// ```
+///
+/// Each function is defined in a manner that looks like a normal Rust function.
+/// The macro then accepts a few nonstandard attributes that can decorate
+/// various functions. Each of the attributes is documented below with what it
+/// can do, and each of them slightly tweaks how further expansion happens.
+///
+/// A quick overview of attributes supported right now are:
+///
+/// * `maybe_use_optimized_c_shim` - indicates that the Rust implementation is
+///   ignored if an optimized C version was compiled.
+/// * `aapcs_on_arm` - forces the ABI of the function to be `"aapcs"` on ARM and
+///   the specified ABI everywhere else.
+/// * `unadjusted_on_win64` - like `aapcs_on_arm` this switches to the
+///   `"unadjusted"` abi on Win64 and the specified abi elsewhere.
+/// * `arm_aeabi_alias` - handles the "aliasing" of various intrinsics on ARM
+///   their otherwise typical names to other prefixed ones.
+/// * `ppc_alias` - changes the name of the symbol on PowerPC platforms without
+///   changing any other behavior. This is mostly for `f128`, which is `tf` on
+///   most platforms but `kf` on PowerPC.
+macro_rules! intrinsics {
+    () => ();
+
+    // Support cfg_attr:
+    (
+        #[cfg_attr($e:meta, $($attr:tt)*)]
+        $(#[$($attrs:tt)*])*
+        pub extern $abi:tt fn $name:ident( $($argname:ident: $ty:ty),* ) $(-> $ret:ty)? {
+            $($body:tt)*
+        }
+        $($rest:tt)*
+    ) => (
+        #[cfg($e)]
+        intrinsics! {
+            #[$($attr)*]
+            $(#[$($attrs)*])*
+            pub extern $abi fn $name($($argname: $ty),*) $(-> $ret)? {
+                $($body)*
+            }
+        }
+
+        #[cfg(not($e))]
+        intrinsics! {
+            $(#[$($attrs)*])*
+            pub extern $abi fn $name($($argname: $ty),*) $(-> $ret)? {
+                $($body)*
+            }
+        }
+
+        intrinsics!($($rest)*);
+    );
+    // Same as above but for unsafe.
+    (
+        #[cfg_attr($e:meta, $($attr:tt)*)]
+        $(#[$($attrs:tt)*])*
+        pub unsafe extern $abi:tt fn $name:ident( $($argname:ident: $ty:ty),* ) $(-> $ret:ty)? {
+            $($body:tt)*
+        }
+        $($rest:tt)*
+    ) => (
+        #[cfg($e)]
+        intrinsics! {
+            #[$($attr)*]
+            $(#[$($attrs)*])*
+            pub unsafe extern $abi fn $name($($argname: $ty),*) $(-> $ret)? {
+                $($body)*
+            }
+        }
+
+        #[cfg(not($e))]
+        intrinsics! {
+            $(#[$($attrs)*])*
+            pub unsafe extern $abi fn $name($($argname: $ty),*) $(-> $ret)? {
+                $($body)*
+            }
+        }
+
+        intrinsics!($($rest)*);
+    );
+
+    // Right now there's a bunch of architecture-optimized intrinsics in the
+    // stock compiler-rt implementation. Not all of these have been ported over
+    // to Rust yet so when the `c` feature of this crate is enabled we fall back
+    // to the architecture-specific versions which should be more optimized. The
+    // purpose of this macro is to easily allow specifying this.
+    //
+    // The `#[maybe_use_optimized_c_shim]` attribute indicates that this
+    // intrinsic may have an optimized C version. In these situations the build
+    // script, if the C code is enabled and compiled, will emit a cfg directive
+    // to get passed to rustc for our compilation. If that cfg is set we skip
+    // the Rust implementation, but if the attribute is not enabled then we
+    // compile in the Rust implementation.
+    (
+        #[maybe_use_optimized_c_shim]
+        $(#[$($attr:tt)*])*
+        pub $(unsafe $(@ $empty:tt)? )? extern $abi:tt fn $name:ident( $($argname:ident:  $ty:ty),* ) $(-> $ret:ty)? {
+            $($body:tt)*
+        }
+
+        $($rest:tt)*
+    ) => (
+        #[cfg($name = "optimized-c")]
+        pub $(unsafe $($empty)? )? extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
+            extern $abi {
+                fn $name($($argname: $ty),*) $(-> $ret)?;
+            }
+            unsafe {
+                $name($($argname),*)
+            }
+        }
+
+        #[cfg(not($name = "optimized-c"))]
+        intrinsics! {
+            $(#[$($attr)*])*
+            pub $(unsafe $($empty)? )? extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
+                $($body)*
+            }
+        }
+
+        intrinsics!($($rest)*);
+    );
+
+    // We recognize the `#[aapcs_on_arm]` attribute here and generate the
+    // same intrinsic but force it to have the `"aapcs"` calling convention on
+    // ARM and `"C"` elsewhere.
+    (
+        #[aapcs_on_arm]
+        $(#[$($attr:tt)*])*
+        pub extern $abi:tt fn $name:ident( $($argname:ident:  $ty:ty),* ) $(-> $ret:ty)? {
+            $($body:tt)*
+        }
+
+        $($rest:tt)*
+    ) => (
+        #[cfg(target_arch = "arm")]
+        intrinsics! {
+            $(#[$($attr)*])*
+            pub extern "aapcs" fn $name( $($argname: $ty),* ) $(-> $ret)? {
+                $($body)*
+            }
+        }
+
+        #[cfg(not(target_arch = "arm"))]
+        intrinsics! {
+            $(#[$($attr)*])*
+            pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
+                $($body)*
+            }
+        }
+
+        intrinsics!($($rest)*);
+    );
+
+    // Like aapcs above we recognize an attribute for the "unadjusted" abi on
+    // win64 for some methods.
+    (
+        #[unadjusted_on_win64]
+        $(#[$($attr:tt)*])*
+        pub extern $abi:tt fn $name:ident( $($argname:ident:  $ty:ty),* ) $(-> $ret:ty)? {
+            $($body:tt)*
+        }
+
+        $($rest:tt)*
+    ) => (
+        #[cfg(all(any(windows, target_os = "cygwin", all(target_os = "uefi", target_arch = "x86_64")), target_pointer_width = "64"))]
+        intrinsics! {
+            $(#[$($attr)*])*
+            pub extern "unadjusted" fn $name( $($argname: $ty),* ) $(-> $ret)? {
+                $($body)*
+            }
+        }
+
+        #[cfg(not(all(any(windows, target_os = "cygwin", all(target_os = "uefi", target_arch = "x86_64")), target_pointer_width = "64")))]
+        intrinsics! {
+            $(#[$($attr)*])*
+            pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
+                $($body)*
+            }
+        }
+
+        intrinsics!($($rest)*);
+    );
+
+    // `arm_aeabi_alias` would conflict with `f16_apple_{arg,ret}_abi` not handled here. Avoid macro ambiguity by combining in a
+    // single `#[]`.
+    (
+        #[apple_f16_arg_abi]
+        #[arm_aeabi_alias = $alias:ident]
+        $($t:tt)*
+    ) => {
+        intrinsics! {
+            #[apple_f16_arg_abi, arm_aeabi_alias = $alias]
+            $($t)*
+        }
+    };
+    (
+        #[apple_f16_ret_abi]
+        #[arm_aeabi_alias = $alias:ident]
+        $($t:tt)*
+    ) => {
+        intrinsics! {
+            #[apple_f16_ret_abi, arm_aeabi_alias = $alias]
+            $($t)*
+        }
+    };
+
+    // On x86 (32-bit and 64-bit) Apple platforms, `f16` is passed and returned like a `u16` unless
+    // the builtin involves `f128`.
+    (
+        // `arm_aeabi_alias` would conflict if not handled here. Avoid macro ambiguity by combining
+        // in a single `#[]`.
+        #[apple_f16_arg_abi $(, arm_aeabi_alias = $alias:ident)?]
+        $(#[$($attr:tt)*])*
+        pub extern $abi:tt fn $name:ident( $($argname:ident:  $ty:ty),* ) $(-> $ret:ty)? {
+            $($body:tt)*
+        }
+
+        $($rest:tt)*
+    ) => (
+        #[cfg(all(target_vendor = "apple", any(target_arch = "x86", target_arch = "x86_64")))]
+        $(#[$($attr)*])*
+        pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
+            $($body)*
+        }
+
+        #[cfg(all(target_vendor = "apple", any(target_arch = "x86", target_arch = "x86_64"), not(feature = "mangled-names")))]
+        mod $name {
+            #[unsafe(no_mangle)]
+            #[cfg_attr(not(any(all(windows, target_env = "gnu"), target_os = "cygwin")), linkage = "weak")]
+            $(#[$($attr)*])*
+            extern $abi fn $name( $($argname: u16),* ) $(-> $ret)? {
+                super::$name($(f16::from_bits($argname)),*)
+            }
+        }
+
+        #[cfg(not(all(target_vendor = "apple", any(target_arch = "x86", target_arch = "x86_64"))))]
+        intrinsics! {
+            $(#[arm_aeabi_alias = $alias])?
+            $(#[$($attr)*])*
+            pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
+                $($body)*
+            }
+        }
+
+        intrinsics!($($rest)*);
+    );
+    (
+        #[apple_f16_ret_abi $(, arm_aeabi_alias = $alias:ident)?]
+        $(#[$($attr:tt)*])*
+        pub extern $abi:tt fn $name:ident( $($argname:ident:  $ty:ty),* ) $(-> $ret:ty)? {
+            $($body:tt)*
+        }
+
+        $($rest:tt)*
+    ) => (
+        #[cfg(all(target_vendor = "apple", any(target_arch = "x86", target_arch = "x86_64")))]
+        $(#[$($attr)*])*
+        pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
+            $($body)*
+        }
+
+        #[cfg(all(target_vendor = "apple", any(target_arch = "x86", target_arch = "x86_64"), not(feature = "mangled-names")))]
+        mod $name {
+            #[unsafe(no_mangle)]
+            #[cfg_attr(not(any(all(windows, target_env = "gnu"), target_os = "cygwin")), linkage = "weak")]
+            $(#[$($attr)*])*
+            extern $abi fn $name( $($argname: $ty),* ) -> u16 {
+                super::$name($($argname),*).to_bits()
+            }
+        }
+
+        #[cfg(not(all(target_vendor = "apple", any(target_arch = "x86", target_arch = "x86_64"))))]
+        intrinsics! {
+            $(#[arm_aeabi_alias = $alias])?
+            $(#[$($attr)*])*
+            pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
+                $($body)*
+            }
+        }
+
+        intrinsics!($($rest)*);
+    );
+
+    // A bunch of intrinsics on ARM are aliased in the standard compiler-rt
+    // build under `__aeabi_*` aliases, and LLVM will call these instead of the
+    // original function. The aliasing here is used to generate these symbols in
+    // the object file.
+    (
+        #[arm_aeabi_alias = $alias:ident]
+        $(#[$($attr:tt)*])*
+        pub extern $abi:tt fn $name:ident( $($argname:ident:  $ty:ty),* ) $(-> $ret:ty)? {
+            $($body:tt)*
+        }
+
+        $($rest:tt)*
+    ) => (
+        #[cfg(target_arch = "arm")]
+        $(#[$($attr)*])*
+        pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
+            $($body)*
+        }
+
+        #[cfg(all(target_arch = "arm", not(feature = "mangled-names")))]
+        mod $name {
+            #[unsafe(no_mangle)]
+            #[cfg_attr(not(any(all(windows, target_env = "gnu"), target_os = "cygwin")), linkage = "weak")]
+            $(#[$($attr)*])*
+            extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
+                super::$name($($argname),*)
+            }
+        }
+
+        #[cfg(all(target_arch = "arm", not(feature = "mangled-names")))]
+        mod $alias {
+            #[unsafe(no_mangle)]
+            #[cfg_attr(not(any(all(windows, target_env = "gnu"), target_os = "cygwin")), linkage = "weak")]
+            $(#[$($attr)*])*
+            extern "aapcs" fn $alias( $($argname: $ty),* ) $(-> $ret)? {
+                super::$name($($argname),*)
+            }
+        }
+
+        #[cfg(not(target_arch = "arm"))]
+        intrinsics! {
+            $(#[$($attr)*])*
+            pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
+                $($body)*
+            }
+        }
+
+        intrinsics!($($rest)*);
+    );
+
+    // PowerPC usually uses `kf` rather than `tf` for `f128`. This is just an easy
+    // way to add an alias on those targets.
+    (
+        #[ppc_alias = $alias:ident]
+        $(#[$($attr:tt)*])*
+        pub extern $abi:tt fn $name:ident( $($argname:ident:  $ty:ty),* ) $(-> $ret:ty)? {
+            $($body:tt)*
+        }
+
+        $($rest:tt)*
+    ) => (
+        #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+        intrinsics! {
+            $(#[$($attr)*])*
+            pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
+                $($body)*
+            }
+        }
+
+        #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+        intrinsics! {
+            $(#[$($attr)*])*
+            pub extern $abi fn $alias( $($argname: $ty),* ) $(-> $ret)? {
+                $($body)*
+            }
+        }
+
+        intrinsics!($($rest)*);
+    );
+
+    // C mem* functions are only generated when the "mem" feature is enabled.
+    (
+        #[mem_builtin]
+        $(#[$($attr:tt)*])*
+        pub unsafe extern $abi:tt fn $name:ident( $($argname:ident:  $ty:ty),* ) $(-> $ret:ty)? {
+            $($body:tt)*
+        }
+
+        $($rest:tt)*
+    ) => (
+        $(#[$($attr)*])*
+        pub unsafe extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
+            $($body)*
+        }
+
+        #[cfg(all(feature = "mem", not(feature = "mangled-names")))]
+        mod $name {
+            $(#[$($attr)*])*
+            #[unsafe(no_mangle)]
+            #[cfg_attr(not(any(all(windows, target_env = "gnu"), target_os = "cygwin")), linkage = "weak")]
+            unsafe extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
+                super::$name($($argname),*)
+            }
+        }
+
+        intrinsics!($($rest)*);
+    );
+
+    // Naked functions are special: we can't generate wrappers for them since
+    // they use a custom calling convention.
+    (
+        #[unsafe(naked)]
+        $(#[$($attr:tt)*])*
+        pub unsafe extern $abi:tt fn $name:ident( $($argname:ident:  $ty:ty),* ) $(-> $ret:ty)? {
+            $($body:tt)*
+        }
+
+        $($rest:tt)*
+    ) => (
+        // `#[naked]` definitions are referenced by other places, so we can't use `cfg` like the others
+        pub mod $name {
+            #[unsafe(naked)]
+            $(#[$($attr)*])*
+            #[cfg_attr(not(feature = "mangled-names"), no_mangle)]
+            #[cfg_attr(not(any(all(windows, target_env = "gnu"), target_os = "cygwin")), linkage = "weak")]
+            pub unsafe extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
+                $($body)*
+            }
+        }
+
+        intrinsics!($($rest)*);
+    );
+
+    // This is the final catch-all rule. At this point we generate an
+    // intrinsic with a conditional `#[no_mangle]` directive to avoid
+    // interfering with duplicate symbols and whatnot during testing.
+    //
+    // The implementation is placed in a separate module, to take advantage
+    // of the fact that rustc partitions functions into code generation
+    // units based on module they are defined in. As a result we will have
+    // a separate object file for each intrinsic. For further details see
+    // corresponding PR in rustc https://github.com/rust-lang/rust/pull/70846
+    //
+    // After the intrinsic is defined we just continue with the rest of the
+    // input we were given.
+    (
+        $(#[$($attr:tt)*])*
+        pub $(unsafe $(@ $empty:tt)?)? extern $abi:tt fn $name:ident( $($argname:ident:  $ty:ty),* ) $(-> $ret:ty)? {
+            $($body:tt)*
+        }
+
+        $($rest:tt)*
+    ) => (
+        $(#[$($attr)*])*
+        pub $(unsafe $($empty)?)? extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
+            $($body)*
+        }
+
+        #[cfg(not(feature = "mangled-names"))]
+        mod $name {
+            $(#[$($attr)*])*
+            #[unsafe(no_mangle)]
+            #[cfg_attr(not(any(all(windows, target_env = "gnu"), target_os = "cygwin")), linkage = "weak")]
+            $(unsafe $($empty)?)? extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
+                // SAFETY: same preconditions.
+                $(unsafe $($empty)?)? { super::$name($($argname),*) }
+            }
+        }
+
+        intrinsics!($($rest)*);
+    );
+}
diff --git a/library/compiler-builtins/compiler-builtins/src/math/mod.rs b/library/compiler-builtins/compiler-builtins/src/math/mod.rs
new file mode 100644
index 00000000000..62d72967410
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/math/mod.rs
@@ -0,0 +1,199 @@
+#[rustfmt::skip]
+#[allow(dead_code)]
+#[allow(unused_imports)]
+#[allow(clippy::all)]
+#[path = "../../../libm/src/math/mod.rs"]
+pub(crate) mod libm_math;
+
+macro_rules! libm_intrinsics {
+    ($(fn $fun:ident($($iid:ident : $ity:ty),+) -> $oty:ty;)+) => {
+        intrinsics! {
+            $(
+                pub extern "C" fn $fun($($iid: $ity),+) -> $oty {
+                    $crate::math::libm_math::$fun($($iid),+)
+                }
+            )+
+        }
+    }
+}
+
+/// This set of functions is well tested in `libm` and known to provide similar performance to
+/// system `libm`, as well as the same or better accuracy.
+pub mod full_availability {
+    #[cfg(f16_enabled)]
+    libm_intrinsics! {
+        fn ceilf16(x: f16) -> f16;
+        fn copysignf16(x: f16, y: f16) -> f16;
+        fn fabsf16(x: f16) -> f16;
+        fn fdimf16(x: f16, y: f16) -> f16;
+        fn floorf16(x: f16) -> f16;
+        fn fmaxf16(x: f16, y: f16) -> f16;
+        fn fmaximumf16(x: f16, y: f16) -> f16;
+        fn fminf16(x: f16, y: f16) -> f16;
+        fn fminimumf16(x: f16, y: f16) -> f16;
+        fn fmodf16(x: f16, y: f16) -> f16;
+        fn rintf16(x: f16) -> f16;
+        fn roundevenf16(x: f16) -> f16;
+        fn roundf16(x: f16) -> f16;
+        fn sqrtf16(x: f16) -> f16;
+        fn truncf16(x: f16) -> f16;
+    }
+
+    /* Weak linkage is unreliable on Windows and Apple, so we don't expose symbols that we know
+     * the system libc provides in order to avoid conflicts. */
+
+    #[cfg(all(not(windows), not(target_vendor = "apple")))]
+    libm_intrinsics! {
+        /* f32 */
+        fn cbrtf(n: f32) -> f32;
+        fn ceilf(x: f32) -> f32;
+        fn copysignf(x: f32, y: f32) -> f32;
+        fn fabsf(x: f32) -> f32;
+        fn fdimf(a: f32, b: f32) -> f32;
+        fn floorf(x: f32) -> f32;
+        fn fmaf(x: f32, y: f32, z: f32) -> f32;
+        fn fmaxf(x: f32, y: f32) -> f32;
+        fn fminf(x: f32, y: f32) -> f32;
+        fn fmodf(x: f32, y: f32) -> f32;
+        fn rintf(x: f32) -> f32;
+        fn roundf(x: f32) -> f32;
+        fn sqrtf(x: f32) -> f32;
+        fn truncf(x: f32) -> f32;
+
+        /* f64 */
+        fn cbrt(x: f64) -> f64;
+        fn ceil(x: f64) -> f64;
+        fn copysign(x: f64, y: f64) -> f64;
+        fn fabs(x: f64) -> f64;
+        fn fdim(a: f64, b: f64) -> f64;
+        fn floor(x: f64) -> f64;
+        fn fma(x: f64, y: f64, z: f64) -> f64;
+        fn fmax(x: f64, y: f64) -> f64;
+        fn fmin(x: f64, y: f64) -> f64;
+        fn fmod(x: f64, y: f64) -> f64;
+        fn rint(x: f64) -> f64;
+        fn round(x: f64) -> f64;
+        fn sqrt(x: f64) -> f64;
+        fn trunc(x: f64) -> f64;
+    }
+
+    // Windows and MacOS do not yet expose roundeven and IEEE 754-2019 `maximum` / `minimum`,
+    // however, so we still provide a fallback.
+    libm_intrinsics! {
+        fn fmaximum(x: f64, y: f64) -> f64;
+        fn fmaximumf(x: f32, y: f32) -> f32;
+        fn fminimum(x: f64, y: f64) -> f64;
+        fn fminimumf(x: f32, y: f32) -> f32;
+        fn roundeven(x: f64) -> f64;
+        fn roundevenf(x: f32) -> f32;
+    }
+
+    #[cfg(f128_enabled)]
+    libm_intrinsics! {
+        fn ceilf128(x: f128) -> f128;
+        fn copysignf128(x: f128, y: f128) -> f128;
+        fn fabsf128(x: f128) -> f128;
+        fn fdimf128(x: f128, y: f128) -> f128;
+        fn floorf128(x: f128) -> f128;
+        fn fmaf128(x: f128, y: f128, z: f128) -> f128;
+        fn fmaxf128(x: f128, y: f128) -> f128;
+        fn fmaximumf128(x: f128, y: f128) -> f128;
+        fn fminf128(x: f128, y: f128) -> f128;
+        fn fminimumf128(x: f128, y: f128) -> f128;
+        fn fmodf128(x: f128, y: f128) -> f128;
+        fn rintf128(x: f128) -> f128;
+        fn roundevenf128(x: f128) -> f128;
+        fn roundf128(x: f128) -> f128;
+        fn sqrtf128(x: f128) -> f128;
+        fn truncf128(x: f128) -> f128;
+    }
+}
+
+/// This group of functions has more performance or precision issues than system versions, or
+/// are otherwise less well tested. Provide them only on platforms that have problems with the
+/// system `libm`.
+///
+/// As `libm` improves, more functions will be moved from this group to the first group.
+///
+/// Do not supply for any of the following:
+/// - x86 without sse2 due to ABI issues
+///   - <https://github.com/rust-lang/rust/issues/114479>
+///   - but exclude UEFI since it is a soft-float target
+///     - <https://github.com/rust-lang/rust/issues/128533>
+/// - All unix targets (linux, macos, freebsd, android, etc)
+/// - wasm with known target_os
+#[cfg(not(any(
+    all(
+        target_arch = "x86",
+        not(target_feature = "sse2"),
+        not(target_os = "uefi"),
+    ),
+    unix,
+    all(target_family = "wasm", not(target_os = "unknown"))
+)))]
+pub mod partial_availability {
+    #[cfg(not(windows))]
+    libm_intrinsics! {
+        fn acos(x: f64) -> f64;
+        fn acosf(n: f32) -> f32;
+        fn asin(x: f64) -> f64;
+        fn asinf(n: f32) -> f32;
+        fn atan(x: f64) -> f64;
+        fn atan2(x: f64, y: f64) -> f64;
+        fn atan2f(a: f32, b: f32) -> f32;
+        fn atanf(n: f32) -> f32;
+        fn cos(x: f64) -> f64;
+        fn cosf(x: f32) -> f32;
+        fn cosh(x: f64) -> f64;
+        fn coshf(n: f32) -> f32;
+        fn erf(x: f64) -> f64;
+        fn erfc(x: f64) -> f64;
+        fn erfcf(x: f32) -> f32;
+        fn erff(x: f32) -> f32;
+        fn exp(x: f64) -> f64;
+        fn exp2(x: f64) -> f64;
+        fn exp2f(x: f32) -> f32;
+        fn expf(x: f32) -> f32;
+        fn expm1(x: f64) -> f64;
+        fn expm1f(n: f32) -> f32;
+        fn hypot(x: f64, y: f64) -> f64;
+        fn hypotf(x: f32, y: f32) -> f32;
+        fn ldexp(f: f64, n: i32) -> f64;
+        fn ldexpf(f: f32, n: i32) -> f32;
+        fn log(x: f64) -> f64;
+        fn log10(x: f64) -> f64;
+        fn log10f(x: f32) -> f32;
+        fn log1p(x: f64) -> f64;
+        fn log1pf(n: f32) -> f32;
+        fn log2(x: f64) -> f64;
+        fn log2f(x: f32) -> f32;
+        fn logf(x: f32) -> f32;
+        fn pow(x: f64, y: f64) -> f64;
+        fn powf(x: f32, y: f32) -> f32;
+        fn sin(x: f64) -> f64;
+        fn sinf(x: f32) -> f32;
+        fn sinh(x: f64) -> f64;
+        fn sinhf(n: f32) -> f32;
+        fn tan(x: f64) -> f64;
+        fn tanf(n: f32) -> f32;
+        fn tanh(x: f64) -> f64;
+        fn tanhf(n: f32) -> f32;
+        fn tgamma(x: f64) -> f64;
+        fn tgammaf(x: f32) -> f32;
+    }
+
+    // allow for windows (and other targets)
+    intrinsics! {
+        pub extern "C" fn lgamma_r(x: f64, s: &mut i32) -> f64 {
+            let r = super::libm_math::lgamma_r(x);
+            *s = r.1;
+            r.0
+        }
+
+        pub extern "C" fn lgammaf_r(x: f32, s: &mut i32) -> f32 {
+            let r = super::libm_math::lgammaf_r(x);
+            *s = r.1;
+            r.0
+        }
+    }
+}
diff --git a/library/compiler-builtins/compiler-builtins/src/mem/impls.rs b/library/compiler-builtins/compiler-builtins/src/mem/impls.rs
new file mode 100644
index 00000000000..14a4787485d
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/mem/impls.rs
@@ -0,0 +1,408 @@
+// In C and Rust it is UB to read or write to usize::MAX because if an allocation extends to the
+// last byte of address space (there must be an allocation to do the read or write), in C computing
+// its one-past-the-end pointer would be equal to NULL and in Rust computing the address of a
+// trailing ZST member with a safe place projection would wrap (place projection address computation
+// is non-wrapping).
+//
+// However, some embedded systems have special memory at usize::MAX, and need to access that
+// memory. If they do that with the intrinsics provided by compiler-builtins (such as memcpy!), the
+// ptr::add in these loops will wrap. And if compiler-builtins is compiled with cfg(ub_checks),
+// this will fail a UB check at runtime.
+//
+// Since this scenario is UB, we are within our rights hit this check and halt execution...
+// But we are also within our rights to try to make it work.
+// We use wrapping_add/wrapping_sub for pointer arithmetic in this module in an attempt to support
+// this use. Of course this is not a guarantee that such use will work, it just means that this
+// crate doing wrapping pointer arithmetic with a method that must not wrap won't be the problem if
+// something does go wrong at runtime.
+use core::intrinsics::likely;
+
+const WORD_SIZE: usize = core::mem::size_of::<usize>();
+const WORD_MASK: usize = WORD_SIZE - 1;
+
+// If the number of bytes involved exceed this threshold we will opt in word-wise copy.
+// The value here selected is max(2 * WORD_SIZE, 16):
+// * We need at least 2 * WORD_SIZE bytes to guarantee that at least 1 word will be copied through
+//   word-wise copy.
+// * The word-wise copy logic needs to perform some checks so it has some small overhead.
+//   ensures that even on 32-bit platforms we have copied at least 8 bytes through
+//   word-wise copy so the saving of word-wise copy outweighs the fixed overhead.
+const WORD_COPY_THRESHOLD: usize = if 2 * WORD_SIZE > 16 {
+    2 * WORD_SIZE
+} else {
+    16
+};
+
+#[cfg(feature = "mem-unaligned")]
+unsafe fn read_usize_unaligned(x: *const usize) -> usize {
+    // Do not use `core::ptr::read_unaligned` here, since it calls `copy_nonoverlapping` which
+    // is translated to memcpy in LLVM.
+    let x_read = (x as *const [u8; core::mem::size_of::<usize>()]).read();
+    usize::from_ne_bytes(x_read)
+}
+
+/// Loads a `T`-sized chunk from `src` into `dst` at offset `offset`, if that does not exceed
+/// `load_sz`. The offset pointers must both be `T`-aligned. Returns the new offset, advanced by the
+/// chunk size if a load happened.
+#[cfg(not(feature = "mem-unaligned"))]
+#[inline(always)]
+unsafe fn load_chunk_aligned<T: Copy>(
+    src: *const usize,
+    dst: *mut usize,
+    load_sz: usize,
+    offset: usize,
+) -> usize {
+    let chunk_sz = core::mem::size_of::<T>();
+    if (load_sz & chunk_sz) != 0 {
+        *dst.wrapping_byte_add(offset).cast::<T>() = *src.wrapping_byte_add(offset).cast::<T>();
+        offset | chunk_sz
+    } else {
+        offset
+    }
+}
+
+/// Load `load_sz` many bytes from `src`, which must be usize-aligned. Acts as if we did a `usize`
+/// read with the out-of-bounds part filled with 0s.
+/// `load_sz` be strictly less than `WORD_SIZE`.
+#[cfg(not(feature = "mem-unaligned"))]
+#[inline(always)]
+unsafe fn load_aligned_partial(src: *const usize, load_sz: usize) -> usize {
+    debug_assert!(load_sz < WORD_SIZE);
+    // We can read up to 7 bytes here, which is enough for WORD_SIZE of 8
+    // (since `load_sz < WORD_SIZE`).
+    const { assert!(WORD_SIZE <= 8) };
+
+    let mut i = 0;
+    let mut out = 0usize;
+    // We load in decreasing order, so the pointers remain sufficiently aligned for the next step.
+    i = load_chunk_aligned::<u32>(src, &raw mut out, load_sz, i);
+    i = load_chunk_aligned::<u16>(src, &raw mut out, load_sz, i);
+    i = load_chunk_aligned::<u8>(src, &raw mut out, load_sz, i);
+    debug_assert!(i == load_sz);
+    out
+}
+
+/// Load `load_sz` many bytes from `src.wrapping_byte_add(WORD_SIZE - load_sz)`. `src` must be
+/// `usize`-aligned. The bytes are returned as the *last* bytes of the return value, i.e., this acts
+/// as if we had done a `usize` read from `src`, with the out-of-bounds part filled with 0s.
+/// `load_sz` be strictly less than `WORD_SIZE`.
+#[cfg(not(feature = "mem-unaligned"))]
+#[inline(always)]
+unsafe fn load_aligned_end_partial(src: *const usize, load_sz: usize) -> usize {
+    debug_assert!(load_sz < WORD_SIZE);
+    // We can read up to 7 bytes here, which is enough for WORD_SIZE of 8
+    // (since `load_sz < WORD_SIZE`).
+    const { assert!(WORD_SIZE <= 8) };
+
+    let mut i = 0;
+    let mut out = 0usize;
+    // Obtain pointers pointing to the beginning of the range we want to load.
+    let src_shifted = src.wrapping_byte_add(WORD_SIZE - load_sz);
+    let out_shifted = (&raw mut out).wrapping_byte_add(WORD_SIZE - load_sz);
+    // We load in increasing order, so by the time we reach `u16` things are 2-aligned etc.
+    i = load_chunk_aligned::<u8>(src_shifted, out_shifted, load_sz, i);
+    i = load_chunk_aligned::<u16>(src_shifted, out_shifted, load_sz, i);
+    i = load_chunk_aligned::<u32>(src_shifted, out_shifted, load_sz, i);
+    debug_assert!(i == load_sz);
+    out
+}
+
+#[inline(always)]
+pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, mut n: usize) {
+    #[inline(always)]
+    unsafe fn copy_forward_bytes(mut dest: *mut u8, mut src: *const u8, n: usize) {
+        let dest_end = dest.wrapping_add(n);
+        while dest < dest_end {
+            *dest = *src;
+            dest = dest.wrapping_add(1);
+            src = src.wrapping_add(1);
+        }
+    }
+
+    #[inline(always)]
+    unsafe fn copy_forward_aligned_words(dest: *mut u8, src: *const u8, n: usize) {
+        let mut dest_usize = dest as *mut usize;
+        let mut src_usize = src as *mut usize;
+        let dest_end = dest.wrapping_add(n) as *mut usize;
+
+        while dest_usize < dest_end {
+            *dest_usize = *src_usize;
+            dest_usize = dest_usize.wrapping_add(1);
+            src_usize = src_usize.wrapping_add(1);
+        }
+    }
+
+    /// `n` is in units of bytes, but must be a multiple of the word size and must not be 0.
+    /// `src` *must not* be `usize`-aligned.
+    #[cfg(not(feature = "mem-unaligned"))]
+    #[inline(always)]
+    unsafe fn copy_forward_misaligned_words(dest: *mut u8, src: *const u8, n: usize) {
+        debug_assert!(n > 0 && n % WORD_SIZE == 0);
+        debug_assert!(src.addr() % WORD_SIZE != 0);
+
+        let mut dest_usize = dest as *mut usize;
+        let dest_end = dest.wrapping_add(n) as *mut usize;
+
+        // Calculate the misalignment offset and shift needed to reassemble value.
+        // Since `src` is definitely not aligned, `offset` is in the range 1..WORD_SIZE.
+        let offset = src as usize & WORD_MASK;
+        let shift = offset * 8;
+
+        // Realign src
+        let mut src_aligned = src.wrapping_byte_sub(offset) as *mut usize;
+        let mut prev_word = load_aligned_end_partial(src_aligned, WORD_SIZE - offset);
+
+        while dest_usize.wrapping_add(1) < dest_end {
+            src_aligned = src_aligned.wrapping_add(1);
+            let cur_word = *src_aligned;
+            let reassembled = if cfg!(target_endian = "little") {
+                prev_word >> shift | cur_word << (WORD_SIZE * 8 - shift)
+            } else {
+                prev_word << shift | cur_word >> (WORD_SIZE * 8 - shift)
+            };
+            prev_word = cur_word;
+
+            *dest_usize = reassembled;
+            dest_usize = dest_usize.wrapping_add(1);
+        }
+
+        // There's one more element left to go, and we can't use the loop for that as on the `src` side,
+        // it is partially out-of-bounds.
+        src_aligned = src_aligned.wrapping_add(1);
+        let cur_word = load_aligned_partial(src_aligned, offset);
+        let reassembled = if cfg!(target_endian = "little") {
+            prev_word >> shift | cur_word << (WORD_SIZE * 8 - shift)
+        } else {
+            prev_word << shift | cur_word >> (WORD_SIZE * 8 - shift)
+        };
+        // prev_word does not matter any more
+
+        *dest_usize = reassembled;
+        // dest_usize does not matter any more
+    }
+
+    /// `n` is in units of bytes, but must be a multiple of the word size and must not be 0.
+    /// `src` *must not* be `usize`-aligned.
+    #[cfg(feature = "mem-unaligned")]
+    #[inline(always)]
+    unsafe fn copy_forward_misaligned_words(dest: *mut u8, src: *const u8, n: usize) {
+        let mut dest_usize = dest as *mut usize;
+        let mut src_usize = src as *mut usize;
+        let dest_end = dest.wrapping_add(n) as *mut usize;
+
+        while dest_usize < dest_end {
+            *dest_usize = read_usize_unaligned(src_usize);
+            dest_usize = dest_usize.wrapping_add(1);
+            src_usize = src_usize.wrapping_add(1);
+        }
+    }
+
+    if n >= WORD_COPY_THRESHOLD {
+        // Align dest
+        // Because of n >= 2 * WORD_SIZE, dst_misalignment < n
+        let dest_misalignment = (dest as usize).wrapping_neg() & WORD_MASK;
+        copy_forward_bytes(dest, src, dest_misalignment);
+        dest = dest.wrapping_add(dest_misalignment);
+        src = src.wrapping_add(dest_misalignment);
+        n -= dest_misalignment;
+
+        let n_words = n & !WORD_MASK;
+        let src_misalignment = src as usize & WORD_MASK;
+        if likely(src_misalignment == 0) {
+            copy_forward_aligned_words(dest, src, n_words);
+        } else {
+            copy_forward_misaligned_words(dest, src, n_words);
+        }
+        dest = dest.wrapping_add(n_words);
+        src = src.wrapping_add(n_words);
+        n -= n_words;
+    }
+    copy_forward_bytes(dest, src, n);
+}
+
+#[inline(always)]
+pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, mut n: usize) {
+    // The following backward copy helper functions uses the pointers past the end
+    // as their inputs instead of pointers to the start!
+    #[inline(always)]
+    unsafe fn copy_backward_bytes(mut dest: *mut u8, mut src: *const u8, n: usize) {
+        let dest_start = dest.wrapping_sub(n);
+        while dest_start < dest {
+            dest = dest.wrapping_sub(1);
+            src = src.wrapping_sub(1);
+            *dest = *src;
+        }
+    }
+
+    #[inline(always)]
+    unsafe fn copy_backward_aligned_words(dest: *mut u8, src: *const u8, n: usize) {
+        let mut dest_usize = dest as *mut usize;
+        let mut src_usize = src as *mut usize;
+        let dest_start = dest.wrapping_sub(n) as *mut usize;
+
+        while dest_start < dest_usize {
+            dest_usize = dest_usize.wrapping_sub(1);
+            src_usize = src_usize.wrapping_sub(1);
+            *dest_usize = *src_usize;
+        }
+    }
+
+    /// `n` is in units of bytes, but must be a multiple of the word size and must not be 0.
+    /// `src` *must not* be `usize`-aligned.
+    #[cfg(not(feature = "mem-unaligned"))]
+    #[inline(always)]
+    unsafe fn copy_backward_misaligned_words(dest: *mut u8, src: *const u8, n: usize) {
+        debug_assert!(n > 0 && n % WORD_SIZE == 0);
+        debug_assert!(src.addr() % WORD_SIZE != 0);
+
+        let mut dest_usize = dest as *mut usize;
+        let dest_start = dest.wrapping_sub(n) as *mut usize; // we're moving towards the start
+
+        // Calculate the misalignment offset and shift needed to reassemble value.
+        // Since `src` is definitely not aligned, `offset` is in the range 1..WORD_SIZE.
+        let offset = src as usize & WORD_MASK;
+        let shift = offset * 8;
+
+        // Realign src
+        let mut src_aligned = src.wrapping_byte_sub(offset) as *mut usize;
+        let mut prev_word = load_aligned_partial(src_aligned, offset);
+
+        while dest_start.wrapping_add(1) < dest_usize {
+            src_aligned = src_aligned.wrapping_sub(1);
+            let cur_word = *src_aligned;
+            let reassembled = if cfg!(target_endian = "little") {
+                prev_word << (WORD_SIZE * 8 - shift) | cur_word >> shift
+            } else {
+                prev_word >> (WORD_SIZE * 8 - shift) | cur_word << shift
+            };
+            prev_word = cur_word;
+
+            dest_usize = dest_usize.wrapping_sub(1);
+            *dest_usize = reassembled;
+        }
+
+        // There's one more element left to go, and we can't use the loop for that as on the `src` side,
+        // it is partially out-of-bounds.
+        src_aligned = src_aligned.wrapping_sub(1);
+        let cur_word = load_aligned_end_partial(src_aligned, WORD_SIZE - offset);
+        let reassembled = if cfg!(target_endian = "little") {
+            prev_word << (WORD_SIZE * 8 - shift) | cur_word >> shift
+        } else {
+            prev_word >> (WORD_SIZE * 8 - shift) | cur_word << shift
+        };
+        // prev_word does not matter any more
+
+        dest_usize = dest_usize.wrapping_sub(1);
+        *dest_usize = reassembled;
+    }
+
+    /// `n` is in units of bytes, but must be a multiple of the word size and must not be 0.
+    /// `src` *must not* be `usize`-aligned.
+    #[cfg(feature = "mem-unaligned")]
+    #[inline(always)]
+    unsafe fn copy_backward_misaligned_words(dest: *mut u8, src: *const u8, n: usize) {
+        let mut dest_usize = dest as *mut usize;
+        let mut src_usize = src as *mut usize;
+        let dest_start = dest.wrapping_sub(n) as *mut usize;
+
+        while dest_start < dest_usize {
+            dest_usize = dest_usize.wrapping_sub(1);
+            src_usize = src_usize.wrapping_sub(1);
+            *dest_usize = read_usize_unaligned(src_usize);
+        }
+    }
+
+    let mut dest = dest.wrapping_add(n);
+    let mut src = src.wrapping_add(n);
+
+    if n >= WORD_COPY_THRESHOLD {
+        // Align dest
+        // Because of n >= 2 * WORD_SIZE, dst_misalignment < n
+        let dest_misalignment = dest as usize & WORD_MASK;
+        copy_backward_bytes(dest, src, dest_misalignment);
+        dest = dest.wrapping_sub(dest_misalignment);
+        src = src.wrapping_sub(dest_misalignment);
+        n -= dest_misalignment;
+
+        let n_words = n & !WORD_MASK;
+        let src_misalignment = src as usize & WORD_MASK;
+        if likely(src_misalignment == 0) {
+            copy_backward_aligned_words(dest, src, n_words);
+        } else {
+            copy_backward_misaligned_words(dest, src, n_words);
+        }
+        dest = dest.wrapping_sub(n_words);
+        src = src.wrapping_sub(n_words);
+        n -= n_words;
+    }
+    copy_backward_bytes(dest, src, n);
+}
+
+#[inline(always)]
+pub unsafe fn set_bytes(mut s: *mut u8, c: u8, mut n: usize) {
+    #[inline(always)]
+    pub unsafe fn set_bytes_bytes(mut s: *mut u8, c: u8, n: usize) {
+        let end = s.wrapping_add(n);
+        while s < end {
+            *s = c;
+            s = s.wrapping_add(1);
+        }
+    }
+
+    #[inline(always)]
+    pub unsafe fn set_bytes_words(s: *mut u8, c: u8, n: usize) {
+        let mut broadcast = c as usize;
+        let mut bits = 8;
+        while bits < WORD_SIZE * 8 {
+            broadcast |= broadcast << bits;
+            bits *= 2;
+        }
+
+        let mut s_usize = s as *mut usize;
+        let end = s.wrapping_add(n) as *mut usize;
+
+        while s_usize < end {
+            *s_usize = broadcast;
+            s_usize = s_usize.wrapping_add(1);
+        }
+    }
+
+    if likely(n >= WORD_COPY_THRESHOLD) {
+        // Align s
+        // Because of n >= 2 * WORD_SIZE, dst_misalignment < n
+        let misalignment = (s as usize).wrapping_neg() & WORD_MASK;
+        set_bytes_bytes(s, c, misalignment);
+        s = s.wrapping_add(misalignment);
+        n -= misalignment;
+
+        let n_words = n & !WORD_MASK;
+        set_bytes_words(s, c, n_words);
+        s = s.wrapping_add(n_words);
+        n -= n_words;
+    }
+    set_bytes_bytes(s, c, n);
+}
+
+#[inline(always)]
+pub unsafe fn compare_bytes(s1: *const u8, s2: *const u8, n: usize) -> i32 {
+    let mut i = 0;
+    while i < n {
+        let a = *s1.wrapping_add(i);
+        let b = *s2.wrapping_add(i);
+        if a != b {
+            return a as i32 - b as i32;
+        }
+        i += 1;
+    }
+    0
+}
+
+#[inline(always)]
+pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize {
+    let mut n = 0;
+    while *s != 0 {
+        n += 1;
+        s = s.wrapping_add(1);
+    }
+    n
+}
diff --git a/library/compiler-builtins/compiler-builtins/src/mem/mod.rs b/library/compiler-builtins/compiler-builtins/src/mem/mod.rs
new file mode 100644
index 00000000000..6828f3804e0
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/mem/mod.rs
@@ -0,0 +1,60 @@
+// Trying to satisfy clippy here is hopeless
+#![allow(clippy::style)]
+// FIXME(e2024): this eventually needs to be removed.
+#![allow(unsafe_op_in_unsafe_fn)]
+
+#[allow(warnings)]
+#[cfg(target_pointer_width = "16")]
+type c_int = i16;
+#[allow(warnings)]
+#[cfg(not(target_pointer_width = "16"))]
+type c_int = i32;
+
+// memcpy/memmove/memset have optimized implementations on some architectures
+#[cfg_attr(
+    all(not(feature = "no-asm"), target_arch = "x86_64"),
+    path = "x86_64.rs"
+)]
+mod impls;
+
+intrinsics! {
+    #[mem_builtin]
+    pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
+        impls::copy_forward(dest, src, n);
+        dest
+    }
+
+    #[mem_builtin]
+    pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
+        let delta = (dest as usize).wrapping_sub(src as usize);
+        if delta >= n {
+            // We can copy forwards because either dest is far enough ahead of src,
+            // or src is ahead of dest (and delta overflowed).
+            impls::copy_forward(dest, src, n);
+        } else {
+            impls::copy_backward(dest, src, n);
+        }
+        dest
+    }
+
+    #[mem_builtin]
+    pub unsafe extern "C" fn memset(s: *mut u8, c: crate::mem::c_int, n: usize) -> *mut u8 {
+        impls::set_bytes(s, c as u8, n);
+        s
+    }
+
+    #[mem_builtin]
+    pub unsafe extern "C" fn memcmp(s1: *const u8, s2: *const u8, n: usize) -> i32 {
+        impls::compare_bytes(s1, s2, n)
+    }
+
+    #[mem_builtin]
+    pub unsafe extern "C" fn bcmp(s1: *const u8, s2: *const u8, n: usize) -> i32 {
+        memcmp(s1, s2, n)
+    }
+
+    #[mem_builtin]
+    pub unsafe extern "C" fn strlen(s: *const core::ffi::c_char) -> usize {
+        impls::c_string_length(s)
+    }
+}
diff --git a/library/compiler-builtins/compiler-builtins/src/mem/x86_64.rs b/library/compiler-builtins/compiler-builtins/src/mem/x86_64.rs
new file mode 100644
index 00000000000..5cbe83ab1e2
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/mem/x86_64.rs
@@ -0,0 +1,313 @@
+// On most modern Intel and AMD processors, "rep movsq" and "rep stosq" have
+// been enhanced to perform better than an simple qword loop, making them ideal
+// for implementing memcpy/memset. Note that "rep cmps" has received no such
+// enhancement, so it is not used to implement memcmp.
+//
+// On certain recent Intel processors, "rep movsb" and "rep stosb" have been
+// further enhanced to automatically select the best microarchitectural
+// implementation based on length and alignment. See the following features from
+// the "Intel® 64 and IA-32 Architectures Optimization Reference Manual":
+//  - ERMSB - Enhanced REP MOVSB and STOSB (Ivy Bridge and later)
+//  - FSRM - Fast Short REP MOV (Ice Lake and later)
+//  - Fast Zero-Length MOVSB (On no current hardware)
+//  - Fast Short STOSB (On no current hardware)
+//
+// To simplify things, we switch to using the byte-based variants if the "ermsb"
+// feature is present at compile-time. We don't bother detecting other features.
+// Note that ERMSB does not enhance the backwards (DF=1) "rep movsb".
+
+use core::arch::asm;
+use core::{intrinsics, mem};
+
+#[inline(always)]
+#[cfg(target_feature = "ermsb")]
+pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
+    // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
+    core::arch::asm!(
+        "repe movsb (%rsi), (%rdi)",
+        inout("rcx") count => _,
+        inout("rdi") dest => _,
+        inout("rsi") src => _,
+        options(att_syntax, nostack, preserves_flags)
+    );
+}
+
+#[inline(always)]
+#[cfg(not(target_feature = "ermsb"))]
+pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, count: usize) {
+    let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
+    // Separating the blocks gives the compiler more freedom to reorder instructions.
+    asm!(
+        "rep movsb",
+        inout("ecx") pre_byte_count => _,
+        inout("rdi") dest => dest,
+        inout("rsi") src => src,
+        options(att_syntax, nostack, preserves_flags)
+    );
+    asm!(
+        "rep movsq",
+        inout("rcx") qword_count => _,
+        inout("rdi") dest => dest,
+        inout("rsi") src => src,
+        options(att_syntax, nostack, preserves_flags)
+    );
+    asm!(
+        "rep movsb",
+        inout("ecx") byte_count => _,
+        inout("rdi") dest => _,
+        inout("rsi") src => _,
+        options(att_syntax, nostack, preserves_flags)
+    );
+}
+
+#[inline(always)]
+pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) {
+    let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
+    // We can't separate this block due to std/cld
+    asm!(
+        "std",
+        "rep movsb",
+        "sub $7, %rsi",
+        "sub $7, %rdi",
+        "mov {qword_count}, %rcx",
+        "rep movsq",
+        "test {pre_byte_count:e}, {pre_byte_count:e}",
+        "add $7, %rsi",
+        "add $7, %rdi",
+        "mov {pre_byte_count:e}, %ecx",
+        "rep movsb",
+        "cld",
+        pre_byte_count = in(reg) pre_byte_count,
+        qword_count = in(reg) qword_count,
+        inout("ecx") byte_count => _,
+        inout("rdi") dest.add(count - 1) => _,
+        inout("rsi") src.add(count - 1) => _,
+        // We modify flags, but we restore it afterwards
+        options(att_syntax, nostack, preserves_flags)
+    );
+}
+
+#[inline(always)]
+#[cfg(target_feature = "ermsb")]
+pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
+    // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
+    core::arch::asm!(
+        "repe stosb %al, (%rdi)",
+        inout("rcx") count => _,
+        inout("rdi") dest => _,
+        inout("al") c => _,
+        options(att_syntax, nostack, preserves_flags)
+    )
+}
+
+#[inline(always)]
+#[cfg(not(target_feature = "ermsb"))]
+pub unsafe fn set_bytes(mut dest: *mut u8, c: u8, count: usize) {
+    let c = c as u64 * 0x0101_0101_0101_0101;
+    let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
+    // Separating the blocks gives the compiler more freedom to reorder instructions.
+    asm!(
+        "rep stosb",
+        inout("ecx") pre_byte_count => _,
+        inout("rdi") dest => dest,
+        in("rax") c,
+        options(att_syntax, nostack, preserves_flags)
+    );
+    asm!(
+        "rep stosq",
+        inout("rcx") qword_count => _,
+        inout("rdi") dest => dest,
+        in("rax") c,
+        options(att_syntax, nostack, preserves_flags)
+    );
+    asm!(
+        "rep stosb",
+        inout("ecx") byte_count => _,
+        inout("rdi") dest => _,
+        in("rax") c,
+        options(att_syntax, nostack, preserves_flags)
+    );
+}
+
+#[inline(always)]
+pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 {
+    #[inline(always)]
+    unsafe fn cmp<T, U, F>(mut a: *const T, mut b: *const T, n: usize, f: F) -> i32
+    where
+        T: Clone + Copy + Eq,
+        U: Clone + Copy + Eq,
+        F: FnOnce(*const U, *const U, usize) -> i32,
+    {
+        // Ensure T is not a ZST.
+        const { assert!(mem::size_of::<T>() != 0) };
+
+        let end = a.add(intrinsics::unchecked_div(n, mem::size_of::<T>()));
+        while a != end {
+            if a.read_unaligned() != b.read_unaligned() {
+                return f(a.cast(), b.cast(), mem::size_of::<T>());
+            }
+            a = a.add(1);
+            b = b.add(1);
+        }
+        f(
+            a.cast(),
+            b.cast(),
+            intrinsics::unchecked_rem(n, mem::size_of::<T>()),
+        )
+    }
+    let c1 = |mut a: *const u8, mut b: *const u8, n| {
+        for _ in 0..n {
+            if a.read() != b.read() {
+                return i32::from(a.read()) - i32::from(b.read());
+            }
+            a = a.add(1);
+            b = b.add(1);
+        }
+        0
+    };
+    let c2 = |a: *const u16, b, n| cmp(a, b, n, c1);
+    let c4 = |a: *const u32, b, n| cmp(a, b, n, c2);
+    let c8 = |a: *const u64, b, n| cmp(a, b, n, c4);
+    let c16 = |a: *const u128, b, n| cmp(a, b, n, c8);
+    c16(a.cast(), b.cast(), n)
+}
+
+// In order to process more than on byte simultaneously when executing strlen,
+// two things must be considered:
+// * An n byte read with an n-byte aligned address will never cross
+//   a page boundary and will always succeed. Any smaller alignment
+//   may result in a read that will cross a page boundary, which may
+//   trigger an access violation.
+// * Surface Rust considers any kind of out-of-bounds read as undefined
+//   behaviour. To dodge this, memory access operations are written
+//   using inline assembly.
+
+#[cfg(target_feature = "sse2")]
+#[inline(always)]
+pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize {
+    use core::arch::x86_64::{__m128i, _mm_cmpeq_epi8, _mm_movemask_epi8, _mm_set1_epi8};
+
+    let mut n = 0;
+
+    // The use of _mm_movemask_epi8 and company allow for speedups,
+    // but they aren't cheap by themselves. Thus, possibly small strings
+    // are handled in simple loops.
+
+    for _ in 0..4 {
+        if *s == 0 {
+            return n;
+        }
+
+        n += 1;
+        s = s.add(1);
+    }
+
+    // Shave of the least significand bits to align the address to a 16
+    // byte boundary. The shaved of bits are used to correct the first iteration.
+
+    let align = s as usize & 15;
+    let mut s = ((s as usize) - align) as *const __m128i;
+    let zero = _mm_set1_epi8(0);
+
+    let x = {
+        let r;
+        asm!(
+            "movdqa ({addr}), {dest}",
+            addr = in(reg) s,
+            dest = out(xmm_reg) r,
+            options(att_syntax, nostack),
+        );
+        r
+    };
+    let cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(x, zero)) >> align;
+
+    if cmp != 0 {
+        return n + cmp.trailing_zeros() as usize;
+    }
+
+    n += 16 - align;
+    s = s.add(1);
+
+    loop {
+        let x = {
+            let r;
+            asm!(
+                "movdqa ({addr}), {dest}",
+                addr = in(reg) s,
+                dest = out(xmm_reg) r,
+                options(att_syntax, nostack),
+            );
+            r
+        };
+        let cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(x, zero)) as u32;
+        if cmp == 0 {
+            n += 16;
+            s = s.add(1);
+        } else {
+            return n + cmp.trailing_zeros() as usize;
+        }
+    }
+}
+
+// Provided for scenarios like kernel development, where SSE might not
+// be available.
+#[cfg(not(target_feature = "sse2"))]
+#[inline(always)]
+pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize {
+    let mut n = 0;
+
+    // Check bytes in steps of one until
+    // either a zero byte is discovered or
+    // pointer is aligned to an eight byte boundary.
+
+    while s as usize & 7 != 0 {
+        if *s == 0 {
+            return n;
+        }
+        n += 1;
+        s = s.add(1);
+    }
+
+    // Check bytes in steps of eight until a zero
+    // byte is discovered.
+
+    let mut s = s as *const u64;
+
+    loop {
+        let mut cs = {
+            let r: u64;
+            asm!(
+                "mov ({addr}), {dest}",
+                addr = in(reg) s,
+                dest = out(reg) r,
+                options(att_syntax, nostack),
+            );
+            r
+        };
+        // Detect if a word has a zero byte, taken from
+        // https://graphics.stanford.edu/~seander/bithacks.html
+        if (cs.wrapping_sub(0x0101010101010101) & !cs & 0x8080808080808080) != 0 {
+            loop {
+                if cs & 255 == 0 {
+                    return n;
+                } else {
+                    cs >>= 8;
+                    n += 1;
+                }
+            }
+        } else {
+            n += 8;
+            s = s.add(1);
+        }
+    }
+}
+
+/// Determine optimal parameters for a `rep` instruction.
+fn rep_param(dest: *mut u8, mut count: usize) -> (usize, usize, usize) {
+    // Unaligned writes are still slow on modern processors, so align the destination address.
+    let pre_byte_count = ((8 - (dest as usize & 0b111)) & 0b111).min(count);
+    count -= pre_byte_count;
+    let qword_count = count >> 3;
+    let byte_count = count & 0b111;
+    (pre_byte_count, qword_count, byte_count)
+}
diff --git a/library/compiler-builtins/compiler-builtins/src/probestack.rs b/library/compiler-builtins/compiler-builtins/src/probestack.rs
new file mode 100644
index 00000000000..5b6abd21a1d
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/probestack.rs
@@ -0,0 +1,350 @@
+// Copyright 2017 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+//! This module defines the `__rust_probestack` intrinsic which is used in the
+//! implementation of "stack probes" on certain platforms.
+//!
+//! The purpose of a stack probe is to provide a static guarantee that if a
+//! thread has a guard page then a stack overflow is guaranteed to hit that
+//! guard page. If a function did not have a stack probe then there's a risk of
+//! having a stack frame *larger* than the guard page, so a function call could
+//! skip over the guard page entirely and then later hit maybe the heap or
+//! another thread, possibly leading to security vulnerabilities such as [The
+//! Stack Clash], for example.
+//!
+//! [The Stack Clash]: https://blog.qualys.com/securitylabs/2017/06/19/the-stack-clash
+//!
+//! The `__rust_probestack` is called in the prologue of functions whose stack
+//! size is larger than the guard page, for example larger than 4096 bytes on
+//! x86. This function is then responsible for "touching" all pages relevant to
+//! the stack to ensure that that if any of them are the guard page we'll hit
+//! them guaranteed.
+//!
+//! The precise ABI for how this function operates is defined by LLVM. There's
+//! no real documentation as to what this is, so you'd basically need to read
+//! the LLVM source code for reference. Often though the test cases can be
+//! illuminating as to the ABI that's generated, or just looking at the output
+//! of `llc`.
+//!
+//! Note that `#[naked]` is typically used here for the stack probe because the
+//! ABI corresponds to no actual ABI.
+//!
+//! Finally it's worth noting that at the time of this writing LLVM only has
+//! support for stack probes on x86 and x86_64. There's no support for stack
+//! probes on any other architecture like ARM or PowerPC64. LLVM I'm sure would
+//! be more than welcome to accept such a change!
+
+#![cfg(not(feature = "mangled-names"))]
+// Windows and Cygwin already has builtins to do this.
+#![cfg(not(any(windows, target_os = "cygwin")))]
+// All these builtins require assembly
+#![cfg(not(feature = "no-asm"))]
+// We only define stack probing for these architectures today.
+#![cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+
+extern "C" {
+    pub fn __rust_probestack();
+}
+
+// A wrapper for our implementation of __rust_probestack, which allows us to
+// keep the assembly inline while controlling all CFI directives in the assembly
+// emitted for the function.
+//
+// This is the ELF version.
+#[cfg(not(any(target_vendor = "apple", target_os = "uefi")))]
+macro_rules! define_rust_probestack {
+    ($body: expr) => {
+        concat!(
+            "
+            .pushsection .text.__rust_probestack
+            .globl __rust_probestack
+            .type  __rust_probestack, @function
+            .hidden __rust_probestack
+        __rust_probestack:
+            ",
+            $body,
+            "
+            .size __rust_probestack, . - __rust_probestack
+            .popsection
+            "
+        )
+    };
+}
+
+#[cfg(all(target_os = "uefi", target_arch = "x86_64"))]
+macro_rules! define_rust_probestack {
+    ($body: expr) => {
+        concat!(
+            "
+            .globl __rust_probestack
+        __rust_probestack:
+            ",
+            $body
+        )
+    };
+}
+
+// Same as above, but for Mach-O. Note that the triple underscore
+// is deliberate
+#[cfg(target_vendor = "apple")]
+macro_rules! define_rust_probestack {
+    ($body: expr) => {
+        concat!(
+            "
+            .globl ___rust_probestack
+        ___rust_probestack:
+            ",
+            $body
+        )
+    };
+}
+
+// In UEFI x86 arch, triple underscore is deliberate.
+#[cfg(all(target_os = "uefi", target_arch = "x86"))]
+macro_rules! define_rust_probestack {
+    ($body: expr) => {
+        concat!(
+            "
+            .globl ___rust_probestack
+        ___rust_probestack:
+            ",
+            $body
+        )
+    };
+}
+
+// Our goal here is to touch each page between %rsp+8 and %rsp+8-%rax,
+// ensuring that if any pages are unmapped we'll make a page fault.
+//
+// The ABI here is that the stack frame size is located in `%rax`. Upon
+// return we're not supposed to modify `%rsp` or `%rax`.
+//
+// Any changes to this function should be replicated to the SGX version below.
+#[cfg(all(
+    target_arch = "x86_64",
+    not(all(target_env = "sgx", target_vendor = "fortanix"))
+))]
+core::arch::global_asm!(
+    define_rust_probestack!(
+        "
+    .cfi_startproc
+    pushq  %rbp
+    .cfi_adjust_cfa_offset 8
+    .cfi_offset %rbp, -16
+    movq   %rsp, %rbp
+    .cfi_def_cfa_register %rbp
+
+    mov    %rax,%r11        // duplicate %rax as we're clobbering %r11
+
+    // Main loop, taken in one page increments. We're decrementing rsp by
+    // a page each time until there's less than a page remaining. We're
+    // guaranteed that this function isn't called unless there's more than a
+    // page needed.
+    //
+    // Note that we're also testing against `8(%rsp)` to account for the 8
+    // bytes pushed on the stack orginally with our return address. Using
+    // `8(%rsp)` simulates us testing the stack pointer in the caller's
+    // context.
+
+    // It's usually called when %rax >= 0x1000, but that's not always true.
+    // Dynamic stack allocation, which is needed to implement unsized
+    // rvalues, triggers stackprobe even if %rax < 0x1000.
+    // Thus we have to check %r11 first to avoid segfault.
+    cmp    $0x1000,%r11
+    jna    3f
+2:
+    sub    $0x1000,%rsp
+    test   %rsp,8(%rsp)
+    sub    $0x1000,%r11
+    cmp    $0x1000,%r11
+    ja     2b
+
+3:
+    // Finish up the last remaining stack space requested, getting the last
+    // bits out of r11
+    sub    %r11,%rsp
+    test   %rsp,8(%rsp)
+
+    // Restore the stack pointer to what it previously was when entering
+    // this function. The caller will readjust the stack pointer after we
+    // return.
+    add    %rax,%rsp
+
+    leave
+    .cfi_def_cfa_register %rsp
+    .cfi_adjust_cfa_offset -8
+    ret
+    .cfi_endproc
+    "
+    ),
+    options(att_syntax)
+);
+
+// This function is the same as above, except that some instructions are
+// [manually patched for LVI].
+//
+// [manually patched for LVI]: https://software.intel.com/security-software-guidance/insights/deep-dive-load-value-injection#specialinstructions
+#[cfg(all(
+    target_arch = "x86_64",
+    all(target_env = "sgx", target_vendor = "fortanix")
+))]
+core::arch::global_asm!(
+    define_rust_probestack!(
+        "
+    .cfi_startproc
+    pushq  %rbp
+    .cfi_adjust_cfa_offset 8
+    .cfi_offset %rbp, -16
+    movq   %rsp, %rbp
+    .cfi_def_cfa_register %rbp
+
+    mov    %rax,%r11        // duplicate %rax as we're clobbering %r11
+
+    // Main loop, taken in one page increments. We're decrementing rsp by
+    // a page each time until there's less than a page remaining. We're
+    // guaranteed that this function isn't called unless there's more than a
+    // page needed.
+    //
+    // Note that we're also testing against `8(%rsp)` to account for the 8
+    // bytes pushed on the stack orginally with our return address. Using
+    // `8(%rsp)` simulates us testing the stack pointer in the caller's
+    // context.
+
+    // It's usually called when %rax >= 0x1000, but that's not always true.
+    // Dynamic stack allocation, which is needed to implement unsized
+    // rvalues, triggers stackprobe even if %rax < 0x1000.
+    // Thus we have to check %r11 first to avoid segfault.
+    cmp    $0x1000,%r11
+    jna    3f
+2:
+    sub    $0x1000,%rsp
+    test   %rsp,8(%rsp)
+    sub    $0x1000,%r11
+    cmp    $0x1000,%r11
+    ja     2b
+
+3:
+    // Finish up the last remaining stack space requested, getting the last
+    // bits out of r11
+    sub    %r11,%rsp
+    test   %rsp,8(%rsp)
+
+    // Restore the stack pointer to what it previously was when entering
+    // this function. The caller will readjust the stack pointer after we
+    // return.
+    add    %rax,%rsp
+
+    leave
+    .cfi_def_cfa_register %rsp
+    .cfi_adjust_cfa_offset -8
+    pop %r11
+    lfence
+    jmp *%r11
+    .cfi_endproc
+    "
+    ),
+    options(att_syntax)
+);
+
+#[cfg(all(target_arch = "x86", not(target_os = "uefi")))]
+// This is the same as x86_64 above, only translated for 32-bit sizes. Note
+// that on Unix we're expected to restore everything as it was, this
+// function basically can't tamper with anything.
+//
+// The ABI here is the same as x86_64, except everything is 32-bits large.
+core::arch::global_asm!(
+    define_rust_probestack!(
+        "
+    .cfi_startproc
+    push   %ebp
+    .cfi_adjust_cfa_offset 4
+    .cfi_offset %ebp, -8
+    mov    %esp, %ebp
+    .cfi_def_cfa_register %ebp
+    push   %ecx
+    mov    %eax,%ecx
+
+    cmp    $0x1000,%ecx
+    jna    3f
+2:
+    sub    $0x1000,%esp
+    test   %esp,8(%esp)
+    sub    $0x1000,%ecx
+    cmp    $0x1000,%ecx
+    ja     2b
+
+3:
+    sub    %ecx,%esp
+    test   %esp,8(%esp)
+
+    add    %eax,%esp
+    pop    %ecx
+    leave
+    .cfi_def_cfa_register %esp
+    .cfi_adjust_cfa_offset -4
+    ret
+    .cfi_endproc
+    "
+    ),
+    options(att_syntax)
+);
+
+#[cfg(all(target_arch = "x86", target_os = "uefi"))]
+// UEFI target is windows like target. LLVM will do _chkstk things like windows.
+// probestack function will also do things like _chkstk in MSVC.
+// So we need to sub %ax %sp in probestack when arch is x86.
+//
+// REF: Rust commit(74e80468347)
+// rust\src\llvm-project\llvm\lib\Target\X86\X86FrameLowering.cpp: 805
+// Comments in LLVM:
+//   MSVC x32's _chkstk and cygwin/mingw's _alloca adjust %esp themselves.
+//   MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp
+//   themselves.
+core::arch::global_asm!(
+    define_rust_probestack!(
+        "
+    .cfi_startproc
+    push   %ebp
+    .cfi_adjust_cfa_offset 4
+    .cfi_offset %ebp, -8
+    mov    %esp, %ebp
+    .cfi_def_cfa_register %ebp
+    push   %ecx
+    push   %edx
+    mov    %eax,%ecx
+
+    cmp    $0x1000,%ecx
+    jna    3f
+2:
+    sub    $0x1000,%esp
+    test   %esp,8(%esp)
+    sub    $0x1000,%ecx
+    cmp    $0x1000,%ecx
+    ja     2b
+
+3:
+    sub    %ecx,%esp
+    test   %esp,8(%esp)
+    mov    4(%ebp),%edx
+    mov    %edx, 12(%esp)
+    add    %eax,%esp
+    pop    %edx
+    pop    %ecx
+    leave
+
+    sub   %eax, %esp
+    .cfi_def_cfa_register %esp
+    .cfi_adjust_cfa_offset -4
+    ret
+    .cfi_endproc
+    "
+    ),
+    options(att_syntax)
+);
diff --git a/library/compiler-builtins/compiler-builtins/src/riscv.rs b/library/compiler-builtins/compiler-builtins/src/riscv.rs
new file mode 100644
index 00000000000..bf312553341
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/riscv.rs
@@ -0,0 +1,50 @@
+intrinsics! {
+    // Ancient Egyptian/Ethiopian/Russian multiplication method
+    // see https://en.wikipedia.org/wiki/Ancient_Egyptian_multiplication
+    //
+    // This is a long-available stock algorithm; e.g. it is documented in
+    // Knuth's "The Art of Computer Programming" volume 2 (under the section
+    // "Evaluation of Powers") since at least the 2nd edition (1981).
+    //
+    // The main attraction of this method is that it implements (software)
+    // multiplication atop four simple operations: doubling, halving, checking
+    // if a value is even/odd, and addition. This is *not* considered to be the
+    // fastest multiplication method, but it may be amongst the simplest (and
+    // smallest with respect to code size).
+    //
+    // for reference, see also implementation from gcc
+    // https://raw.githubusercontent.com/gcc-mirror/gcc/master/libgcc/config/epiphany/mulsi3.c
+    //
+    // and from LLVM (in relatively readable RISC-V assembly):
+    // https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/riscv/int_mul_impl.inc
+    pub extern "C" fn __mulsi3(a: u32, b: u32) -> u32 {
+        let (mut a, mut b) = (a, b);
+        let mut r: u32 = 0;
+
+        while a > 0 {
+            if a & 1 > 0 {
+                r = r.wrapping_add(b);
+            }
+            a >>= 1;
+            b <<= 1;
+        }
+
+        r
+    }
+
+    #[cfg(not(target_feature = "m"))]
+    pub extern "C" fn __muldi3(a: u64, b: u64) -> u64 {
+        let (mut a, mut b) = (a, b);
+        let mut r: u64 = 0;
+
+        while a > 0 {
+            if a & 1 > 0 {
+                r = r.wrapping_add(b);
+            }
+            a >>= 1;
+            b <<= 1;
+        }
+
+        r
+    }
+}
diff --git a/library/compiler-builtins/compiler-builtins/src/x86.rs b/library/compiler-builtins/compiler-builtins/src/x86.rs
new file mode 100644
index 00000000000..01152d9c798
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/x86.rs
@@ -0,0 +1,53 @@
+#![allow(unused_imports)]
+
+use core::intrinsics;
+
+// NOTE These functions are implemented using assembly because they using a custom
+// calling convention which can't be implemented using a normal Rust function
+
+// NOTE These functions are never mangled as they are not tested against compiler-rt
+
+intrinsics! {
+    #[unsafe(naked)]
+    #[cfg(all(
+        any(all(windows, target_env = "gnu"), target_os = "uefi"),
+        not(feature = "no-asm")
+    ))]
+    pub unsafe extern "C" fn __chkstk() {
+        core::arch::naked_asm!(
+            "jmp __alloca", // Jump to __alloca since fallthrough may be unreliable"
+            options(att_syntax)
+        );
+    }
+
+    #[unsafe(naked)]
+    #[cfg(all(
+        any(all(windows, target_env = "gnu"), target_os = "uefi"),
+        not(feature = "no-asm")
+    ))]
+    pub unsafe extern "C" fn _alloca() {
+        // __chkstk and _alloca are the same function
+        core::arch::naked_asm!(
+            "push   %ecx",
+            "cmp    $0x1000,%eax",
+            "lea    8(%esp),%ecx", // esp before calling this routine -> ecx
+            "jb     1f",
+            "2:",
+            "sub    $0x1000,%ecx",
+            "test   %ecx,(%ecx)",
+            "sub    $0x1000,%eax",
+            "cmp    $0x1000,%eax",
+            "ja     2b",
+            "1:",
+            "sub    %eax,%ecx",
+            "test   %ecx,(%ecx)",
+            "lea    4(%esp),%eax",  // load pointer to the return address into eax
+            "mov    %ecx,%esp",     // install the new top of stack pointer into esp
+            "mov    -4(%eax),%ecx", // restore ecx
+            "push   (%eax)",        // push return address onto the stack
+            "sub    %esp,%eax",     // restore the original value in eax
+            "ret",
+            options(att_syntax)
+        );
+    }
+}
diff --git a/library/compiler-builtins/compiler-builtins/src/x86_64.rs b/library/compiler-builtins/compiler-builtins/src/x86_64.rs
new file mode 100644
index 00000000000..fc1190f79b2
--- /dev/null
+++ b/library/compiler-builtins/compiler-builtins/src/x86_64.rs
@@ -0,0 +1,51 @@
+#![allow(unused_imports)]
+
+use core::intrinsics;
+
+// NOTE These functions are implemented using assembly because they using a custom
+// calling convention which can't be implemented using a normal Rust function
+
+// NOTE These functions are never mangled as they are not tested against compiler-rt
+
+intrinsics! {
+    #[unsafe(naked)]
+    #[cfg(all(
+        any(
+            all(windows, target_env = "gnu"),
+            target_os = "cygwin",
+            target_os = "uefi"
+        ),
+        not(feature = "no-asm")
+    ))]
+    pub unsafe extern "C" fn ___chkstk_ms() {
+        core::arch::naked_asm!(
+            "push   %rcx",
+            "push   %rax",
+            "cmp    $0x1000,%rax",
+            "lea    24(%rsp),%rcx",
+            "jb     1f",
+            "2:",
+            "sub    $0x1000,%rcx",
+            "test   %rcx,(%rcx)",
+            "sub    $0x1000,%rax",
+            "cmp    $0x1000,%rax",
+            "ja     2b",
+            "1:",
+            "sub    %rax,%rcx",
+            "test   %rcx,(%rcx)",
+            "pop    %rax",
+            "pop    %rcx",
+            "ret",
+            options(att_syntax)
+        );
+    }
+}
+
+// HACK(https://github.com/rust-lang/rust/issues/62785): x86_64-unknown-uefi needs special LLVM
+// support unless we emit the _fltused
+mod _fltused {
+    #[unsafe(no_mangle)]
+    #[used]
+    #[cfg(target_os = "uefi")]
+    static _fltused: i32 = 0;
+}
diff --git a/library/compiler-builtins/crates/libm-macros/Cargo.toml b/library/compiler-builtins/crates/libm-macros/Cargo.toml
new file mode 100644
index 00000000000..3929854f08e
--- /dev/null
+++ b/library/compiler-builtins/crates/libm-macros/Cargo.toml
@@ -0,0 +1,22 @@
+[package]
+name = "libm-macros"
+version = "0.1.0"
+edition = "2024"
+publish = false
+license = "MIT OR Apache-2.0"
+
+[lib]
+proc-macro = true
+
+[dependencies]
+heck = "0.5.0"
+proc-macro2 = "1.0.94"
+quote = "1.0.40"
+syn = { version = "2.0.100", features = ["full", "extra-traits", "visit-mut"] }
+
+[lints.rust]
+# Values used during testing
+unexpected_cfgs = { level = "warn", check-cfg = [
+  'cfg(f16_enabled)',
+  'cfg(f128_enabled)',
+] }
diff --git a/library/compiler-builtins/crates/libm-macros/src/enums.rs b/library/compiler-builtins/crates/libm-macros/src/enums.rs
new file mode 100644
index 00000000000..b4646f984d4
--- /dev/null
+++ b/library/compiler-builtins/crates/libm-macros/src/enums.rs
@@ -0,0 +1,171 @@
+use heck::ToUpperCamelCase;
+use proc_macro2 as pm2;
+use proc_macro2::{Ident, Span};
+use quote::quote;
+use syn::spanned::Spanned;
+use syn::{Fields, ItemEnum, Variant};
+
+use crate::{ALL_OPERATIONS, base_name};
+
+/// Implement `#[function_enum]`, see documentation in `lib.rs`.
+pub fn function_enum(
+    mut item: ItemEnum,
+    attributes: pm2::TokenStream,
+) -> syn::Result<pm2::TokenStream> {
+    expect_empty_enum(&item)?;
+    let attr_span = attributes.span();
+    let mut attr = attributes.into_iter();
+
+    // Attribute should be the identifier of the `BaseName` enum.
+    let Some(tt) = attr.next() else {
+        return Err(syn::Error::new(attr_span, "expected one attribute"));
+    };
+
+    let pm2::TokenTree::Ident(base_enum) = tt else {
+        return Err(syn::Error::new(tt.span(), "expected an identifier"));
+    };
+
+    if let Some(tt) = attr.next() {
+        return Err(syn::Error::new(
+            tt.span(),
+            "unexpected token after identifier",
+        ));
+    }
+
+    let enum_name = &item.ident;
+    let mut as_str_arms = Vec::new();
+    let mut from_str_arms = Vec::new();
+    let mut base_arms = Vec::new();
+
+    for func in ALL_OPERATIONS.iter() {
+        let fn_name = func.name;
+        let ident = Ident::new(&fn_name.to_upper_camel_case(), Span::call_site());
+        let bname_ident = Ident::new(&base_name(fn_name).to_upper_camel_case(), Span::call_site());
+
+        // Match arm for `fn as_str(self)` matcher
+        as_str_arms.push(quote! { Self::#ident => #fn_name });
+        from_str_arms.push(quote! { #fn_name => Self::#ident });
+
+        // Match arm for `fn base_name(self)` matcher
+        base_arms.push(quote! { Self::#ident => #base_enum::#bname_ident });
+
+        let variant = Variant {
+            attrs: Vec::new(),
+            ident,
+            fields: Fields::Unit,
+            discriminant: None,
+        };
+
+        item.variants.push(variant);
+    }
+
+    let variants = item.variants.iter();
+
+    let res = quote! {
+        // Instantiate the enum
+        #item
+
+        impl #enum_name {
+            /// All variants of this enum.
+            pub const ALL: &[Self] = &[
+                #( Self::#variants, )*
+            ];
+
+            /// The stringified version of this function name.
+            pub const fn as_str(self) -> &'static str {
+                match self {
+                    #( #as_str_arms , )*
+                }
+            }
+
+            /// If `s` is the name of a function, return it.
+            pub fn from_str(s: &str) -> Option<Self> {
+                let ret = match s {
+                    #( #from_str_arms , )*
+                    _ => return None,
+                };
+                Some(ret)
+            }
+
+            /// The base name enum for this function.
+            pub const fn base_name(self) -> #base_enum {
+                match self {
+                    #( #base_arms, )*
+                }
+            }
+
+            /// Return information about this operation.
+            pub fn math_op(self) -> &'static crate::op::MathOpInfo {
+                crate::op::ALL_OPERATIONS.iter().find(|op| op.name == self.as_str()).unwrap()
+            }
+        }
+    };
+
+    Ok(res)
+}
+
+/// Implement `#[base_name_enum]`, see documentation in `lib.rs`.
+pub fn base_name_enum(
+    mut item: ItemEnum,
+    attributes: pm2::TokenStream,
+) -> syn::Result<pm2::TokenStream> {
+    expect_empty_enum(&item)?;
+    if !attributes.is_empty() {
+        let sp = attributes.span();
+        return Err(syn::Error::new(sp.span(), "no attributes expected"));
+    }
+
+    let mut base_names: Vec<_> = ALL_OPERATIONS
+        .iter()
+        .map(|func| base_name(func.name))
+        .collect();
+    base_names.sort_unstable();
+    base_names.dedup();
+
+    let item_name = &item.ident;
+    let mut as_str_arms = Vec::new();
+
+    for base_name in base_names {
+        let ident = Ident::new(&base_name.to_upper_camel_case(), Span::call_site());
+
+        // Match arm for `fn as_str(self)` matcher
+        as_str_arms.push(quote! { Self::#ident => #base_name });
+
+        let variant = Variant {
+            attrs: Vec::new(),
+            ident,
+            fields: Fields::Unit,
+            discriminant: None,
+        };
+
+        item.variants.push(variant);
+    }
+
+    let res = quote! {
+        // Instantiate the enum
+        #item
+
+        impl #item_name {
+            /// The stringified version of this base name.
+            pub const fn as_str(self) -> &'static str {
+                match self {
+                    #( #as_str_arms ),*
+                }
+            }
+        }
+    };
+
+    Ok(res)
+}
+
+/// Verify that an enum is empty, otherwise return an error
+fn expect_empty_enum(item: &ItemEnum) -> syn::Result<()> {
+    if !item.variants.is_empty() {
+        Err(syn::Error::new(
+            item.variants.span(),
+            "expected an empty enum",
+        ))
+    } else {
+        Ok(())
+    }
+}
diff --git a/library/compiler-builtins/crates/libm-macros/src/lib.rs b/library/compiler-builtins/crates/libm-macros/src/lib.rs
new file mode 100644
index 00000000000..482da974ca8
--- /dev/null
+++ b/library/compiler-builtins/crates/libm-macros/src/lib.rs
@@ -0,0 +1,504 @@
+#![feature(let_chains)]
+
+mod enums;
+mod parse;
+mod shared;
+
+use parse::{Invocation, StructuredInput};
+use proc_macro as pm;
+use proc_macro2::{self as pm2, Span};
+use quote::{ToTokens, quote};
+pub(crate) use shared::{ALL_OPERATIONS, FloatTy, MathOpInfo, Ty};
+use syn::spanned::Spanned;
+use syn::visit_mut::VisitMut;
+use syn::{Ident, ItemEnum};
+
+const KNOWN_TYPES: &[&str] = &[
+    "FTy", "CFn", "CArgs", "CRet", "RustFn", "RustArgs", "RustRet", "public",
+];
+
+/// Populate an enum with a variant representing function. Names are in upper camel case.
+///
+/// Applied to an empty enum. Expects one attribute `#[function_enum(BaseName)]` that provides
+/// the name of the `BaseName` enum.
+#[proc_macro_attribute]
+pub fn function_enum(attributes: pm::TokenStream, tokens: pm::TokenStream) -> pm::TokenStream {
+    let item = syn::parse_macro_input!(tokens as ItemEnum);
+    let res = enums::function_enum(item, attributes.into());
+
+    match res {
+        Ok(ts) => ts,
+        Err(e) => e.into_compile_error(),
+    }
+    .into()
+}
+
+/// Create an enum representing all possible base names, with names in upper camel case.
+///
+/// Applied to an empty enum.
+#[proc_macro_attribute]
+pub fn base_name_enum(attributes: pm::TokenStream, tokens: pm::TokenStream) -> pm::TokenStream {
+    let item = syn::parse_macro_input!(tokens as ItemEnum);
+    let res = enums::base_name_enum(item, attributes.into());
+
+    match res {
+        Ok(ts) => ts,
+        Err(e) => e.into_compile_error(),
+    }
+    .into()
+}
+
+/// Do something for each function present in this crate.
+///
+/// Takes a callback macro and invokes it multiple times, once for each function that
+/// this crate exports. This makes it easy to create generic tests, benchmarks, or other checks
+/// and apply it to each symbol.
+///
+/// Additionally, the `extra` and `fn_extra` patterns can make use of magic identifiers:
+///
+/// - `MACRO_FN_NAME`: gets replaced with the name of the function on that invocation.
+/// - `MACRO_FN_NAME_NORMALIZED`: similar to the above, but removes sufixes so e.g. `sinf` becomes
+///   `sin`, `cosf128` becomes `cos`, etc.
+///
+/// Invoke as:
+///
+/// ```
+/// // Macro that is invoked once per function
+/// macro_rules! callback_macro {
+///     (
+///         // Name of that function
+///         fn_name: $fn_name:ident,
+///         // The basic float type for this function (e.g. `f32`, `f64`)
+///         FTy: $FTy:ty,
+///         // Function signature of the C version (e.g. `fn(f32, &mut f32) -> f32`)
+///         CFn: $CFn:ty,
+///         // A tuple representing the C version's arguments (e.g. `(f32, &mut f32)`)
+///         CArgs: $CArgs:ty,
+///         // The C version's return type (e.g. `f32`)
+///         CRet: $CRet:ty,
+///         // Function signature of the Rust version (e.g. `fn(f32) -> (f32, f32)`)
+///         RustFn: $RustFn:ty,
+///         // A tuple representing the Rust version's arguments (e.g. `(f32,)`)
+///         RustArgs: $RustArgs:ty,
+///         // The Rust version's return type (e.g. `(f32, f32)`)
+///         RustRet: $RustRet:ty,
+///         // True if this is part of `libm`'s public API
+///         public: $public:expr,
+///         // Attributes for the current function, if any
+///         attrs: [$($attr:meta),*],
+///         // Extra tokens passed directly (if any)
+///         extra: [$extra:ident],
+///         // Extra function-tokens passed directly (if any)
+///         fn_extra: $fn_extra:expr,
+///     ) => { };
+/// }
+///
+/// // All fields except for `callback` are optional.
+/// libm_macros::for_each_function! {
+///     // The macro to invoke as a callback
+///     callback: callback_macro,
+///     // Which types to include either as a list (`[CFn, RustFn, RustArgs]`) or "all"
+///     emit_types: all,
+///     // Functions to skip, i.e. `callback` shouldn't be called at all for these.
+///     skip: [sin, cos],
+///     // Attributes passed as `attrs` for specific functions. For example, here the invocation
+///     // with `sinf` and that with `cosf` will both get `meta1` and `meta2`, but no others will.
+///     //
+///     // Note that `f16_enabled` and `f128_enabled` will always get emitted regardless of whether
+///     // or not this is specified.
+///     attributes: [
+///         #[meta1]
+///         #[meta2]
+///         [sinf, cosf],
+///     ],
+///     // Any tokens that should be passed directly to all invocations of the callback. This can
+///     // be used to pass local variables or other things the macro needs access to.
+///     extra: [foo],
+///     // Similar to `extra`, but allow providing a pattern for only specific functions. Uses
+///     // a simplified match-like syntax.
+///     fn_extra: match MACRO_FN_NAME {
+///         hypot | hypotf => |x| x.hypot(),
+///         // `ALL_*` magic matchers also work to extract specific types
+///         ALL_F64 => |x| x,
+///         // The default pattern gets applied to everything that did not match
+///         _ => |x| x,
+///     },
+/// }
+/// ```
+#[proc_macro]
+pub fn for_each_function(tokens: pm::TokenStream) -> pm::TokenStream {
+    let input = syn::parse_macro_input!(tokens as Invocation);
+
+    let res = StructuredInput::from_fields(input)
+        .and_then(|mut s_in| validate(&mut s_in).map(|fn_list| (s_in, fn_list)))
+        .and_then(|(s_in, fn_list)| expand(s_in, &fn_list));
+
+    match res {
+        Ok(ts) => ts.into(),
+        Err(e) => e.into_compile_error().into(),
+    }
+}
+
+/// Check for any input that is structurally correct but has other problems.
+///
+/// Returns the list of function names that we should expand for.
+fn validate(input: &mut StructuredInput) -> syn::Result<Vec<&'static MathOpInfo>> {
+    // Replace magic mappers with a list of relevant functions.
+    if let Some(map) = &mut input.fn_extra {
+        for (name, ty) in [
+            ("ALL_F16", FloatTy::F16),
+            ("ALL_F32", FloatTy::F32),
+            ("ALL_F64", FloatTy::F64),
+            ("ALL_F128", FloatTy::F128),
+        ] {
+            let Some(k) = map.keys().find(|key| *key == name) else {
+                continue;
+            };
+
+            let key = k.clone();
+            let val = map.remove(&key).unwrap();
+
+            for op in ALL_OPERATIONS.iter().filter(|op| op.float_ty == ty) {
+                map.insert(Ident::new(op.name, key.span()), val.clone());
+            }
+        }
+    }
+
+    // Collect lists of all functions that are provied as macro inputs in various fields (only,
+    // skip, attributes).
+    let attr_mentions = input
+        .attributes
+        .iter()
+        .flat_map(|map_list| map_list.iter())
+        .flat_map(|attr_map| attr_map.names.iter());
+    let only_mentions = input.only.iter().flat_map(|only_list| only_list.iter());
+    let fn_extra_mentions = input
+        .fn_extra
+        .iter()
+        .flat_map(|v| v.keys())
+        .filter(|name| *name != "_");
+    let all_mentioned_fns = input
+        .skip
+        .iter()
+        .chain(only_mentions)
+        .chain(attr_mentions)
+        .chain(fn_extra_mentions);
+
+    // Make sure that every function mentioned is a real function
+    for mentioned in all_mentioned_fns {
+        if !ALL_OPERATIONS.iter().any(|func| mentioned == func.name) {
+            let e = syn::Error::new(
+                mentioned.span(),
+                format!("unrecognized function name `{mentioned}`"),
+            );
+            return Err(e);
+        }
+    }
+
+    if !input.skip.is_empty() && input.only.is_some() {
+        let e = syn::Error::new(
+            input.only_span.unwrap(),
+            "only one of `skip` or `only` may be specified",
+        );
+        return Err(e);
+    }
+
+    // Construct a list of what we intend to expand
+    let mut fn_list = Vec::new();
+    for func in ALL_OPERATIONS.iter() {
+        let fn_name = func.name;
+        // If we have an `only` list and it does _not_ contain this function name, skip it
+        if input
+            .only
+            .as_ref()
+            .is_some_and(|only| !only.iter().any(|o| o == fn_name))
+        {
+            continue;
+        }
+
+        // If there is a `skip` list that contains this function name, skip it
+        if input.skip.iter().any(|s| s == fn_name) {
+            continue;
+        }
+
+        // Omit f16 and f128 functions if requested
+        if input.skip_f16_f128 && (func.float_ty == FloatTy::F16 || func.float_ty == FloatTy::F128)
+        {
+            continue;
+        }
+
+        // Run everything else
+        fn_list.push(func);
+    }
+
+    // Types that the user would like us to provide in the macro
+    let mut add_all_types = false;
+    for ty in &input.emit_types {
+        let ty_name = ty.to_string();
+        if ty_name == "all" {
+            add_all_types = true;
+            continue;
+        }
+
+        // Check that all requested types are valid
+        if !KNOWN_TYPES.contains(&ty_name.as_str()) {
+            let e = syn::Error::new(
+                ty_name.span(),
+                format!("unrecognized type identifier `{ty_name}`"),
+            );
+            return Err(e);
+        }
+    }
+
+    if add_all_types {
+        // Ensure that if `all` was specified that nothing else was
+        if input.emit_types.len() > 1 {
+            let e = syn::Error::new(
+                input.emit_types_span.unwrap(),
+                "if `all` is specified, no other type identifiers may be given",
+            );
+            return Err(e);
+        }
+
+        // ...and then add all types
+        input.emit_types.clear();
+        for ty in KNOWN_TYPES {
+            let ident = Ident::new(ty, Span::call_site());
+            input.emit_types.push(ident);
+        }
+    }
+
+    if let Some(map) = &input.fn_extra
+        && !map.keys().any(|key| key == "_")
+    {
+        // No default provided; make sure every expected function is covered
+        let mut fns_not_covered = Vec::new();
+        for func in &fn_list {
+            if !map.keys().any(|key| key == func.name) {
+                // `name` was not mentioned in the `match` statement
+                fns_not_covered.push(func);
+            }
+        }
+
+        if !fns_not_covered.is_empty() {
+            let e = syn::Error::new(
+                input.fn_extra_span.unwrap(),
+                format!(
+                    "`fn_extra`: no default `_` pattern specified and the following \
+                     patterns are not covered: {fns_not_covered:#?}"
+                ),
+            );
+            return Err(e);
+        }
+    };
+
+    Ok(fn_list)
+}
+
+/// Expand our structured macro input into invocations of the callback macro.
+fn expand(input: StructuredInput, fn_list: &[&MathOpInfo]) -> syn::Result<pm2::TokenStream> {
+    let mut out = pm2::TokenStream::new();
+    let default_ident = Ident::new("_", Span::call_site());
+    let callback = input.callback;
+
+    for func in fn_list {
+        let fn_name = Ident::new(func.name, Span::call_site());
+
+        // Prepare attributes in an `attrs: ...` field
+        let mut meta_fields = Vec::new();
+        if let Some(attrs) = &input.attributes {
+            let meta_iter = attrs
+                .iter()
+                .filter(|map| map.names.contains(&fn_name))
+                .flat_map(|map| &map.meta)
+                .map(|v| v.into_token_stream());
+
+            meta_fields.extend(meta_iter);
+        }
+
+        // Always emit f16 and f128 meta so this doesn't need to be repeated everywhere
+        if func.rust_sig.args.contains(&Ty::F16) || func.rust_sig.returns.contains(&Ty::F16) {
+            let ts = quote! { cfg(f16_enabled) };
+            meta_fields.push(ts);
+        }
+        if func.rust_sig.args.contains(&Ty::F128) || func.rust_sig.returns.contains(&Ty::F128) {
+            let ts = quote! { cfg(f128_enabled) };
+            meta_fields.push(ts);
+        }
+
+        let meta_field = quote! { attrs: [ #( #meta_fields ),* ], };
+
+        // Prepare extra in an `extra: ...` field, running the replacer
+        let extra_field = match input.extra.clone() {
+            Some(mut extra) => {
+                let mut v = MacroReplace::new(func.name);
+                v.visit_expr_mut(&mut extra);
+                v.finish()?;
+
+                quote! { extra: #extra, }
+            }
+            None => pm2::TokenStream::new(),
+        };
+
+        // Prepare function-specific extra in a `fn_extra: ...` field, running the replacer
+        let fn_extra_field = match input.fn_extra {
+            Some(ref map) => {
+                let mut fn_extra = map
+                    .get(&fn_name)
+                    .or_else(|| map.get(&default_ident))
+                    .unwrap()
+                    .clone();
+
+                let mut v = MacroReplace::new(func.name);
+                v.visit_expr_mut(&mut fn_extra);
+                v.finish()?;
+
+                quote! { fn_extra: #fn_extra, }
+            }
+            None => pm2::TokenStream::new(),
+        };
+
+        let base_fty = func.float_ty;
+        let c_args = &func.c_sig.args;
+        let c_ret = &func.c_sig.returns;
+        let rust_args = &func.rust_sig.args;
+        let rust_ret = &func.rust_sig.returns;
+        let public = func.public;
+
+        let mut ty_fields = Vec::new();
+        for ty in &input.emit_types {
+            let field = match ty.to_string().as_str() {
+                "FTy" => quote! { FTy: #base_fty, },
+                "CFn" => quote! { CFn: fn( #(#c_args),* ,) -> ( #(#c_ret),* ), },
+                "CArgs" => quote! { CArgs: ( #(#c_args),* ,), },
+                "CRet" => quote! { CRet: ( #(#c_ret),* ), },
+                "RustFn" => quote! { RustFn: fn( #(#rust_args),* ,) -> ( #(#rust_ret),* ), },
+                "RustArgs" => quote! { RustArgs: ( #(#rust_args),* ,), },
+                "RustRet" => quote! { RustRet: ( #(#rust_ret),* ), },
+                "public" => quote! { public: #public, },
+                _ => unreachable!("checked in validation"),
+            };
+            ty_fields.push(field);
+        }
+
+        let new = quote! {
+            #callback! {
+                fn_name: #fn_name,
+                #( #ty_fields )*
+                #meta_field
+                #extra_field
+                #fn_extra_field
+            }
+        };
+
+        out.extend(new);
+    }
+
+    Ok(out)
+}
+
+/// Visitor to replace "magic" identifiers that we allow: `MACRO_FN_NAME` and
+/// `MACRO_FN_NAME_NORMALIZED`.
+struct MacroReplace {
+    fn_name: &'static str,
+    /// Remove the trailing `f` or `f128` to make
+    norm_name: String,
+    error: Option<syn::Error>,
+}
+
+impl MacroReplace {
+    fn new(name: &'static str) -> Self {
+        let norm_name = base_name(name);
+        Self {
+            fn_name: name,
+            norm_name: norm_name.to_owned(),
+            error: None,
+        }
+    }
+
+    fn finish(self) -> syn::Result<()> {
+        match self.error {
+            Some(e) => Err(e),
+            None => Ok(()),
+        }
+    }
+
+    fn visit_ident_inner(&mut self, i: &mut Ident) {
+        let s = i.to_string();
+        if !s.starts_with("MACRO") || self.error.is_some() {
+            return;
+        }
+
+        match s.as_str() {
+            "MACRO_FN_NAME" => *i = Ident::new(self.fn_name, i.span()),
+            "MACRO_FN_NAME_NORMALIZED" => *i = Ident::new(&self.norm_name, i.span()),
+            _ => {
+                self.error = Some(syn::Error::new(
+                    i.span(),
+                    format!("unrecognized meta expression `{s}`"),
+                ));
+            }
+        }
+    }
+}
+
+impl VisitMut for MacroReplace {
+    fn visit_ident_mut(&mut self, i: &mut Ident) {
+        self.visit_ident_inner(i);
+        syn::visit_mut::visit_ident_mut(self, i);
+    }
+}
+
+/// Return the unsuffixed version of a function name; e.g. `abs` and `absf` both return `abs`,
+/// `lgamma_r` and `lgammaf_r` both return `lgamma_r`.
+fn base_name(name: &str) -> &str {
+    let known_mappings = &[
+        ("erff", "erf"),
+        ("erf", "erf"),
+        ("lgammaf_r", "lgamma_r"),
+        ("modff", "modf"),
+        ("modf", "modf"),
+    ];
+
+    match known_mappings.iter().find(|known| known.0 == name) {
+        Some(found) => found.1,
+        None => name
+            .strip_suffix("f")
+            .or_else(|| name.strip_suffix("f16"))
+            .or_else(|| name.strip_suffix("f128"))
+            .unwrap_or(name),
+    }
+}
+
+impl ToTokens for Ty {
+    fn to_tokens(&self, tokens: &mut pm2::TokenStream) {
+        let ts = match self {
+            Ty::F16 => quote! { f16 },
+            Ty::F32 => quote! { f32 },
+            Ty::F64 => quote! { f64 },
+            Ty::F128 => quote! { f128 },
+            Ty::I32 => quote! { i32 },
+            Ty::CInt => quote! { ::core::ffi::c_int },
+            Ty::MutF16 => quote! { &'a mut f16 },
+            Ty::MutF32 => quote! { &'a mut f32 },
+            Ty::MutF64 => quote! { &'a mut f64 },
+            Ty::MutF128 => quote! { &'a mut f128 },
+            Ty::MutI32 => quote! { &'a mut i32 },
+            Ty::MutCInt => quote! { &'a mut core::ffi::c_int },
+        };
+
+        tokens.extend(ts);
+    }
+}
+impl ToTokens for FloatTy {
+    fn to_tokens(&self, tokens: &mut pm2::TokenStream) {
+        let ts = match self {
+            FloatTy::F16 => quote! { f16 },
+            FloatTy::F32 => quote! { f32 },
+            FloatTy::F64 => quote! { f64 },
+            FloatTy::F128 => quote! { f128 },
+        };
+
+        tokens.extend(ts);
+    }
+}
diff --git a/library/compiler-builtins/crates/libm-macros/src/parse.rs b/library/compiler-builtins/crates/libm-macros/src/parse.rs
new file mode 100644
index 00000000000..4876f3ef726
--- /dev/null
+++ b/library/compiler-builtins/crates/libm-macros/src/parse.rs
@@ -0,0 +1,296 @@
+use std::collections::BTreeMap;
+
+use proc_macro2::Span;
+use quote::ToTokens;
+use syn::parse::{Parse, ParseStream, Parser};
+use syn::punctuated::Punctuated;
+use syn::spanned::Spanned;
+use syn::token::{self, Comma};
+use syn::{Arm, Attribute, Expr, ExprMatch, Ident, LitBool, Meta, Token, bracketed};
+
+/// The input to our macro; just a list of `field: value` items.
+#[derive(Debug)]
+pub struct Invocation {
+    fields: Punctuated<Mapping, Comma>,
+}
+
+impl Parse for Invocation {
+    fn parse(input: ParseStream) -> syn::Result<Self> {
+        Ok(Self {
+            fields: input.parse_terminated(Mapping::parse, Token![,])?,
+        })
+    }
+}
+
+/// A `key: expression` mapping with nothing else. Basically a simplified `syn::Field`.
+#[derive(Debug)]
+struct Mapping {
+    name: Ident,
+    _sep: Token![:],
+    expr: Expr,
+}
+
+impl Parse for Mapping {
+    fn parse(input: ParseStream) -> syn::Result<Self> {
+        Ok(Self {
+            name: input.parse()?,
+            _sep: input.parse()?,
+            expr: input.parse()?,
+        })
+    }
+}
+
+/// The input provided to our proc macro, after parsing into the form we expect.
+#[derive(Debug)]
+pub struct StructuredInput {
+    /// Macro to invoke once per function
+    pub callback: Ident,
+    /// Whether or not to provide `CFn` `CArgs` `RustFn` etc. This is really only needed
+    /// once for crate to set up the main trait.
+    pub emit_types: Vec<Ident>,
+    /// Skip these functions
+    pub skip: Vec<Ident>,
+    /// If true, omit f16 and f128 functions that aren't present in other libraries.
+    pub skip_f16_f128: bool,
+    /// Invoke only for these functions
+    pub only: Option<Vec<Ident>>,
+    /// Attributes that get applied to specific functions
+    pub attributes: Option<Vec<AttributeMap>>,
+    /// Extra expressions to pass to all invocations of the macro
+    pub extra: Option<Expr>,
+    /// Per-function extra expressions to pass to the macro
+    pub fn_extra: Option<BTreeMap<Ident, Expr>>,
+    // For diagnostics
+    pub emit_types_span: Option<Span>,
+    pub only_span: Option<Span>,
+    pub fn_extra_span: Option<Span>,
+}
+
+impl StructuredInput {
+    pub fn from_fields(input: Invocation) -> syn::Result<Self> {
+        let mut map: Vec<_> = input.fields.into_iter().collect();
+        let cb_expr = expect_field(&mut map, "callback")?;
+        let emit_types_expr = expect_field(&mut map, "emit_types").ok();
+        let skip_expr = expect_field(&mut map, "skip").ok();
+        let skip_f16_f128 = expect_field(&mut map, "skip_f16_f128").ok();
+        let only_expr = expect_field(&mut map, "only").ok();
+        let attr_expr = expect_field(&mut map, "attributes").ok();
+        let extra = expect_field(&mut map, "extra").ok();
+        let fn_extra = expect_field(&mut map, "fn_extra").ok();
+
+        if !map.is_empty() {
+            Err(syn::Error::new(
+                map.first().unwrap().name.span(),
+                format!("unexpected fields {map:?}"),
+            ))?;
+        }
+
+        let emit_types_span = emit_types_expr.as_ref().map(|expr| expr.span());
+        let emit_types = match emit_types_expr {
+            Some(expr) => Parser::parse2(parse_ident_or_array, expr.into_token_stream())?,
+            None => Vec::new(),
+        };
+
+        let skip = match skip_expr {
+            Some(expr) => Parser::parse2(parse_ident_array, expr.into_token_stream())?,
+            None => Vec::new(),
+        };
+
+        let skip_f16_f128 = match skip_f16_f128 {
+            Some(expr) => expect_litbool(expr)?.value,
+            None => false,
+        };
+
+        let only_span = only_expr.as_ref().map(|expr| expr.span());
+        let only = match only_expr {
+            Some(expr) => Some(Parser::parse2(parse_ident_array, expr.into_token_stream())?),
+            None => None,
+        };
+
+        let attributes = match attr_expr {
+            Some(expr) => {
+                let mut attributes = Vec::new();
+                let attr_exprs = Parser::parse2(parse_expr_array, expr.into_token_stream())?;
+
+                for attr in attr_exprs {
+                    attributes.push(syn::parse2(attr.into_token_stream())?);
+                }
+                Some(attributes)
+            }
+            None => None,
+        };
+
+        let fn_extra_span = fn_extra.as_ref().map(|expr| expr.span());
+        let fn_extra = match fn_extra {
+            Some(expr) => Some(extract_fn_extra_field(expr)?),
+            None => None,
+        };
+
+        Ok(Self {
+            callback: expect_ident(cb_expr)?,
+            emit_types,
+            skip,
+            skip_f16_f128,
+            only,
+            only_span,
+            attributes,
+            extra,
+            fn_extra,
+            fn_extra_span,
+            emit_types_span,
+        })
+    }
+}
+
+fn extract_fn_extra_field(expr: Expr) -> syn::Result<BTreeMap<Ident, Expr>> {
+    let Expr::Match(mexpr) = expr else {
+        let e = syn::Error::new(expr.span(), "`fn_extra` expects a match expression");
+        return Err(e);
+    };
+
+    let ExprMatch {
+        attrs,
+        match_token: _,
+        expr,
+        brace_token: _,
+        arms,
+    } = mexpr;
+
+    expect_empty_attrs(&attrs)?;
+
+    let match_on = expect_ident(*expr)?;
+    if match_on != "MACRO_FN_NAME" {
+        let e = syn::Error::new(match_on.span(), "only allowed to match on `MACRO_FN_NAME`");
+        return Err(e);
+    }
+
+    let mut res = BTreeMap::new();
+
+    for arm in arms {
+        let Arm {
+            attrs,
+            pat,
+            guard,
+            fat_arrow_token: _,
+            body,
+            comma: _,
+        } = arm;
+
+        expect_empty_attrs(&attrs)?;
+
+        let keys = match pat {
+            syn::Pat::Wild(w) => vec![Ident::new("_", w.span())],
+            _ => Parser::parse2(parse_ident_pat, pat.into_token_stream())?,
+        };
+
+        if let Some(guard) = guard {
+            let e = syn::Error::new(guard.0.span(), "no guards allowed in this position");
+            return Err(e);
+        }
+
+        for key in keys {
+            let inserted = res.insert(key.clone(), *body.clone());
+            if inserted.is_some() {
+                let e = syn::Error::new(key.span(), format!("key `{key}` specified twice"));
+                return Err(e);
+            }
+        }
+    }
+
+    Ok(res)
+}
+
+fn expect_empty_attrs(attrs: &[Attribute]) -> syn::Result<()> {
+    if attrs.is_empty() {
+        return Ok(());
+    }
+
+    let e = syn::Error::new(
+        attrs.first().unwrap().span(),
+        "no attributes allowed in this position",
+    );
+    Err(e)
+}
+
+/// Extract a named field from a map, raising an error if it doesn't exist.
+fn expect_field(v: &mut Vec<Mapping>, name: &str) -> syn::Result<Expr> {
+    let pos = v.iter().position(|v| v.name == name).ok_or_else(|| {
+        syn::Error::new(
+            Span::call_site(),
+            format!("missing expected field `{name}`"),
+        )
+    })?;
+
+    Ok(v.remove(pos).expr)
+}
+
+/// Coerce an expression into a simple identifier.
+fn expect_ident(expr: Expr) -> syn::Result<Ident> {
+    syn::parse2(expr.into_token_stream())
+}
+
+/// Coerce an expression into a simple keyword.
+fn expect_litbool(expr: Expr) -> syn::Result<LitBool> {
+    syn::parse2(expr.into_token_stream())
+}
+
+/// Parse either a single identifier (`foo`) or an array of identifiers (`[foo, bar, baz]`).
+fn parse_ident_or_array(input: ParseStream) -> syn::Result<Vec<Ident>> {
+    if !input.peek(token::Bracket) {
+        return Ok(vec![input.parse()?]);
+    }
+
+    parse_ident_array(input)
+}
+
+/// Parse an array of expressions.
+fn parse_expr_array(input: ParseStream) -> syn::Result<Vec<Expr>> {
+    let content;
+    let _ = bracketed!(content in input);
+    let fields = content.parse_terminated(Expr::parse, Token![,])?;
+    Ok(fields.into_iter().collect())
+}
+
+/// Parse an array of idents, e.g. `[foo, bar, baz]`.
+fn parse_ident_array(input: ParseStream) -> syn::Result<Vec<Ident>> {
+    let content;
+    let _ = bracketed!(content in input);
+    let fields = content.parse_terminated(Ident::parse, Token![,])?;
+    Ok(fields.into_iter().collect())
+}
+
+/// Parse an pattern of idents, specifically `(foo | bar | baz)`.
+fn parse_ident_pat(input: ParseStream) -> syn::Result<Vec<Ident>> {
+    if !input.peek2(Token![|]) {
+        return Ok(vec![input.parse()?]);
+    }
+
+    let fields = Punctuated::<Ident, Token![|]>::parse_separated_nonempty(input)?;
+    Ok(fields.into_iter().collect())
+}
+
+/// A mapping of attributes to identifiers (just a simplified `Expr`).
+///
+/// Expressed as:
+///
+/// ```ignore
+/// #[meta1]
+/// #[meta2]
+/// [foo, bar, baz]
+/// ```
+#[derive(Debug)]
+pub struct AttributeMap {
+    pub meta: Vec<Meta>,
+    pub names: Vec<Ident>,
+}
+
+impl Parse for AttributeMap {
+    fn parse(input: ParseStream) -> syn::Result<Self> {
+        let attrs = input.call(Attribute::parse_outer)?;
+
+        Ok(Self {
+            meta: attrs.into_iter().map(|a| a.meta).collect(),
+            names: parse_ident_array(input)?,
+        })
+    }
+}
diff --git a/library/compiler-builtins/crates/libm-macros/src/shared.rs b/library/compiler-builtins/crates/libm-macros/src/shared.rs
new file mode 100644
index 00000000000..1cefe4e8c7e
--- /dev/null
+++ b/library/compiler-builtins/crates/libm-macros/src/shared.rs
@@ -0,0 +1,590 @@
+/* List of all functions that is shared between `libm-macros` and `libm-test`. */
+
+use std::fmt;
+use std::sync::LazyLock;
+
+struct NestedOp {
+    float_ty: FloatTy,
+    rust_sig: Signature,
+    c_sig: Option<Signature>,
+    fn_list: &'static [&'static str],
+    public: bool,
+}
+
+/// We need a flat list to work with most of the time, but define things as a more convenient
+/// nested list.
+const ALL_OPERATIONS_NESTED: &[NestedOp] = &[
+    NestedOp {
+        // `fn(f16) -> f16`
+        float_ty: FloatTy::F16,
+        rust_sig: Signature {
+            args: &[Ty::F16],
+            returns: &[Ty::F16],
+        },
+        c_sig: None,
+        fn_list: &[
+            "ceilf16",
+            "fabsf16",
+            "floorf16",
+            "rintf16",
+            "roundevenf16",
+            "roundf16",
+            "sqrtf16",
+            "truncf16",
+        ],
+        public: true,
+    },
+    NestedOp {
+        // `fn(f32) -> f32`
+        float_ty: FloatTy::F32,
+        rust_sig: Signature {
+            args: &[Ty::F32],
+            returns: &[Ty::F32],
+        },
+        c_sig: None,
+        fn_list: &[
+            "acosf",
+            "acoshf",
+            "asinf",
+            "asinhf",
+            "atanf",
+            "atanhf",
+            "cbrtf",
+            "ceilf",
+            "cosf",
+            "coshf",
+            "erfcf",
+            "erff",
+            "exp10f",
+            "exp2f",
+            "expf",
+            "expm1f",
+            "fabsf",
+            "floorf",
+            "j0f",
+            "j1f",
+            "lgammaf",
+            "log10f",
+            "log1pf",
+            "log2f",
+            "logf",
+            "rintf",
+            "roundevenf",
+            "roundf",
+            "sinf",
+            "sinhf",
+            "sqrtf",
+            "tanf",
+            "tanhf",
+            "tgammaf",
+            "truncf",
+            "y0f",
+            "y1f",
+        ],
+        public: true,
+    },
+    NestedOp {
+        // `(f64) -> f64`
+        float_ty: FloatTy::F64,
+        rust_sig: Signature {
+            args: &[Ty::F64],
+            returns: &[Ty::F64],
+        },
+        c_sig: None,
+        fn_list: &[
+            "acos",
+            "acosh",
+            "asin",
+            "asinh",
+            "atan",
+            "atanh",
+            "cbrt",
+            "ceil",
+            "cos",
+            "cosh",
+            "erf",
+            "erfc",
+            "exp",
+            "exp10",
+            "exp2",
+            "expm1",
+            "fabs",
+            "floor",
+            "j0",
+            "j1",
+            "lgamma",
+            "log",
+            "log10",
+            "log1p",
+            "log2",
+            "rint",
+            "round",
+            "roundeven",
+            "sin",
+            "sinh",
+            "sqrt",
+            "tan",
+            "tanh",
+            "tgamma",
+            "trunc",
+            "y0",
+            "y1",
+        ],
+        public: true,
+    },
+    NestedOp {
+        // `fn(f128) -> f128`
+        float_ty: FloatTy::F128,
+        rust_sig: Signature {
+            args: &[Ty::F128],
+            returns: &[Ty::F128],
+        },
+        c_sig: None,
+        fn_list: &[
+            "ceilf128",
+            "fabsf128",
+            "floorf128",
+            "rintf128",
+            "roundevenf128",
+            "roundf128",
+            "sqrtf128",
+            "truncf128",
+        ],
+        public: true,
+    },
+    NestedOp {
+        // `(f16, f16) -> f16`
+        float_ty: FloatTy::F16,
+        rust_sig: Signature {
+            args: &[Ty::F16, Ty::F16],
+            returns: &[Ty::F16],
+        },
+        c_sig: None,
+        fn_list: &[
+            "copysignf16",
+            "fdimf16",
+            "fmaxf16",
+            "fmaximum_numf16",
+            "fmaximumf16",
+            "fminf16",
+            "fminimum_numf16",
+            "fminimumf16",
+            "fmodf16",
+        ],
+        public: true,
+    },
+    NestedOp {
+        // `(f32, f32) -> f32`
+        float_ty: FloatTy::F32,
+        rust_sig: Signature {
+            args: &[Ty::F32, Ty::F32],
+            returns: &[Ty::F32],
+        },
+        c_sig: None,
+        fn_list: &[
+            "atan2f",
+            "copysignf",
+            "fdimf",
+            "fmaxf",
+            "fmaximum_numf",
+            "fmaximumf",
+            "fminf",
+            "fminimum_numf",
+            "fminimumf",
+            "fmodf",
+            "hypotf",
+            "nextafterf",
+            "powf",
+            "remainderf",
+        ],
+        public: true,
+    },
+    NestedOp {
+        // `(f64, f64) -> f64`
+        float_ty: FloatTy::F64,
+        rust_sig: Signature {
+            args: &[Ty::F64, Ty::F64],
+            returns: &[Ty::F64],
+        },
+        c_sig: None,
+        fn_list: &[
+            "atan2",
+            "copysign",
+            "fdim",
+            "fmax",
+            "fmaximum",
+            "fmaximum_num",
+            "fmin",
+            "fminimum",
+            "fminimum_num",
+            "fmod",
+            "hypot",
+            "nextafter",
+            "pow",
+            "remainder",
+        ],
+        public: true,
+    },
+    NestedOp {
+        // `(f128, f128) -> f128`
+        float_ty: FloatTy::F128,
+        rust_sig: Signature {
+            args: &[Ty::F128, Ty::F128],
+            returns: &[Ty::F128],
+        },
+        c_sig: None,
+        fn_list: &[
+            "copysignf128",
+            "fdimf128",
+            "fmaxf128",
+            "fmaximum_numf128",
+            "fmaximumf128",
+            "fminf128",
+            "fminimum_numf128",
+            "fminimumf128",
+            "fmodf128",
+        ],
+        public: true,
+    },
+    NestedOp {
+        // `(f32, f32, f32) -> f32`
+        float_ty: FloatTy::F32,
+        rust_sig: Signature {
+            args: &[Ty::F32, Ty::F32, Ty::F32],
+            returns: &[Ty::F32],
+        },
+        c_sig: None,
+        fn_list: &["fmaf"],
+        public: true,
+    },
+    NestedOp {
+        // `(f64, f64, f64) -> f64`
+        float_ty: FloatTy::F64,
+        rust_sig: Signature {
+            args: &[Ty::F64, Ty::F64, Ty::F64],
+            returns: &[Ty::F64],
+        },
+        c_sig: None,
+        fn_list: &["fma"],
+        public: true,
+    },
+    NestedOp {
+        // `(f128, f128, f128) -> f128`
+        float_ty: FloatTy::F128,
+        rust_sig: Signature {
+            args: &[Ty::F128, Ty::F128, Ty::F128],
+            returns: &[Ty::F128],
+        },
+        c_sig: None,
+        fn_list: &["fmaf128"],
+        public: true,
+    },
+    NestedOp {
+        // `(f32) -> i32`
+        float_ty: FloatTy::F32,
+        rust_sig: Signature {
+            args: &[Ty::F32],
+            returns: &[Ty::I32],
+        },
+        c_sig: None,
+        fn_list: &["ilogbf"],
+        public: true,
+    },
+    NestedOp {
+        // `(f64) -> i32`
+        float_ty: FloatTy::F64,
+        rust_sig: Signature {
+            args: &[Ty::F64],
+            returns: &[Ty::I32],
+        },
+        c_sig: None,
+        fn_list: &["ilogb"],
+        public: true,
+    },
+    NestedOp {
+        // `(i32, f32) -> f32`
+        float_ty: FloatTy::F32,
+        rust_sig: Signature {
+            args: &[Ty::I32, Ty::F32],
+            returns: &[Ty::F32],
+        },
+        c_sig: None,
+        fn_list: &["jnf", "ynf"],
+        public: true,
+    },
+    NestedOp {
+        // `(i32, f64) -> f64`
+        float_ty: FloatTy::F64,
+        rust_sig: Signature {
+            args: &[Ty::I32, Ty::F64],
+            returns: &[Ty::F64],
+        },
+        c_sig: None,
+        fn_list: &["jn", "yn"],
+        public: true,
+    },
+    NestedOp {
+        // `(f16, i32) -> f16`
+        float_ty: FloatTy::F16,
+        rust_sig: Signature {
+            args: &[Ty::F16, Ty::I32],
+            returns: &[Ty::F16],
+        },
+        c_sig: None,
+        fn_list: &["ldexpf16", "scalbnf16"],
+        public: true,
+    },
+    NestedOp {
+        // `(f32, i32) -> f32`
+        float_ty: FloatTy::F32,
+        rust_sig: Signature {
+            args: &[Ty::F32, Ty::I32],
+            returns: &[Ty::F32],
+        },
+        c_sig: None,
+        fn_list: &["ldexpf", "scalbnf"],
+        public: true,
+    },
+    NestedOp {
+        // `(f64, i64) -> f64`
+        float_ty: FloatTy::F64,
+        rust_sig: Signature {
+            args: &[Ty::F64, Ty::I32],
+            returns: &[Ty::F64],
+        },
+        c_sig: None,
+        fn_list: &["ldexp", "scalbn"],
+        public: true,
+    },
+    NestedOp {
+        // `(f128, i32) -> f128`
+        float_ty: FloatTy::F128,
+        rust_sig: Signature {
+            args: &[Ty::F128, Ty::I32],
+            returns: &[Ty::F128],
+        },
+        c_sig: None,
+        fn_list: &["ldexpf128", "scalbnf128"],
+        public: true,
+    },
+    NestedOp {
+        // `(f32, &mut f32) -> f32` as `(f32) -> (f32, f32)`
+        float_ty: FloatTy::F32,
+        rust_sig: Signature {
+            args: &[Ty::F32],
+            returns: &[Ty::F32, Ty::F32],
+        },
+        c_sig: Some(Signature {
+            args: &[Ty::F32, Ty::MutF32],
+            returns: &[Ty::F32],
+        }),
+        fn_list: &["modff"],
+        public: true,
+    },
+    NestedOp {
+        // `(f64, &mut f64) -> f64` as  `(f64) -> (f64, f64)`
+        float_ty: FloatTy::F64,
+        rust_sig: Signature {
+            args: &[Ty::F64],
+            returns: &[Ty::F64, Ty::F64],
+        },
+        c_sig: Some(Signature {
+            args: &[Ty::F64, Ty::MutF64],
+            returns: &[Ty::F64],
+        }),
+        fn_list: &["modf"],
+        public: true,
+    },
+    NestedOp {
+        // `(f32, &mut c_int) -> f32` as `(f32) -> (f32, i32)`
+        float_ty: FloatTy::F32,
+        rust_sig: Signature {
+            args: &[Ty::F32],
+            returns: &[Ty::F32, Ty::I32],
+        },
+        c_sig: Some(Signature {
+            args: &[Ty::F32, Ty::MutCInt],
+            returns: &[Ty::F32],
+        }),
+        fn_list: &["frexpf", "lgammaf_r"],
+        public: true,
+    },
+    NestedOp {
+        // `(f64, &mut c_int) -> f64` as `(f64) -> (f64, i32)`
+        float_ty: FloatTy::F64,
+        rust_sig: Signature {
+            args: &[Ty::F64],
+            returns: &[Ty::F64, Ty::I32],
+        },
+        c_sig: Some(Signature {
+            args: &[Ty::F64, Ty::MutCInt],
+            returns: &[Ty::F64],
+        }),
+        fn_list: &["frexp", "lgamma_r"],
+        public: true,
+    },
+    NestedOp {
+        // `(f32, f32, &mut c_int) -> f32` as `(f32, f32) -> (f32, i32)`
+        float_ty: FloatTy::F32,
+        rust_sig: Signature {
+            args: &[Ty::F32, Ty::F32],
+            returns: &[Ty::F32, Ty::I32],
+        },
+        c_sig: Some(Signature {
+            args: &[Ty::F32, Ty::F32, Ty::MutCInt],
+            returns: &[Ty::F32],
+        }),
+        fn_list: &["remquof"],
+        public: true,
+    },
+    NestedOp {
+        // `(f64, f64, &mut c_int) -> f64` as `(f64, f64) -> (f64, i32)`
+        float_ty: FloatTy::F64,
+        rust_sig: Signature {
+            args: &[Ty::F64, Ty::F64],
+            returns: &[Ty::F64, Ty::I32],
+        },
+        c_sig: Some(Signature {
+            args: &[Ty::F64, Ty::F64, Ty::MutCInt],
+            returns: &[Ty::F64],
+        }),
+        fn_list: &["remquo"],
+        public: true,
+    },
+    NestedOp {
+        // `(f32, &mut f32, &mut f32)` as `(f32) -> (f32, f32)`
+        float_ty: FloatTy::F32,
+        rust_sig: Signature {
+            args: &[Ty::F32],
+            returns: &[Ty::F32, Ty::F32],
+        },
+        c_sig: Some(Signature {
+            args: &[Ty::F32, Ty::MutF32, Ty::MutF32],
+            returns: &[],
+        }),
+        fn_list: &["sincosf"],
+        public: true,
+    },
+    NestedOp {
+        // `(f64, &mut f64, &mut f64)` as `(f64) -> (f64, f64)`
+        float_ty: FloatTy::F64,
+        rust_sig: Signature {
+            args: &[Ty::F64],
+            returns: &[Ty::F64, Ty::F64],
+        },
+        c_sig: Some(Signature {
+            args: &[Ty::F64, Ty::MutF64, Ty::MutF64],
+            returns: &[],
+        }),
+        fn_list: &["sincos"],
+        public: true,
+    },
+];
+
+/// A type used in a function signature.
+#[allow(dead_code)]
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+pub enum Ty {
+    F16,
+    F32,
+    F64,
+    F128,
+    I32,
+    CInt,
+    MutF16,
+    MutF32,
+    MutF64,
+    MutF128,
+    MutI32,
+    MutCInt,
+}
+
+/// A subset of [`Ty`] representing only floats.
+#[allow(dead_code)]
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+pub enum FloatTy {
+    F16,
+    F32,
+    F64,
+    F128,
+}
+
+impl fmt::Display for Ty {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let s = match self {
+            Ty::F16 => "f16",
+            Ty::F32 => "f32",
+            Ty::F64 => "f64",
+            Ty::F128 => "f128",
+            Ty::I32 => "i32",
+            Ty::CInt => "::core::ffi::c_int",
+            Ty::MutF16 => "&mut f16",
+            Ty::MutF32 => "&mut f32",
+            Ty::MutF64 => "&mut f64",
+            Ty::MutF128 => "&mut f128",
+            Ty::MutI32 => "&mut i32",
+            Ty::MutCInt => "&mut ::core::ffi::c_int",
+        };
+        f.write_str(s)
+    }
+}
+
+impl fmt::Display for FloatTy {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let s = match self {
+            FloatTy::F16 => "f16",
+            FloatTy::F32 => "f32",
+            FloatTy::F64 => "f64",
+            FloatTy::F128 => "f128",
+        };
+        f.write_str(s)
+    }
+}
+
+/// Representation of e.g. `(f32, f32) -> f32`
+#[derive(Debug, Clone)]
+pub struct Signature {
+    pub args: &'static [Ty],
+    pub returns: &'static [Ty],
+}
+
+/// Combined information about a function implementation.
+#[derive(Debug, Clone)]
+pub struct MathOpInfo {
+    pub name: &'static str,
+    pub float_ty: FloatTy,
+    /// Function signature for C implementations
+    pub c_sig: Signature,
+    /// Function signature for Rust implementations
+    pub rust_sig: Signature,
+    /// True if part of libm's public API
+    pub public: bool,
+}
+
+/// A flat representation of `ALL_FUNCTIONS`.
+pub static ALL_OPERATIONS: LazyLock<Vec<MathOpInfo>> = LazyLock::new(|| {
+    let mut ret = Vec::new();
+
+    for op in ALL_OPERATIONS_NESTED {
+        let fn_names = op.fn_list;
+        for name in fn_names {
+            let api = MathOpInfo {
+                name,
+                float_ty: op.float_ty,
+                rust_sig: op.rust_sig.clone(),
+                c_sig: op.c_sig.clone().unwrap_or_else(|| op.rust_sig.clone()),
+                public: op.public,
+            };
+            ret.push(api);
+        }
+
+        if !fn_names.is_sorted() {
+            let mut sorted = (*fn_names).to_owned();
+            sorted.sort_unstable();
+            panic!("names list is not sorted: {fn_names:?}\nExpected: {sorted:?}");
+        }
+    }
+
+    ret.sort_by_key(|item| item.name);
+    ret
+});
diff --git a/library/compiler-builtins/crates/libm-macros/tests/basic.rs b/library/compiler-builtins/crates/libm-macros/tests/basic.rs
new file mode 100644
index 00000000000..b4276262229
--- /dev/null
+++ b/library/compiler-builtins/crates/libm-macros/tests/basic.rs
@@ -0,0 +1,177 @@
+#![feature(f16)]
+#![feature(f128)]
+// `STATUS_DLL_NOT_FOUND` on i686 MinGW, not worth looking into.
+#![cfg(not(all(target_arch = "x86", target_os = "windows", target_env = "gnu")))]
+
+macro_rules! basic {
+    (
+        fn_name: $fn_name:ident,
+        FTy: $FTy:ty,
+        CFn: $CFn:ty,
+        CArgs: $CArgs:ty,
+        CRet: $CRet:ty,
+        RustFn: $RustFn:ty,
+        RustArgs: $RustArgs:ty,
+        RustRet: $RustRet:ty,
+        public: $public:expr,
+        attrs: [$($attr:meta),*],
+        extra: [$($extra_tt:tt)*],
+        fn_extra: $fn_extra:expr,
+    ) => {
+        $(#[$attr])*
+        #[allow(dead_code)]
+        pub mod $fn_name {
+            type FTy= $FTy;
+            type CFnTy<'a> = $CFn;
+            type RustFnTy = $RustFn;
+            type RustArgsTy = $RustArgs;
+            type RustRetTy = $RustRet;
+            const PUBLIC: bool = $public;
+            const A: &[&str] = &[$($extra_tt)*];
+            fn foo(a: f32) -> f32 {
+                $fn_extra(a)
+            }
+        }
+    };
+}
+
+mod test_basic {
+    libm_macros::for_each_function! {
+        callback: basic,
+        emit_types: all,
+        skip: [sin, cos],
+        attributes: [
+            // just some random attributes
+            #[allow(clippy::pedantic)]
+            #[allow(dead_code)]
+            [sinf, cosf]
+        ],
+        extra: ["foo", "bar"],
+        fn_extra: match MACRO_FN_NAME {
+            sin => |x| x + 2.0,
+            cos | cosf => |x: f32| x.MACRO_FN_NAME_NORMALIZED(),
+            _ => |_x| 100.0
+        }
+    }
+}
+
+macro_rules! basic_no_extra {
+    (
+        fn_name: $fn_name:ident,
+        attrs: [$($attr:meta),*],
+    ) => {
+        $(#[$attr])*
+        mod $fn_name {}
+    };
+}
+
+mod test_basic_no_extra {
+    // Test with no extra, no skip, and no attributes
+    libm_macros::for_each_function! {
+        callback: basic_no_extra,
+    }
+}
+
+mod test_only {
+    // Test that only works
+    libm_macros::for_each_function! {
+        callback: basic_no_extra,
+        only: [sin, sinf],
+    }
+}
+
+macro_rules! specified_types {
+    (
+        fn_name: $fn_name:ident,
+        RustFn: $RustFn:ty,
+        RustArgs: $RustArgs:ty,
+        attrs: [$($attr:meta),*],
+    ) => {
+        $(#[$attr])*
+        #[allow(dead_code)]
+        mod $fn_name {
+            type RustFnTy = $RustFn;
+            type RustArgsTy = $RustArgs;
+        }
+    };
+}
+
+mod test_emit_types {
+    // Test that we can specify a couple types to emit
+    libm_macros::for_each_function! {
+        callback: specified_types,
+        emit_types: [RustFn, RustArgs],
+    }
+}
+
+#[test]
+fn test_skip_f16_f128() {
+    macro_rules! skip_f16_f128 {
+        (
+        fn_name: $fn_name:ident,
+        attrs: [$($attr:meta),*],
+        extra: $vec:ident,
+    ) => {
+            $vec.push(stringify!($fn_name));
+        };
+    }
+
+    let mut v = Vec::new();
+    // Test with no extra, no skip, and no attributes
+    libm_macros::for_each_function! {
+        callback: skip_f16_f128,
+        skip_f16_f128: true,
+        extra: v,
+    }
+
+    for name in v {
+        assert!(!name.contains("f16"), "{name}");
+        assert!(!name.contains("f128"), "{name}");
+    }
+}
+
+#[test]
+fn test_fn_extra_expansion() {
+    macro_rules! fn_extra_expansion {
+        (
+            fn_name: $fn_name:ident,
+            attrs: [$($attr:meta),*],
+            fn_extra: $vec:expr,
+        ) => {
+            $vec.push(stringify!($fn_name));
+        };
+    }
+
+    let mut vf16 = Vec::new();
+    let mut vf32 = Vec::new();
+    let mut vf64 = Vec::new();
+    let mut vf128 = Vec::new();
+
+    // Test with no extra, no skip, and no attributes
+    libm_macros::for_each_function! {
+        callback: fn_extra_expansion,
+        fn_extra: match MACRO_FN_NAME {
+            ALL_F16 => vf16,
+            ALL_F32 => vf32,
+            ALL_F64 => vf64,
+            ALL_F128 => vf128,
+        }
+    }
+
+    // Skip functions with a suffix after the type spec
+    vf16.retain(|name| !name.ends_with("_r"));
+    vf32.retain(|name| !name.ends_with("_r"));
+    vf64.retain(|name| !name.ends_with("_r"));
+    vf128.retain(|name| !name.ends_with("_r"));
+
+    for name in vf16 {
+        assert!(name.ends_with("f16"), "{name}");
+    }
+    for name in vf32 {
+        assert!(name.ends_with("f"), "{name}");
+    }
+    let _ = vf64;
+    for name in vf128 {
+        assert!(name.ends_with("f128"), "{name}");
+    }
+}
diff --git a/library/compiler-builtins/crates/libm-macros/tests/enum.rs b/library/compiler-builtins/crates/libm-macros/tests/enum.rs
new file mode 100644
index 00000000000..93e209a0dcc
--- /dev/null
+++ b/library/compiler-builtins/crates/libm-macros/tests/enum.rs
@@ -0,0 +1,38 @@
+#[libm_macros::function_enum(BaseName)]
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+pub enum Identifier {}
+
+#[libm_macros::base_name_enum]
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+pub enum BaseName {}
+
+#[test]
+fn as_str() {
+    assert_eq!(Identifier::Sin.as_str(), "sin");
+    assert_eq!(Identifier::Sinf.as_str(), "sinf");
+}
+
+#[test]
+fn from_str() {
+    assert_eq!(Identifier::from_str("sin").unwrap(), Identifier::Sin);
+    assert_eq!(Identifier::from_str("sinf").unwrap(), Identifier::Sinf);
+}
+
+#[test]
+fn basename() {
+    assert_eq!(Identifier::Sin.base_name(), BaseName::Sin);
+    assert_eq!(Identifier::Sinf.base_name(), BaseName::Sin);
+}
+
+#[test]
+fn math_op() {
+    assert_eq!(Identifier::Sin.math_op().float_ty, FloatTy::F64);
+    assert_eq!(Identifier::Sinf.math_op().float_ty, FloatTy::F32);
+}
+
+// Replicate the structure that we have in `libm-test`
+mod op {
+    include!("../../libm-macros/src/shared.rs");
+}
+
+use op::FloatTy;
diff --git a/library/compiler-builtins/crates/musl-math-sys/Cargo.toml b/library/compiler-builtins/crates/musl-math-sys/Cargo.toml
new file mode 100644
index 00000000000..d3fb147e526
--- /dev/null
+++ b/library/compiler-builtins/crates/musl-math-sys/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "musl-math-sys"
+version = "0.1.0"
+edition = "2024"
+publish = false
+license = "MIT OR Apache-2.0"
+
+[dependencies]
+
+[dev-dependencies]
+libm = { path = "../../libm" }
+
+[build-dependencies]
+cc = "1.2.16"
diff --git a/library/compiler-builtins/crates/musl-math-sys/build.rs b/library/compiler-builtins/crates/musl-math-sys/build.rs
new file mode 100644
index 00000000000..b00dbc73e28
--- /dev/null
+++ b/library/compiler-builtins/crates/musl-math-sys/build.rs
@@ -0,0 +1,350 @@
+use std::collections::BTreeMap;
+use std::path::{Path, PathBuf};
+use std::process::{Command, Stdio};
+use std::{env, fs, str};
+
+/// Static library that will be built
+const LIB_NAME: &str = "musl_math_prefixed";
+
+/// Files that have more than one symbol. Map of file names to the symbols defined in that file.
+const MULTIPLE_SYMBOLS: &[(&str, &[&str])] = &[
+    (
+        "__invtrigl",
+        &["__invtrigl", "__invtrigl_R", "__pio2_hi", "__pio2_lo"],
+    ),
+    ("__polevll", &["__polevll", "__p1evll"]),
+    ("erf", &["erf", "erfc"]),
+    ("erff", &["erff", "erfcf"]),
+    ("erfl", &["erfl", "erfcl"]),
+    ("exp10", &["exp10", "pow10"]),
+    ("exp10f", &["exp10f", "pow10f"]),
+    ("exp10l", &["exp10l", "pow10l"]),
+    ("exp2f_data", &["exp2f_data", "__exp2f_data"]),
+    ("exp_data", &["exp_data", "__exp_data"]),
+    ("j0", &["j0", "y0"]),
+    ("j0f", &["j0f", "y0f"]),
+    ("j1", &["j1", "y1"]),
+    ("j1f", &["j1f", "y1f"]),
+    ("jn", &["jn", "yn"]),
+    ("jnf", &["jnf", "ynf"]),
+    ("lgamma", &["lgamma", "__lgamma_r"]),
+    ("remainder", &["remainder", "drem"]),
+    ("remainderf", &["remainderf", "dremf"]),
+    ("lgammaf", &["lgammaf", "lgammaf_r", "__lgammaf_r"]),
+    ("lgammal", &["lgammal", "lgammal_r", "__lgammal_r"]),
+    ("log2_data", &["log2_data", "__log2_data"]),
+    ("log2f_data", &["log2f_data", "__log2f_data"]),
+    ("log_data", &["log_data", "__log_data"]),
+    ("logf_data", &["logf_data", "__logf_data"]),
+    ("pow_data", &["pow_data", "__pow_log_data"]),
+    ("powf_data", &["powf_data", "__powf_log2_data"]),
+    ("signgam", &["signgam", "__signgam"]),
+    ("sqrt_data", &["sqrt_data", "__rsqrt_tab"]),
+];
+
+fn main() {
+    let cfg = Config::from_env();
+
+    if cfg.target_env == "msvc"
+        || cfg.target_family == "wasm"
+        || cfg.target_features.iter().any(|f| f == "thumb-mode")
+    {
+        println!(
+            "cargo::warning=Musl doesn't compile with the current \
+            target {}; skipping build",
+            &cfg.target_string
+        );
+        return;
+    }
+
+    build_musl_math(&cfg);
+}
+
+#[allow(dead_code)]
+#[derive(Debug)]
+struct Config {
+    manifest_dir: PathBuf,
+    out_dir: PathBuf,
+    musl_dir: PathBuf,
+    musl_arch: String,
+    target_arch: String,
+    target_env: String,
+    target_family: String,
+    target_os: String,
+    target_string: String,
+    target_vendor: String,
+    target_features: Vec<String>,
+}
+
+impl Config {
+    fn from_env() -> Self {
+        let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
+        let target_features = env::var("CARGO_CFG_TARGET_FEATURE")
+            .map(|feats| feats.split(',').map(ToOwned::to_owned).collect())
+            .unwrap_or_default();
+        let musl_dir = manifest_dir.join("musl");
+
+        let target_arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap();
+        let musl_arch = if target_arch == "x86" {
+            "i386".to_owned()
+        } else {
+            target_arch.clone()
+        };
+
+        println!(
+            "cargo::rerun-if-changed={}/c_patches",
+            manifest_dir.display()
+        );
+        println!("cargo::rerun-if-changed={}", musl_dir.display());
+
+        Self {
+            manifest_dir,
+            out_dir: PathBuf::from(env::var("OUT_DIR").unwrap()),
+            musl_dir,
+            musl_arch,
+            target_arch,
+            target_env: env::var("CARGO_CFG_TARGET_ENV").unwrap(),
+            target_family: env::var("CARGO_CFG_TARGET_FAMILY").unwrap(),
+            target_os: env::var("CARGO_CFG_TARGET_OS").unwrap(),
+            target_string: env::var("TARGET").unwrap(),
+            target_vendor: env::var("CARGO_CFG_TARGET_VENDOR").unwrap(),
+            target_features,
+        }
+    }
+}
+
+/// Build musl math symbols to a static library
+fn build_musl_math(cfg: &Config) {
+    let musl_dir = &cfg.musl_dir;
+    let math = musl_dir.join("src/math");
+    let arch_dir = musl_dir.join("arch").join(&cfg.musl_arch);
+    assert!(
+        math.exists(),
+        "musl source not found. Is the submodule up to date?"
+    );
+
+    let source_map = find_math_source(&math, cfg);
+    let out_path = cfg.out_dir.join(format!("lib{LIB_NAME}.a"));
+
+    // Run configuration steps. Usually done as part of the musl `Makefile`.
+    let obj_include = cfg.out_dir.join("musl_obj/include");
+    fs::create_dir_all(&obj_include).unwrap();
+    fs::create_dir_all(obj_include.join("bits")).unwrap();
+    let sed_stat = Command::new("sed")
+        .arg("-f")
+        .arg(musl_dir.join("tools/mkalltypes.sed"))
+        .arg(arch_dir.join("bits/alltypes.h.in"))
+        .arg(musl_dir.join("include/alltypes.h.in"))
+        .stderr(Stdio::inherit())
+        .output()
+        .unwrap();
+    assert!(
+        sed_stat.status.success(),
+        "sed command failed: {:?}",
+        sed_stat.status
+    );
+
+    fs::write(obj_include.join("bits/alltypes.h"), sed_stat.stdout).unwrap();
+
+    let mut cbuild = cc::Build::new();
+    cbuild
+        .extra_warnings(false)
+        .warnings(false)
+        .flag_if_supported("-Wno-bitwise-op-parentheses")
+        .flag_if_supported("-Wno-literal-range")
+        .flag_if_supported("-Wno-parentheses")
+        .flag_if_supported("-Wno-shift-count-overflow")
+        .flag_if_supported("-Wno-shift-op-parentheses")
+        .flag_if_supported("-Wno-unused-but-set-variable")
+        .flag_if_supported("-std=c99")
+        .flag_if_supported("-ffreestanding")
+        .flag_if_supported("-nostdinc")
+        .define("_ALL_SOURCE", "1")
+        .define(
+            "ROOT_INCLUDE_FEATURES",
+            Some(musl_dir.join("include/features.h").to_str().unwrap()),
+        )
+        // Our overrides are in this directory
+        .include(cfg.manifest_dir.join("c_patches"))
+        .include(musl_dir.join("arch").join(&cfg.musl_arch))
+        .include(musl_dir.join("arch/generic"))
+        .include(musl_dir.join("src/include"))
+        .include(musl_dir.join("src/internal"))
+        .include(obj_include)
+        .include(musl_dir.join("include"))
+        .file(cfg.manifest_dir.join("c_patches/alias.c"));
+
+    for (sym_name, src_file) in source_map {
+        // Build the source file
+        cbuild.file(src_file);
+
+        // Trickery! Redefine the symbol names to have the prefix `musl_`, which allows us to
+        // differentiate these symbols from whatever we provide.
+        if let Some((_names, syms)) = MULTIPLE_SYMBOLS
+            .iter()
+            .find(|(name, _syms)| *name == sym_name)
+        {
+            // Handle the occasional file that defines multiple symbols
+            for sym in *syms {
+                cbuild.define(sym, Some(format!("musl_{sym}").as_str()));
+            }
+        } else {
+            // If the file doesn't define multiple symbols, the file name will be the symbol
+            cbuild.define(&sym_name, Some(format!("musl_{sym_name}").as_str()));
+        }
+    }
+
+    if cfg!(windows) {
+        // On Windows we don't have a good way to check symbols, so skip that step.
+        cbuild.compile(LIB_NAME);
+        return;
+    }
+
+    let objfiles = cbuild.compile_intermediates();
+
+    // We create the archive ourselves with relocations rather than letting `cc` do it so we can
+    // encourage it to resolve symbols now. This should help avoid accidentally linking the wrong
+    // thing.
+    let stat = cbuild
+        .get_compiler()
+        .to_command()
+        .arg("-r")
+        .arg("-o")
+        .arg(&out_path)
+        .args(objfiles)
+        .status()
+        .unwrap();
+    assert!(stat.success());
+
+    println!("cargo::rustc-link-lib={LIB_NAME}");
+    println!("cargo::rustc-link-search=native={}", cfg.out_dir.display());
+
+    validate_archive_symbols(&out_path);
+}
+
+/// Build a map of `name -> path`. `name` is typically the symbol name, but this doesn't account
+/// for files that provide multiple symbols.
+fn find_math_source(math_root: &Path, cfg: &Config) -> BTreeMap<String, PathBuf> {
+    let mut map = BTreeMap::new();
+    let mut arch_dir = None;
+
+    // Locate all files and directories
+    for item in fs::read_dir(math_root).unwrap() {
+        let path = item.unwrap().path();
+        let meta = fs::metadata(&path).unwrap();
+
+        if meta.is_dir() {
+            // Make note of the arch-specific directory if it exists
+            if path.file_name().unwrap() == cfg.target_arch.as_str() {
+                arch_dir = Some(path);
+            }
+            continue;
+        }
+
+        // Skip non-source files
+        if path.extension().is_some_and(|ext| ext == "h") {
+            continue;
+        }
+
+        let sym_name = path.file_stem().unwrap();
+        map.insert(sym_name.to_str().unwrap().to_owned(), path.to_owned());
+    }
+
+    // If arch-specific versions are available, build those instead.
+    if let Some(arch_dir) = arch_dir {
+        for item in fs::read_dir(arch_dir).unwrap() {
+            let path = item.unwrap().path();
+            let sym_name = path.file_stem().unwrap();
+
+            if path.extension().unwrap() == "s" {
+                // FIXME: we never build assembly versions since we have no good way to
+                // rename the symbol (our options are probably preprocessor or objcopy).
+                continue;
+            }
+            map.insert(sym_name.to_str().unwrap().to_owned(), path);
+        }
+    }
+
+    map
+}
+
+/// Make sure we don't have something like a loose unprefixed `_cos` called somewhere, which could
+/// wind up linking to system libraries rather than the built musl library.
+fn validate_archive_symbols(out_path: &Path) {
+    const ALLOWED_UNDEF_PFX: &[&str] = &[
+        // PIC and arch-specific
+        ".TOC",
+        "_GLOBAL_OFFSET_TABLE_",
+        "__x86.get_pc_thunk",
+        // gcc/compiler-rt/compiler-builtins symbols
+        "__add",
+        "__aeabi_",
+        "__div",
+        "__eq",
+        "__extend",
+        "__fix",
+        "__float",
+        "__gcc_",
+        "__ge",
+        "__gt",
+        "__le",
+        "__lshr",
+        "__lt",
+        "__mul",
+        "__ne",
+        "__stack_chk_fail",
+        "__stack_chk_guard",
+        "__sub",
+        "__trunc",
+        "__undef",
+        // string routines
+        "__bzero",
+        "bzero",
+        // FPENV interfaces
+        "feclearexcept",
+        "fegetround",
+        "feraiseexcept",
+        "fesetround",
+        "fetestexcept",
+    ];
+
+    // List global undefined symbols
+    let out = Command::new("nm")
+        .arg("-guj")
+        .arg(out_path)
+        .stderr(Stdio::inherit())
+        .output()
+        .unwrap();
+
+    let undef = str::from_utf8(&out.stdout).unwrap();
+    let mut undef = undef.lines().collect::<Vec<_>>();
+    undef.retain(|sym| {
+        // Account for file formats that add a leading `_`
+        !ALLOWED_UNDEF_PFX
+            .iter()
+            .any(|pfx| sym.starts_with(pfx) || sym[1..].starts_with(pfx))
+    });
+
+    assert!(
+        undef.is_empty(),
+        "found disallowed undefined symbols: {undef:#?}"
+    );
+
+    // Find any symbols that are missing the `_musl_` prefix`
+    let out = Command::new("nm")
+        .arg("-gUj")
+        .arg(out_path)
+        .stderr(Stdio::inherit())
+        .output()
+        .unwrap();
+
+    let defined = str::from_utf8(&out.stdout).unwrap();
+    let mut defined = defined.lines().collect::<Vec<_>>();
+    defined.retain(|sym| {
+        !(sym.starts_with("_musl_")
+            || sym.starts_with("musl_")
+            || sym.starts_with("__x86.get_pc_thunk"))
+    });
+
+    assert!(defined.is_empty(), "found unprefixed symbols: {defined:#?}");
+}
diff --git a/library/compiler-builtins/crates/musl-math-sys/c_patches/alias.c b/library/compiler-builtins/crates/musl-math-sys/c_patches/alias.c
new file mode 100644
index 00000000000..63e0f08d5eb
--- /dev/null
+++ b/library/compiler-builtins/crates/musl-math-sys/c_patches/alias.c
@@ -0,0 +1,40 @@
+/* On platforms that don't support weak symbols, define required aliases
+ * as wrappers. See comments in `features.h` for more.
+ */
+#if defined(__APPLE__) || defined(__MINGW32__)
+
+double __lgamma_r(double a, int *b);
+float __lgammaf_r(float a, int *b);
+long __lgammal_r(long double a, int *b);
+double exp10(double a);
+float exp10f(float a);
+long exp10l(long double a);
+double remainder(double a, double b);
+float remainderf(float a, float b);
+
+double lgamma_r(double a, int *b) {
+	return __lgamma_r(a, b);
+}
+float lgammaf_r(float a, int *b) {
+	return __lgammaf_r(a, b);
+}
+long double lgammal_r(long double a, int *b) {
+	return __lgammal_r(a, b);
+}
+double pow10(double a) {
+	return exp10(a);
+}
+float pow10f(float a) {
+	return exp10f(a);
+}
+long double pow10l(long double a) {
+	return exp10l(a);
+}
+double drem(double a, double b) {
+	return remainder(a, b);
+}
+float dremf(float a, float b) {
+	return remainderf(a, b);
+}
+
+#endif
diff --git a/library/compiler-builtins/crates/musl-math-sys/c_patches/features.h b/library/compiler-builtins/crates/musl-math-sys/c_patches/features.h
new file mode 100644
index 00000000000..97af935979a
--- /dev/null
+++ b/library/compiler-builtins/crates/musl-math-sys/c_patches/features.h
@@ -0,0 +1,39 @@
+/* This is meant to override Musl's src/include/features.h
+ *
+ * We use a separate file here to redefine some attributes that don't work on
+ * all platforms that we would like to build on.
+ */
+
+#ifndef FEATURES_H
+#define FEATURES_H
+
+/* Get the required `#include "../../include/features.h"` since we can't use
+ * the relative path. The C macros need double indirection to get a usable
+ * string. */
+#define _stringify_inner(s) #s
+#define _stringify(s) _stringify_inner(s)
+#include _stringify(ROOT_INCLUDE_FEATURES)
+
+#if defined(__APPLE__)
+#define weak __attribute__((__weak__))
+#define hidden __attribute__((__visibility__("hidden")))
+
+/* We _should_ be able to define this as:
+ *     _Pragma(_stringify(weak musl_ ## new = musl_ ## old))
+ * However, weak symbols aren't handled correctly [1]. So we manually write
+ * wrappers, which are in `alias.c`.
+ *
+ * [1]: https://github.com/llvm/llvm-project/issues/111321
+ */
+#define weak_alias(old, new) /* nothing */
+
+#else
+#define weak __attribute__((__weak__))
+#define hidden __attribute__((__visibility__("hidden")))
+#define weak_alias(old, new) \
+	extern __typeof(old) musl_ ## new \
+	__attribute__((__weak__, __alias__(_stringify(musl_ ## old))))
+
+#endif /* defined(__APPLE__) */
+
+#endif
diff --git a/library/compiler-builtins/crates/musl-math-sys/musl b/library/compiler-builtins/crates/musl-math-sys/musl
new file mode 160000
+Subproject c47ad25ea3b484e10326f933e927c0bc8cded3d
diff --git a/library/compiler-builtins/crates/musl-math-sys/src/lib.rs b/library/compiler-builtins/crates/musl-math-sys/src/lib.rs
new file mode 100644
index 00000000000..6a4bf4859d9
--- /dev/null
+++ b/library/compiler-builtins/crates/musl-math-sys/src/lib.rs
@@ -0,0 +1,287 @@
+//! Bindings to Musl math functions (these are built in `build.rs`).
+
+use std::ffi::{c_char, c_int, c_long};
+
+/// Macro for creating bindings and exposing a safe function (since the implementations have no
+/// preconditions). Included functions must have correct signatures, otherwise this will be
+/// unsound.
+macro_rules! functions {
+    ( $(
+        $( #[$meta:meta] )*
+        $pfx_name:ident: $name:ident( $($arg:ident: $aty:ty),+ ) -> $rty:ty;
+    )* ) => {
+        unsafe extern "C" {
+            $( fn $pfx_name( $($arg: $aty),+ ) -> $rty; )*
+        }
+
+        $(
+            // Expose a safe version
+            $( #[$meta] )*
+            pub fn $name( $($arg: $aty),+ ) -> $rty {
+                // SAFETY: FFI calls with no preconditions
+                unsafe { $pfx_name( $($arg),+ ) }
+            }
+        )*
+
+        #[cfg(test)]
+        mod tests {
+            use super::*;
+            use test_support::CallTest;
+
+            $( functions!(
+                @single_test
+                $name($($arg: $aty),+) -> $rty
+            ); )*
+        }
+    };
+
+    (@single_test
+        $name:ident( $($arg:ident: $aty:ty),+ ) -> $rty:ty
+    ) => {
+        // Run a simple check to ensure we can link and call the function without crashing.
+        #[test]
+        // FIXME(#309): LE PPC crashes calling some musl functions
+        #[cfg_attr(all(target_arch = "powerpc64", target_endian = "little"), ignore)]
+        fn $name() {
+            <fn($($aty),+) -> $rty>::check(super::$name);
+        }
+    };
+}
+
+#[cfg(test)]
+mod test_support {
+    use core::ffi::c_char;
+
+    /// Just verify that we are able to call the function.
+    pub trait CallTest {
+        fn check(f: Self);
+    }
+
+    macro_rules! impl_calltest {
+        ($( ($($arg:ty),*) -> $ret:ty; )*) => {
+            $(
+                impl CallTest for fn($($arg),*) -> $ret {
+                    fn check(f: Self) {
+                        f($(1 as $arg),*);
+                    }
+                }
+            )*
+        };
+    }
+
+    impl_calltest! {
+        (f32) -> f32;
+        (f64) -> f64;
+        (f32, f32) -> f32;
+        (f64, f64) -> f64;
+        (i32, f32) -> f32;
+        (i32, f64) -> f64;
+        (f32, f32, f32) -> f32;
+        (f64, f64, f64) -> f64;
+        (f32, i32) -> f32;
+        (f32, i64) -> f32;
+        (f32) -> i32;
+        (f64) -> i32;
+        (f64, i32) -> f64;
+        (f64, i64) -> f64;
+    }
+
+    impl CallTest for fn(f32, &mut f32) -> f32 {
+        fn check(f: Self) {
+            let mut tmp = 0.0;
+            f(0.0, &mut tmp);
+        }
+    }
+    impl CallTest for fn(f64, &mut f64) -> f64 {
+        fn check(f: Self) {
+            let mut tmp = 0.0;
+            f(0.0, &mut tmp);
+        }
+    }
+    impl CallTest for fn(f32, &mut i32) -> f32 {
+        fn check(f: Self) {
+            let mut tmp = 1;
+            f(0.0, &mut tmp);
+        }
+    }
+    impl CallTest for fn(f64, &mut i32) -> f64 {
+        fn check(f: Self) {
+            let mut tmp = 1;
+            f(0.0, &mut tmp);
+        }
+    }
+    impl CallTest for fn(f32, f32, &mut i32) -> f32 {
+        fn check(f: Self) {
+            let mut tmp = 1;
+            f(0.0, 0.0, &mut tmp);
+        }
+    }
+    impl CallTest for fn(f64, f64, &mut i32) -> f64 {
+        fn check(f: Self) {
+            let mut tmp = 1;
+            f(0.0, 0.0, &mut tmp);
+        }
+    }
+    impl CallTest for fn(f32, &mut f32, &mut f32) {
+        fn check(f: Self) {
+            let mut tmp1 = 1.0;
+            let mut tmp2 = 1.0;
+            f(0.0, &mut tmp1, &mut tmp2);
+        }
+    }
+    impl CallTest for fn(f64, &mut f64, &mut f64) {
+        fn check(f: Self) {
+            let mut tmp1 = 1.0;
+            let mut tmp2 = 1.0;
+            f(0.0, &mut tmp1, &mut tmp2);
+        }
+    }
+    impl CallTest for fn(*const c_char) -> f32 {
+        fn check(f: Self) {
+            f(c"1".as_ptr());
+        }
+    }
+    impl CallTest for fn(*const c_char) -> f64 {
+        fn check(f: Self) {
+            f(c"1".as_ptr());
+        }
+    }
+}
+
+functions! {
+    musl_acos: acos(a: f64) -> f64;
+    musl_acosf: acosf(a: f32) -> f32;
+    musl_acosh: acosh(a: f64) -> f64;
+    musl_acoshf: acoshf(a: f32) -> f32;
+    musl_asin: asin(a: f64) -> f64;
+    musl_asinf: asinf(a: f32) -> f32;
+    musl_asinh: asinh(a: f64) -> f64;
+    musl_asinhf: asinhf(a: f32) -> f32;
+    musl_atan2: atan2(a: f64, b: f64) -> f64;
+    musl_atan2f: atan2f(a: f32, b: f32) -> f32;
+    musl_atan: atan(a: f64) -> f64;
+    musl_atanf: atanf(a: f32) -> f32;
+    musl_atanh: atanh(a: f64) -> f64;
+    musl_atanhf: atanhf(a: f32) -> f32;
+    musl_cbrt: cbrt(a: f64) -> f64;
+    musl_cbrtf: cbrtf(a: f32) -> f32;
+    musl_ceil: ceil(a: f64) -> f64;
+    musl_ceilf: ceilf(a: f32) -> f32;
+    musl_copysign: copysign(a: f64, b: f64) -> f64;
+    musl_copysignf: copysignf(a: f32, b: f32) -> f32;
+    musl_cos: cos(a: f64) -> f64;
+    musl_cosf: cosf(a: f32) -> f32;
+    musl_cosh: cosh(a: f64) -> f64;
+    musl_coshf: coshf(a: f32) -> f32;
+    musl_drem: drem(a: f64, b: f64) -> f64;
+    musl_dremf: dremf(a: f32, b: f32) -> f32;
+    musl_erf: erf(a: f64) -> f64;
+    musl_erfc: erfc(a: f64) -> f64;
+    musl_erfcf: erfcf(a: f32) -> f32;
+    musl_erff: erff(a: f32) -> f32;
+    musl_exp10: exp10(a: f64) -> f64;
+    musl_exp10f: exp10f(a: f32) -> f32;
+    musl_exp2: exp2(a: f64) -> f64;
+    musl_exp2f: exp2f(a: f32) -> f32;
+    musl_exp: exp(a: f64) -> f64;
+    musl_expf: expf(a: f32) -> f32;
+    musl_expm1: expm1(a: f64) -> f64;
+    musl_expm1f: expm1f(a: f32) -> f32;
+    musl_fabs: fabs(a: f64) -> f64;
+    musl_fabsf: fabsf(a: f32) -> f32;
+    musl_fdim: fdim(a: f64, b: f64) -> f64;
+    musl_fdimf: fdimf(a: f32, b: f32) -> f32;
+    musl_finite: finite(a: f64) -> c_int;
+    musl_finitef: finitef(a: f32) -> c_int;
+    musl_floor: floor(a: f64) -> f64;
+    musl_floorf: floorf(a: f32) -> f32;
+    musl_fma: fma(a: f64, b: f64, c: f64) -> f64;
+    musl_fmaf: fmaf(a: f32, b: f32, c: f32) -> f32;
+    musl_fmax: fmax(a: f64, b: f64) -> f64;
+    musl_fmaxf: fmaxf(a: f32, b: f32) -> f32;
+    musl_fmin: fmin(a: f64, b: f64) -> f64;
+    musl_fminf: fminf(a: f32, b: f32) -> f32;
+    musl_fmod: fmod(a: f64, b: f64) -> f64;
+    musl_fmodf: fmodf(a: f32, b: f32) -> f32;
+    musl_frexp: frexp(a: f64, b: &mut c_int) -> f64;
+    musl_frexpf: frexpf(a: f32, b: &mut c_int) -> f32;
+    musl_hypot: hypot(a: f64, b: f64) -> f64;
+    musl_hypotf: hypotf(a: f32, b: f32) -> f32;
+    musl_ilogb: ilogb(a: f64) -> c_int;
+    musl_ilogbf: ilogbf(a: f32) -> c_int;
+    musl_j0: j0(a: f64) -> f64;
+    musl_j0f: j0f(a: f32) -> f32;
+    musl_j1: j1(a: f64) -> f64;
+    musl_j1f: j1f(a: f32) -> f32;
+    musl_jn: jn(a: c_int, b: f64) -> f64;
+    musl_jnf: jnf(a: c_int, b: f32) -> f32;
+    musl_ldexp: ldexp(a: f64, b: c_int) -> f64;
+    musl_ldexpf: ldexpf(a: f32, b: c_int) -> f32;
+    musl_lgamma: lgamma(a: f64) -> f64;
+    musl_lgamma_r: lgamma_r(a: f64, b: &mut c_int) -> f64;
+    musl_lgammaf: lgammaf(a: f32) -> f32;
+    musl_lgammaf_r: lgammaf_r(a: f32, b: &mut c_int) -> f32;
+    musl_log10: log10(a: f64) -> f64;
+    musl_log10f: log10f(a: f32) -> f32;
+    musl_log1p: log1p(a: f64) -> f64;
+    musl_log1pf: log1pf(a: f32) -> f32;
+    musl_log2: log2(a: f64) -> f64;
+    musl_log2f: log2f(a: f32) -> f32;
+    musl_log: log(a: f64) -> f64;
+    musl_logb: logb(a: f64) -> f64;
+    musl_logbf: logbf(a: f32) -> f32;
+    musl_logf: logf(a: f32) -> f32;
+    musl_modf: modf(a: f64, b: &mut f64) -> f64;
+    musl_modff: modff(a: f32, b: &mut f32) -> f32;
+
+    // FIXME: these need to be unsafe
+    #[allow(clippy::not_unsafe_ptr_arg_deref)]
+    musl_nan: nan(a: *const c_char) -> f64;
+    #[allow(clippy::not_unsafe_ptr_arg_deref)]
+    musl_nanf: nanf(a: *const c_char) -> f32;
+
+    musl_nearbyint: nearbyint(a: f64) -> f64;
+    musl_nearbyintf: nearbyintf(a: f32) -> f32;
+    musl_nextafter: nextafter(a: f64, b: f64) -> f64;
+    musl_nextafterf: nextafterf(a: f32, b: f32) -> f32;
+    musl_pow10: pow10(a: f64) -> f64;
+    musl_pow10f: pow10f(a: f32) -> f32;
+    musl_pow: pow(a: f64, b: f64) -> f64;
+    musl_powf: powf(a: f32, b: f32) -> f32;
+    musl_remainder: remainder(a: f64, b: f64) -> f64;
+    musl_remainderf: remainderf(a: f32, b: f32) -> f32;
+    musl_remquo: remquo(a: f64, b: f64, c: &mut c_int) -> f64;
+    musl_remquof: remquof(a: f32, b: f32, c: &mut c_int) -> f32;
+    musl_rint: rint(a: f64) -> f64;
+    musl_rintf: rintf(a: f32) -> f32;
+    musl_round: round(a: f64) -> f64;
+    musl_roundf: roundf(a: f32) -> f32;
+    musl_scalbln: scalbln(a: f64, b: c_long) -> f64;
+    musl_scalblnf: scalblnf(a: f32, b: c_long) -> f32;
+    musl_scalbn: scalbn(a: f64, b: c_int) -> f64;
+    musl_scalbnf: scalbnf(a: f32, b: c_int) -> f32;
+    musl_significand: significand(a: f64) -> f64;
+    musl_significandf: significandf(a: f32) -> f32;
+    musl_sin: sin(a: f64) -> f64;
+    musl_sincos: sincos(a: f64, b: &mut f64, c: &mut f64) -> ();
+    musl_sincosf: sincosf(a: f32, b: &mut f32, c: &mut f32) -> ();
+    musl_sinf: sinf(a: f32) -> f32;
+    musl_sinh: sinh(a: f64) -> f64;
+    musl_sinhf: sinhf(a: f32) -> f32;
+    musl_sqrt: sqrt(a: f64) -> f64;
+    musl_sqrtf: sqrtf(a: f32) -> f32;
+    musl_tan: tan(a: f64) -> f64;
+    musl_tanf: tanf(a: f32) -> f32;
+    musl_tanh: tanh(a: f64) -> f64;
+    musl_tanhf: tanhf(a: f32) -> f32;
+    musl_tgamma: tgamma(a: f64) -> f64;
+    musl_tgammaf: tgammaf(a: f32) -> f32;
+    musl_trunc: trunc(a: f64) -> f64;
+    musl_truncf: truncf(a: f32) -> f32;
+    musl_y0: y0(a: f64) -> f64;
+    musl_y0f: y0f(a: f32) -> f32;
+    musl_y1: y1(a: f64) -> f64;
+    musl_y1f: y1f(a: f32) -> f32;
+    musl_yn: yn(a: c_int, b: f64) -> f64;
+    musl_ynf: ynf(a: c_int, b: f32) -> f32;
+}
diff --git a/library/compiler-builtins/crates/panic-handler/Cargo.toml b/library/compiler-builtins/crates/panic-handler/Cargo.toml
new file mode 100644
index 00000000000..a6764fc481b
--- /dev/null
+++ b/library/compiler-builtins/crates/panic-handler/Cargo.toml
@@ -0,0 +1,12 @@
+[package]
+name = "panic-handler"
+version = "0.1.0"
+authors = ["Alex Crichton <alex@alexcrichton.com>"]
+edition = "2024"
+publish = false
+
+[lib]
+test = false
+bench = false
+
+[dependencies]
diff --git a/library/compiler-builtins/crates/panic-handler/src/lib.rs b/library/compiler-builtins/crates/panic-handler/src/lib.rs
new file mode 100644
index 00000000000..673e005224b
--- /dev/null
+++ b/library/compiler-builtins/crates/panic-handler/src/lib.rs
@@ -0,0 +1,11 @@
+//! This is needed for tests on targets that require a `#[panic_handler]` function
+
+#![feature(no_core)]
+#![no_core]
+
+extern crate core;
+
+#[panic_handler]
+fn panic(_: &core::panic::PanicInfo) -> ! {
+    loop {}
+}
diff --git a/library/compiler-builtins/crates/util/Cargo.toml b/library/compiler-builtins/crates/util/Cargo.toml
new file mode 100644
index 00000000000..614c54bd835
--- /dev/null
+++ b/library/compiler-builtins/crates/util/Cargo.toml
@@ -0,0 +1,19 @@
+[package]
+name = "util"
+version = "0.1.0"
+edition = "2024"
+publish = false
+license = "MIT OR Apache-2.0"
+
+[features]
+default = ["build-musl", "build-mpfr", "unstable-float"]
+build-musl = ["libm-test/build-musl", "dep:musl-math-sys"]
+build-mpfr = ["libm-test/build-mpfr", "dep:rug"]
+unstable-float = ["libm/unstable-float", "libm-test/unstable-float", "rug?/nightly-float"]
+
+[dependencies]
+libm = { path = "../../libm", default-features = false }
+libm-macros = { path = "../libm-macros" }
+libm-test = { path = "../../libm-test", default-features = false }
+musl-math-sys = { path = "../musl-math-sys", optional = true }
+rug = { version = "1.27.0", optional = true, default-features = false, features = ["float", "std"] }
diff --git a/library/compiler-builtins/crates/util/build.rs b/library/compiler-builtins/crates/util/build.rs
new file mode 100644
index 00000000000..a1be4127527
--- /dev/null
+++ b/library/compiler-builtins/crates/util/build.rs
@@ -0,0 +1,10 @@
+#![allow(unexpected_cfgs)]
+
+#[path = "../../libm/configure.rs"]
+mod configure;
+
+fn main() {
+    println!("cargo:rerun-if-changed=../../libm/configure.rs");
+    let cfg = configure::Config::from_env();
+    configure::emit_libm_config(&cfg);
+}
diff --git a/library/compiler-builtins/crates/util/src/main.rs b/library/compiler-builtins/crates/util/src/main.rs
new file mode 100644
index 00000000000..5972181531b
--- /dev/null
+++ b/library/compiler-builtins/crates/util/src/main.rs
@@ -0,0 +1,350 @@
+//! Helper CLI utility for common tasks.
+
+#![cfg_attr(f16_enabled, feature(f16))]
+#![cfg_attr(f128_enabled, feature(f128))]
+
+use std::any::type_name;
+use std::env;
+use std::num::ParseIntError;
+use std::str::FromStr;
+
+use libm::support::{Hexf, hf32, hf64};
+#[cfg(feature = "build-mpfr")]
+use libm_test::mpfloat::MpOp;
+use libm_test::{MathOp, TupleCall};
+#[cfg(feature = "build-mpfr")]
+use rug::az::{self, Az};
+
+const USAGE: &str = "\
+usage:
+
+cargo run -p util -- <SUBCOMMAND>
+
+SUBCOMMAND:
+    eval <BASIS> <OP> inputs...
+        Evaulate the expression with a given basis. This can be useful for
+        running routines with a debugger, or quickly checking input. Examples:
+        * eval musl sinf 1.234 # print the results of musl sinf(1.234f32)
+        * eval mpfr pow 1.234 2.432 # print the results of mpfr pow(1.234, 2.432)
+";
+
+fn main() {
+    let args = env::args().collect::<Vec<_>>();
+    let str_args = args.iter().map(|s| s.as_str()).collect::<Vec<_>>();
+
+    match &str_args.as_slice()[1..] {
+        ["eval", basis, op, inputs @ ..] => do_eval(basis, op, inputs),
+        _ => {
+            println!("{USAGE}\nunrecognized input `{str_args:?}`");
+            std::process::exit(1);
+        }
+    }
+}
+
+macro_rules! handle_call {
+    (
+        fn_name: $fn_name:ident,
+        CFn: $CFn:ty,
+        RustFn: $RustFn:ty,
+        RustArgs: $RustArgs:ty,
+        attrs: [$($attr:meta),*],
+        extra: ($basis:ident, $op:ident, $inputs:ident),
+        fn_extra: $musl_fn:expr,
+    ) => {
+        $(#[$attr])*
+        if $op == stringify!($fn_name) {
+            type Op = libm_test::op::$fn_name::Routine;
+
+            let input = <$RustArgs>::parse($inputs);
+            let libm_fn: <Op as MathOp>::RustFn = libm::$fn_name;
+
+            let output = match $basis {
+                "libm" => input.call_intercept_panics(libm_fn),
+                #[cfg(feature = "build-musl")]
+                "musl" => {
+                    let musl_fn: <Op as MathOp>::CFn =
+                        $musl_fn.unwrap_or_else(|| panic!("no musl function for {}", $op));
+                    input.call(musl_fn)
+                }
+                #[cfg(feature = "build-mpfr")]
+                "mpfr" => {
+                    let mut mp = <Op as MpOp>::new_mp();
+                    Op::run(&mut mp, input)
+                }
+                _ => panic!("unrecognized or disabled basis '{}'", $basis),
+            };
+            println!("{output:?} {:x}", Hexf(output));
+            return;
+        }
+    };
+}
+
+/// Evaluate the specified operation with a given basis.
+fn do_eval(basis: &str, op: &str, inputs: &[&str]) {
+    libm_macros::for_each_function! {
+        callback: handle_call,
+        emit_types: [CFn, RustFn, RustArgs],
+        extra: (basis, op, inputs),
+        fn_extra: match MACRO_FN_NAME {
+            // Not provided by musl
+            fmaximum
+            | fmaximum_num
+            | fmaximum_numf
+            | fmaximumf
+            | fminimum
+            | fminimum_num
+            | fminimum_numf
+            | fminimumf
+            | roundeven
+            | roundevenf
+            | ALL_F16
+            | ALL_F128 => None,
+            _ => Some(musl_math_sys::MACRO_FN_NAME)
+        }
+    }
+
+    panic!("no operation matching {op}");
+}
+
+/// Parse a tuple from a space-delimited string.
+trait ParseTuple {
+    fn parse(input: &[&str]) -> Self;
+}
+
+macro_rules! impl_parse_tuple {
+    ($ty:ty) => {
+        impl ParseTuple for ($ty,) {
+            fn parse(input: &[&str]) -> Self {
+                assert_eq!(input.len(), 1, "expected a single argument, got {input:?}");
+                (parse(input, 0),)
+            }
+        }
+
+        impl ParseTuple for ($ty, $ty) {
+            fn parse(input: &[&str]) -> Self {
+                assert_eq!(input.len(), 2, "expected two arguments, got {input:?}");
+                (parse(input, 0), parse(input, 1))
+            }
+        }
+
+        impl ParseTuple for ($ty, i32) {
+            fn parse(input: &[&str]) -> Self {
+                assert_eq!(input.len(), 2, "expected two arguments, got {input:?}");
+                (parse(input, 0), parse(input, 1))
+            }
+        }
+
+        impl ParseTuple for (i32, $ty) {
+            fn parse(input: &[&str]) -> Self {
+                assert_eq!(input.len(), 2, "expected two arguments, got {input:?}");
+                (parse(input, 0), parse(input, 1))
+            }
+        }
+
+        impl ParseTuple for ($ty, $ty, $ty) {
+            fn parse(input: &[&str]) -> Self {
+                assert_eq!(input.len(), 3, "expected three arguments, got {input:?}");
+                (parse(input, 0), parse(input, 1), parse(input, 2))
+            }
+        }
+    };
+}
+
+#[allow(unused_macros)]
+#[cfg(feature = "build-mpfr")]
+macro_rules! impl_parse_tuple_via_rug {
+    ($ty:ty) => {
+        impl ParseTuple for ($ty,) {
+            fn parse(input: &[&str]) -> Self {
+                assert_eq!(input.len(), 1, "expected a single argument, got {input:?}");
+                (parse_rug(input, 0),)
+            }
+        }
+
+        impl ParseTuple for ($ty, $ty) {
+            fn parse(input: &[&str]) -> Self {
+                assert_eq!(input.len(), 2, "expected two arguments, got {input:?}");
+                (parse_rug(input, 0), parse_rug(input, 1))
+            }
+        }
+
+        impl ParseTuple for ($ty, i32) {
+            fn parse(input: &[&str]) -> Self {
+                assert_eq!(input.len(), 2, "expected two arguments, got {input:?}");
+                (parse_rug(input, 0), parse(input, 1))
+            }
+        }
+
+        impl ParseTuple for (i32, $ty) {
+            fn parse(input: &[&str]) -> Self {
+                assert_eq!(input.len(), 2, "expected two arguments, got {input:?}");
+                (parse(input, 0), parse_rug(input, 1))
+            }
+        }
+
+        impl ParseTuple for ($ty, $ty, $ty) {
+            fn parse(input: &[&str]) -> Self {
+                assert_eq!(input.len(), 3, "expected three arguments, got {input:?}");
+                (
+                    parse_rug(input, 0),
+                    parse_rug(input, 1),
+                    parse_rug(input, 2),
+                )
+            }
+        }
+    };
+}
+
+// Fallback for when Rug is not built.
+#[allow(unused_macros)]
+#[cfg(not(feature = "build-mpfr"))]
+macro_rules! impl_parse_tuple_via_rug {
+    ($ty:ty) => {
+        impl ParseTuple for ($ty,) {
+            fn parse(_input: &[&str]) -> Self {
+                panic!("parsing this type requires the `build-mpfr` feature")
+            }
+        }
+
+        impl ParseTuple for ($ty, $ty) {
+            fn parse(_input: &[&str]) -> Self {
+                panic!("parsing this type requires the `build-mpfr` feature")
+            }
+        }
+
+        impl ParseTuple for ($ty, i32) {
+            fn parse(_input: &[&str]) -> Self {
+                panic!("parsing this type requires the `build-mpfr` feature")
+            }
+        }
+
+        impl ParseTuple for (i32, $ty) {
+            fn parse(_input: &[&str]) -> Self {
+                panic!("parsing this type requires the `build-mpfr` feature")
+            }
+        }
+
+        impl ParseTuple for ($ty, $ty, $ty) {
+            fn parse(_input: &[&str]) -> Self {
+                panic!("parsing this type requires the `build-mpfr` feature")
+            }
+        }
+    };
+}
+
+impl_parse_tuple!(f32);
+impl_parse_tuple!(f64);
+
+#[cfg(f16_enabled)]
+impl_parse_tuple_via_rug!(f16);
+#[cfg(f128_enabled)]
+impl_parse_tuple_via_rug!(f128);
+
+/// Try to parse the number, printing a nice message on failure.
+fn parse<T: FromStr + FromStrRadix>(input: &[&str], idx: usize) -> T {
+    let s = input[idx];
+
+    let msg = || format!("invalid {} input '{s}'", type_name::<T>());
+
+    if s.starts_with("0x") || s.starts_with("-0x") {
+        return T::from_str_radix(s, 16).unwrap_or_else(|_| panic!("{}", msg()));
+    }
+
+    if s.starts_with("0b") {
+        return T::from_str_radix(s, 2).unwrap_or_else(|_| panic!("{}", msg()));
+    }
+
+    s.parse().unwrap_or_else(|_| panic!("{}", msg()))
+}
+
+/// Try to parse the float type going via `rug`, for `f16` and `f128` which don't yet implement
+/// `FromStr`.
+#[cfg(feature = "build-mpfr")]
+fn parse_rug<F>(input: &[&str], idx: usize) -> F
+where
+    F: libm_test::Float + FromStrRadix,
+    rug::Float: az::Cast<F>,
+{
+    let s = input[idx];
+
+    let msg = || format!("invalid {} input '{s}'", type_name::<F>());
+
+    if s.starts_with("0x") {
+        return F::from_str_radix(s, 16).unwrap_or_else(|_| panic!("{}", msg()));
+    }
+
+    if s.starts_with("0b") {
+        return F::from_str_radix(s, 2).unwrap_or_else(|_| panic!("{}", msg()));
+    }
+
+    let x = rug::Float::parse(s).unwrap_or_else(|_| panic!("{}", msg()));
+    let x = rug::Float::with_val(F::BITS, x);
+    x.az()
+}
+
+trait FromStrRadix: Sized {
+    fn from_str_radix(s: &str, radix: u32) -> Result<Self, ParseIntError>;
+}
+
+impl FromStrRadix for i32 {
+    fn from_str_radix(s: &str, radix: u32) -> Result<Self, ParseIntError> {
+        let s = strip_radix_prefix(s, radix);
+        i32::from_str_radix(s, radix)
+    }
+}
+
+#[cfg(f16_enabled)]
+impl FromStrRadix for f16 {
+    fn from_str_radix(s: &str, radix: u32) -> Result<Self, ParseIntError> {
+        if radix == 16 && s.contains("p") {
+            return Ok(libm::support::hf16(s));
+        }
+
+        let s = strip_radix_prefix(s, radix);
+        u16::from_str_radix(s, radix).map(Self::from_bits)
+    }
+}
+
+impl FromStrRadix for f32 {
+    fn from_str_radix(s: &str, radix: u32) -> Result<Self, ParseIntError> {
+        if radix == 16 && s.contains("p") {
+            // Parse as hex float
+            return Ok(hf32(s));
+        }
+
+        let s = strip_radix_prefix(s, radix);
+        u32::from_str_radix(s, radix).map(Self::from_bits)
+    }
+}
+
+impl FromStrRadix for f64 {
+    fn from_str_radix(s: &str, radix: u32) -> Result<Self, ParseIntError> {
+        if s.contains("p") {
+            return Ok(hf64(s));
+        }
+
+        let s = strip_radix_prefix(s, radix);
+        u64::from_str_radix(s, radix).map(Self::from_bits)
+    }
+}
+
+#[cfg(f128_enabled)]
+impl FromStrRadix for f128 {
+    fn from_str_radix(s: &str, radix: u32) -> Result<Self, ParseIntError> {
+        if radix == 16 && s.contains("p") {
+            return Ok(libm::support::hf128(s));
+        }
+        let s = strip_radix_prefix(s, radix);
+        u128::from_str_radix(s, radix).map(Self::from_bits)
+    }
+}
+
+fn strip_radix_prefix(s: &str, radix: u32) -> &str {
+    if radix == 16 {
+        s.strip_prefix("0x").unwrap()
+    } else if radix == 2 {
+        s.strip_prefix("0b").unwrap()
+    } else {
+        s
+    }
+}
diff --git a/library/compiler-builtins/etc/function-definitions.json b/library/compiler-builtins/etc/function-definitions.json
new file mode 100644
index 00000000000..4f796905b75
--- /dev/null
+++ b/library/compiler-builtins/etc/function-definitions.json
@@ -0,0 +1,1071 @@
+{
+    "__comment": "Autogenerated by update-api-list.py. List of files that define a function with a given name. This file is checked in to make it obvious if refactoring breaks things",
+    "acos": {
+        "sources": [
+            "libm/src/math/acos.rs"
+        ],
+        "type": "f64"
+    },
+    "acosf": {
+        "sources": [
+            "libm/src/math/acosf.rs"
+        ],
+        "type": "f32"
+    },
+    "acosh": {
+        "sources": [
+            "libm/src/math/acosh.rs"
+        ],
+        "type": "f64"
+    },
+    "acoshf": {
+        "sources": [
+            "libm/src/math/acoshf.rs"
+        ],
+        "type": "f32"
+    },
+    "asin": {
+        "sources": [
+            "libm/src/math/asin.rs"
+        ],
+        "type": "f64"
+    },
+    "asinf": {
+        "sources": [
+            "libm/src/math/asinf.rs"
+        ],
+        "type": "f32"
+    },
+    "asinh": {
+        "sources": [
+            "libm/src/math/asinh.rs"
+        ],
+        "type": "f64"
+    },
+    "asinhf": {
+        "sources": [
+            "libm/src/math/asinhf.rs"
+        ],
+        "type": "f32"
+    },
+    "atan": {
+        "sources": [
+            "libm/src/math/atan.rs"
+        ],
+        "type": "f64"
+    },
+    "atan2": {
+        "sources": [
+            "libm/src/math/atan2.rs"
+        ],
+        "type": "f64"
+    },
+    "atan2f": {
+        "sources": [
+            "libm/src/math/atan2f.rs"
+        ],
+        "type": "f32"
+    },
+    "atanf": {
+        "sources": [
+            "libm/src/math/atanf.rs"
+        ],
+        "type": "f32"
+    },
+    "atanh": {
+        "sources": [
+            "libm/src/math/atanh.rs"
+        ],
+        "type": "f64"
+    },
+    "atanhf": {
+        "sources": [
+            "libm/src/math/atanhf.rs"
+        ],
+        "type": "f32"
+    },
+    "cbrt": {
+        "sources": [
+            "libm/src/math/cbrt.rs"
+        ],
+        "type": "f64"
+    },
+    "cbrtf": {
+        "sources": [
+            "libm/src/math/cbrtf.rs"
+        ],
+        "type": "f32"
+    },
+    "ceil": {
+        "sources": [
+            "libm/src/math/arch/i586.rs",
+            "libm/src/math/arch/wasm32.rs",
+            "libm/src/math/ceil.rs",
+            "libm/src/math/generic/ceil.rs"
+        ],
+        "type": "f64"
+    },
+    "ceilf": {
+        "sources": [
+            "libm/src/math/arch/wasm32.rs",
+            "libm/src/math/ceil.rs",
+            "libm/src/math/generic/ceil.rs"
+        ],
+        "type": "f32"
+    },
+    "ceilf128": {
+        "sources": [
+            "libm/src/math/ceil.rs",
+            "libm/src/math/generic/ceil.rs"
+        ],
+        "type": "f128"
+    },
+    "ceilf16": {
+        "sources": [
+            "libm/src/math/ceil.rs",
+            "libm/src/math/generic/ceil.rs"
+        ],
+        "type": "f16"
+    },
+    "copysign": {
+        "sources": [
+            "libm/src/math/copysign.rs",
+            "libm/src/math/generic/copysign.rs"
+        ],
+        "type": "f64"
+    },
+    "copysignf": {
+        "sources": [
+            "libm/src/math/copysign.rs",
+            "libm/src/math/generic/copysign.rs"
+        ],
+        "type": "f32"
+    },
+    "copysignf128": {
+        "sources": [
+            "libm/src/math/copysign.rs",
+            "libm/src/math/generic/copysign.rs"
+        ],
+        "type": "f128"
+    },
+    "copysignf16": {
+        "sources": [
+            "libm/src/math/copysign.rs",
+            "libm/src/math/generic/copysign.rs"
+        ],
+        "type": "f16"
+    },
+    "cos": {
+        "sources": [
+            "libm/src/math/cos.rs"
+        ],
+        "type": "f64"
+    },
+    "cosf": {
+        "sources": [
+            "libm/src/math/cosf.rs"
+        ],
+        "type": "f32"
+    },
+    "cosh": {
+        "sources": [
+            "libm/src/math/cosh.rs"
+        ],
+        "type": "f64"
+    },
+    "coshf": {
+        "sources": [
+            "libm/src/math/coshf.rs"
+        ],
+        "type": "f32"
+    },
+    "erf": {
+        "sources": [
+            "libm/src/math/erf.rs"
+        ],
+        "type": "f64"
+    },
+    "erfc": {
+        "sources": [
+            "libm/src/math/erf.rs"
+        ],
+        "type": "f64"
+    },
+    "erfcf": {
+        "sources": [
+            "libm/src/math/erff.rs"
+        ],
+        "type": "f32"
+    },
+    "erff": {
+        "sources": [
+            "libm/src/math/erff.rs"
+        ],
+        "type": "f32"
+    },
+    "exp": {
+        "sources": [
+            "libm/src/math/exp.rs"
+        ],
+        "type": "f64"
+    },
+    "exp10": {
+        "sources": [
+            "libm/src/math/exp10.rs"
+        ],
+        "type": "f64"
+    },
+    "exp10f": {
+        "sources": [
+            "libm/src/math/exp10f.rs"
+        ],
+        "type": "f32"
+    },
+    "exp2": {
+        "sources": [
+            "libm/src/math/exp2.rs"
+        ],
+        "type": "f64"
+    },
+    "exp2f": {
+        "sources": [
+            "libm/src/math/exp2f.rs"
+        ],
+        "type": "f32"
+    },
+    "expf": {
+        "sources": [
+            "libm/src/math/expf.rs"
+        ],
+        "type": "f32"
+    },
+    "expm1": {
+        "sources": [
+            "libm/src/math/expm1.rs"
+        ],
+        "type": "f64"
+    },
+    "expm1f": {
+        "sources": [
+            "libm/src/math/expm1f.rs"
+        ],
+        "type": "f32"
+    },
+    "fabs": {
+        "sources": [
+            "libm/src/math/arch/wasm32.rs",
+            "libm/src/math/fabs.rs",
+            "libm/src/math/generic/fabs.rs"
+        ],
+        "type": "f64"
+    },
+    "fabsf": {
+        "sources": [
+            "libm/src/math/arch/wasm32.rs",
+            "libm/src/math/fabs.rs",
+            "libm/src/math/generic/fabs.rs"
+        ],
+        "type": "f32"
+    },
+    "fabsf128": {
+        "sources": [
+            "libm/src/math/fabs.rs",
+            "libm/src/math/generic/fabs.rs"
+        ],
+        "type": "f128"
+    },
+    "fabsf16": {
+        "sources": [
+            "libm/src/math/fabs.rs",
+            "libm/src/math/generic/fabs.rs"
+        ],
+        "type": "f16"
+    },
+    "fdim": {
+        "sources": [
+            "libm/src/math/fdim.rs",
+            "libm/src/math/generic/fdim.rs"
+        ],
+        "type": "f64"
+    },
+    "fdimf": {
+        "sources": [
+            "libm/src/math/fdim.rs",
+            "libm/src/math/generic/fdim.rs"
+        ],
+        "type": "f32"
+    },
+    "fdimf128": {
+        "sources": [
+            "libm/src/math/fdim.rs",
+            "libm/src/math/generic/fdim.rs"
+        ],
+        "type": "f128"
+    },
+    "fdimf16": {
+        "sources": [
+            "libm/src/math/fdim.rs",
+            "libm/src/math/generic/fdim.rs"
+        ],
+        "type": "f16"
+    },
+    "floor": {
+        "sources": [
+            "libm/src/math/arch/i586.rs",
+            "libm/src/math/arch/wasm32.rs",
+            "libm/src/math/floor.rs",
+            "libm/src/math/generic/floor.rs"
+        ],
+        "type": "f64"
+    },
+    "floorf": {
+        "sources": [
+            "libm/src/math/arch/wasm32.rs",
+            "libm/src/math/floor.rs",
+            "libm/src/math/generic/floor.rs"
+        ],
+        "type": "f32"
+    },
+    "floorf128": {
+        "sources": [
+            "libm/src/math/floor.rs",
+            "libm/src/math/generic/floor.rs"
+        ],
+        "type": "f128"
+    },
+    "floorf16": {
+        "sources": [
+            "libm/src/math/floor.rs",
+            "libm/src/math/generic/floor.rs"
+        ],
+        "type": "f16"
+    },
+    "fma": {
+        "sources": [
+            "libm/src/math/arch/aarch64.rs",
+            "libm/src/math/arch/x86/fma.rs",
+            "libm/src/math/fma.rs"
+        ],
+        "type": "f64"
+    },
+    "fmaf": {
+        "sources": [
+            "libm/src/math/arch/aarch64.rs",
+            "libm/src/math/arch/x86/fma.rs",
+            "libm/src/math/fma.rs"
+        ],
+        "type": "f32"
+    },
+    "fmaf128": {
+        "sources": [
+            "libm/src/math/fma.rs"
+        ],
+        "type": "f128"
+    },
+    "fmax": {
+        "sources": [
+            "libm/src/math/fmin_fmax.rs",
+            "libm/src/math/generic/fmax.rs"
+        ],
+        "type": "f64"
+    },
+    "fmaxf": {
+        "sources": [
+            "libm/src/math/fmin_fmax.rs",
+            "libm/src/math/generic/fmax.rs"
+        ],
+        "type": "f32"
+    },
+    "fmaxf128": {
+        "sources": [
+            "libm/src/math/fmin_fmax.rs",
+            "libm/src/math/generic/fmax.rs"
+        ],
+        "type": "f128"
+    },
+    "fmaxf16": {
+        "sources": [
+            "libm/src/math/fmin_fmax.rs",
+            "libm/src/math/generic/fmax.rs"
+        ],
+        "type": "f16"
+    },
+    "fmaximum": {
+        "sources": [
+            "libm/src/math/fminimum_fmaximum.rs",
+            "libm/src/math/generic/fmaximum.rs"
+        ],
+        "type": "f64"
+    },
+    "fmaximum_num": {
+        "sources": [
+            "libm/src/math/fminimum_fmaximum_num.rs",
+            "libm/src/math/generic/fmaximum_num.rs"
+        ],
+        "type": "f64"
+    },
+    "fmaximum_numf": {
+        "sources": [
+            "libm/src/math/fminimum_fmaximum_num.rs",
+            "libm/src/math/generic/fmaximum_num.rs"
+        ],
+        "type": "f32"
+    },
+    "fmaximum_numf128": {
+        "sources": [
+            "libm/src/math/fminimum_fmaximum_num.rs",
+            "libm/src/math/generic/fmaximum_num.rs"
+        ],
+        "type": "f128"
+    },
+    "fmaximum_numf16": {
+        "sources": [
+            "libm/src/math/fminimum_fmaximum_num.rs",
+            "libm/src/math/generic/fmaximum_num.rs"
+        ],
+        "type": "f16"
+    },
+    "fmaximumf": {
+        "sources": [
+            "libm/src/math/fminimum_fmaximum.rs",
+            "libm/src/math/generic/fmaximum.rs"
+        ],
+        "type": "f32"
+    },
+    "fmaximumf128": {
+        "sources": [
+            "libm/src/math/fminimum_fmaximum.rs",
+            "libm/src/math/generic/fmaximum.rs"
+        ],
+        "type": "f128"
+    },
+    "fmaximumf16": {
+        "sources": [
+            "libm/src/math/fminimum_fmaximum.rs",
+            "libm/src/math/generic/fmaximum.rs"
+        ],
+        "type": "f16"
+    },
+    "fmin": {
+        "sources": [
+            "libm/src/math/fmin_fmax.rs",
+            "libm/src/math/generic/fmin.rs"
+        ],
+        "type": "f64"
+    },
+    "fminf": {
+        "sources": [
+            "libm/src/math/fmin_fmax.rs",
+            "libm/src/math/generic/fmin.rs"
+        ],
+        "type": "f32"
+    },
+    "fminf128": {
+        "sources": [
+            "libm/src/math/fmin_fmax.rs",
+            "libm/src/math/generic/fmin.rs"
+        ],
+        "type": "f128"
+    },
+    "fminf16": {
+        "sources": [
+            "libm/src/math/fmin_fmax.rs",
+            "libm/src/math/generic/fmin.rs"
+        ],
+        "type": "f16"
+    },
+    "fminimum": {
+        "sources": [
+            "libm/src/math/fminimum_fmaximum.rs",
+            "libm/src/math/generic/fminimum.rs"
+        ],
+        "type": "f64"
+    },
+    "fminimum_num": {
+        "sources": [
+            "libm/src/math/fminimum_fmaximum_num.rs",
+            "libm/src/math/generic/fminimum_num.rs"
+        ],
+        "type": "f64"
+    },
+    "fminimum_numf": {
+        "sources": [
+            "libm/src/math/fminimum_fmaximum_num.rs",
+            "libm/src/math/generic/fminimum_num.rs"
+        ],
+        "type": "f32"
+    },
+    "fminimum_numf128": {
+        "sources": [
+            "libm/src/math/fminimum_fmaximum_num.rs",
+            "libm/src/math/generic/fminimum_num.rs"
+        ],
+        "type": "f128"
+    },
+    "fminimum_numf16": {
+        "sources": [
+            "libm/src/math/fminimum_fmaximum_num.rs",
+            "libm/src/math/generic/fminimum_num.rs"
+        ],
+        "type": "f16"
+    },
+    "fminimumf": {
+        "sources": [
+            "libm/src/math/fminimum_fmaximum.rs",
+            "libm/src/math/generic/fminimum.rs"
+        ],
+        "type": "f32"
+    },
+    "fminimumf128": {
+        "sources": [
+            "libm/src/math/fminimum_fmaximum.rs",
+            "libm/src/math/generic/fminimum.rs"
+        ],
+        "type": "f128"
+    },
+    "fminimumf16": {
+        "sources": [
+            "libm/src/math/fminimum_fmaximum.rs",
+            "libm/src/math/generic/fminimum.rs"
+        ],
+        "type": "f16"
+    },
+    "fmod": {
+        "sources": [
+            "libm/src/math/fmod.rs",
+            "libm/src/math/generic/fmod.rs"
+        ],
+        "type": "f64"
+    },
+    "fmodf": {
+        "sources": [
+            "libm/src/math/fmod.rs",
+            "libm/src/math/generic/fmod.rs"
+        ],
+        "type": "f32"
+    },
+    "fmodf128": {
+        "sources": [
+            "libm/src/math/fmod.rs",
+            "libm/src/math/generic/fmod.rs"
+        ],
+        "type": "f128"
+    },
+    "fmodf16": {
+        "sources": [
+            "libm/src/math/fmod.rs",
+            "libm/src/math/generic/fmod.rs"
+        ],
+        "type": "f16"
+    },
+    "frexp": {
+        "sources": [
+            "libm/src/math/frexp.rs"
+        ],
+        "type": "f64"
+    },
+    "frexpf": {
+        "sources": [
+            "libm/src/math/frexpf.rs"
+        ],
+        "type": "f32"
+    },
+    "hypot": {
+        "sources": [
+            "libm/src/math/hypot.rs"
+        ],
+        "type": "f64"
+    },
+    "hypotf": {
+        "sources": [
+            "libm/src/math/hypotf.rs"
+        ],
+        "type": "f32"
+    },
+    "ilogb": {
+        "sources": [
+            "libm/src/math/ilogb.rs"
+        ],
+        "type": "f64"
+    },
+    "ilogbf": {
+        "sources": [
+            "libm/src/math/ilogbf.rs"
+        ],
+        "type": "f32"
+    },
+    "j0": {
+        "sources": [
+            "libm/src/math/j0.rs"
+        ],
+        "type": "f64"
+    },
+    "j0f": {
+        "sources": [
+            "libm/src/math/j0f.rs"
+        ],
+        "type": "f32"
+    },
+    "j1": {
+        "sources": [
+            "libm/src/math/j1.rs"
+        ],
+        "type": "f64"
+    },
+    "j1f": {
+        "sources": [
+            "libm/src/math/j1f.rs"
+        ],
+        "type": "f32"
+    },
+    "jn": {
+        "sources": [
+            "libm/src/math/jn.rs"
+        ],
+        "type": "f64"
+    },
+    "jnf": {
+        "sources": [
+            "libm/src/math/jnf.rs"
+        ],
+        "type": "f32"
+    },
+    "ldexp": {
+        "sources": [
+            "libm/src/math/ldexp.rs"
+        ],
+        "type": "f64"
+    },
+    "ldexpf": {
+        "sources": [
+            "libm/src/math/ldexp.rs"
+        ],
+        "type": "f32"
+    },
+    "ldexpf128": {
+        "sources": [
+            "libm/src/math/ldexp.rs"
+        ],
+        "type": "f128"
+    },
+    "ldexpf16": {
+        "sources": [
+            "libm/src/math/ldexp.rs"
+        ],
+        "type": "f16"
+    },
+    "lgamma": {
+        "sources": [
+            "libm/src/math/lgamma.rs"
+        ],
+        "type": "f64"
+    },
+    "lgamma_r": {
+        "sources": [
+            "libm/src/math/lgamma_r.rs"
+        ],
+        "type": "f64"
+    },
+    "lgammaf": {
+        "sources": [
+            "libm/src/math/lgammaf.rs"
+        ],
+        "type": "f32"
+    },
+    "lgammaf_r": {
+        "sources": [
+            "libm/src/math/lgammaf_r.rs"
+        ],
+        "type": "f32"
+    },
+    "log": {
+        "sources": [
+            "libm/src/math/log.rs"
+        ],
+        "type": "f64"
+    },
+    "log10": {
+        "sources": [
+            "libm/src/math/log10.rs"
+        ],
+        "type": "f64"
+    },
+    "log10f": {
+        "sources": [
+            "libm/src/math/log10f.rs"
+        ],
+        "type": "f32"
+    },
+    "log1p": {
+        "sources": [
+            "libm/src/math/log1p.rs"
+        ],
+        "type": "f64"
+    },
+    "log1pf": {
+        "sources": [
+            "libm/src/math/log1pf.rs"
+        ],
+        "type": "f32"
+    },
+    "log2": {
+        "sources": [
+            "libm/src/math/log2.rs"
+        ],
+        "type": "f64"
+    },
+    "log2f": {
+        "sources": [
+            "libm/src/math/log2f.rs"
+        ],
+        "type": "f32"
+    },
+    "logf": {
+        "sources": [
+            "libm/src/math/logf.rs"
+        ],
+        "type": "f32"
+    },
+    "modf": {
+        "sources": [
+            "libm/src/math/modf.rs"
+        ],
+        "type": "f64"
+    },
+    "modff": {
+        "sources": [
+            "libm/src/math/modff.rs"
+        ],
+        "type": "f32"
+    },
+    "nextafter": {
+        "sources": [
+            "libm/src/math/nextafter.rs"
+        ],
+        "type": "f64"
+    },
+    "nextafterf": {
+        "sources": [
+            "libm/src/math/nextafterf.rs"
+        ],
+        "type": "f32"
+    },
+    "pow": {
+        "sources": [
+            "libm/src/math/pow.rs"
+        ],
+        "type": "f64"
+    },
+    "powf": {
+        "sources": [
+            "libm/src/math/powf.rs"
+        ],
+        "type": "f32"
+    },
+    "remainder": {
+        "sources": [
+            "libm/src/math/remainder.rs"
+        ],
+        "type": "f64"
+    },
+    "remainderf": {
+        "sources": [
+            "libm/src/math/remainderf.rs"
+        ],
+        "type": "f32"
+    },
+    "remquo": {
+        "sources": [
+            "libm/src/math/remquo.rs"
+        ],
+        "type": "f64"
+    },
+    "remquof": {
+        "sources": [
+            "libm/src/math/remquof.rs"
+        ],
+        "type": "f32"
+    },
+    "rint": {
+        "sources": [
+            "libm/src/math/arch/aarch64.rs",
+            "libm/src/math/arch/wasm32.rs",
+            "libm/src/math/rint.rs"
+        ],
+        "type": "f64"
+    },
+    "rintf": {
+        "sources": [
+            "libm/src/math/arch/aarch64.rs",
+            "libm/src/math/arch/wasm32.rs",
+            "libm/src/math/rint.rs"
+        ],
+        "type": "f32"
+    },
+    "rintf128": {
+        "sources": [
+            "libm/src/math/rint.rs"
+        ],
+        "type": "f128"
+    },
+    "rintf16": {
+        "sources": [
+            "libm/src/math/arch/aarch64.rs",
+            "libm/src/math/rint.rs"
+        ],
+        "type": "f16"
+    },
+    "round": {
+        "sources": [
+            "libm/src/math/generic/round.rs",
+            "libm/src/math/round.rs"
+        ],
+        "type": "f64"
+    },
+    "roundeven": {
+        "sources": [
+            "libm/src/math/roundeven.rs"
+        ],
+        "type": "f64"
+    },
+    "roundevenf": {
+        "sources": [
+            "libm/src/math/roundeven.rs"
+        ],
+        "type": "f32"
+    },
+    "roundevenf128": {
+        "sources": [
+            "libm/src/math/roundeven.rs"
+        ],
+        "type": "f128"
+    },
+    "roundevenf16": {
+        "sources": [
+            "libm/src/math/roundeven.rs"
+        ],
+        "type": "f16"
+    },
+    "roundf": {
+        "sources": [
+            "libm/src/math/generic/round.rs",
+            "libm/src/math/round.rs"
+        ],
+        "type": "f32"
+    },
+    "roundf128": {
+        "sources": [
+            "libm/src/math/generic/round.rs",
+            "libm/src/math/round.rs"
+        ],
+        "type": "f128"
+    },
+    "roundf16": {
+        "sources": [
+            "libm/src/math/generic/round.rs",
+            "libm/src/math/round.rs"
+        ],
+        "type": "f16"
+    },
+    "scalbn": {
+        "sources": [
+            "libm/src/math/generic/scalbn.rs",
+            "libm/src/math/scalbn.rs"
+        ],
+        "type": "f64"
+    },
+    "scalbnf": {
+        "sources": [
+            "libm/src/math/generic/scalbn.rs",
+            "libm/src/math/scalbn.rs"
+        ],
+        "type": "f32"
+    },
+    "scalbnf128": {
+        "sources": [
+            "libm/src/math/generic/scalbn.rs",
+            "libm/src/math/scalbn.rs"
+        ],
+        "type": "f128"
+    },
+    "scalbnf16": {
+        "sources": [
+            "libm/src/math/generic/scalbn.rs",
+            "libm/src/math/scalbn.rs"
+        ],
+        "type": "f16"
+    },
+    "sin": {
+        "sources": [
+            "libm/src/math/sin.rs"
+        ],
+        "type": "f64"
+    },
+    "sincos": {
+        "sources": [
+            "libm/src/math/sincos.rs"
+        ],
+        "type": "f64"
+    },
+    "sincosf": {
+        "sources": [
+            "libm/src/math/sincosf.rs"
+        ],
+        "type": "f32"
+    },
+    "sinf": {
+        "sources": [
+            "libm/src/math/sinf.rs"
+        ],
+        "type": "f32"
+    },
+    "sinh": {
+        "sources": [
+            "libm/src/math/sinh.rs"
+        ],
+        "type": "f64"
+    },
+    "sinhf": {
+        "sources": [
+            "libm/src/math/sinhf.rs"
+        ],
+        "type": "f32"
+    },
+    "sqrt": {
+        "sources": [
+            "libm/src/math/arch/aarch64.rs",
+            "libm/src/math/arch/wasm32.rs",
+            "libm/src/math/arch/x86.rs",
+            "libm/src/math/generic/sqrt.rs",
+            "libm/src/math/sqrt.rs"
+        ],
+        "type": "f64"
+    },
+    "sqrtf": {
+        "sources": [
+            "libm/src/math/arch/aarch64.rs",
+            "libm/src/math/arch/wasm32.rs",
+            "libm/src/math/arch/x86.rs",
+            "libm/src/math/generic/sqrt.rs",
+            "libm/src/math/sqrt.rs"
+        ],
+        "type": "f32"
+    },
+    "sqrtf128": {
+        "sources": [
+            "libm/src/math/generic/sqrt.rs",
+            "libm/src/math/sqrt.rs"
+        ],
+        "type": "f128"
+    },
+    "sqrtf16": {
+        "sources": [
+            "libm/src/math/arch/aarch64.rs",
+            "libm/src/math/generic/sqrt.rs",
+            "libm/src/math/sqrt.rs"
+        ],
+        "type": "f16"
+    },
+    "tan": {
+        "sources": [
+            "libm/src/math/tan.rs"
+        ],
+        "type": "f64"
+    },
+    "tanf": {
+        "sources": [
+            "libm/src/math/tanf.rs"
+        ],
+        "type": "f32"
+    },
+    "tanh": {
+        "sources": [
+            "libm/src/math/tanh.rs"
+        ],
+        "type": "f64"
+    },
+    "tanhf": {
+        "sources": [
+            "libm/src/math/tanhf.rs"
+        ],
+        "type": "f32"
+    },
+    "tgamma": {
+        "sources": [
+            "libm/src/math/tgamma.rs"
+        ],
+        "type": "f64"
+    },
+    "tgammaf": {
+        "sources": [
+            "libm/src/math/tgammaf.rs"
+        ],
+        "type": "f32"
+    },
+    "trunc": {
+        "sources": [
+            "libm/src/math/arch/wasm32.rs",
+            "libm/src/math/generic/trunc.rs",
+            "libm/src/math/trunc.rs"
+        ],
+        "type": "f64"
+    },
+    "truncf": {
+        "sources": [
+            "libm/src/math/arch/wasm32.rs",
+            "libm/src/math/generic/trunc.rs",
+            "libm/src/math/trunc.rs"
+        ],
+        "type": "f32"
+    },
+    "truncf128": {
+        "sources": [
+            "libm/src/math/generic/trunc.rs",
+            "libm/src/math/trunc.rs"
+        ],
+        "type": "f128"
+    },
+    "truncf16": {
+        "sources": [
+            "libm/src/math/generic/trunc.rs",
+            "libm/src/math/trunc.rs"
+        ],
+        "type": "f16"
+    },
+    "y0": {
+        "sources": [
+            "libm/src/math/j0.rs"
+        ],
+        "type": "f64"
+    },
+    "y0f": {
+        "sources": [
+            "libm/src/math/j0f.rs"
+        ],
+        "type": "f32"
+    },
+    "y1": {
+        "sources": [
+            "libm/src/math/j1.rs"
+        ],
+        "type": "f64"
+    },
+    "y1f": {
+        "sources": [
+            "libm/src/math/j1f.rs"
+        ],
+        "type": "f32"
+    },
+    "yn": {
+        "sources": [
+            "libm/src/math/jn.rs"
+        ],
+        "type": "f64"
+    },
+    "ynf": {
+        "sources": [
+            "libm/src/math/jnf.rs"
+        ],
+        "type": "f32"
+    }
+}
diff --git a/library/compiler-builtins/etc/function-list.txt b/library/compiler-builtins/etc/function-list.txt
new file mode 100644
index 00000000000..1f226c8c0ff
--- /dev/null
+++ b/library/compiler-builtins/etc/function-list.txt
@@ -0,0 +1,164 @@
+# autogenerated by update-api-list.py
+acos
+acosf
+acosh
+acoshf
+asin
+asinf
+asinh
+asinhf
+atan
+atan2
+atan2f
+atanf
+atanh
+atanhf
+cbrt
+cbrtf
+ceil
+ceilf
+ceilf128
+ceilf16
+copysign
+copysignf
+copysignf128
+copysignf16
+cos
+cosf
+cosh
+coshf
+erf
+erfc
+erfcf
+erff
+exp
+exp10
+exp10f
+exp2
+exp2f
+expf
+expm1
+expm1f
+fabs
+fabsf
+fabsf128
+fabsf16
+fdim
+fdimf
+fdimf128
+fdimf16
+floor
+floorf
+floorf128
+floorf16
+fma
+fmaf
+fmaf128
+fmax
+fmaxf
+fmaxf128
+fmaxf16
+fmaximum
+fmaximum_num
+fmaximum_numf
+fmaximum_numf128
+fmaximum_numf16
+fmaximumf
+fmaximumf128
+fmaximumf16
+fmin
+fminf
+fminf128
+fminf16
+fminimum
+fminimum_num
+fminimum_numf
+fminimum_numf128
+fminimum_numf16
+fminimumf
+fminimumf128
+fminimumf16
+fmod
+fmodf
+fmodf128
+fmodf16
+frexp
+frexpf
+hypot
+hypotf
+ilogb
+ilogbf
+j0
+j0f
+j1
+j1f
+jn
+jnf
+ldexp
+ldexpf
+ldexpf128
+ldexpf16
+lgamma
+lgamma_r
+lgammaf
+lgammaf_r
+log
+log10
+log10f
+log1p
+log1pf
+log2
+log2f
+logf
+modf
+modff
+nextafter
+nextafterf
+pow
+powf
+remainder
+remainderf
+remquo
+remquof
+rint
+rintf
+rintf128
+rintf16
+round
+roundeven
+roundevenf
+roundevenf128
+roundevenf16
+roundf
+roundf128
+roundf16
+scalbn
+scalbnf
+scalbnf128
+scalbnf16
+sin
+sincos
+sincosf
+sinf
+sinh
+sinhf
+sqrt
+sqrtf
+sqrtf128
+sqrtf16
+tan
+tanf
+tanh
+tanhf
+tgamma
+tgammaf
+trunc
+truncf
+truncf128
+truncf16
+y0
+y0f
+y1
+y1f
+yn
+ynf
diff --git a/library/compiler-builtins/etc/update-api-list.py b/library/compiler-builtins/etc/update-api-list.py
new file mode 100755
index 00000000000..28ff22f4cbb
--- /dev/null
+++ b/library/compiler-builtins/etc/update-api-list.py
@@ -0,0 +1,361 @@
+#!/usr/bin/env python3
+"""Create a text file listing all public API. This can be used to ensure that all
+functions are covered by our macros.
+
+This file additionally does tidy-esque checks that all functions are listed where
+needed, or that lists are sorted.
+"""
+
+import difflib
+import json
+import re
+import subprocess as sp
+import sys
+from dataclasses import dataclass
+from glob import glob
+from pathlib import Path
+from typing import Any, Callable, TypeAlias
+
+SELF_PATH = Path(__file__)
+ETC_DIR = SELF_PATH.parent
+ROOT_DIR = ETC_DIR.parent
+
+# These files do not trigger a retest.
+IGNORED_SOURCES = ["libm/src/libm_helper.rs", "libm/src/math/support/float_traits.rs"]
+
+IndexTy: TypeAlias = dict[str, dict[str, Any]]
+"""Type of the `index` item in rustdoc's JSON output"""
+
+
+def eprint(*args, **kwargs):
+    """Print to stderr."""
+    print(*args, file=sys.stderr, **kwargs)
+
+
+@dataclass
+class Crate:
+    """Representation of public interfaces and function defintion locations in
+    `libm`.
+    """
+
+    public_functions: list[str]
+    """List of all public functions."""
+    defs: dict[str, list[str]]
+    """Map from `name->[source files]` to find all places that define a public
+    function. We track this to know which tests need to be rerun when specific files
+    get updated.
+    """
+    types: dict[str, str]
+    """Map from `name->type`."""
+
+    def __init__(self) -> None:
+        self.public_functions = []
+        self.defs = {}
+        self.types = {}
+
+        j = self.get_rustdoc_json()
+        index: IndexTy = j["index"]
+        self._init_function_list(index)
+        self._init_defs(index)
+        self._init_types()
+
+    @staticmethod
+    def get_rustdoc_json() -> dict[Any, Any]:
+        """Get rustdoc's JSON output for the `libm` crate."""
+
+        j = sp.check_output(
+            [
+                "rustdoc",
+                "libm/src/lib.rs",
+                "--edition=2021",
+                "--document-private-items",
+                "--output-format=json",
+                "--cfg=f16_enabled",
+                "--cfg=f128_enabled",
+                "-Zunstable-options",
+                "-o-",
+            ],
+            cwd=ROOT_DIR,
+            text=True,
+        )
+        j = json.loads(j)
+        return j
+
+    def _init_function_list(self, index: IndexTy) -> None:
+        """Get a list of public functions from rustdoc JSON output.
+
+        Note that this only finds functions that are reexported in `lib.rs`, this will
+        need to be adjusted if we need to account for functions that are defined there, or
+        glob reexports in other locations.
+        """
+        # Filter out items that are not public
+        public = [i for i in index.values() if i["visibility"] == "public"]
+
+        # Collect a list of source IDs for reexported items in `lib.rs` or `mod math`.
+        use = (i for i in public if "use" in i["inner"])
+        use = (
+            i
+            for i in use
+            if i["span"]["filename"] in ["libm/src/math/mod.rs", "libm/src/lib.rs"]
+        )
+        reexported_ids = [item["inner"]["use"]["id"] for item in use]
+
+        # Collect a list of reexported items that are functions
+        for id in reexported_ids:
+            srcitem = index.get(str(id))
+            # External crate
+            if srcitem is None:
+                continue
+
+            # Skip if not a function
+            if "function" not in srcitem["inner"]:
+                continue
+
+            self.public_functions.append(srcitem["name"])
+        self.public_functions.sort()
+
+    def _init_defs(self, index: IndexTy) -> None:
+        defs = {name: set() for name in self.public_functions}
+        funcs = (i for i in index.values() if "function" in i["inner"])
+        funcs = (f for f in funcs if f["name"] in self.public_functions)
+        for func in funcs:
+            defs[func["name"]].add(func["span"]["filename"])
+
+        # A lot of the `arch` module is often configured out so doesn't show up in docs. Use
+        # string matching as a fallback.
+        for fname in glob(
+            "libm/src/math/arch/**/*.rs", root_dir=ROOT_DIR, recursive=True
+        ):
+            contents = (ROOT_DIR.joinpath(fname)).read_text()
+
+            for name in self.public_functions:
+                if f"fn {name}" in contents:
+                    defs[name].add(fname)
+
+        for name, sources in defs.items():
+            base_sources = defs[base_name(name)[0]]
+            for src in (s for s in base_sources if "generic" in s):
+                sources.add(src)
+
+            for src in IGNORED_SOURCES:
+                sources.discard(src)
+
+        # Sort the set
+        self.defs = {k: sorted(v) for (k, v) in defs.items()}
+
+    def _init_types(self) -> None:
+        self.types = {name: base_name(name)[1] for name in self.public_functions}
+
+    def write_function_list(self, check: bool) -> None:
+        """Collect the list of public functions to a simple text file."""
+        output = "# autogenerated by update-api-list.py\n"
+        for name in self.public_functions:
+            output += f"{name}\n"
+
+        out_file = ETC_DIR.joinpath("function-list.txt")
+
+        if check:
+            with open(out_file, "r") as f:
+                current = f.read()
+            diff_and_exit(current, output, "function list")
+        else:
+            with open(out_file, "w") as f:
+                f.write(output)
+
+    def write_function_defs(self, check: bool) -> None:
+        """Collect the list of information about public functions to a JSON file ."""
+        comment = (
+            "Autogenerated by update-api-list.py. "
+            "List of files that define a function with a given name. "
+            "This file is checked in to make it obvious if refactoring breaks things"
+        )
+
+        d = {"__comment": comment}
+        d |= {
+            name: {"sources": self.defs[name], "type": self.types[name]}
+            for name in self.public_functions
+        }
+
+        out_file = ETC_DIR.joinpath("function-definitions.json")
+        output = json.dumps(d, indent=4) + "\n"
+
+        if check:
+            with open(out_file, "r") as f:
+                current = f.read()
+            diff_and_exit(current, output, "source list")
+        else:
+            with open(out_file, "w") as f:
+                f.write(output)
+
+    def tidy_lists(self) -> None:
+        """In each file, check annotations indicating blocks of code should be sorted or should
+        include all public API.
+        """
+
+        flist = sp.check_output(["git", "ls-files"], cwd=ROOT_DIR, text=True)
+
+        for path in flist.splitlines():
+            fpath = ROOT_DIR.joinpath(path)
+            if fpath.is_dir() or fpath == SELF_PATH:
+                continue
+
+            lines = fpath.read_text().splitlines()
+
+            validate_delimited_block(
+                fpath,
+                lines,
+                "verify-sorted-start",
+                "verify-sorted-end",
+                ensure_sorted,
+            )
+
+            validate_delimited_block(
+                fpath,
+                lines,
+                "verify-apilist-start",
+                "verify-apilist-end",
+                lambda p, n, lines: self.ensure_contains_api(p, n, lines),
+            )
+
+    def ensure_contains_api(self, fpath: Path, line_num: int, lines: list[str]):
+        """Given a list of strings, ensure that each public function we have is named
+        somewhere.
+        """
+        not_found = []
+        for func in self.public_functions:
+            # The function name may be on its own or somewhere in a snake case string.
+            pat = re.compile(rf"(\b|_){func}(\b|_)")
+            found = next((line for line in lines if pat.search(line)), None)
+
+            if found is None:
+                not_found.append(func)
+
+        if len(not_found) == 0:
+            return
+
+        relpath = fpath.relative_to(ROOT_DIR)
+        eprint(f"functions not found at {relpath}:{line_num}: {not_found}")
+        exit(1)
+
+
+def validate_delimited_block(
+    fpath: Path,
+    lines: list[str],
+    start: str,
+    end: str,
+    validate: Callable[[Path, int, list[str]], None],
+) -> None:
+    """Identify blocks of code wrapped within `start` and `end`, collect their contents
+    to a list of strings, and call `validate` for each of those lists.
+    """
+    relpath = fpath.relative_to(ROOT_DIR)
+    block_lines = []
+    block_start_line: None | int = None
+    for line_num, line in enumerate(lines):
+        line_num += 1
+
+        if start in line:
+            block_start_line = line_num
+            continue
+
+        if end in line:
+            if block_start_line is None:
+                eprint(f"`{end}` without `{start}` at {relpath}:{line_num}")
+                exit(1)
+
+            validate(fpath, block_start_line, block_lines)
+            block_lines = []
+            block_start_line = None
+            continue
+
+        if block_start_line is not None:
+            block_lines.append(line)
+
+    if block_start_line is not None:
+        eprint(f"`{start}` without `{end}` at {relpath}:{block_start_line}")
+        exit(1)
+
+
+def ensure_sorted(fpath: Path, block_start_line: int, lines: list[str]) -> None:
+    """Ensure that a list of lines is sorted, otherwise print a diff and exit."""
+    relpath = fpath.relative_to(ROOT_DIR)
+    diff_and_exit(
+        "\n".join(lines),
+        "\n".join(sorted(lines)),
+        f"sorted block at {relpath}:{block_start_line}",
+    )
+
+
+def diff_and_exit(actual: str, expected: str, name: str):
+    """If the two strings are different, print a diff between them and then exit
+    with an error.
+    """
+    if actual == expected:
+        print(f"{name} output matches expected; success")
+        return
+
+    a = [f"{line}\n" for line in actual.splitlines()]
+    b = [f"{line}\n" for line in expected.splitlines()]
+
+    diff = difflib.unified_diff(a, b, "actual", "expected")
+    sys.stdout.writelines(diff)
+    print(f"mismatched {name}")
+    exit(1)
+
+
+def base_name(name: str) -> tuple[str, str]:
+    """Return the basename and type from a full function name. Keep in sync with Rust's
+    `fn base_name`.
+    """
+    known_mappings = [
+        ("erff", ("erf", "f32")),
+        ("erf", ("erf", "f64")),
+        ("modff", ("modf", "f32")),
+        ("modf", ("modf", "f64")),
+        ("lgammaf_r", ("lgamma_r", "f32")),
+        ("lgamma_r", ("lgamma_r", "f64")),
+    ]
+
+    found = next((base for (full, base) in known_mappings if full == name), None)
+    if found is not None:
+        return found
+
+    if name.endswith("f"):
+        return (name.rstrip("f"), "f32")
+
+    if name.endswith("f16"):
+        return (name.rstrip("f16"), "f16")
+
+    if name.endswith("f128"):
+        return (name.rstrip("f128"), "f128")
+
+    return (name, "f64")
+
+
+def ensure_updated_list(check: bool) -> None:
+    """Runner to update the function list and JSON, or check that it is already up
+    to date.
+    """
+    crate = Crate()
+    crate.write_function_list(check)
+    crate.write_function_defs(check)
+
+    crate.tidy_lists()
+
+
+def main():
+    """By default overwrite the file. If `--check` is passed, print a diff instead and
+    error if the files are different.
+    """
+    match sys.argv:
+        case [_]:
+            ensure_updated_list(False)
+        case [_, "--check"]:
+            ensure_updated_list(True)
+        case _:
+            print("unrecognized arguments")
+            exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/library/compiler-builtins/libm-test/Cargo.toml b/library/compiler-builtins/libm-test/Cargo.toml
new file mode 100644
index 00000000000..7a306e73557
--- /dev/null
+++ b/library/compiler-builtins/libm-test/Cargo.toml
@@ -0,0 +1,74 @@
+[package]
+name = "libm-test"
+version = "0.1.0"
+edition = "2024"
+publish = false
+license = "MIT OR Apache-2.0"
+
+[features]
+default = ["build-mpfr", "build-musl", "unstable-float"]
+
+# Propagated from libm because this affects which functions we test.
+unstable-float = ["libm/unstable-float", "rug?/nightly-float"]
+
+# Generate tests which are random inputs and the outputs are calculated with
+# musl libc.
+build-mpfr = ["dep:rug", "dep:gmp-mpfr-sys"]
+
+# Build our own musl for testing and benchmarks
+build-musl = ["dep:musl-math-sys"]
+
+# Enable report generation without bringing in more dependencies by default
+benchmarking-reports = ["criterion/plotters", "criterion/html_reports"]
+
+# Enable icount benchmarks (requires iai-callgrind and valgrind)
+icount = ["dep:iai-callgrind"]
+
+# Run with a reduced set of benchmarks, such as for CI
+short-benchmarks = []
+
+[dependencies]
+anyhow = "1.0.97"
+# This is not directly used but is required so we can enable `gmp-mpfr-sys/force-cross`.
+gmp-mpfr-sys = { version = "1.6.4", optional = true, default-features = false }
+iai-callgrind = { version = "0.14.0", optional = true }
+indicatif = { version = "0.17.11", default-features = false }
+libm = { path = "../libm", features = ["unstable-public-internals"] }
+libm-macros = { path = "../crates/libm-macros" }
+musl-math-sys = { path = "../crates/musl-math-sys", optional = true }
+paste = "1.0.15"
+rand = "0.9.0"
+rand_chacha = "0.9.0"
+rayon = "1.10.0"
+rug = { version = "1.27.0", optional = true, default-features = false, features = ["float", "integer", "std"] }
+
+[target.'cfg(target_family = "wasm")'.dependencies]
+getrandom = { version = "0.3.2", features = ["wasm_js"] }
+
+[build-dependencies]
+rand = { version = "0.9.0", optional = true }
+
+[dev-dependencies]
+criterion = { version = "0.5.1", default-features = false, features = ["cargo_bench_support"] }
+libtest-mimic = "0.8.1"
+
+[[bench]]
+name = "icount"
+harness = false
+required-features = ["icount"]
+
+[[bench]]
+name = "random"
+harness = false
+
+[[test]]
+# No harness so that we can skip tests at runtime based on env. Prefixed with
+# `z` so these tests get run last.
+name = "z_extensive"
+harness = false
+
+[lints.rust]
+# Values from the chared config.rs used by `libm` but not the test crate
+unexpected_cfgs = { level = "warn", check-cfg = [
+  'cfg(feature, values("arch", "force-soft-floats", "unstable-intrinsics"))',
+] }
diff --git a/library/compiler-builtins/libm-test/benches/icount.rs b/library/compiler-builtins/libm-test/benches/icount.rs
new file mode 100644
index 00000000000..da8c6bfd15a
--- /dev/null
+++ b/library/compiler-builtins/libm-test/benches/icount.rs
@@ -0,0 +1,316 @@
+//! Benchmarks that use `iai-cachegrind` to be reasonably CI-stable.
+
+use std::hint::black_box;
+
+use iai_callgrind::{library_benchmark, library_benchmark_group, main};
+use libm::support::{HInt, u256};
+use libm_test::generate::spaced;
+use libm_test::{CheckBasis, CheckCtx, GeneratorKind, MathOp, OpRustArgs, TupleCall, op};
+
+const BENCH_ITER_ITEMS: u64 = 500;
+
+macro_rules! icount_benches {
+    (
+        fn_name: $fn_name:ident,
+        attrs: [$($_attr:meta),*],
+    ) => {
+        paste::paste! {
+            // Construct benchmark inputs from the logspace generator.
+            fn [< setup_ $fn_name >]() -> Vec<OpRustArgs<op::$fn_name::Routine>> {
+                type Op = op::$fn_name::Routine;
+                let mut ctx = CheckCtx::new(
+                    Op::IDENTIFIER,
+                    CheckBasis::None,
+                    GeneratorKind::QuickSpaced
+                );
+                ctx.override_iterations(BENCH_ITER_ITEMS);
+                let ret = spaced::get_test_cases::<Op>(&ctx).0.collect::<Vec<_>>();
+                println!("operation {}, {} steps", Op::NAME, ret.len());
+                ret
+            }
+
+            // Run benchmarks with the above inputs.
+            #[library_benchmark]
+            #[bench::logspace([< setup_ $fn_name >]())]
+            fn [< icount_bench_ $fn_name >](cases: Vec<OpRustArgs<op::$fn_name::Routine>>) {
+                type Op = op::$fn_name::Routine;
+                let f = black_box(Op::ROUTINE);
+                for input in cases.iter().copied() {
+                    input.call(f);
+                }
+            }
+
+            library_benchmark_group!(
+                name = [< icount_bench_ $fn_name _group  >];
+                benchmarks = [< icount_bench_ $fn_name >]
+            );
+        }
+    };
+}
+
+libm_macros::for_each_function! {
+    callback: icount_benches,
+}
+
+fn setup_u128_mul() -> Vec<(u128, u128)> {
+    let step = u128::MAX / 300;
+    let mut x = 0u128;
+    let mut y = 0u128;
+    let mut v = Vec::new();
+
+    loop {
+        'inner: loop {
+            match y.checked_add(step) {
+                Some(new) => y = new,
+                None => break 'inner,
+            }
+
+            v.push((x, y))
+        }
+
+        match x.checked_add(step) {
+            Some(new) => x = new,
+            None => break,
+        }
+    }
+
+    v
+}
+
+fn setup_u256_add() -> Vec<(u256, u256)> {
+    let mut v = Vec::new();
+    for (x, y) in setup_u128_mul() {
+        // square the u128 inputs to cover most of the u256 range
+        v.push((x.widen_mul(x), y.widen_mul(y)));
+    }
+    // Doesn't get covered by `u128:MAX^2`
+    v.push((u256::MAX, u256::MAX));
+    v
+}
+
+fn setup_u256_shift() -> Vec<(u256, u32)> {
+    let mut v = Vec::new();
+
+    for (x, _) in setup_u128_mul() {
+        let x2 = x.widen_mul(x);
+        for y in 0u32..256 {
+            v.push((x2, y));
+        }
+    }
+
+    v
+}
+
+#[library_benchmark]
+#[bench::linspace(setup_u128_mul())]
+fn icount_bench_u128_widen_mul(cases: Vec<(u128, u128)>) {
+    for (x, y) in cases.iter().copied() {
+        black_box(black_box(x).zero_widen_mul(black_box(y)));
+    }
+}
+
+library_benchmark_group!(
+    name = icount_bench_u128_widen_mul_group;
+    benchmarks = icount_bench_u128_widen_mul
+);
+
+#[library_benchmark]
+#[bench::linspace(setup_u256_add())]
+fn icount_bench_u256_add(cases: Vec<(u256, u256)>) {
+    for (x, y) in cases.iter().copied() {
+        black_box(black_box(x) + black_box(y));
+    }
+}
+
+library_benchmark_group!(
+    name = icount_bench_u256_add_group;
+    benchmarks = icount_bench_u256_add
+);
+
+#[library_benchmark]
+#[bench::linspace(setup_u256_shift())]
+fn icount_bench_u256_shr(cases: Vec<(u256, u32)>) {
+    for (x, y) in cases.iter().copied() {
+        black_box(black_box(x) >> black_box(y));
+    }
+}
+
+library_benchmark_group!(
+    name = icount_bench_u256_shr_group;
+    benchmarks = icount_bench_u256_shr
+);
+
+main!(
+    library_benchmark_groups =
+    // u256-related benchmarks
+    icount_bench_u128_widen_mul_group,
+    icount_bench_u256_add_group,
+    icount_bench_u256_shr_group,
+    // verify-apilist-start
+    // verify-sorted-start
+    icount_bench_acos_group,
+    icount_bench_acosf_group,
+    icount_bench_acosh_group,
+    icount_bench_acoshf_group,
+    icount_bench_asin_group,
+    icount_bench_asinf_group,
+    icount_bench_asinh_group,
+    icount_bench_asinhf_group,
+    icount_bench_atan2_group,
+    icount_bench_atan2f_group,
+    icount_bench_atan_group,
+    icount_bench_atanf_group,
+    icount_bench_atanh_group,
+    icount_bench_atanhf_group,
+    icount_bench_cbrt_group,
+    icount_bench_cbrtf_group,
+    icount_bench_ceil_group,
+    icount_bench_ceilf128_group,
+    icount_bench_ceilf16_group,
+    icount_bench_ceilf_group,
+    icount_bench_copysign_group,
+    icount_bench_copysignf128_group,
+    icount_bench_copysignf16_group,
+    icount_bench_copysignf_group,
+    icount_bench_cos_group,
+    icount_bench_cosf_group,
+    icount_bench_cosh_group,
+    icount_bench_coshf_group,
+    icount_bench_erf_group,
+    icount_bench_erfc_group,
+    icount_bench_erfcf_group,
+    icount_bench_erff_group,
+    icount_bench_exp10_group,
+    icount_bench_exp10f_group,
+    icount_bench_exp2_group,
+    icount_bench_exp2f_group,
+    icount_bench_exp_group,
+    icount_bench_expf_group,
+    icount_bench_expm1_group,
+    icount_bench_expm1f_group,
+    icount_bench_fabs_group,
+    icount_bench_fabsf128_group,
+    icount_bench_fabsf16_group,
+    icount_bench_fabsf_group,
+    icount_bench_fdim_group,
+    icount_bench_fdimf128_group,
+    icount_bench_fdimf16_group,
+    icount_bench_fdimf_group,
+    icount_bench_floor_group,
+    icount_bench_floorf128_group,
+    icount_bench_floorf16_group,
+    icount_bench_floorf_group,
+    icount_bench_fma_group,
+    icount_bench_fmaf128_group,
+    icount_bench_fmaf_group,
+    icount_bench_fmax_group,
+    icount_bench_fmaxf128_group,
+    icount_bench_fmaxf16_group,
+    icount_bench_fmaxf_group,
+    icount_bench_fmaximum_group,
+    icount_bench_fmaximum_num_group,
+    icount_bench_fmaximum_numf128_group,
+    icount_bench_fmaximum_numf16_group,
+    icount_bench_fmaximum_numf_group,
+    icount_bench_fmaximumf128_group,
+    icount_bench_fmaximumf16_group,
+    icount_bench_fmaximumf_group,
+    icount_bench_fmin_group,
+    icount_bench_fminf128_group,
+    icount_bench_fminf16_group,
+    icount_bench_fminf_group,
+    icount_bench_fminimum_group,
+    icount_bench_fminimum_num_group,
+    icount_bench_fminimum_numf128_group,
+    icount_bench_fminimum_numf16_group,
+    icount_bench_fminimum_numf_group,
+    icount_bench_fminimumf128_group,
+    icount_bench_fminimumf16_group,
+    icount_bench_fminimumf_group,
+    icount_bench_fmod_group,
+    icount_bench_fmodf128_group,
+    icount_bench_fmodf16_group,
+    icount_bench_fmodf_group,
+    icount_bench_frexp_group,
+    icount_bench_frexpf_group,
+    icount_bench_hypot_group,
+    icount_bench_hypotf_group,
+    icount_bench_ilogb_group,
+    icount_bench_ilogbf_group,
+    icount_bench_j0_group,
+    icount_bench_j0f_group,
+    icount_bench_j1_group,
+    icount_bench_j1f_group,
+    icount_bench_jn_group,
+    icount_bench_jnf_group,
+    icount_bench_ldexp_group,
+    icount_bench_ldexpf128_group,
+    icount_bench_ldexpf16_group,
+    icount_bench_ldexpf_group,
+    icount_bench_lgamma_group,
+    icount_bench_lgamma_r_group,
+    icount_bench_lgammaf_group,
+    icount_bench_lgammaf_r_group,
+    icount_bench_log10_group,
+    icount_bench_log10f_group,
+    icount_bench_log1p_group,
+    icount_bench_log1pf_group,
+    icount_bench_log2_group,
+    icount_bench_log2f_group,
+    icount_bench_log_group,
+    icount_bench_logf_group,
+    icount_bench_modf_group,
+    icount_bench_modff_group,
+    icount_bench_nextafter_group,
+    icount_bench_nextafterf_group,
+    icount_bench_pow_group,
+    icount_bench_powf_group,
+    icount_bench_remainder_group,
+    icount_bench_remainderf_group,
+    icount_bench_remquo_group,
+    icount_bench_remquof_group,
+    icount_bench_rint_group,
+    icount_bench_rintf128_group,
+    icount_bench_rintf16_group,
+    icount_bench_rintf_group,
+    icount_bench_round_group,
+    icount_bench_roundeven_group,
+    icount_bench_roundevenf128_group,
+    icount_bench_roundevenf16_group,
+    icount_bench_roundevenf_group,
+    icount_bench_roundf128_group,
+    icount_bench_roundf16_group,
+    icount_bench_roundf_group,
+    icount_bench_scalbn_group,
+    icount_bench_scalbnf128_group,
+    icount_bench_scalbnf16_group,
+    icount_bench_scalbnf_group,
+    icount_bench_sin_group,
+    icount_bench_sincos_group,
+    icount_bench_sincosf_group,
+    icount_bench_sinf_group,
+    icount_bench_sinh_group,
+    icount_bench_sinhf_group,
+    icount_bench_sqrt_group,
+    icount_bench_sqrtf128_group,
+    icount_bench_sqrtf16_group,
+    icount_bench_sqrtf_group,
+    icount_bench_tan_group,
+    icount_bench_tanf_group,
+    icount_bench_tanh_group,
+    icount_bench_tanhf_group,
+    icount_bench_tgamma_group,
+    icount_bench_tgammaf_group,
+    icount_bench_trunc_group,
+    icount_bench_truncf128_group,
+    icount_bench_truncf16_group,
+    icount_bench_truncf_group,
+    icount_bench_y0_group,
+    icount_bench_y0f_group,
+    icount_bench_y1_group,
+    icount_bench_y1f_group,
+    icount_bench_yn_group,
+    icount_bench_ynf_group,
+    // verify-sorted-end
+    // verify-apilist-end
+);
diff --git a/library/compiler-builtins/libm-test/benches/random.rs b/library/compiler-builtins/libm-test/benches/random.rs
new file mode 100644
index 00000000000..1b17f049eca
--- /dev/null
+++ b/library/compiler-builtins/libm-test/benches/random.rs
@@ -0,0 +1,179 @@
+use std::hint::black_box;
+use std::time::Duration;
+
+use criterion::{Criterion, criterion_main};
+use libm_test::generate::random;
+use libm_test::generate::random::RandomInput;
+use libm_test::{CheckBasis, CheckCtx, GeneratorKind, MathOp, TupleCall};
+
+/// Benchmark with this many items to get a variety
+const BENCH_ITER_ITEMS: usize = if cfg!(feature = "short-benchmarks") {
+    50
+} else {
+    500
+};
+
+/// Extra parameters we only care about if we are benchmarking against musl.
+#[allow(dead_code)]
+struct MuslExtra<F> {
+    musl_fn: Option<F>,
+    skip_on_i586: bool,
+}
+
+macro_rules! musl_rand_benches {
+    (
+        fn_name: $fn_name:ident,
+        attrs: [$($attr:meta),*],
+        fn_extra: ($skip_on_i586:expr, $musl_fn:expr),
+    ) => {
+        paste::paste! {
+            $(#[$attr])*
+            fn [< musl_bench_ $fn_name >](c: &mut Criterion) {
+                type Op = libm_test::op::$fn_name::Routine;
+
+                #[cfg(feature = "build-musl")]
+                let musl_extra = MuslExtra::<libm_test::OpCFn<Op>> {
+                    musl_fn: $musl_fn,
+                    skip_on_i586: $skip_on_i586,
+                };
+
+                #[cfg(not(feature = "build-musl"))]
+                let musl_extra = MuslExtra {
+                    musl_fn: None,
+                    skip_on_i586: $skip_on_i586,
+                };
+
+                bench_one::<Op>(c, musl_extra);
+            }
+        }
+    };
+}
+
+fn bench_one<Op>(c: &mut Criterion, musl_extra: MuslExtra<Op::CFn>)
+where
+    Op: MathOp,
+    Op::RustArgs: RandomInput,
+{
+    let name = Op::NAME;
+
+    let ctx = CheckCtx::new(Op::IDENTIFIER, CheckBasis::Musl, GeneratorKind::Random);
+    let benchvec: Vec<_> = random::get_test_cases::<Op::RustArgs>(&ctx)
+        .0
+        .take(BENCH_ITER_ITEMS)
+        .collect();
+
+    // Perform a sanity check that we are benchmarking the same thing
+    // Don't test against musl if it is not available
+    #[cfg(feature = "build-musl")]
+    for input in benchvec.iter().copied() {
+        use anyhow::Context;
+        use libm_test::CheckOutput;
+
+        if cfg!(x86_no_sse) && musl_extra.skip_on_i586 {
+            break;
+        }
+
+        let Some(musl_fn) = musl_extra.musl_fn else {
+            continue;
+        };
+        let musl_res = input.call(musl_fn);
+        let crate_res = input.call(Op::ROUTINE);
+
+        crate_res
+            .validate(musl_res, input, &ctx)
+            .context(name)
+            .unwrap();
+    }
+
+    #[cfg(not(feature = "build-musl"))]
+    let _ = musl_extra; // silence unused warnings
+
+    /* Option pointers are black boxed to avoid inlining in the benchmark loop */
+
+    let mut group = c.benchmark_group(name);
+    group.bench_function("crate", |b| {
+        b.iter(|| {
+            let f = black_box(Op::ROUTINE);
+            for input in benchvec.iter().copied() {
+                input.call(f);
+            }
+        })
+    });
+
+    // Don't test against musl if it is not available
+    #[cfg(feature = "build-musl")]
+    {
+        if let Some(musl_fn) = musl_extra.musl_fn {
+            group.bench_function("musl", |b| {
+                b.iter(|| {
+                    let f = black_box(musl_fn);
+                    for input in benchvec.iter().copied() {
+                        input.call(f);
+                    }
+                })
+            });
+        }
+    }
+}
+
+libm_macros::for_each_function! {
+    callback: musl_rand_benches,
+    skip: [],
+    fn_extra: match MACRO_FN_NAME {
+        // We pass a tuple of `(skip_on_i586, musl_fn)`
+
+        // FIXME(correctness): exp functions have the wrong result on i586
+        exp10 | exp10f | exp2 | exp2f => (true, Some(musl_math_sys::MACRO_FN_NAME)),
+
+        // Musl does not provide `f16` and `f128` functions, as well as a handful of others
+        fmaximum
+        | fmaximum_num
+        | fmaximum_numf
+        | fmaximumf
+        | fminimum
+        | fminimum_num
+        | fminimum_numf
+        | fminimumf
+        | roundeven
+        | roundevenf
+        | ALL_F16
+        | ALL_F128 => (false, None),
+
+        // By default we never skip (false) and always have a musl function available
+        _ => (false, Some(musl_math_sys::MACRO_FN_NAME))
+    }
+}
+
+macro_rules! run_callback {
+    (
+        fn_name: $fn_name:ident,
+        attrs: [$($attr:meta),*],
+        extra: [$criterion:ident],
+    ) => {
+        paste::paste! {
+            $(#[$attr])*
+            [< musl_bench_ $fn_name >](&mut $criterion)
+        }
+    };
+}
+
+pub fn musl_random() {
+    let mut criterion = Criterion::default();
+
+    // For CI, run a short 0.5s warmup and 1.0s tests. This makes benchmarks complete in
+    // about the same time as other tests.
+    if cfg!(feature = "short-benchmarks") {
+        criterion = criterion
+            .warm_up_time(Duration::from_millis(200))
+            .measurement_time(Duration::from_millis(600));
+    }
+
+    criterion = criterion.configure_from_args();
+
+    libm_macros::for_each_function! {
+        callback: run_callback,
+        extra: [criterion],
+    };
+}
+
+criterion_main!(musl_random);
diff --git a/library/compiler-builtins/libm-test/build.rs b/library/compiler-builtins/libm-test/build.rs
new file mode 100644
index 00000000000..510ba842f10
--- /dev/null
+++ b/library/compiler-builtins/libm-test/build.rs
@@ -0,0 +1,9 @@
+#[path = "../libm/configure.rs"]
+mod configure;
+use configure::Config;
+
+fn main() {
+    println!("cargo:rerun-if-changed=../libm/configure.rs");
+    let cfg = Config::from_env();
+    configure::emit_test_config(&cfg);
+}
diff --git a/library/compiler-builtins/libm-test/examples/plot_domains.rs b/library/compiler-builtins/libm-test/examples/plot_domains.rs
new file mode 100644
index 00000000000..3563103b8cd
--- /dev/null
+++ b/library/compiler-builtins/libm-test/examples/plot_domains.rs
@@ -0,0 +1,109 @@
+//! Program to write all inputs from a generator to a file, then invoke a Julia script to plot
+//! them. Output is in `target/plots`.
+//!
+//! Requires Julia with the `CairoMakie` dependency.
+//!
+//! Note that running in release mode by default generates a _lot_ more datapoints, which
+//! causes plotting to be extremely slow (some simplification to be done in the script).
+
+use std::fmt::Write as _;
+use std::io::{BufWriter, Write};
+use std::path::Path;
+use std::process::Command;
+use std::{env, fs};
+
+use libm_test::generate::spaced::SpacedInput;
+use libm_test::generate::{edge_cases, spaced};
+use libm_test::{CheckBasis, CheckCtx, GeneratorKind, MathOp, op};
+
+const JL_PLOT: &str = "examples/plot_file.jl";
+
+fn main() {
+    let manifest_env = env::var("CARGO_MANIFEST_DIR").unwrap();
+    let manifest_dir = Path::new(&manifest_env);
+    let out_dir = manifest_dir.join("../../target/plots");
+    if !out_dir.exists() {
+        fs::create_dir(&out_dir).unwrap();
+    }
+
+    let jl_script = manifest_dir.join(JL_PLOT);
+    let mut config = format!(r#"out_dir = "{}""#, out_dir.display());
+    config.write_str("\n\n").unwrap();
+
+    // Plot a few domains with some functions that use them.
+    plot_one_operator::<op::sqrtf::Routine>(&out_dir, &mut config);
+    plot_one_operator::<op::cosf::Routine>(&out_dir, &mut config);
+    plot_one_operator::<op::cbrtf::Routine>(&out_dir, &mut config);
+
+    let config_path = out_dir.join("config.toml");
+    fs::write(&config_path, config).unwrap();
+
+    // The script expects a path to `config.toml` to be passed as its only argument
+    let mut cmd = Command::new("julia");
+    if cfg!(optimizations_enabled) {
+        cmd.arg("-O3");
+    }
+    cmd.arg(jl_script).arg(config_path);
+
+    println!("launching script... {cmd:?}");
+    cmd.status().unwrap();
+}
+
+/// Run multiple generators for a single operator.
+fn plot_one_operator<Op>(out_dir: &Path, config: &mut String)
+where
+    Op: MathOp<FTy = f32, RustArgs = (f32,)>,
+    Op::RustArgs: SpacedInput<Op>,
+{
+    let mut ctx = CheckCtx::new(Op::IDENTIFIER, CheckBasis::Mpfr, GeneratorKind::QuickSpaced);
+    plot_one_generator(
+        out_dir,
+        &ctx,
+        "logspace",
+        config,
+        spaced::get_test_cases::<Op>(&ctx).0,
+    );
+    ctx.gen_kind = GeneratorKind::EdgeCases;
+    plot_one_generator(
+        out_dir,
+        &ctx,
+        "edge_cases",
+        config,
+        edge_cases::get_test_cases::<Op>(&ctx).0,
+    );
+}
+
+/// Plot the output of a single generator.
+fn plot_one_generator(
+    out_dir: &Path,
+    ctx: &CheckCtx,
+    gen_name: &str,
+    config: &mut String,
+    generator: impl Iterator<Item = (f32,)>,
+) {
+    let fn_name = ctx.base_name_str;
+    let text_file = out_dir.join(format!("input-{fn_name}-{gen_name}.txt"));
+
+    let f = fs::File::create(&text_file).unwrap();
+    let mut w = BufWriter::new(f);
+    let mut count = 0u64;
+
+    for input in generator {
+        writeln!(w, "{:e}", input.0).unwrap();
+        count += 1;
+    }
+
+    w.flush().unwrap();
+    println!("generated {count} inputs for {fn_name}-{gen_name}");
+
+    writeln!(
+        config,
+        r#"[[input]]
+function = "{fn_name}"
+generator = "{gen_name}"
+input_file = "{}"
+"#,
+        text_file.to_str().unwrap()
+    )
+    .unwrap()
+}
diff --git a/library/compiler-builtins/libm-test/examples/plot_file.jl b/library/compiler-builtins/libm-test/examples/plot_file.jl
new file mode 100644
index 00000000000..acffd97569f
--- /dev/null
+++ b/library/compiler-builtins/libm-test/examples/plot_file.jl
@@ -0,0 +1,171 @@
+"A quick script for plotting a list of floats.
+
+Takes a path to a TOML file (Julia has builtin TOML support but not JSON) which
+specifies a list of source files to plot. Plots are done with both a linear and
+a log scale.
+
+Requires [Makie] (specifically CairoMakie) for plotting.
+
+[Makie]: https://docs.makie.org/stable/
+"
+
+using CairoMakie
+using TOML
+
+function main()::Nothing
+    CairoMakie.activate!(px_per_unit = 10)
+    config_path = ARGS[1]
+
+    cfg = Dict()
+    open(config_path, "r") do f
+        cfg = TOML.parse(f)
+    end
+
+    out_dir = cfg["out_dir"]
+    for input in cfg["input"]
+        fn_name = input["function"]
+        gen_name = input["generator"]
+        input_file = input["input_file"]
+
+        plot_one(input_file, out_dir, fn_name, gen_name)
+    end
+end
+
+"Read inputs from a file, create both linear and log plots for one function"
+function plot_one(
+    input_file::String,
+    out_dir::String,
+    fn_name::String,
+    gen_name::String,
+)::Nothing
+    fig = Figure()
+
+    lin_out_file = joinpath(out_dir, "plot-$fn_name-$gen_name.png")
+    log_out_file = joinpath(out_dir, "plot-$fn_name-$gen_name-log.png")
+
+    # Map string function names to callable functions
+    if fn_name == "cos"
+        orig_func = cos
+        xlims = (-6.0, 6.0)
+        xlims_log = (-pi * 10, pi * 10)
+    elseif fn_name == "cbrt"
+        orig_func = cbrt
+        xlims = (-2.0, 2.0)
+        xlims_log = (-1000.0, 1000.0)
+    elseif fn_name == "sqrt"
+        orig_func = sqrt
+        xlims = (-1.1, 6.0)
+        xlims_log = (-1.1, 5000.0)
+    else
+        println("unrecognized function name `$fn_name`; update plot_file.jl")
+        exit(1)
+    end
+
+    # Edge cases don't do much beyond +/-1, except for infinity.
+    if gen_name == "edge_cases"
+        xlims = (-1.1, 1.1)
+        xlims_log = (-1.1, 1.1)
+    end
+
+    # Turn domain errors into NaN
+    func(x) = map_or(x, orig_func, NaN)
+
+    # Parse a series of X values produced by the generator
+    inputs = readlines(input_file)
+    gen_x = map((v) -> parse(Float32, v), inputs)
+
+    do_plot(
+        fig,
+        gen_x,
+        func,
+        xlims[1],
+        xlims[2],
+        "$fn_name $gen_name (linear scale)",
+        lin_out_file,
+        false,
+    )
+
+    do_plot(
+        fig,
+        gen_x,
+        func,
+        xlims_log[1],
+        xlims_log[2],
+        "$fn_name $gen_name (log scale)",
+        log_out_file,
+        true,
+    )
+end
+
+"Create a single plot"
+function do_plot(
+    fig::Figure,
+    gen_x::Vector{F},
+    func::Function,
+    xmin::AbstractFloat,
+    xmax::AbstractFloat,
+    title::String,
+    out_file::String,
+    logscale::Bool,
+)::Nothing where {F<:AbstractFloat}
+    println("plotting $title")
+
+    # `gen_x` is the values the generator produces. `actual_x` is for plotting a
+    # continuous function.
+    input_min = xmin - 1.0
+    input_max = xmax + 1.0
+    gen_x = filter((v) -> v >= input_min && v <= input_max, gen_x)
+    markersize = length(gen_x) < 10_000 ? 6.0 : 4.0
+
+    steps = 10_000
+    if logscale
+        r = LinRange(symlog10(input_min), symlog10(input_max), steps)
+        actual_x = sympow10.(r)
+        xscale = Makie.pseudolog10
+    else
+        actual_x = LinRange(input_min, input_max, steps)
+        xscale = identity
+    end
+
+    gen_y = @. func(gen_x)
+    actual_y = @. func(actual_x)
+
+    ax = Axis(fig[1, 1], xscale = xscale, title = title)
+
+    lines!(
+        ax,
+        actual_x,
+        actual_y,
+        color = (:lightblue, 0.6),
+        linewidth = 6.0,
+        label = "true function",
+    )
+    scatter!(
+        ax,
+        gen_x,
+        gen_y,
+        color = (:darkblue, 0.9),
+        markersize = markersize,
+        label = "checked inputs",
+    )
+    axislegend(ax, position = :rb, framevisible = false)
+
+    save(out_file, fig)
+    delete!(ax)
+end
+
+"Apply a function, returning the default if there is a domain error"
+function map_or(input::AbstractFloat, f::Function, default::Any)::Union{AbstractFloat,Any}
+    try
+        return f(input)
+    catch
+        return default
+    end
+end
+
+# Operations for logarithms that are symmetric about 0
+C = 10
+symlog10(x::Number) = sign(x) * (log10(1 + abs(x) / (10^C)))
+sympow10(x::Number) = (10^C) * (10^x - 1)
+
+main()
diff --git a/library/compiler-builtins/libm-test/src/domain.rs b/library/compiler-builtins/libm-test/src/domain.rs
new file mode 100644
index 00000000000..94641be9b54
--- /dev/null
+++ b/library/compiler-builtins/libm-test/src/domain.rs
@@ -0,0 +1,292 @@
+//! Traits and operations related to bounds of a function.
+
+use std::fmt;
+use std::ops::Bound;
+
+use libm::support::Int;
+
+use crate::{BaseName, Float, FloatExt, Identifier};
+
+/// Representation of a single dimension of a function's domain.
+#[derive(Clone, Debug)]
+pub struct Domain<T> {
+    /// Start of the region for which a function is defined (ignoring poles).
+    pub start: Bound<T>,
+    /// Endof the region for which a function is defined (ignoring poles).
+    pub end: Bound<T>,
+    /// Additional points to check closer around. These can be e.g. undefined asymptotes or
+    /// inflection points.
+    pub check_points: Option<fn() -> BoxIter<T>>,
+}
+
+type BoxIter<T> = Box<dyn Iterator<Item = T>>;
+
+impl<F: FloatExt> Domain<F> {
+    /// The start of this domain, saturating at negative infinity.
+    pub fn range_start(&self) -> F {
+        match self.start {
+            Bound::Included(v) => v,
+            Bound::Excluded(v) => v.next_up(),
+            Bound::Unbounded => F::NEG_INFINITY,
+        }
+    }
+
+    /// The end of this domain, saturating at infinity.
+    pub fn range_end(&self) -> F {
+        match self.end {
+            Bound::Included(v) => v,
+            Bound::Excluded(v) => v.next_down(),
+            Bound::Unbounded => F::INFINITY,
+        }
+    }
+}
+
+/// A value that may be any float type or any integer type.
+#[derive(Clone, Debug)]
+pub enum EitherPrim<F, I> {
+    Float(F),
+    Int(I),
+}
+
+impl<F: fmt::Debug, I: fmt::Debug> EitherPrim<F, I> {
+    pub fn unwrap_float(self) -> F {
+        match self {
+            EitherPrim::Float(f) => f,
+            EitherPrim::Int(_) => panic!("expected float; got {self:?}"),
+        }
+    }
+
+    pub fn unwrap_int(self) -> I {
+        match self {
+            EitherPrim::Float(_) => panic!("expected int; got {self:?}"),
+            EitherPrim::Int(i) => i,
+        }
+    }
+}
+
+/// Convenience 1-dimensional float domains.
+impl<F: Float> Domain<F> {
+    /// x ∈ ℝ
+    const UNBOUNDED: Self = Self {
+        start: Bound::Unbounded,
+        end: Bound::Unbounded,
+        check_points: None,
+    };
+
+    /// x ∈ ℝ >= 0
+    const POSITIVE: Self = Self {
+        start: Bound::Included(F::ZERO),
+        end: Bound::Unbounded,
+        check_points: None,
+    };
+
+    /// x ∈ ℝ > 0
+    const STRICTLY_POSITIVE: Self = Self {
+        start: Bound::Excluded(F::ZERO),
+        end: Bound::Unbounded,
+        check_points: None,
+    };
+
+    /// Wrap in the float variant of [`EitherPrim`].
+    const fn into_prim_float<I>(self) -> EitherPrim<Self, Domain<I>> {
+        EitherPrim::Float(self)
+    }
+}
+
+/// Convenience 1-dimensional integer domains.
+impl<I: Int> Domain<I> {
+    /// x ∈ ℝ
+    const UNBOUNDED_INT: Self = Self {
+        start: Bound::Unbounded,
+        end: Bound::Unbounded,
+        check_points: None,
+    };
+
+    /// Wrap in the int variant of [`EitherPrim`].
+    const fn into_prim_int<F>(self) -> EitherPrim<Domain<F>, Self> {
+        EitherPrim::Int(self)
+    }
+}
+
+/// Multidimensional domains, represented as an array of 1-D domains.
+impl<F: Float, I: Int> EitherPrim<Domain<F>, Domain<I>> {
+    /// x ∈ ℝ
+    const UNBOUNDED1: [Self; 1] = [Domain {
+        start: Bound::Unbounded,
+        end: Bound::Unbounded,
+        check_points: None,
+    }
+    .into_prim_float()];
+
+    /// {x1, x2} ∈ ℝ
+    const UNBOUNDED2: [Self; 2] = [
+        Domain::UNBOUNDED.into_prim_float(),
+        Domain::UNBOUNDED.into_prim_float(),
+    ];
+
+    /// {x1, x2, x3} ∈ ℝ
+    const UNBOUNDED3: [Self; 3] = [
+        Domain::UNBOUNDED.into_prim_float(),
+        Domain::UNBOUNDED.into_prim_float(),
+        Domain::UNBOUNDED.into_prim_float(),
+    ];
+
+    /// {x1, x2} ∈ ℝ, one float and one int
+    const UNBOUNDED_F_I: [Self; 2] = [
+        Domain::UNBOUNDED.into_prim_float(),
+        Domain::UNBOUNDED_INT.into_prim_int(),
+    ];
+
+    /// x ∈ ℝ >= 0
+    const POSITIVE: [Self; 1] = [Domain::POSITIVE.into_prim_float()];
+
+    /// x ∈ ℝ > 0
+    const STRICTLY_POSITIVE: [Self; 1] = [Domain::STRICTLY_POSITIVE.into_prim_float()];
+
+    /// Used for versions of `asin` and `acos`.
+    const INVERSE_TRIG_PERIODIC: [Self; 1] = [Domain {
+        start: Bound::Included(F::NEG_ONE),
+        end: Bound::Included(F::ONE),
+        check_points: None,
+    }
+    .into_prim_float()];
+
+    /// Domain for `acosh`
+    const ACOSH: [Self; 1] = [Domain {
+        start: Bound::Included(F::ONE),
+        end: Bound::Unbounded,
+        check_points: None,
+    }
+    .into_prim_float()];
+
+    /// Domain for `atanh`
+    const ATANH: [Self; 1] = [Domain {
+        start: Bound::Excluded(F::NEG_ONE),
+        end: Bound::Excluded(F::ONE),
+        check_points: None,
+    }
+    .into_prim_float()];
+
+    /// Domain for `sin`, `cos`, and `tan`
+    const TRIG: [Self; 1] = [Domain {
+        // Trig functions have special behavior at fractions of π.
+        check_points: Some(|| Box::new([-F::PI, -F::FRAC_PI_2, F::FRAC_PI_2, F::PI].into_iter())),
+        ..Domain::UNBOUNDED
+    }
+    .into_prim_float()];
+
+    /// Domain for `log` in various bases
+    const LOG: [Self; 1] = Self::STRICTLY_POSITIVE;
+
+    /// Domain for `log1p` i.e. `log(1 + x)`
+    const LOG1P: [Self; 1] = [Domain {
+        start: Bound::Excluded(F::NEG_ONE),
+        end: Bound::Unbounded,
+        check_points: None,
+    }
+    .into_prim_float()];
+
+    /// Domain for `sqrt`
+    const SQRT: [Self; 1] = Self::POSITIVE;
+
+    /// Domain for `gamma`
+    const GAMMA: [Self; 1] = [Domain {
+        check_points: Some(|| {
+            // Negative integers are asymptotes
+            Box::new((0..u8::MAX).map(|scale| {
+                let mut base = F::ZERO;
+                for _ in 0..scale {
+                    base = base - F::ONE;
+                }
+                base
+            }))
+        }),
+        // Whether or not gamma is defined for negative numbers is implementation dependent
+        ..Domain::UNBOUNDED
+    }
+    .into_prim_float()];
+
+    /// Domain for `loggamma`
+    const LGAMMA: [Self; 1] = Self::STRICTLY_POSITIVE;
+
+    /// Domain for `jn` and `yn`.
+    // FIXME: the domain should provide some sort of "reasonable range" so we don't actually test
+    // the entire system unbounded.
+    const BESSEL_N: [Self; 2] = [
+        Domain::UNBOUNDED_INT.into_prim_int(),
+        Domain::UNBOUNDED.into_prim_float(),
+    ];
+}
+
+/// Get the domain for a given function.
+pub fn get_domain<F: Float, I: Int>(
+    id: Identifier,
+    argnum: usize,
+) -> EitherPrim<Domain<F>, Domain<I>> {
+    let x = match id.base_name() {
+        BaseName::Acos => &EitherPrim::INVERSE_TRIG_PERIODIC[..],
+        BaseName::Acosh => &EitherPrim::ACOSH[..],
+        BaseName::Asin => &EitherPrim::INVERSE_TRIG_PERIODIC[..],
+        BaseName::Asinh => &EitherPrim::UNBOUNDED1[..],
+        BaseName::Atan => &EitherPrim::UNBOUNDED1[..],
+        BaseName::Atan2 => &EitherPrim::UNBOUNDED2[..],
+        BaseName::Cbrt => &EitherPrim::UNBOUNDED1[..],
+        BaseName::Atanh => &EitherPrim::ATANH[..],
+        BaseName::Ceil => &EitherPrim::UNBOUNDED1[..],
+        BaseName::Cosh => &EitherPrim::UNBOUNDED1[..],
+        BaseName::Copysign => &EitherPrim::UNBOUNDED2[..],
+        BaseName::Cos => &EitherPrim::TRIG[..],
+        BaseName::Exp => &EitherPrim::UNBOUNDED1[..],
+        BaseName::Erf => &EitherPrim::UNBOUNDED1[..],
+        BaseName::Erfc => &EitherPrim::UNBOUNDED1[..],
+        BaseName::Expm1 => &EitherPrim::UNBOUNDED1[..],
+        BaseName::Exp10 => &EitherPrim::UNBOUNDED1[..],
+        BaseName::Exp2 => &EitherPrim::UNBOUNDED1[..],
+        BaseName::Frexp => &EitherPrim::UNBOUNDED1[..],
+        BaseName::Fabs => &EitherPrim::UNBOUNDED1[..],
+        BaseName::Fdim => &EitherPrim::UNBOUNDED2[..],
+        BaseName::Floor => &EitherPrim::UNBOUNDED1[..],
+        BaseName::Fma => &EitherPrim::UNBOUNDED3[..],
+        BaseName::Fmax => &EitherPrim::UNBOUNDED2[..],
+        BaseName::Fmaximum => &EitherPrim::UNBOUNDED2[..],
+        BaseName::FmaximumNum => &EitherPrim::UNBOUNDED2[..],
+        BaseName::Fmin => &EitherPrim::UNBOUNDED2[..],
+        BaseName::Fminimum => &EitherPrim::UNBOUNDED2[..],
+        BaseName::FminimumNum => &EitherPrim::UNBOUNDED2[..],
+        BaseName::Fmod => &EitherPrim::UNBOUNDED2[..],
+        BaseName::Hypot => &EitherPrim::UNBOUNDED2[..],
+        BaseName::Ilogb => &EitherPrim::UNBOUNDED1[..],
+        BaseName::J0 => &EitherPrim::UNBOUNDED1[..],
+        BaseName::J1 => &EitherPrim::UNBOUNDED1[..],
+        BaseName::Jn => &EitherPrim::BESSEL_N[..],
+        BaseName::Ldexp => &EitherPrim::UNBOUNDED_F_I[..],
+        BaseName::Lgamma => &EitherPrim::LGAMMA[..],
+        BaseName::LgammaR => &EitherPrim::LGAMMA[..],
+        BaseName::Log => &EitherPrim::LOG[..],
+        BaseName::Log10 => &EitherPrim::LOG[..],
+        BaseName::Log1p => &EitherPrim::LOG1P[..],
+        BaseName::Log2 => &EitherPrim::LOG[..],
+        BaseName::Modf => &EitherPrim::UNBOUNDED1[..],
+        BaseName::Nextafter => &EitherPrim::UNBOUNDED2[..],
+        BaseName::Pow => &EitherPrim::UNBOUNDED2[..],
+        BaseName::Remainder => &EitherPrim::UNBOUNDED2[..],
+        BaseName::Remquo => &EitherPrim::UNBOUNDED2[..],
+        BaseName::Rint => &EitherPrim::UNBOUNDED1[..],
+        BaseName::Round => &EitherPrim::UNBOUNDED1[..],
+        BaseName::Roundeven => &EitherPrim::UNBOUNDED1[..],
+        BaseName::Scalbn => &EitherPrim::UNBOUNDED_F_I[..],
+        BaseName::Sin => &EitherPrim::TRIG[..],
+        BaseName::Sincos => &EitherPrim::TRIG[..],
+        BaseName::Sinh => &EitherPrim::UNBOUNDED1[..],
+        BaseName::Sqrt => &EitherPrim::SQRT[..],
+        BaseName::Tan => &EitherPrim::TRIG[..],
+        BaseName::Tanh => &EitherPrim::UNBOUNDED1[..],
+        BaseName::Tgamma => &EitherPrim::GAMMA[..],
+        BaseName::Trunc => &EitherPrim::UNBOUNDED1[..],
+        BaseName::Y0 => &EitherPrim::UNBOUNDED1[..],
+        BaseName::Y1 => &EitherPrim::UNBOUNDED1[..],
+        BaseName::Yn => &EitherPrim::BESSEL_N[..],
+    };
+
+    x[argnum].clone()
+}
diff --git a/library/compiler-builtins/libm-test/src/f8_impl.rs b/library/compiler-builtins/libm-test/src/f8_impl.rs
new file mode 100644
index 00000000000..905c7d7fde9
--- /dev/null
+++ b/library/compiler-builtins/libm-test/src/f8_impl.rs
@@ -0,0 +1,505 @@
+//! An IEEE-compliant 8-bit float type for testing purposes.
+
+use std::cmp::{self, Ordering};
+use std::{fmt, ops};
+
+use crate::Float;
+
+/// Sometimes verifying float logic is easiest when all values can quickly be checked exhaustively
+/// or by hand.
+///
+/// IEEE-754 compliant type that includes a 1 bit sign, 4 bit exponent, and 3 bit significand.
+/// Bias is -7.
+///
+/// Based on <https://en.wikipedia.org/wiki/Minifloat#Example_8-bit_float_(1.4.3)>.
+#[derive(Clone, Copy)]
+#[repr(transparent)]
+#[allow(non_camel_case_types)]
+pub struct f8(u8);
+
+impl Float for f8 {
+    type Int = u8;
+    type SignedInt = i8;
+
+    const ZERO: Self = Self(0b0_0000_000);
+    const NEG_ZERO: Self = Self(0b1_0000_000);
+    const ONE: Self = Self(0b0_0111_000);
+    const NEG_ONE: Self = Self(0b1_0111_000);
+    const MAX: Self = Self(0b0_1110_111);
+    const MIN: Self = Self(0b1_1110_111);
+    const INFINITY: Self = Self(0b0_1111_000);
+    const NEG_INFINITY: Self = Self(0b1_1111_000);
+    const NAN: Self = Self(0b0_1111_100);
+    const NEG_NAN: Self = Self(0b1_1111_100);
+    const MIN_POSITIVE_NORMAL: Self = Self(1 << Self::SIG_BITS);
+    // FIXME: incorrect values
+    const EPSILON: Self = Self::ZERO;
+    const PI: Self = Self::ZERO;
+    const NEG_PI: Self = Self::ZERO;
+    const FRAC_PI_2: Self = Self::ZERO;
+
+    const BITS: u32 = 8;
+    const SIG_BITS: u32 = 3;
+    const SIGN_MASK: Self::Int = 0b1_0000_000;
+    const SIG_MASK: Self::Int = 0b0_0000_111;
+    const EXP_MASK: Self::Int = 0b0_1111_000;
+    const IMPLICIT_BIT: Self::Int = 0b0_0001_000;
+
+    fn to_bits(self) -> Self::Int {
+        self.0
+    }
+
+    fn to_bits_signed(self) -> Self::SignedInt {
+        self.0 as i8
+    }
+
+    fn is_nan(self) -> bool {
+        self.0 & Self::EXP_MASK == Self::EXP_MASK && self.0 & Self::SIG_MASK != 0
+    }
+
+    fn is_infinite(self) -> bool {
+        self.0 & Self::EXP_MASK == Self::EXP_MASK && self.0 & Self::SIG_MASK == 0
+    }
+
+    fn is_sign_negative(self) -> bool {
+        self.0 & Self::SIGN_MASK != 0
+    }
+
+    fn from_bits(a: Self::Int) -> Self {
+        Self(a)
+    }
+
+    fn abs(self) -> Self {
+        libm::generic::fabs(self)
+    }
+
+    fn copysign(self, other: Self) -> Self {
+        libm::generic::copysign(self, other)
+    }
+
+    fn fma(self, _y: Self, _z: Self) -> Self {
+        unimplemented!()
+    }
+
+    fn normalize(_significand: Self::Int) -> (i32, Self::Int) {
+        unimplemented!()
+    }
+}
+
+impl f8 {
+    pub const ALL_LEN: usize = 240;
+
+    /// All non-infinite non-NaN values of `f8`
+    pub const ALL: [Self; Self::ALL_LEN] = [
+        // -m*2^7
+        Self(0b1_1110_111), // -240
+        Self(0b1_1110_110),
+        Self(0b1_1110_101),
+        Self(0b1_1110_100),
+        Self(0b1_1110_011),
+        Self(0b1_1110_010),
+        Self(0b1_1110_001),
+        Self(0b1_1110_000), // -128
+        // -m*2^6
+        Self(0b1_1101_111), // -120
+        Self(0b1_1101_110),
+        Self(0b1_1101_101),
+        Self(0b1_1101_100),
+        Self(0b1_1101_011),
+        Self(0b1_1101_010),
+        Self(0b1_1101_001),
+        Self(0b1_1101_000), // -64
+        // -m*2^5
+        Self(0b1_1100_111), // -60
+        Self(0b1_1100_110),
+        Self(0b1_1100_101),
+        Self(0b1_1100_100),
+        Self(0b1_1100_011),
+        Self(0b1_1100_010),
+        Self(0b1_1100_001),
+        Self(0b1_1100_000), // -32
+        // -m*2^4
+        Self(0b1_1011_111), // -30
+        Self(0b1_1011_110),
+        Self(0b1_1011_101),
+        Self(0b1_1011_100),
+        Self(0b1_1011_011),
+        Self(0b1_1011_010),
+        Self(0b1_1011_001),
+        Self(0b1_1011_000), // -16
+        // -m*2^3
+        Self(0b1_1010_111), // -15
+        Self(0b1_1010_110),
+        Self(0b1_1010_101),
+        Self(0b1_1010_100),
+        Self(0b1_1010_011),
+        Self(0b1_1010_010),
+        Self(0b1_1010_001),
+        Self(0b1_1010_000), // -8
+        // -m*2^2
+        Self(0b1_1001_111), // -7.5
+        Self(0b1_1001_110),
+        Self(0b1_1001_101),
+        Self(0b1_1001_100),
+        Self(0b1_1001_011),
+        Self(0b1_1001_010),
+        Self(0b1_1001_001),
+        Self(0b1_1001_000), // -4
+        // -m*2^1
+        Self(0b1_1000_111), // -3.75
+        Self(0b1_1000_110),
+        Self(0b1_1000_101),
+        Self(0b1_1000_100),
+        Self(0b1_1000_011),
+        Self(0b1_1000_010),
+        Self(0b1_1000_001),
+        Self(0b1_1000_000), // -2
+        // -m*2^0
+        Self(0b1_0111_111), // -1.875
+        Self(0b1_0111_110),
+        Self(0b1_0111_101),
+        Self(0b1_0111_100),
+        Self(0b1_0111_011),
+        Self(0b1_0111_010),
+        Self(0b1_0111_001),
+        Self(0b1_0111_000), // -1
+        // -m*2^-1
+        Self(0b1_0110_111), // −0.9375
+        Self(0b1_0110_110),
+        Self(0b1_0110_101),
+        Self(0b1_0110_100),
+        Self(0b1_0110_011),
+        Self(0b1_0110_010),
+        Self(0b1_0110_001),
+        Self(0b1_0110_000), // -0.5
+        // -m*2^-2
+        Self(0b1_0101_111), // −0.46875
+        Self(0b1_0101_110),
+        Self(0b1_0101_101),
+        Self(0b1_0101_100),
+        Self(0b1_0101_011),
+        Self(0b1_0101_010),
+        Self(0b1_0101_001),
+        Self(0b1_0101_000), // -0.25
+        // -m*2^-3
+        Self(0b1_0100_111), // −0.234375
+        Self(0b1_0100_110),
+        Self(0b1_0100_101),
+        Self(0b1_0100_100),
+        Self(0b1_0100_011),
+        Self(0b1_0100_010),
+        Self(0b1_0100_001),
+        Self(0b1_0100_000), // -0.125
+        // -m*2^-4
+        Self(0b1_0011_111), // −0.1171875
+        Self(0b1_0011_110),
+        Self(0b1_0011_101),
+        Self(0b1_0011_100),
+        Self(0b1_0011_011),
+        Self(0b1_0011_010),
+        Self(0b1_0011_001),
+        Self(0b1_0011_000), // −0.0625
+        // -m*2^-5
+        Self(0b1_0010_111), // −0.05859375
+        Self(0b1_0010_110),
+        Self(0b1_0010_101),
+        Self(0b1_0010_100),
+        Self(0b1_0010_011),
+        Self(0b1_0010_010),
+        Self(0b1_0010_001),
+        Self(0b1_0010_000), // −0.03125
+        // -m*2^-6
+        Self(0b1_0001_111), // −0.029296875
+        Self(0b1_0001_110),
+        Self(0b1_0001_101),
+        Self(0b1_0001_100),
+        Self(0b1_0001_011),
+        Self(0b1_0001_010),
+        Self(0b1_0001_001),
+        Self(0b1_0001_000), // −0.015625
+        // -m*2^-7 subnormal numbers
+        Self(0b1_0000_111), // −0.013671875
+        Self(0b1_0000_110),
+        Self(0b1_0000_101),
+        Self(0b1_0000_100),
+        Self(0b1_0000_011),
+        Self(0b1_0000_010),
+        Self(0b1_0000_001), // −0.001953125
+        // Zeroes
+        Self(0b1_0000_000), // -0.0
+        Self(0b0_0000_000), // 0.0
+        // m*2^-7 // subnormal numbers
+        Self(0b0_0000_001),
+        Self(0b0_0000_010),
+        Self(0b0_0000_011),
+        Self(0b0_0000_100),
+        Self(0b0_0000_101),
+        Self(0b0_0000_110),
+        Self(0b0_0000_111), // 0.013671875
+        // m*2^-6
+        Self(0b0_0001_000), // 0.015625
+        Self(0b0_0001_001),
+        Self(0b0_0001_010),
+        Self(0b0_0001_011),
+        Self(0b0_0001_100),
+        Self(0b0_0001_101),
+        Self(0b0_0001_110),
+        Self(0b0_0001_111), // 0.029296875
+        // m*2^-5
+        Self(0b0_0010_000), // 0.03125
+        Self(0b0_0010_001),
+        Self(0b0_0010_010),
+        Self(0b0_0010_011),
+        Self(0b0_0010_100),
+        Self(0b0_0010_101),
+        Self(0b0_0010_110),
+        Self(0b0_0010_111), // 0.05859375
+        // m*2^-4
+        Self(0b0_0011_000), // 0.0625
+        Self(0b0_0011_001),
+        Self(0b0_0011_010),
+        Self(0b0_0011_011),
+        Self(0b0_0011_100),
+        Self(0b0_0011_101),
+        Self(0b0_0011_110),
+        Self(0b0_0011_111), // 0.1171875
+        // m*2^-3
+        Self(0b0_0100_000), // 0.125
+        Self(0b0_0100_001),
+        Self(0b0_0100_010),
+        Self(0b0_0100_011),
+        Self(0b0_0100_100),
+        Self(0b0_0100_101),
+        Self(0b0_0100_110),
+        Self(0b0_0100_111), // 0.234375
+        // m*2^-2
+        Self(0b0_0101_000), // 0.25
+        Self(0b0_0101_001),
+        Self(0b0_0101_010),
+        Self(0b0_0101_011),
+        Self(0b0_0101_100),
+        Self(0b0_0101_101),
+        Self(0b0_0101_110),
+        Self(0b0_0101_111), // 0.46875
+        // m*2^-1
+        Self(0b0_0110_000), // 0.5
+        Self(0b0_0110_001),
+        Self(0b0_0110_010),
+        Self(0b0_0110_011),
+        Self(0b0_0110_100),
+        Self(0b0_0110_101),
+        Self(0b0_0110_110),
+        Self(0b0_0110_111), // 0.9375
+        // m*2^0
+        Self(0b0_0111_000), // 1
+        Self(0b0_0111_001),
+        Self(0b0_0111_010),
+        Self(0b0_0111_011),
+        Self(0b0_0111_100),
+        Self(0b0_0111_101),
+        Self(0b0_0111_110),
+        Self(0b0_0111_111), // 1.875
+        // m*2^1
+        Self(0b0_1000_000), // 2
+        Self(0b0_1000_001),
+        Self(0b0_1000_010),
+        Self(0b0_1000_011),
+        Self(0b0_1000_100),
+        Self(0b0_1000_101),
+        Self(0b0_1000_110),
+        Self(0b0_1000_111), // 3.75
+        // m*2^2
+        Self(0b0_1001_000), // 4
+        Self(0b0_1001_001),
+        Self(0b0_1001_010),
+        Self(0b0_1001_011),
+        Self(0b0_1001_100),
+        Self(0b0_1001_101),
+        Self(0b0_1001_110),
+        Self(0b0_1001_111), // 7.5
+        // m*2^3
+        Self(0b0_1010_000), // 8
+        Self(0b0_1010_001),
+        Self(0b0_1010_010),
+        Self(0b0_1010_011),
+        Self(0b0_1010_100),
+        Self(0b0_1010_101),
+        Self(0b0_1010_110),
+        Self(0b0_1010_111), // 15
+        // m*2^4
+        Self(0b0_1011_000), // 16
+        Self(0b0_1011_001),
+        Self(0b0_1011_010),
+        Self(0b0_1011_011),
+        Self(0b0_1011_100),
+        Self(0b0_1011_101),
+        Self(0b0_1011_110),
+        Self(0b0_1011_111), // 30
+        // m*2^5
+        Self(0b0_1100_000), // 32
+        Self(0b0_1100_001),
+        Self(0b0_1100_010),
+        Self(0b0_1100_011),
+        Self(0b0_1100_100),
+        Self(0b0_1100_101),
+        Self(0b0_1100_110),
+        Self(0b0_1100_111), // 60
+        // m*2^6
+        Self(0b0_1101_000), // 64
+        Self(0b0_1101_001),
+        Self(0b0_1101_010),
+        Self(0b0_1101_011),
+        Self(0b0_1101_100),
+        Self(0b0_1101_101),
+        Self(0b0_1101_110),
+        Self(0b0_1101_111), // 120
+        // m*2^7
+        Self(0b0_1110_000), // 128
+        Self(0b0_1110_001),
+        Self(0b0_1110_010),
+        Self(0b0_1110_011),
+        Self(0b0_1110_100),
+        Self(0b0_1110_101),
+        Self(0b0_1110_110),
+        Self(0b0_1110_111), // 240
+    ];
+}
+
+impl ops::Add for f8 {
+    type Output = Self;
+    fn add(self, _rhs: Self) -> Self::Output {
+        unimplemented!()
+    }
+}
+
+impl ops::Sub for f8 {
+    type Output = Self;
+    fn sub(self, _rhs: Self) -> Self::Output {
+        unimplemented!()
+    }
+}
+impl ops::Mul for f8 {
+    type Output = Self;
+    fn mul(self, _rhs: Self) -> Self::Output {
+        unimplemented!()
+    }
+}
+impl ops::Div for f8 {
+    type Output = Self;
+    fn div(self, _rhs: Self) -> Self::Output {
+        unimplemented!()
+    }
+}
+
+impl ops::Neg for f8 {
+    type Output = Self;
+    fn neg(self) -> Self::Output {
+        Self(self.0 ^ Self::SIGN_MASK)
+    }
+}
+
+impl ops::Rem for f8 {
+    type Output = Self;
+    fn rem(self, _rhs: Self) -> Self::Output {
+        unimplemented!()
+    }
+}
+
+impl ops::AddAssign for f8 {
+    fn add_assign(&mut self, _rhs: Self) {
+        unimplemented!()
+    }
+}
+
+impl ops::SubAssign for f8 {
+    fn sub_assign(&mut self, _rhs: Self) {
+        unimplemented!()
+    }
+}
+
+impl ops::MulAssign for f8 {
+    fn mul_assign(&mut self, _rhs: Self) {
+        unimplemented!()
+    }
+}
+
+impl cmp::PartialEq for f8 {
+    fn eq(&self, other: &Self) -> bool {
+        if self.is_nan() || other.is_nan() {
+            false
+        } else if self.abs().to_bits() | other.abs().to_bits() == 0 {
+            true
+        } else {
+            self.0 == other.0
+        }
+    }
+}
+impl cmp::PartialOrd for f8 {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        let inf_rep = f8::EXP_MASK;
+
+        let a_abs = self.abs().to_bits();
+        let b_abs = other.abs().to_bits();
+
+        // If either a or b is NaN, they are unordered.
+        if a_abs > inf_rep || b_abs > inf_rep {
+            return None;
+        }
+
+        // If a and b are both zeros, they are equal.
+        if a_abs | b_abs == 0 {
+            return Some(Ordering::Equal);
+        }
+
+        let a_srep = self.to_bits_signed();
+        let b_srep = other.to_bits_signed();
+        let res = a_srep.cmp(&b_srep);
+
+        if a_srep & b_srep >= 0 {
+            // If at least one of a and b is positive, we get the same result comparing
+            // a and b as signed integers as we would with a fp_ting-point compare.
+            Some(res)
+        } else {
+            // Otherwise, both are negative, so we need to flip the sense of the
+            // comparison to get the correct result.
+            Some(res.reverse())
+        }
+    }
+}
+impl fmt::Display for f8 {
+    fn fmt(&self, _f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        unimplemented!()
+    }
+}
+
+impl fmt::Debug for f8 {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        fmt::Binary::fmt(self, f)
+    }
+}
+
+impl fmt::Binary for f8 {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let v = self.0;
+        write!(
+            f,
+            "0b{:b}_{:04b}_{:03b}",
+            v >> 7,
+            (v & Self::EXP_MASK) >> Self::SIG_BITS,
+            v & Self::SIG_MASK
+        )
+    }
+}
+
+impl fmt::LowerHex for f8 {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
+pub const fn hf8(s: &str) -> f8 {
+    let Ok(bits) = libm::support::hex_float::parse_hex_exact(s, 8, 3) else {
+        panic!()
+    };
+    f8(bits as u8)
+}
diff --git a/library/compiler-builtins/libm-test/src/generate.rs b/library/compiler-builtins/libm-test/src/generate.rs
new file mode 100644
index 00000000000..da080d23fa7
--- /dev/null
+++ b/library/compiler-builtins/libm-test/src/generate.rs
@@ -0,0 +1,50 @@
+//! Different generators that can create random or systematic bit patterns.
+
+pub mod case_list;
+pub mod edge_cases;
+pub mod random;
+pub mod spaced;
+
+/// A wrapper to turn any iterator into an `ExactSizeIterator`. Asserts the final result to ensure
+/// the provided size was correct.
+#[derive(Debug)]
+pub struct KnownSize<I> {
+    total: u64,
+    current: u64,
+    iter: I,
+}
+
+impl<I> KnownSize<I> {
+    pub fn new(iter: I, total: u64) -> Self {
+        Self {
+            total,
+            current: 0,
+            iter,
+        }
+    }
+}
+
+impl<I: Iterator> Iterator for KnownSize<I> {
+    type Item = I::Item;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let next = self.iter.next();
+        if next.is_some() {
+            self.current += 1;
+            return next;
+        }
+
+        assert_eq!(
+            self.current, self.total,
+            "total items did not match expected"
+        );
+        None
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let remaining = usize::try_from(self.total - self.current).unwrap();
+        (remaining, Some(remaining))
+    }
+}
+
+impl<I: Iterator> ExactSizeIterator for KnownSize<I> {}
diff --git a/library/compiler-builtins/libm-test/src/generate/case_list.rs b/library/compiler-builtins/libm-test/src/generate/case_list.rs
new file mode 100644
index 00000000000..43b28722f2d
--- /dev/null
+++ b/library/compiler-builtins/libm-test/src/generate/case_list.rs
@@ -0,0 +1,896 @@
+//! Test cases to verify specific values.
+//!
+//! Each routine can have a set of inputs and, optinoally, outputs. If an output is provided, it
+//! will be used to check against. If only inputs are provided, the case will be checked against
+//! a basis.
+//!
+//! This is useful for adding regression tests or expected failures.
+
+use libm::hf64;
+#[cfg(f128_enabled)]
+use libm::hf128;
+
+use crate::{CheckBasis, CheckCtx, GeneratorKind, MathOp, op};
+
+pub struct TestCase<Op: MathOp> {
+    pub input: Op::RustArgs,
+    pub output: Option<Op::RustRet>,
+}
+
+impl<Op: MathOp> TestCase<Op> {
+    #[expect(dead_code)]
+    fn append_inputs(v: &mut Vec<Self>, l: &[Op::RustArgs]) {
+        v.extend(l.iter().copied().map(|input| Self {
+            input,
+            output: None,
+        }));
+    }
+
+    fn append_pairs(v: &mut Vec<Self>, l: &[(Op::RustArgs, Option<Op::RustRet>)])
+    where
+        Op::RustRet: Copy,
+    {
+        v.extend(
+            l.iter()
+                .copied()
+                .map(|(input, output)| Self { input, output }),
+        );
+    }
+}
+
+fn acos_cases() -> Vec<TestCase<op::acos::Routine>> {
+    vec![]
+}
+
+fn acosf_cases() -> Vec<TestCase<op::acosf::Routine>> {
+    vec![]
+}
+
+fn acosh_cases() -> Vec<TestCase<op::acosh::Routine>> {
+    vec![]
+}
+
+fn acoshf_cases() -> Vec<TestCase<op::acoshf::Routine>> {
+    vec![]
+}
+
+fn asin_cases() -> Vec<TestCase<op::asin::Routine>> {
+    vec![]
+}
+
+fn asinf_cases() -> Vec<TestCase<op::asinf::Routine>> {
+    vec![]
+}
+
+fn asinh_cases() -> Vec<TestCase<op::asinh::Routine>> {
+    vec![]
+}
+
+fn asinhf_cases() -> Vec<TestCase<op::asinhf::Routine>> {
+    vec![]
+}
+
+fn atan_cases() -> Vec<TestCase<op::atan::Routine>> {
+    vec![]
+}
+
+fn atan2_cases() -> Vec<TestCase<op::atan2::Routine>> {
+    vec![]
+}
+
+fn atan2f_cases() -> Vec<TestCase<op::atan2f::Routine>> {
+    vec![]
+}
+
+fn atanf_cases() -> Vec<TestCase<op::atanf::Routine>> {
+    vec![]
+}
+
+fn atanh_cases() -> Vec<TestCase<op::atanh::Routine>> {
+    vec![]
+}
+
+fn atanhf_cases() -> Vec<TestCase<op::atanhf::Routine>> {
+    vec![]
+}
+
+fn cbrt_cases() -> Vec<TestCase<op::cbrt::Routine>> {
+    vec![]
+}
+
+fn cbrtf_cases() -> Vec<TestCase<op::cbrtf::Routine>> {
+    vec![]
+}
+
+fn ceil_cases() -> Vec<TestCase<op::ceil::Routine>> {
+    vec![]
+}
+
+fn ceilf_cases() -> Vec<TestCase<op::ceilf::Routine>> {
+    vec![]
+}
+
+#[cfg(f128_enabled)]
+fn ceilf128_cases() -> Vec<TestCase<op::ceilf128::Routine>> {
+    vec![]
+}
+
+#[cfg(f16_enabled)]
+fn ceilf16_cases() -> Vec<TestCase<op::ceilf16::Routine>> {
+    vec![]
+}
+
+fn copysign_cases() -> Vec<TestCase<op::copysign::Routine>> {
+    vec![]
+}
+
+fn copysignf_cases() -> Vec<TestCase<op::copysignf::Routine>> {
+    vec![]
+}
+
+#[cfg(f128_enabled)]
+fn copysignf128_cases() -> Vec<TestCase<op::copysignf128::Routine>> {
+    vec![]
+}
+
+#[cfg(f16_enabled)]
+fn copysignf16_cases() -> Vec<TestCase<op::copysignf16::Routine>> {
+    vec![]
+}
+
+fn cos_cases() -> Vec<TestCase<op::cos::Routine>> {
+    vec![]
+}
+
+fn cosf_cases() -> Vec<TestCase<op::cosf::Routine>> {
+    vec![]
+}
+
+fn cosh_cases() -> Vec<TestCase<op::cosh::Routine>> {
+    vec![]
+}
+
+fn coshf_cases() -> Vec<TestCase<op::coshf::Routine>> {
+    vec![]
+}
+
+fn erf_cases() -> Vec<TestCase<op::erf::Routine>> {
+    vec![]
+}
+
+fn erfc_cases() -> Vec<TestCase<op::erfc::Routine>> {
+    vec![]
+}
+
+fn erfcf_cases() -> Vec<TestCase<op::erfcf::Routine>> {
+    vec![]
+}
+
+fn erff_cases() -> Vec<TestCase<op::erff::Routine>> {
+    vec![]
+}
+
+fn exp_cases() -> Vec<TestCase<op::exp::Routine>> {
+    vec![]
+}
+
+fn exp10_cases() -> Vec<TestCase<op::exp10::Routine>> {
+    vec![]
+}
+
+fn exp10f_cases() -> Vec<TestCase<op::exp10f::Routine>> {
+    vec![]
+}
+
+fn exp2_cases() -> Vec<TestCase<op::exp2::Routine>> {
+    vec![]
+}
+
+fn exp2f_cases() -> Vec<TestCase<op::exp2f::Routine>> {
+    vec![]
+}
+
+fn expf_cases() -> Vec<TestCase<op::expf::Routine>> {
+    vec![]
+}
+
+fn expm1_cases() -> Vec<TestCase<op::expm1::Routine>> {
+    vec![]
+}
+
+fn expm1f_cases() -> Vec<TestCase<op::expm1f::Routine>> {
+    vec![]
+}
+
+fn fabs_cases() -> Vec<TestCase<op::fabs::Routine>> {
+    vec![]
+}
+
+fn fabsf_cases() -> Vec<TestCase<op::fabsf::Routine>> {
+    vec![]
+}
+
+#[cfg(f128_enabled)]
+fn fabsf128_cases() -> Vec<TestCase<op::fabsf128::Routine>> {
+    vec![]
+}
+
+#[cfg(f16_enabled)]
+fn fabsf16_cases() -> Vec<TestCase<op::fabsf16::Routine>> {
+    vec![]
+}
+
+fn fdim_cases() -> Vec<TestCase<op::fdim::Routine>> {
+    vec![]
+}
+
+fn fdimf_cases() -> Vec<TestCase<op::fdimf::Routine>> {
+    vec![]
+}
+
+#[cfg(f128_enabled)]
+fn fdimf128_cases() -> Vec<TestCase<op::fdimf128::Routine>> {
+    vec![]
+}
+
+#[cfg(f16_enabled)]
+fn fdimf16_cases() -> Vec<TestCase<op::fdimf16::Routine>> {
+    vec![]
+}
+
+fn floor_cases() -> Vec<TestCase<op::floor::Routine>> {
+    vec![]
+}
+
+fn floorf_cases() -> Vec<TestCase<op::floorf::Routine>> {
+    vec![]
+}
+
+#[cfg(f128_enabled)]
+fn floorf128_cases() -> Vec<TestCase<op::floorf128::Routine>> {
+    vec![]
+}
+
+#[cfg(f16_enabled)]
+fn floorf16_cases() -> Vec<TestCase<op::floorf16::Routine>> {
+    vec![]
+}
+
+fn fma_cases() -> Vec<TestCase<op::fma::Routine>> {
+    let mut v = vec![];
+    TestCase::append_pairs(
+        &mut v,
+        &[
+            // Previous failure with incorrect sign
+            ((5e-324, -5e-324, 0.0), Some(-0.0)),
+        ],
+    );
+    v
+}
+
+fn fmaf_cases() -> Vec<TestCase<op::fmaf::Routine>> {
+    let mut v = vec![];
+    TestCase::append_pairs(
+        &mut v,
+        &[
+            // Known rounding error for some implementations (notably MinGW)
+            (
+                (-1.9369631e13f32, 2.1513551e-7, -1.7354427e-24),
+                Some(-4167095.8),
+            ),
+        ],
+    );
+    v
+}
+
+#[cfg(f128_enabled)]
+fn fmaf128_cases() -> Vec<TestCase<op::fmaf128::Routine>> {
+    let mut v = vec![];
+    TestCase::append_pairs(
+        &mut v,
+        &[
+            (
+                // Tricky rounding case that previously failed in extensive tests
+                (
+                    hf128!("-0x1.1966cc01966cc01966cc01966f06p-25"),
+                    hf128!("-0x1.669933fe69933fe69933fe6997c9p-16358"),
+                    hf128!("-0x0.000000000000000000000000048ap-16382"),
+                ),
+                Some(hf128!("0x0.c5171470a3ff5e0f68d751491b18p-16382")),
+            ),
+            (
+                // Subnormal edge case that caused a failure
+                (
+                    hf128!("0x0.7ffffffffffffffffffffffffff7p-16382"),
+                    hf128!("0x1.ffffffffffffffffffffffffffffp-1"),
+                    hf128!("0x0.8000000000000000000000000009p-16382"),
+                ),
+                Some(hf128!("0x1.0000000000000000000000000000p-16382")),
+            ),
+        ],
+    );
+    v
+}
+
+#[cfg(f16_enabled)]
+fn fmaxf16_cases() -> Vec<TestCase<op::fmaxf16::Routine>> {
+    vec![]
+}
+
+fn fmaxf_cases() -> Vec<TestCase<op::fmaxf::Routine>> {
+    vec![]
+}
+
+fn fmax_cases() -> Vec<TestCase<op::fmax::Routine>> {
+    vec![]
+}
+
+#[cfg(f128_enabled)]
+fn fmaxf128_cases() -> Vec<TestCase<op::fmaxf128::Routine>> {
+    vec![]
+}
+
+#[cfg(f16_enabled)]
+fn fmaximumf16_cases() -> Vec<TestCase<op::fmaximumf16::Routine>> {
+    vec![]
+}
+
+fn fmaximumf_cases() -> Vec<TestCase<op::fmaximumf::Routine>> {
+    vec![]
+}
+
+fn fmaximum_cases() -> Vec<TestCase<op::fmaximum::Routine>> {
+    vec![]
+}
+
+#[cfg(f128_enabled)]
+fn fmaximumf128_cases() -> Vec<TestCase<op::fmaximumf128::Routine>> {
+    vec![]
+}
+
+#[cfg(f16_enabled)]
+fn fmaximum_numf16_cases() -> Vec<TestCase<op::fmaximum_numf16::Routine>> {
+    vec![]
+}
+
+fn fmaximum_numf_cases() -> Vec<TestCase<op::fmaximum_numf::Routine>> {
+    vec![]
+}
+
+fn fmaximum_num_cases() -> Vec<TestCase<op::fmaximum_num::Routine>> {
+    vec![]
+}
+
+#[cfg(f128_enabled)]
+fn fmaximum_numf128_cases() -> Vec<TestCase<op::fmaximum_numf128::Routine>> {
+    vec![]
+}
+
+#[cfg(f16_enabled)]
+fn fminf16_cases() -> Vec<TestCase<op::fminf16::Routine>> {
+    vec![]
+}
+
+fn fminf_cases() -> Vec<TestCase<op::fminf::Routine>> {
+    vec![]
+}
+
+fn fmin_cases() -> Vec<TestCase<op::fmin::Routine>> {
+    vec![]
+}
+
+#[cfg(f128_enabled)]
+fn fminf128_cases() -> Vec<TestCase<op::fminf128::Routine>> {
+    vec![]
+}
+
+#[cfg(f16_enabled)]
+fn fminimumf16_cases() -> Vec<TestCase<op::fminimumf16::Routine>> {
+    vec![]
+}
+
+fn fminimumf_cases() -> Vec<TestCase<op::fminimumf::Routine>> {
+    vec![]
+}
+
+fn fminimum_cases() -> Vec<TestCase<op::fminimum::Routine>> {
+    vec![]
+}
+
+#[cfg(f128_enabled)]
+fn fminimumf128_cases() -> Vec<TestCase<op::fminimumf128::Routine>> {
+    vec![]
+}
+
+#[cfg(f16_enabled)]
+fn fminimum_numf16_cases() -> Vec<TestCase<op::fminimum_numf16::Routine>> {
+    vec![]
+}
+
+fn fminimum_numf_cases() -> Vec<TestCase<op::fminimum_numf::Routine>> {
+    vec![]
+}
+
+fn fminimum_num_cases() -> Vec<TestCase<op::fminimum_num::Routine>> {
+    vec![]
+}
+
+#[cfg(f128_enabled)]
+fn fminimum_numf128_cases() -> Vec<TestCase<op::fminimum_numf128::Routine>> {
+    vec![]
+}
+
+fn fmod_cases() -> Vec<TestCase<op::fmod::Routine>> {
+    let mut v = vec![];
+    TestCase::append_pairs(
+        &mut v,
+        &[
+            // Previous failure with incorrect loop iteration
+            // <https://github.com/rust-lang/libm/pull/469#discussion_r2022337272>
+            ((2.1, 3.123e-320), Some(2.0696e-320)),
+            ((2.1, 2.253547e-318), Some(1.772535e-318)),
+        ],
+    );
+    v
+}
+
+fn fmodf_cases() -> Vec<TestCase<op::fmodf::Routine>> {
+    let mut v = vec![];
+    TestCase::append_pairs(
+        &mut v,
+        &[
+            // Previous failure with incorrect loop iteration
+            // <https://github.com/rust-lang/libm/pull/469#discussion_r2022337272>
+            ((2.1, 8.858e-42), Some(8.085e-42)),
+            ((2.1, 6.39164e-40), Some(6.1636e-40)),
+            ((5.5, 6.39164e-40), Some(4.77036e-40)),
+            ((-151.189, 6.39164e-40), Some(-5.64734e-40)),
+        ],
+    );
+    v
+}
+
+#[cfg(f128_enabled)]
+fn fmodf128_cases() -> Vec<TestCase<op::fmodf128::Routine>> {
+    vec![]
+}
+
+#[cfg(f16_enabled)]
+fn fmodf16_cases() -> Vec<TestCase<op::fmodf16::Routine>> {
+    vec![]
+}
+
+fn frexp_cases() -> Vec<TestCase<op::frexp::Routine>> {
+    vec![]
+}
+
+fn frexpf_cases() -> Vec<TestCase<op::frexpf::Routine>> {
+    vec![]
+}
+
+fn hypot_cases() -> Vec<TestCase<op::hypot::Routine>> {
+    vec![]
+}
+
+fn hypotf_cases() -> Vec<TestCase<op::hypotf::Routine>> {
+    vec![]
+}
+
+fn ilogb_cases() -> Vec<TestCase<op::ilogb::Routine>> {
+    vec![]
+}
+
+fn ilogbf_cases() -> Vec<TestCase<op::ilogbf::Routine>> {
+    vec![]
+}
+
+fn j0_cases() -> Vec<TestCase<op::j0::Routine>> {
+    vec![]
+}
+
+fn j0f_cases() -> Vec<TestCase<op::j0f::Routine>> {
+    vec![]
+}
+
+fn j1_cases() -> Vec<TestCase<op::j1::Routine>> {
+    vec![]
+}
+
+fn j1f_cases() -> Vec<TestCase<op::j1f::Routine>> {
+    vec![]
+}
+
+fn jn_cases() -> Vec<TestCase<op::jn::Routine>> {
+    vec![]
+}
+
+fn jnf_cases() -> Vec<TestCase<op::jnf::Routine>> {
+    vec![]
+}
+
+fn ldexp_cases() -> Vec<TestCase<op::ldexp::Routine>> {
+    vec![]
+}
+
+fn ldexpf_cases() -> Vec<TestCase<op::ldexpf::Routine>> {
+    vec![]
+}
+
+#[cfg(f128_enabled)]
+fn ldexpf128_cases() -> Vec<TestCase<op::ldexpf128::Routine>> {
+    vec![]
+}
+
+#[cfg(f16_enabled)]
+fn ldexpf16_cases() -> Vec<TestCase<op::ldexpf16::Routine>> {
+    vec![]
+}
+
+fn lgamma_cases() -> Vec<TestCase<op::lgamma::Routine>> {
+    vec![]
+}
+
+fn lgamma_r_cases() -> Vec<TestCase<op::lgamma_r::Routine>> {
+    vec![]
+}
+
+fn lgammaf_cases() -> Vec<TestCase<op::lgammaf::Routine>> {
+    vec![]
+}
+
+fn lgammaf_r_cases() -> Vec<TestCase<op::lgammaf_r::Routine>> {
+    vec![]
+}
+
+fn log_cases() -> Vec<TestCase<op::log::Routine>> {
+    vec![]
+}
+
+fn log10_cases() -> Vec<TestCase<op::log10::Routine>> {
+    vec![]
+}
+
+fn log10f_cases() -> Vec<TestCase<op::log10f::Routine>> {
+    vec![]
+}
+
+fn log1p_cases() -> Vec<TestCase<op::log1p::Routine>> {
+    vec![]
+}
+
+fn log1pf_cases() -> Vec<TestCase<op::log1pf::Routine>> {
+    vec![]
+}
+
+fn log2_cases() -> Vec<TestCase<op::log2::Routine>> {
+    vec![]
+}
+
+fn log2f_cases() -> Vec<TestCase<op::log2f::Routine>> {
+    vec![]
+}
+
+fn logf_cases() -> Vec<TestCase<op::logf::Routine>> {
+    vec![]
+}
+
+fn modf_cases() -> Vec<TestCase<op::modf::Routine>> {
+    vec![]
+}
+
+fn modff_cases() -> Vec<TestCase<op::modff::Routine>> {
+    vec![]
+}
+
+fn nextafter_cases() -> Vec<TestCase<op::nextafter::Routine>> {
+    vec![]
+}
+
+fn nextafterf_cases() -> Vec<TestCase<op::nextafterf::Routine>> {
+    vec![]
+}
+
+fn pow_cases() -> Vec<TestCase<op::pow::Routine>> {
+    vec![]
+}
+
+fn powf_cases() -> Vec<TestCase<op::powf::Routine>> {
+    vec![]
+}
+
+fn remainder_cases() -> Vec<TestCase<op::remainder::Routine>> {
+    vec![]
+}
+
+fn remainderf_cases() -> Vec<TestCase<op::remainderf::Routine>> {
+    vec![]
+}
+
+fn remquo_cases() -> Vec<TestCase<op::remquo::Routine>> {
+    vec![]
+}
+
+fn remquof_cases() -> Vec<TestCase<op::remquof::Routine>> {
+    vec![]
+}
+
+fn rint_cases() -> Vec<TestCase<op::rint::Routine>> {
+    let mut v = vec![];
+    TestCase::append_pairs(
+        &mut v,
+        &[
+            // Known failure on i586
+            #[cfg(not(x86_no_sse))]
+            (
+                (hf64!("-0x1.e3f13ff995ffcp+38"),),
+                Some(hf64!("-0x1.e3f13ff994000p+38")),
+            ),
+            #[cfg(x86_no_sse)]
+            (
+                (hf64!("-0x1.e3f13ff995ffcp+38"),),
+                Some(hf64!("-0x1.e3f13ff998000p+38")),
+            ),
+        ],
+    );
+    v
+}
+
+fn rintf_cases() -> Vec<TestCase<op::rintf::Routine>> {
+    vec![]
+}
+
+#[cfg(f128_enabled)]
+fn rintf128_cases() -> Vec<TestCase<op::rintf128::Routine>> {
+    vec![]
+}
+
+#[cfg(f16_enabled)]
+fn rintf16_cases() -> Vec<TestCase<op::rintf16::Routine>> {
+    vec![]
+}
+
+#[cfg(f16_enabled)]
+fn roundf16_cases() -> Vec<TestCase<op::roundf16::Routine>> {
+    vec![]
+}
+
+fn round_cases() -> Vec<TestCase<op::round::Routine>> {
+    vec![]
+}
+
+fn roundf_cases() -> Vec<TestCase<op::roundf::Routine>> {
+    vec![]
+}
+
+#[cfg(f128_enabled)]
+fn roundf128_cases() -> Vec<TestCase<op::roundf128::Routine>> {
+    vec![]
+}
+
+#[cfg(f16_enabled)]
+fn roundevenf16_cases() -> Vec<TestCase<op::roundevenf16::Routine>> {
+    vec![]
+}
+
+fn roundeven_cases() -> Vec<TestCase<op::roundeven::Routine>> {
+    let mut v = vec![];
+    TestCase::append_pairs(
+        &mut v,
+        &[
+            // Known failure on i586
+            #[cfg(not(x86_no_sse))]
+            (
+                (hf64!("-0x1.e3f13ff995ffcp+38"),),
+                Some(hf64!("-0x1.e3f13ff994000p+38")),
+            ),
+            #[cfg(x86_no_sse)]
+            (
+                (hf64!("-0x1.e3f13ff995ffcp+38"),),
+                Some(hf64!("-0x1.e3f13ff998000p+38")),
+            ),
+        ],
+    );
+    v
+}
+
+fn roundevenf_cases() -> Vec<TestCase<op::roundevenf::Routine>> {
+    vec![]
+}
+
+#[cfg(f128_enabled)]
+fn roundevenf128_cases() -> Vec<TestCase<op::roundevenf128::Routine>> {
+    vec![]
+}
+
+fn scalbn_cases() -> Vec<TestCase<op::scalbn::Routine>> {
+    vec![]
+}
+
+fn scalbnf_cases() -> Vec<TestCase<op::scalbnf::Routine>> {
+    vec![]
+}
+
+#[cfg(f128_enabled)]
+fn scalbnf128_cases() -> Vec<TestCase<op::scalbnf128::Routine>> {
+    vec![]
+}
+
+#[cfg(f16_enabled)]
+fn scalbnf16_cases() -> Vec<TestCase<op::scalbnf16::Routine>> {
+    vec![]
+}
+
+fn sin_cases() -> Vec<TestCase<op::sin::Routine>> {
+    vec![]
+}
+
+fn sincos_cases() -> Vec<TestCase<op::sincos::Routine>> {
+    vec![]
+}
+
+fn sincosf_cases() -> Vec<TestCase<op::sincosf::Routine>> {
+    vec![]
+}
+
+fn sinf_cases() -> Vec<TestCase<op::sinf::Routine>> {
+    vec![]
+}
+
+fn sinh_cases() -> Vec<TestCase<op::sinh::Routine>> {
+    vec![]
+}
+
+fn sinhf_cases() -> Vec<TestCase<op::sinhf::Routine>> {
+    vec![]
+}
+
+fn sqrt_cases() -> Vec<TestCase<op::sqrt::Routine>> {
+    vec![]
+}
+
+fn sqrtf_cases() -> Vec<TestCase<op::sqrtf::Routine>> {
+    vec![]
+}
+
+#[cfg(f128_enabled)]
+fn sqrtf128_cases() -> Vec<TestCase<op::sqrtf128::Routine>> {
+    vec![]
+}
+
+#[cfg(f16_enabled)]
+fn sqrtf16_cases() -> Vec<TestCase<op::sqrtf16::Routine>> {
+    vec![]
+}
+
+fn tan_cases() -> Vec<TestCase<op::tan::Routine>> {
+    vec![]
+}
+
+fn tanf_cases() -> Vec<TestCase<op::tanf::Routine>> {
+    vec![]
+}
+
+fn tanh_cases() -> Vec<TestCase<op::tanh::Routine>> {
+    vec![]
+}
+
+fn tanhf_cases() -> Vec<TestCase<op::tanhf::Routine>> {
+    vec![]
+}
+
+fn tgamma_cases() -> Vec<TestCase<op::tgamma::Routine>> {
+    vec![]
+}
+
+fn tgammaf_cases() -> Vec<TestCase<op::tgammaf::Routine>> {
+    vec![]
+}
+
+fn trunc_cases() -> Vec<TestCase<op::trunc::Routine>> {
+    vec![]
+}
+
+fn truncf_cases() -> Vec<TestCase<op::truncf::Routine>> {
+    vec![]
+}
+
+#[cfg(f128_enabled)]
+fn truncf128_cases() -> Vec<TestCase<op::truncf128::Routine>> {
+    vec![]
+}
+
+#[cfg(f16_enabled)]
+fn truncf16_cases() -> Vec<TestCase<op::truncf16::Routine>> {
+    vec![]
+}
+
+fn y0_cases() -> Vec<TestCase<op::y0::Routine>> {
+    vec![]
+}
+
+fn y0f_cases() -> Vec<TestCase<op::y0f::Routine>> {
+    vec![]
+}
+
+fn y1_cases() -> Vec<TestCase<op::y1::Routine>> {
+    vec![]
+}
+
+fn y1f_cases() -> Vec<TestCase<op::y1f::Routine>> {
+    vec![]
+}
+
+fn yn_cases() -> Vec<TestCase<op::yn::Routine>> {
+    vec![]
+}
+
+fn ynf_cases() -> Vec<TestCase<op::ynf::Routine>> {
+    vec![]
+}
+
+pub trait CaseListInput: MathOp + Sized {
+    fn get_cases() -> Vec<TestCase<Self>>;
+}
+
+macro_rules! impl_case_list {
+    (
+        fn_name: $fn_name:ident,
+        attrs: [$($attr:meta),*],
+    ) => {
+        paste::paste! {
+            $(#[$attr])*
+            impl CaseListInput for crate::op::$fn_name::Routine {
+                fn get_cases() -> Vec<TestCase<Self>> {
+                    [< $fn_name _cases >]()
+                }
+            }
+        }
+    };
+}
+
+libm_macros::for_each_function! {
+    callback: impl_case_list,
+}
+
+/// This is the test generator for standalone tests, i.e. those with no basis. For this, it
+/// only extracts tests with a known output.
+pub fn get_test_cases_standalone<Op>(
+    ctx: &CheckCtx,
+) -> impl Iterator<Item = (Op::RustArgs, Op::RustRet)> + use<'_, Op>
+where
+    Op: MathOp + CaseListInput,
+{
+    assert_eq!(ctx.basis, CheckBasis::None);
+    assert_eq!(ctx.gen_kind, GeneratorKind::List);
+    Op::get_cases()
+        .into_iter()
+        .filter_map(|x| x.output.map(|o| (x.input, o)))
+}
+
+/// Opposite of the above; extract only test cases that don't have a known output, to be run
+/// against a basis.
+pub fn get_test_cases_basis<Op>(
+    ctx: &CheckCtx,
+) -> (impl Iterator<Item = Op::RustArgs> + use<'_, Op>, u64)
+where
+    Op: MathOp + CaseListInput,
+{
+    assert_ne!(ctx.basis, CheckBasis::None);
+    assert_eq!(ctx.gen_kind, GeneratorKind::List);
+
+    let cases = Op::get_cases();
+    let count: u64 = cases
+        .iter()
+        .filter(|case| case.output.is_none())
+        .count()
+        .try_into()
+        .unwrap();
+
+    (
+        cases
+            .into_iter()
+            .filter(|x| x.output.is_none())
+            .map(|x| x.input),
+        count,
+    )
+}
diff --git a/library/compiler-builtins/libm-test/src/generate/edge_cases.rs b/library/compiler-builtins/libm-test/src/generate/edge_cases.rs
new file mode 100644
index 00000000000..2fb0746388c
--- /dev/null
+++ b/library/compiler-builtins/libm-test/src/generate/edge_cases.rs
@@ -0,0 +1,314 @@
+//! A generator that checks a handful of cases near infinities, zeros, asymptotes, and NaNs.
+
+use libm::support::{CastInto, Float, Int, MinInt};
+
+use crate::domain::get_domain;
+use crate::generate::KnownSize;
+use crate::op::OpITy;
+use crate::run_cfg::{check_near_count, check_point_count};
+use crate::{BaseName, CheckCtx, FloatExt, FloatTy, MathOp, test_log};
+
+/// Generate a sequence of edge cases, e.g. numbers near zeroes and infiniteis.
+pub trait EdgeCaseInput<Op> {
+    fn get_cases(ctx: &CheckCtx) -> (impl Iterator<Item = Self> + Send, u64);
+}
+
+/// Create a list of values around interesting points (infinities, zeroes, NaNs).
+fn float_edge_cases<Op>(
+    ctx: &CheckCtx,
+    argnum: usize,
+) -> (impl Iterator<Item = Op::FTy> + Clone, u64)
+where
+    Op: MathOp,
+{
+    let mut ret = Vec::new();
+    let one = OpITy::<Op>::ONE;
+    let values = &mut ret;
+    let domain = get_domain::<_, i8>(ctx.fn_ident, argnum).unwrap_float();
+    let domain_start = domain.range_start();
+    let domain_end = domain.range_end();
+
+    let check_points = check_point_count(ctx);
+    let near_points = check_near_count(ctx);
+
+    // Check near some notable constants
+    count_up(Op::FTy::ONE, near_points, values);
+    count_up(Op::FTy::ZERO, near_points, values);
+    count_up(Op::FTy::NEG_ONE, near_points, values);
+    count_down(Op::FTy::ONE, near_points, values);
+    count_down(Op::FTy::ZERO, near_points, values);
+    count_down(Op::FTy::NEG_ONE, near_points, values);
+    values.push(Op::FTy::NEG_ZERO);
+
+    // Check values near the extremes
+    count_up(Op::FTy::NEG_INFINITY, near_points, values);
+    count_down(Op::FTy::INFINITY, near_points, values);
+    count_down(domain_end, near_points, values);
+    count_up(domain_start, near_points, values);
+    count_down(domain_start, near_points, values);
+    count_up(domain_end, near_points, values);
+    count_down(domain_end, near_points, values);
+
+    // Check some special values that aren't included in the above ranges
+    values.push(Op::FTy::NAN);
+    values.extend(Op::FTy::consts().iter());
+
+    // Check around the maximum subnormal value
+    let sub_max = Op::FTy::from_bits(Op::FTy::SIG_MASK);
+    count_up(sub_max, near_points, values);
+    count_down(sub_max, near_points, values);
+    count_up(-sub_max, near_points, values);
+    count_down(-sub_max, near_points, values);
+
+    // Check a few values around the subnormal range
+    for shift in (0..Op::FTy::SIG_BITS).step_by(Op::FTy::SIG_BITS as usize / 5) {
+        let v = Op::FTy::from_bits(one << shift);
+        count_up(v, 2, values);
+        count_down(v, 2, values);
+        count_up(-v, 2, values);
+        count_down(-v, 2, values);
+    }
+
+    // Check around asymptotes
+    if let Some(f) = domain.check_points {
+        let iter = f();
+        for x in iter.take(check_points) {
+            count_up(x, near_points, values);
+            count_down(x, near_points, values);
+        }
+    }
+
+    // Some results may overlap so deduplicate the vector to save test cycles.
+    values.sort_by_key(|x| x.to_bits());
+    values.dedup_by_key(|x| x.to_bits());
+
+    let count = ret.len().try_into().unwrap();
+
+    test_log(&format!(
+        "{gen_kind:?} {basis:?} {fn_ident} arg {arg}/{args}: {count} edge cases",
+        gen_kind = ctx.gen_kind,
+        basis = ctx.basis,
+        fn_ident = ctx.fn_ident,
+        arg = argnum + 1,
+        args = ctx.input_count(),
+    ));
+
+    (ret.into_iter(), count)
+}
+
+/// Add `points` values starting at and including `x` and counting up. Uses the smallest possible
+/// increments (1 ULP).
+fn count_up<F: Float>(mut x: F, points: u64, values: &mut Vec<F>) {
+    assert!(!x.is_nan());
+
+    let mut count = 0;
+    while x < F::INFINITY && count < points {
+        values.push(x);
+        x = x.next_up();
+        count += 1;
+    }
+}
+
+/// Add `points` values starting at and including `x` and counting down. Uses the smallest possible
+/// increments (1 ULP).
+fn count_down<F: Float>(mut x: F, points: u64, values: &mut Vec<F>) {
+    assert!(!x.is_nan());
+
+    let mut count = 0;
+    while x > F::NEG_INFINITY && count < points {
+        values.push(x);
+        x = x.next_down();
+        count += 1;
+    }
+}
+
+/// Create a list of values around interesting integer points (min, zero, max).
+pub fn int_edge_cases<I: Int>(
+    ctx: &CheckCtx,
+    argnum: usize,
+) -> (impl Iterator<Item = I> + Clone, u64)
+where
+    i32: CastInto<I>,
+{
+    let mut values = Vec::new();
+    let near_points = check_near_count(ctx);
+
+    // Check around max/min and zero
+    int_count_around(I::MIN, near_points, &mut values);
+    int_count_around(I::MAX, near_points, &mut values);
+    int_count_around(I::ZERO, near_points, &mut values);
+    int_count_around(I::ZERO, near_points, &mut values);
+
+    if matches!(ctx.base_name, BaseName::Scalbn | BaseName::Ldexp) {
+        assert_eq!(argnum, 1, "scalbn integer argument should be arg1");
+        let (emax, emin, emin_sn) = match ctx.fn_ident.math_op().float_ty {
+            FloatTy::F16 => {
+                #[cfg(not(f16_enabled))]
+                unreachable!();
+                #[cfg(f16_enabled)]
+                (f16::EXP_MAX, f16::EXP_MIN, f16::EXP_MIN_SUBNORM)
+            }
+            FloatTy::F32 => (f32::EXP_MAX, f32::EXP_MIN, f32::EXP_MIN_SUBNORM),
+            FloatTy::F64 => (f64::EXP_MAX, f64::EXP_MIN, f64::EXP_MIN_SUBNORM),
+            FloatTy::F128 => {
+                #[cfg(not(f128_enabled))]
+                unreachable!();
+                #[cfg(f128_enabled)]
+                (f128::EXP_MAX, f128::EXP_MIN, f128::EXP_MIN_SUBNORM)
+            }
+        };
+
+        // `scalbn`/`ldexp` have their trickiest behavior around exponent limits
+        int_count_around(emax.cast(), near_points, &mut values);
+        int_count_around(emin.cast(), near_points, &mut values);
+        int_count_around(emin_sn.cast(), near_points, &mut values);
+        int_count_around((-emin_sn).cast(), near_points, &mut values);
+
+        // Also check values that cause the maximum possible difference in exponents
+        int_count_around((emax - emin).cast(), near_points, &mut values);
+        int_count_around((emin - emax).cast(), near_points, &mut values);
+        int_count_around((emax - emin_sn).cast(), near_points, &mut values);
+        int_count_around((emin_sn - emax).cast(), near_points, &mut values);
+    }
+
+    values.sort();
+    values.dedup();
+    let count = values.len().try_into().unwrap();
+
+    test_log(&format!(
+        "{gen_kind:?} {basis:?} {fn_ident} arg {arg}/{args}: {count} edge cases",
+        gen_kind = ctx.gen_kind,
+        basis = ctx.basis,
+        fn_ident = ctx.fn_ident,
+        arg = argnum + 1,
+        args = ctx.input_count(),
+    ));
+
+    (values.into_iter(), count)
+}
+
+/// Add `points` values both up and down, starting at and including `x`.
+fn int_count_around<I: Int>(x: I, points: u64, values: &mut Vec<I>) {
+    let mut current = x;
+    for _ in 0..points {
+        values.push(current);
+        current = match current.checked_add(I::ONE) {
+            Some(v) => v,
+            None => break,
+        };
+    }
+
+    current = x;
+    for _ in 0..points {
+        values.push(current);
+        current = match current.checked_sub(I::ONE) {
+            Some(v) => v,
+            None => break,
+        };
+    }
+}
+
+macro_rules! impl_edge_case_input {
+    ($fty:ty) => {
+        impl<Op> EdgeCaseInput<Op> for ($fty,)
+        where
+            Op: MathOp<RustArgs = Self, FTy = $fty>,
+        {
+            fn get_cases(ctx: &CheckCtx) -> (impl Iterator<Item = Self>, u64) {
+                let (iter0, steps0) = float_edge_cases::<Op>(ctx, 0);
+                let iter0 = iter0.map(|v| (v,));
+                (iter0, steps0)
+            }
+        }
+
+        impl<Op> EdgeCaseInput<Op> for ($fty, $fty)
+        where
+            Op: MathOp<RustArgs = Self, FTy = $fty>,
+        {
+            fn get_cases(ctx: &CheckCtx) -> (impl Iterator<Item = Self>, u64) {
+                let (iter0, steps0) = float_edge_cases::<Op>(ctx, 0);
+                let (iter1, steps1) = float_edge_cases::<Op>(ctx, 1);
+                let iter =
+                    iter0.flat_map(move |first| iter1.clone().map(move |second| (first, second)));
+                let count = steps0.checked_mul(steps1).unwrap();
+                (iter, count)
+            }
+        }
+
+        impl<Op> EdgeCaseInput<Op> for ($fty, $fty, $fty)
+        where
+            Op: MathOp<RustArgs = Self, FTy = $fty>,
+        {
+            fn get_cases(ctx: &CheckCtx) -> (impl Iterator<Item = Self>, u64) {
+                let (iter0, steps0) = float_edge_cases::<Op>(ctx, 0);
+                let (iter1, steps1) = float_edge_cases::<Op>(ctx, 1);
+                let (iter2, steps2) = float_edge_cases::<Op>(ctx, 2);
+
+                let iter = iter0
+                    .flat_map(move |first| iter1.clone().map(move |second| (first, second)))
+                    .flat_map(move |(first, second)| {
+                        iter2.clone().map(move |third| (first, second, third))
+                    });
+                let count = steps0
+                    .checked_mul(steps1)
+                    .unwrap()
+                    .checked_mul(steps2)
+                    .unwrap();
+
+                (iter, count)
+            }
+        }
+
+        impl<Op> EdgeCaseInput<Op> for (i32, $fty)
+        where
+            Op: MathOp<RustArgs = Self, FTy = $fty>,
+        {
+            fn get_cases(ctx: &CheckCtx) -> (impl Iterator<Item = Self>, u64) {
+                let (iter0, steps0) = int_edge_cases(ctx, 0);
+                let (iter1, steps1) = float_edge_cases::<Op>(ctx, 1);
+
+                let iter =
+                    iter0.flat_map(move |first| iter1.clone().map(move |second| (first, second)));
+                let count = steps0.checked_mul(steps1).unwrap();
+
+                (iter, count)
+            }
+        }
+
+        impl<Op> EdgeCaseInput<Op> for ($fty, i32)
+        where
+            Op: MathOp<RustArgs = Self, FTy = $fty>,
+        {
+            fn get_cases(ctx: &CheckCtx) -> (impl Iterator<Item = Self>, u64) {
+                let (iter0, steps0) = float_edge_cases::<Op>(ctx, 0);
+                let (iter1, steps1) = int_edge_cases(ctx, 1);
+
+                let iter =
+                    iter0.flat_map(move |first| iter1.clone().map(move |second| (first, second)));
+                let count = steps0.checked_mul(steps1).unwrap();
+
+                (iter, count)
+            }
+        }
+    };
+}
+
+#[cfg(f16_enabled)]
+impl_edge_case_input!(f16);
+impl_edge_case_input!(f32);
+impl_edge_case_input!(f64);
+#[cfg(f128_enabled)]
+impl_edge_case_input!(f128);
+
+pub fn get_test_cases<Op>(
+    ctx: &CheckCtx,
+) -> (impl Iterator<Item = Op::RustArgs> + Send + use<'_, Op>, u64)
+where
+    Op: MathOp,
+    Op::RustArgs: EdgeCaseInput<Op>,
+{
+    let (iter, count) = Op::RustArgs::get_cases(ctx);
+
+    // Wrap in `KnownSize` so we get an assertion if the cuunt is wrong.
+    (KnownSize::new(iter, count), count)
+}
diff --git a/library/compiler-builtins/libm-test/src/generate/random.rs b/library/compiler-builtins/libm-test/src/generate/random.rs
new file mode 100644
index 00000000000..4ee88946d8e
--- /dev/null
+++ b/library/compiler-builtins/libm-test/src/generate/random.rs
@@ -0,0 +1,128 @@
+use std::env;
+use std::ops::RangeInclusive;
+use std::sync::LazyLock;
+
+use libm::support::Float;
+use rand::distr::{Alphanumeric, StandardUniform};
+use rand::prelude::Distribution;
+use rand::{Rng, SeedableRng};
+use rand_chacha::ChaCha8Rng;
+
+use super::KnownSize;
+use crate::CheckCtx;
+use crate::run_cfg::{int_range, iteration_count};
+
+pub(crate) const SEED_ENV: &str = "LIBM_SEED";
+
+pub static SEED: LazyLock<[u8; 32]> = LazyLock::new(|| {
+    let s = env::var(SEED_ENV).unwrap_or_else(|_| {
+        let mut rng = rand::rng();
+        (0..32).map(|_| rng.sample(Alphanumeric) as char).collect()
+    });
+
+    s.as_bytes().try_into().unwrap_or_else(|_| {
+        panic!("Seed must be 32 characters, got `{s}`");
+    })
+});
+
+/// Generate a sequence of random values of this type.
+pub trait RandomInput: Sized {
+    fn get_cases(ctx: &CheckCtx) -> (impl Iterator<Item = Self> + Send, u64);
+}
+
+/// Generate a sequence of deterministically random floats.
+fn random_floats<F: Float>(count: u64) -> impl Iterator<Item = F>
+where
+    StandardUniform: Distribution<F::Int>,
+{
+    let mut rng = ChaCha8Rng::from_seed(*SEED);
+
+    // Generate integers to get a full range of bitpatterns (including NaNs), then convert back
+    // to the float type.
+    (0..count).map(move |_| F::from_bits(rng.random::<F::Int>()))
+}
+
+/// Generate a sequence of deterministically random `i32`s within a specified range.
+fn random_ints(count: u64, range: RangeInclusive<i32>) -> impl Iterator<Item = i32> {
+    let mut rng = ChaCha8Rng::from_seed(*SEED);
+    (0..count).map(move |_| rng.random_range::<i32, _>(range.clone()))
+}
+
+macro_rules! impl_random_input {
+    ($fty:ty) => {
+        impl RandomInput for ($fty,) {
+            fn get_cases(ctx: &CheckCtx) -> (impl Iterator<Item = Self>, u64) {
+                let count = iteration_count(ctx, 0);
+                let iter = random_floats(count).map(|f: $fty| (f,));
+                (iter, count)
+            }
+        }
+
+        impl RandomInput for ($fty, $fty) {
+            fn get_cases(ctx: &CheckCtx) -> (impl Iterator<Item = Self>, u64) {
+                let count0 = iteration_count(ctx, 0);
+                let count1 = iteration_count(ctx, 1);
+                let iter = random_floats(count0)
+                    .flat_map(move |f1: $fty| random_floats(count1).map(move |f2: $fty| (f1, f2)));
+                (iter, count0 * count1)
+            }
+        }
+
+        impl RandomInput for ($fty, $fty, $fty) {
+            fn get_cases(ctx: &CheckCtx) -> (impl Iterator<Item = Self>, u64) {
+                let count0 = iteration_count(ctx, 0);
+                let count1 = iteration_count(ctx, 1);
+                let count2 = iteration_count(ctx, 2);
+                let iter = random_floats(count0).flat_map(move |f1: $fty| {
+                    random_floats(count1).flat_map(move |f2: $fty| {
+                        random_floats(count2).map(move |f3: $fty| (f1, f2, f3))
+                    })
+                });
+                (iter, count0 * count1 * count2)
+            }
+        }
+
+        impl RandomInput for (i32, $fty) {
+            fn get_cases(ctx: &CheckCtx) -> (impl Iterator<Item = Self>, u64) {
+                let count0 = iteration_count(ctx, 0);
+                let count1 = iteration_count(ctx, 1);
+                let range0 = int_range(ctx, 0);
+                let iter = random_ints(count0, range0)
+                    .flat_map(move |f1: i32| random_floats(count1).map(move |f2: $fty| (f1, f2)));
+                (iter, count0 * count1)
+            }
+        }
+
+        impl RandomInput for ($fty, i32) {
+            fn get_cases(ctx: &CheckCtx) -> (impl Iterator<Item = Self>, u64) {
+                let count0 = iteration_count(ctx, 0);
+                let count1 = iteration_count(ctx, 1);
+                let range1 = int_range(ctx, 1);
+                let iter = random_floats(count0).flat_map(move |f1: $fty| {
+                    random_ints(count1, range1.clone()).map(move |f2: i32| (f1, f2))
+                });
+                (iter, count0 * count1)
+            }
+        }
+    };
+}
+
+#[cfg(f16_enabled)]
+impl_random_input!(f16);
+impl_random_input!(f32);
+impl_random_input!(f64);
+#[cfg(f128_enabled)]
+impl_random_input!(f128);
+
+/// Create a test case iterator.
+pub fn get_test_cases<RustArgs: RandomInput>(
+    ctx: &CheckCtx,
+) -> (
+    impl Iterator<Item = RustArgs> + Send + use<'_, RustArgs>,
+    u64,
+) {
+    let (iter, count) = RustArgs::get_cases(ctx);
+
+    // Wrap in `KnownSize` so we get an assertion if the cuunt is wrong.
+    (KnownSize::new(iter, count), count)
+}
diff --git a/library/compiler-builtins/libm-test/src/generate/spaced.rs b/library/compiler-builtins/libm-test/src/generate/spaced.rs
new file mode 100644
index 00000000000..8e6b376ebd1
--- /dev/null
+++ b/library/compiler-builtins/libm-test/src/generate/spaced.rs
@@ -0,0 +1,258 @@
+use std::fmt;
+use std::ops::RangeInclusive;
+
+use libm::support::{Float, MinInt};
+
+use crate::domain::get_domain;
+use crate::op::OpITy;
+use crate::run_cfg::{int_range, iteration_count};
+use crate::{CheckCtx, MathOp, linear_ints, logspace};
+
+/// Generate a sequence of inputs that eiher cover the domain in completeness (for smaller float
+/// types and single argument functions) or provide evenly spaced inputs across the domain with
+/// approximately `u32::MAX` total iterations.
+pub trait SpacedInput<Op> {
+    fn get_cases(ctx: &CheckCtx) -> (impl Iterator<Item = Self> + Send, u64);
+}
+
+/// Construct an iterator from `logspace` and also calculate the total number of steps expected
+/// for that iterator.
+fn logspace_steps<Op>(
+    ctx: &CheckCtx,
+    argnum: usize,
+    max_steps: u64,
+) -> (impl Iterator<Item = Op::FTy> + Clone, u64)
+where
+    Op: MathOp,
+    OpITy<Op>: TryFrom<u64, Error: fmt::Debug>,
+    u64: TryFrom<OpITy<Op>, Error: fmt::Debug>,
+    RangeInclusive<OpITy<Op>>: Iterator,
+{
+    // i8 is a dummy type here, it can be any integer.
+    let domain = get_domain::<Op::FTy, i8>(ctx.fn_ident, argnum).unwrap_float();
+    let start = domain.range_start();
+    let end = domain.range_end();
+
+    let max_steps = OpITy::<Op>::try_from(max_steps).unwrap_or(OpITy::<Op>::MAX);
+    let (iter, steps) = logspace(start, end, max_steps);
+
+    // `steps` will be <= the original `max_steps`, which is a `u64`.
+    (iter, steps.try_into().unwrap())
+}
+
+/// Represents the iterator in either `Left` or `Right`.
+enum EitherIter<A, B> {
+    A(A),
+    B(B),
+}
+
+impl<T, A: Iterator<Item = T>, B: Iterator<Item = T>> Iterator for EitherIter<A, B> {
+    type Item = T;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        match self {
+            Self::A(iter) => iter.next(),
+            Self::B(iter) => iter.next(),
+        }
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        match self {
+            Self::A(iter) => iter.size_hint(),
+            Self::B(iter) => iter.size_hint(),
+        }
+    }
+}
+
+/// Gets the total number of possible values, returning `None` if that number doesn't fit in a
+/// `u64`.
+fn value_count<F: Float>() -> Option<u64>
+where
+    u64: TryFrom<F::Int>,
+{
+    u64::try_from(F::Int::MAX)
+        .ok()
+        .and_then(|max| max.checked_add(1))
+}
+
+/// Returns an iterator of every possible value of type `F`.
+fn all_values<F: Float>() -> impl Iterator<Item = F>
+where
+    RangeInclusive<F::Int>: Iterator<Item = F::Int>,
+{
+    (F::Int::MIN..=F::Int::MAX).map(|bits| F::from_bits(bits))
+}
+
+macro_rules! impl_spaced_input {
+    ($fty:ty) => {
+        impl<Op> SpacedInput<Op> for ($fty,)
+        where
+            Op: MathOp<RustArgs = Self, FTy = $fty>,
+        {
+            fn get_cases(ctx: &CheckCtx) -> (impl Iterator<Item = Self>, u64) {
+                let max_steps0 = iteration_count(ctx, 0);
+                // `f16` and `f32` can have exhaustive tests.
+                match value_count::<Op::FTy>() {
+                    Some(steps0) if steps0 <= max_steps0 => {
+                        let iter0 = all_values();
+                        let iter0 = iter0.map(|v| (v,));
+                        (EitherIter::A(iter0), steps0)
+                    }
+                    _ => {
+                        let (iter0, steps0) = logspace_steps::<Op>(ctx, 0, max_steps0);
+                        let iter0 = iter0.map(|v| (v,));
+                        (EitherIter::B(iter0), steps0)
+                    }
+                }
+            }
+        }
+
+        impl<Op> SpacedInput<Op> for ($fty, $fty)
+        where
+            Op: MathOp<RustArgs = Self, FTy = $fty>,
+        {
+            fn get_cases(ctx: &CheckCtx) -> (impl Iterator<Item = Self>, u64) {
+                let max_steps0 = iteration_count(ctx, 0);
+                let max_steps1 = iteration_count(ctx, 1);
+                // `f16` can have exhaustive tests.
+                match value_count::<Op::FTy>() {
+                    Some(count) if count <= max_steps0 && count <= max_steps1 => {
+                        let iter = all_values()
+                            .flat_map(|first| all_values().map(move |second| (first, second)));
+                        (EitherIter::A(iter), count.checked_mul(count).unwrap())
+                    }
+                    _ => {
+                        let (iter0, steps0) = logspace_steps::<Op>(ctx, 0, max_steps0);
+                        let (iter1, steps1) = logspace_steps::<Op>(ctx, 1, max_steps1);
+                        let iter = iter0.flat_map(move |first| {
+                            iter1.clone().map(move |second| (first, second))
+                        });
+                        let count = steps0.checked_mul(steps1).unwrap();
+                        (EitherIter::B(iter), count)
+                    }
+                }
+            }
+        }
+
+        impl<Op> SpacedInput<Op> for ($fty, $fty, $fty)
+        where
+            Op: MathOp<RustArgs = Self, FTy = $fty>,
+        {
+            fn get_cases(ctx: &CheckCtx) -> (impl Iterator<Item = Self>, u64) {
+                let max_steps0 = iteration_count(ctx, 0);
+                let max_steps1 = iteration_count(ctx, 1);
+                let max_steps2 = iteration_count(ctx, 2);
+                // `f16` can be exhaustive tested if `LIBM_EXTENSIVE_TESTS` is incresed.
+                match value_count::<Op::FTy>() {
+                    Some(count)
+                        if count <= max_steps0 && count <= max_steps1 && count <= max_steps2 =>
+                    {
+                        let iter = all_values().flat_map(|first| {
+                            all_values().flat_map(move |second| {
+                                all_values().map(move |third| (first, second, third))
+                            })
+                        });
+                        (EitherIter::A(iter), count.checked_pow(3).unwrap())
+                    }
+                    _ => {
+                        let (iter0, steps0) = logspace_steps::<Op>(ctx, 0, max_steps0);
+                        let (iter1, steps1) = logspace_steps::<Op>(ctx, 1, max_steps1);
+                        let (iter2, steps2) = logspace_steps::<Op>(ctx, 2, max_steps2);
+
+                        let iter = iter0
+                            .flat_map(move |first| iter1.clone().map(move |second| (first, second)))
+                            .flat_map(move |(first, second)| {
+                                iter2.clone().map(move |third| (first, second, third))
+                            });
+                        let count = steps0
+                            .checked_mul(steps1)
+                            .unwrap()
+                            .checked_mul(steps2)
+                            .unwrap();
+
+                        (EitherIter::B(iter), count)
+                    }
+                }
+            }
+        }
+
+        impl<Op> SpacedInput<Op> for (i32, $fty)
+        where
+            Op: MathOp<RustArgs = Self, FTy = $fty>,
+        {
+            fn get_cases(ctx: &CheckCtx) -> (impl Iterator<Item = Self>, u64) {
+                let range0 = int_range(ctx, 0);
+                let max_steps0 = iteration_count(ctx, 0);
+                let max_steps1 = iteration_count(ctx, 1);
+                match value_count::<Op::FTy>() {
+                    Some(count1) if count1 <= max_steps1 => {
+                        let (iter0, steps0) = linear_ints(range0, max_steps0);
+                        let iter = iter0
+                            .flat_map(move |first| all_values().map(move |second| (first, second)));
+                        (EitherIter::A(iter), steps0.checked_mul(count1).unwrap())
+                    }
+                    _ => {
+                        let (iter0, steps0) = linear_ints(range0, max_steps0);
+                        let (iter1, steps1) = logspace_steps::<Op>(ctx, 1, max_steps1);
+
+                        let iter = iter0.flat_map(move |first| {
+                            iter1.clone().map(move |second| (first, second))
+                        });
+                        let count = steps0.checked_mul(steps1).unwrap();
+
+                        (EitherIter::B(iter), count)
+                    }
+                }
+            }
+        }
+
+        impl<Op> SpacedInput<Op> for ($fty, i32)
+        where
+            Op: MathOp<RustArgs = Self, FTy = $fty>,
+        {
+            fn get_cases(ctx: &CheckCtx) -> (impl Iterator<Item = Self>, u64) {
+                let max_steps0 = iteration_count(ctx, 0);
+                let range1 = int_range(ctx, 1);
+                let max_steps1 = iteration_count(ctx, 1);
+                match value_count::<Op::FTy>() {
+                    Some(count0) if count0 <= max_steps0 => {
+                        let (iter1, steps1) = linear_ints(range1, max_steps1);
+                        let iter = all_values().flat_map(move |first| {
+                            iter1.clone().map(move |second| (first, second))
+                        });
+                        (EitherIter::A(iter), count0.checked_mul(steps1).unwrap())
+                    }
+                    _ => {
+                        let (iter0, steps0) = logspace_steps::<Op>(ctx, 0, max_steps0);
+                        let (iter1, steps1) = linear_ints(range1, max_steps1);
+
+                        let iter = iter0.flat_map(move |first| {
+                            iter1.clone().map(move |second| (first, second))
+                        });
+                        let count = steps0.checked_mul(steps1).unwrap();
+
+                        (EitherIter::B(iter), count)
+                    }
+                }
+            }
+        }
+    };
+}
+
+#[cfg(f16_enabled)]
+impl_spaced_input!(f16);
+impl_spaced_input!(f32);
+impl_spaced_input!(f64);
+#[cfg(f128_enabled)]
+impl_spaced_input!(f128);
+
+/// Create a test case iterator for extensive inputs. Also returns the total test case count.
+pub fn get_test_cases<Op>(
+    ctx: &CheckCtx,
+) -> (impl Iterator<Item = Op::RustArgs> + Send + use<'_, Op>, u64)
+where
+    Op: MathOp,
+    Op::RustArgs: SpacedInput<Op>,
+{
+    Op::RustArgs::get_cases(ctx)
+}
diff --git a/library/compiler-builtins/libm-test/src/lib.rs b/library/compiler-builtins/libm-test/src/lib.rs
new file mode 100644
index 00000000000..accb39654d1
--- /dev/null
+++ b/library/compiler-builtins/libm-test/src/lib.rs
@@ -0,0 +1,107 @@
+#![cfg_attr(f16_enabled, feature(f16))]
+#![cfg_attr(f128_enabled, feature(f128))]
+#![allow(clippy::unusual_byte_groupings)] // sometimes we group by sign_exp_sig
+
+pub mod domain;
+mod f8_impl;
+pub mod generate;
+#[cfg(feature = "build-mpfr")]
+pub mod mpfloat;
+mod num;
+pub mod op;
+mod precision;
+mod run_cfg;
+mod test_traits;
+
+use std::env;
+use std::fs::File;
+use std::io::Write;
+use std::path::PathBuf;
+use std::sync::LazyLock;
+use std::time::SystemTime;
+
+pub use f8_impl::{f8, hf8};
+pub use libm::support::{Float, Int, IntTy, MinInt};
+pub use num::{FloatExt, linear_ints, logspace};
+pub use op::{
+    BaseName, FloatTy, Identifier, MathOp, OpCFn, OpCRet, OpFTy, OpRustArgs, OpRustFn, OpRustRet,
+    Ty,
+};
+pub use precision::{MaybeOverride, SpecialCase, default_ulp};
+use run_cfg::extensive_max_iterations;
+pub use run_cfg::{
+    CheckBasis, CheckCtx, EXTENSIVE_ENV, GeneratorKind, bigint_fuzz_iteration_count,
+    skip_extensive_test,
+};
+pub use test_traits::{CheckOutput, Hex, TupleCall};
+
+/// Result type for tests is usually from `anyhow`. Most times there is no success value to
+/// propagate.
+pub type TestResult<T = (), E = anyhow::Error> = Result<T, E>;
+
+/// True if `EMULATED` is set and nonempty. Used to determine how many iterations to run.
+pub const fn emulated() -> bool {
+    match option_env!("EMULATED") {
+        Some(s) if s.is_empty() => false,
+        None => false,
+        Some(_) => true,
+    }
+}
+
+/// True if `CI` is set and nonempty.
+pub const fn ci() -> bool {
+    match option_env!("CI") {
+        Some(s) if s.is_empty() => false,
+        None => false,
+        Some(_) => true,
+    }
+}
+
+/// Print to stderr and additionally log it to `target/test-log.txt`. This is useful for saving
+/// output that would otherwise be consumed by the test harness.
+pub fn test_log(s: &str) {
+    // Handle to a file opened in append mode, unless a suitable path can't be determined.
+    static OUTFILE: LazyLock<Option<File>> = LazyLock::new(|| {
+        // If the target directory is overridden, use that environment variable. Otherwise, save
+        // at the default path `{workspace_root}/target`.
+        let target_dir = match env::var("CARGO_TARGET_DIR") {
+            Ok(s) => PathBuf::from(s),
+            Err(_) => {
+                let Ok(x) = env::var("CARGO_MANIFEST_DIR") else {
+                    return None;
+                };
+
+                PathBuf::from(x).join("../target")
+            }
+        };
+        let outfile = target_dir.join("test-log.txt");
+
+        let mut f = File::options()
+            .create(true)
+            .append(true)
+            .open(outfile)
+            .expect("failed to open logfile");
+        let now = SystemTime::now()
+            .duration_since(SystemTime::UNIX_EPOCH)
+            .unwrap();
+
+        writeln!(f, "\n\nTest run at {}", now.as_secs()).unwrap();
+        writeln!(f, "arch: {}", env::consts::ARCH).unwrap();
+        writeln!(f, "os: {}", env::consts::OS).unwrap();
+        writeln!(f, "bits: {}", usize::BITS).unwrap();
+        writeln!(f, "emulated: {}", emulated()).unwrap();
+        writeln!(f, "ci: {}", ci()).unwrap();
+        writeln!(f, "cargo features: {}", env!("CFG_CARGO_FEATURES")).unwrap();
+        writeln!(f, "opt level: {}", env!("CFG_OPT_LEVEL")).unwrap();
+        writeln!(f, "target features: {}", env!("CFG_TARGET_FEATURES")).unwrap();
+        writeln!(f, "extensive iterations {}", extensive_max_iterations()).unwrap();
+
+        Some(f)
+    });
+
+    eprintln!("{s}");
+
+    if let Some(mut f) = OUTFILE.as_ref() {
+        writeln!(f, "{s}").unwrap();
+    }
+}
diff --git a/library/compiler-builtins/libm-test/src/mpfloat.rs b/library/compiler-builtins/libm-test/src/mpfloat.rs
new file mode 100644
index 00000000000..9b51dc6051d
--- /dev/null
+++ b/library/compiler-builtins/libm-test/src/mpfloat.rs
@@ -0,0 +1,603 @@
+//! Interfaces needed to support testing with multi-precision floating point numbers.
+//!
+//! Within this module, the macros create a submodule for each `libm` function. These contain
+//! a struct named `Operation` that implements [`MpOp`].
+
+use std::cmp::Ordering;
+
+use rug::Assign;
+pub use rug::Float as MpFloat;
+use rug::az::{self, Az};
+use rug::float::Round::Nearest;
+use rug::ops::{PowAssignRound, RemAssignRound};
+
+use crate::{Float, MathOp};
+
+/// Create a multiple-precision float with the correct number of bits for a concrete float type.
+fn new_mpfloat<F: Float>() -> MpFloat {
+    MpFloat::new(F::SIG_BITS + 1)
+}
+
+/// Set subnormal emulation and convert to a concrete float type.
+fn prep_retval<F: Float>(mp: &mut MpFloat, ord: Ordering) -> F
+where
+    for<'a> &'a MpFloat: az::Cast<F>,
+{
+    mp.subnormalize_ieee_round(ord, Nearest);
+    (&*mp).az::<F>()
+}
+
+/// Structures that represent a float operation.
+///
+pub trait MpOp: MathOp {
+    /// The struct itself should hold any context that can be reused among calls to `run` (allocated
+    /// `MpFloat`s).
+    type MpTy;
+
+    /// Create a new instance.
+    fn new_mp() -> Self::MpTy;
+
+    /// Perform the operation.
+    ///
+    /// Usually this means assigning inputs to cached floats, performing the operation, applying
+    /// subnormal approximation, and converting the result back to concrete values.
+    fn run(this: &mut Self::MpTy, input: Self::RustArgs) -> Self::RustRet;
+}
+
+/// Implement `MpOp` for functions with a single return value.
+macro_rules! impl_mp_op {
+    // Matcher for unary functions
+    (
+        fn_name: $fn_name:ident,
+        RustFn: fn($_fty:ty,) -> $_ret:ty,
+        attrs: [$($attr:meta),*],
+        fn_extra: $fn_name_normalized:expr,
+    ) => {
+        paste::paste! {
+            $(#[$attr])*
+            impl MpOp for crate::op::$fn_name::Routine {
+                type MpTy = MpFloat;
+
+                fn new_mp() -> Self::MpTy {
+                    new_mpfloat::<Self::FTy>()
+                }
+
+                fn run(this: &mut Self::MpTy, input: Self::RustArgs) -> Self::RustRet {
+                    this.assign(input.0);
+                    let ord = this.[< $fn_name_normalized _round >](Nearest);
+                    prep_retval::<Self::RustRet>(this, ord)
+                }
+            }
+        }
+    };
+    // Matcher for binary functions
+    (
+        fn_name: $fn_name:ident,
+        RustFn: fn($_fty:ty, $_fty2:ty,) -> $_ret:ty,
+        attrs: [$($attr:meta),*],
+        fn_extra: $fn_name_normalized:expr,
+    ) => {
+        paste::paste! {
+            $(#[$attr])*
+            impl MpOp for crate::op::$fn_name::Routine {
+                type MpTy = (MpFloat, MpFloat);
+
+                fn new_mp() -> Self::MpTy {
+                    (new_mpfloat::<Self::FTy>(), new_mpfloat::<Self::FTy>())
+                }
+
+                fn run(this: &mut Self::MpTy, input: Self::RustArgs) -> Self::RustRet {
+                    this.0.assign(input.0);
+                    this.1.assign(input.1);
+                    let ord = this.0.[< $fn_name_normalized _round >](&this.1, Nearest);
+                    prep_retval::<Self::RustRet>(&mut this.0, ord)
+                }
+            }
+        }
+    };
+    // Matcher for ternary functions
+    (
+        fn_name: $fn_name:ident,
+        RustFn: fn($_fty:ty, $_fty2:ty, $_fty3:ty,) -> $_ret:ty,
+        attrs: [$($attr:meta),*],
+        fn_extra: $fn_name_normalized:expr,
+    ) => {
+        paste::paste! {
+            $(#[$attr])*
+            impl MpOp for crate::op::$fn_name::Routine {
+                type MpTy = (MpFloat, MpFloat, MpFloat);
+
+                fn new_mp() -> Self::MpTy {
+                    (
+                        new_mpfloat::<Self::FTy>(),
+                        new_mpfloat::<Self::FTy>(),
+                        new_mpfloat::<Self::FTy>(),
+                    )
+                }
+
+                fn run(this: &mut Self::MpTy, input: Self::RustArgs) -> Self::RustRet {
+                    this.0.assign(input.0);
+                    this.1.assign(input.1);
+                    this.2.assign(input.2);
+                    let ord = this.0.[< $fn_name_normalized _round >](&this.1, &this.2, Nearest);
+                    prep_retval::<Self::RustRet>(&mut this.0, ord)
+                }
+            }
+        }
+    };
+}
+
+libm_macros::for_each_function! {
+    callback: impl_mp_op,
+    emit_types: [RustFn],
+    skip: [
+        // Most of these need a manual implementation
+        // verify-sorted-start
+        ceil,
+        ceilf,
+        ceilf128,
+        ceilf16,
+        copysign,
+        copysignf,
+        copysignf128,
+        copysignf16,
+        fabs,
+        fabsf,
+        fabsf128,
+        fabsf16,floor,
+        floorf,
+        floorf128,
+        floorf16,
+        fmaximum,
+        fmaximumf,
+        fmaximumf128,
+        fmaximumf16,
+        fminimum,
+        fminimumf,
+        fminimumf128,
+        fminimumf16,
+        fmod,
+        fmodf,
+        fmodf128,
+        fmodf16,
+        frexp,
+        frexpf,
+        ilogb,
+        ilogbf,
+        jn,
+        jnf,
+        ldexp,
+        ldexpf,
+        ldexpf128,
+        ldexpf16,
+        lgamma_r,
+        lgammaf_r,
+        modf,
+        modff,
+        nextafter,
+        nextafterf,
+        pow,
+        powf,remquo,
+        remquof,
+        rint,
+        rintf,
+        rintf128,
+        rintf16,
+        round,
+        roundeven,
+        roundevenf,
+        roundevenf128,
+        roundevenf16,
+        roundf,
+        roundf128,
+        roundf16,
+        scalbn,
+        scalbnf,
+        scalbnf128,
+        scalbnf16,
+        sincos,sincosf,
+        trunc,
+        truncf,
+        truncf128,
+        truncf16,yn,
+        ynf,
+        // verify-sorted-end
+    ],
+    fn_extra: match MACRO_FN_NAME {
+        // Remap function names that are different between mpfr and libm
+        expm1 | expm1f => exp_m1,
+        fabs | fabsf => abs,
+        fdim | fdimf | fdimf16 | fdimf128  => positive_diff,
+        fma | fmaf | fmaf128 => mul_add,
+        fmax | fmaxf | fmaxf16 | fmaxf128 |
+        fmaximum_num | fmaximum_numf | fmaximum_numf16 | fmaximum_numf128 => max,
+        fmin | fminf | fminf16 | fminf128 |
+        fminimum_num | fminimum_numf | fminimum_numf16 | fminimum_numf128 => min,
+        lgamma | lgammaf => ln_gamma,
+        log | logf => ln,
+        log1p | log1pf => ln_1p,
+        tgamma | tgammaf => gamma,
+        _ => MACRO_FN_NAME_NORMALIZED
+    }
+}
+
+/// Implement unary functions that don't have a `_round` version
+macro_rules! impl_no_round {
+    // Unary matcher
+    ($($fn_name:ident => $rug_name:ident;)*) => {
+        paste::paste! {
+            $( impl_no_round!{ @inner_unary $fn_name, $rug_name } )*
+        }
+    };
+
+    (@inner_unary $fn_name:ident, $rug_name:ident) => {
+        impl MpOp for crate::op::$fn_name::Routine {
+            type MpTy = MpFloat;
+
+            fn new_mp() -> Self::MpTy {
+                new_mpfloat::<Self::FTy>()
+            }
+
+            fn run(this: &mut Self::MpTy, input: Self::RustArgs) -> Self::RustRet {
+                this.assign(input.0);
+                this.$rug_name();
+                prep_retval::<Self::RustRet>(this, Ordering::Equal)
+            }
+        }
+    };
+}
+
+impl_no_round! {
+    ceil => ceil_mut;
+    ceilf => ceil_mut;
+    fabs => abs_mut;
+    fabsf => abs_mut;
+    floor => floor_mut;
+    floorf => floor_mut;
+    rint => round_even_mut; // FIXME: respect rounding mode
+    rintf => round_even_mut; // FIXME: respect rounding mode
+    round => round_mut;
+    roundeven => round_even_mut;
+    roundevenf => round_even_mut;
+    roundf => round_mut;
+    trunc => trunc_mut;
+    truncf => trunc_mut;
+}
+
+#[cfg(f16_enabled)]
+impl_no_round! {
+    ceilf16 => ceil_mut;
+    fabsf16 => abs_mut;
+    floorf16 => floor_mut;
+    rintf16 => round_even_mut; // FIXME: respect rounding mode
+    roundf16 => round_mut;
+    roundevenf16 => round_even_mut;
+    truncf16 => trunc_mut;
+}
+
+#[cfg(f128_enabled)]
+impl_no_round! {
+    ceilf128 => ceil_mut;
+    fabsf128 => abs_mut;
+    floorf128 => floor_mut;
+    rintf128 => round_even_mut; // FIXME: respect rounding mode
+    roundf128 => round_mut;
+    roundevenf128 => round_even_mut;
+    truncf128 => trunc_mut;
+}
+
+/// Some functions are difficult to do in a generic way. Implement them here.
+macro_rules! impl_op_for_ty {
+    ($fty:ty, $suffix:literal) => {
+        paste::paste! {
+            impl MpOp for crate::op::[<modf $suffix>]::Routine {
+                type MpTy = (MpFloat, MpFloat);
+
+                fn new_mp() -> Self::MpTy {
+                    (new_mpfloat::<Self::FTy>(), new_mpfloat::<Self::FTy>())
+                }
+
+                fn run(this: &mut Self::MpTy, input: Self::RustArgs) -> Self::RustRet {
+                    this.0.assign(input.0);
+                    this.1.assign(&this.0);
+                    let (ord0, ord1) = this.0.trunc_fract_round(&mut this.1, Nearest);
+                    (
+                        prep_retval::<Self::FTy>(&mut this.1, ord0),
+                        prep_retval::<Self::FTy>(&mut this.0, ord1),
+                    )
+                }
+            }
+
+            impl MpOp for crate::op::[<pow $suffix>]::Routine {
+                type MpTy = (MpFloat, MpFloat);
+
+                fn new_mp() -> Self::MpTy {
+                    (new_mpfloat::<Self::FTy>(), new_mpfloat::<Self::FTy>())
+                }
+
+                fn run(this: &mut Self::MpTy, input: Self::RustArgs) -> Self::RustRet {
+                    this.0.assign(input.0);
+                    this.1.assign(input.1);
+                    let ord = this.0.pow_assign_round(&this.1, Nearest);
+                    prep_retval::<Self::RustRet>(&mut this.0, ord)
+                }
+            }
+
+            impl MpOp for crate::op::[<frexp $suffix>]::Routine {
+                type MpTy = MpFloat;
+
+                fn new_mp() -> Self::MpTy {
+                    new_mpfloat::<Self::FTy>()
+                }
+
+                fn run(this: &mut Self::MpTy, input: Self::RustArgs) -> Self::RustRet {
+                    this.assign(input.0);
+                    let exp = this.frexp_mut();
+                    (prep_retval::<Self::FTy>(this, Ordering::Equal), exp)
+                }
+            }
+
+            impl MpOp for crate::op::[<ilogb $suffix>]::Routine {
+                type MpTy = MpFloat;
+
+                fn new_mp() -> Self::MpTy {
+                    new_mpfloat::<Self::FTy>()
+                }
+
+                fn run(this: &mut Self::MpTy, input: Self::RustArgs) -> Self::RustRet {
+                    this.assign(input.0);
+
+                    // `get_exp` follows `frexp` for `0.5 <= |m| < 1.0`. Adjust the exponent by
+                    // one to scale the significand to `1.0 <= |m| < 2.0`.
+                    this.get_exp().map(|v| v - 1).unwrap_or_else(|| {
+                        if this.is_infinite() {
+                            i32::MAX
+                        } else {
+                            // Zero or NaN
+                            i32::MIN
+                        }
+                    })
+                }
+            }
+
+            impl MpOp for crate::op::[<jn $suffix>]::Routine {
+                type MpTy = MpFloat;
+
+                fn new_mp() -> Self::MpTy {
+                    new_mpfloat::<Self::FTy>()
+                }
+
+                fn run(this: &mut Self::MpTy, input: Self::RustArgs) -> Self::RustRet {
+                    let (n, x) = input;
+                    this.assign(x);
+                    let ord = this.jn_round(n, Nearest);
+                    prep_retval::<Self::FTy>(this, ord)
+                }
+            }
+
+            impl MpOp for crate::op::[<sincos $suffix>]::Routine {
+                type MpTy = (MpFloat, MpFloat);
+
+                fn new_mp() -> Self::MpTy {
+                    (new_mpfloat::<Self::FTy>(), new_mpfloat::<Self::FTy>())
+                }
+
+                fn run(this: &mut Self::MpTy, input: Self::RustArgs) -> Self::RustRet {
+                    this.0.assign(input.0);
+                    this.1.assign(0.0);
+                    let (sord, cord) = this.0.sin_cos_round(&mut this.1, Nearest);
+                    (
+                        prep_retval::<Self::FTy>(&mut this.0, sord),
+                        prep_retval::<Self::FTy>(&mut this.1, cord)
+                    )
+                }
+            }
+
+            impl MpOp for crate::op::[<remquo $suffix>]::Routine {
+                type MpTy = (MpFloat, MpFloat);
+
+                fn new_mp() -> Self::MpTy {
+                    (
+                        new_mpfloat::<Self::FTy>(),
+                        new_mpfloat::<Self::FTy>(),
+                    )
+                }
+
+                fn run(this: &mut Self::MpTy, input: Self::RustArgs) -> Self::RustRet {
+                    this.0.assign(input.0);
+                    this.1.assign(input.1);
+                    let (ord, q) = this.0.remainder_quo31_round(&this.1, Nearest);
+                    (prep_retval::<Self::FTy>(&mut this.0, ord), q)
+                }
+            }
+
+            impl MpOp for crate::op::[<yn $suffix>]::Routine {
+                type MpTy = MpFloat;
+
+                fn new_mp() -> Self::MpTy {
+                    new_mpfloat::<Self::FTy>()
+                }
+
+                fn run(this: &mut Self::MpTy, input: Self::RustArgs) -> Self::RustRet {
+                    let (n, x) = input;
+                    this.assign(x);
+                    let ord = this.yn_round(n, Nearest);
+                    prep_retval::<Self::FTy>(this, ord)
+                }
+            }
+        }
+    };
+}
+
+/// Version of `impl_op_for_ty` with only functions that have `f16` and `f128` implementations.
+macro_rules! impl_op_for_ty_all {
+    ($fty:ty, $suffix:literal) => {
+        paste::paste! {
+            impl MpOp for crate::op::[<copysign $suffix>]::Routine {
+                type MpTy = (MpFloat, MpFloat);
+
+                fn new_mp() -> Self::MpTy {
+                    (new_mpfloat::<Self::FTy>(), new_mpfloat::<Self::FTy>())
+                }
+
+                fn run(this: &mut Self::MpTy, input: Self::RustArgs) -> Self::RustRet {
+                    this.0.assign(input.0);
+                    this.1.assign(input.1);
+                    this.0.copysign_mut(&this.1);
+                    prep_retval::<Self::RustRet>(&mut this.0, Ordering::Equal)
+                }
+            }
+
+            impl MpOp for crate::op::[<fmod $suffix>]::Routine {
+                type MpTy = (MpFloat, MpFloat);
+
+                fn new_mp() -> Self::MpTy {
+                    (new_mpfloat::<Self::FTy>(), new_mpfloat::<Self::FTy>())
+                }
+
+                fn run(this: &mut Self::MpTy, input: Self::RustArgs) -> Self::RustRet {
+                    this.0.assign(input.0);
+                    this.1.assign(input.1);
+                    let ord = this.0.rem_assign_round(&this.1, Nearest);
+                    prep_retval::<Self::RustRet>(&mut this.0, ord)
+
+                }
+            }
+
+            impl MpOp for crate::op::[< fmaximum $suffix >]::Routine {
+                type MpTy = (MpFloat, MpFloat);
+
+                fn new_mp() -> Self::MpTy {
+                    (new_mpfloat::<Self::FTy>(), new_mpfloat::<Self::FTy>())
+                }
+
+                fn run(this: &mut Self::MpTy, input: Self::RustArgs) -> Self::RustRet {
+                    this.0.assign(input.0);
+                    this.1.assign(input.1);
+                    let ord = if this.0.is_nan() || this.1.is_nan() {
+                        this.0.assign($fty::NAN);
+                        Ordering::Equal
+                    } else {
+                        this.0.max_round(&this.1, Nearest)
+                    };
+                    prep_retval::<Self::RustRet>(&mut this.0, ord)
+                }
+            }
+
+            impl MpOp for crate::op::[< fminimum $suffix >]::Routine {
+                type MpTy = (MpFloat, MpFloat);
+
+                fn new_mp() -> Self::MpTy {
+                    (new_mpfloat::<Self::FTy>(), new_mpfloat::<Self::FTy>())
+                }
+
+                fn run(this: &mut Self::MpTy, input: Self::RustArgs) -> Self::RustRet {
+                    this.0.assign(input.0);
+                    this.1.assign(input.1);
+                    let ord = if this.0.is_nan() || this.1.is_nan() {
+                        this.0.assign($fty::NAN);
+                        Ordering::Equal
+                    } else {
+                        this.0.min_round(&this.1, Nearest)
+                    };
+                    prep_retval::<Self::RustRet>(&mut this.0, ord)
+                }
+            }
+
+            // `ldexp` and `scalbn` are the same for binary floating point, so just forward all
+            // methods.
+            impl MpOp for crate::op::[<ldexp $suffix>]::Routine {
+                type MpTy = <crate::op::[<scalbn $suffix>]::Routine as MpOp>::MpTy;
+
+                fn new_mp() -> Self::MpTy {
+                    <crate::op::[<scalbn $suffix>]::Routine as MpOp>::new_mp()
+                }
+
+                fn run(this: &mut Self::MpTy, input: Self::RustArgs) -> Self::RustRet {
+                    <crate::op::[<scalbn $suffix>]::Routine as MpOp>::run(this, input)
+                }
+            }
+
+            impl MpOp for crate::op::[<scalbn $suffix>]::Routine {
+                type MpTy = MpFloat;
+
+                fn new_mp() -> Self::MpTy {
+                    new_mpfloat::<Self::FTy>()
+                }
+
+                fn run(this: &mut Self::MpTy, input: Self::RustArgs) -> Self::RustRet {
+                    this.assign(input.0);
+                    *this <<= input.1;
+                    prep_retval::<Self::FTy>(this, Ordering::Equal)
+                }
+            }
+        }
+    };
+}
+
+impl_op_for_ty!(f32, "f");
+impl_op_for_ty!(f64, "");
+
+#[cfg(f16_enabled)]
+impl_op_for_ty_all!(f16, "f16");
+impl_op_for_ty_all!(f32, "f");
+impl_op_for_ty_all!(f64, "");
+#[cfg(f128_enabled)]
+impl_op_for_ty_all!(f128, "f128");
+
+// `lgamma_r` is not a simple suffix so we can't use the above macro.
+impl MpOp for crate::op::lgamma_r::Routine {
+    type MpTy = MpFloat;
+
+    fn new_mp() -> Self::MpTy {
+        new_mpfloat::<Self::FTy>()
+    }
+
+    fn run(this: &mut Self::MpTy, input: Self::RustArgs) -> Self::RustRet {
+        this.assign(input.0);
+        let (sign, ord) = this.ln_abs_gamma_round(Nearest);
+        let ret = prep_retval::<Self::FTy>(this, ord);
+        (ret, sign as i32)
+    }
+}
+
+impl MpOp for crate::op::lgammaf_r::Routine {
+    type MpTy = MpFloat;
+
+    fn new_mp() -> Self::MpTy {
+        new_mpfloat::<Self::FTy>()
+    }
+
+    fn run(this: &mut Self::MpTy, input: Self::RustArgs) -> Self::RustRet {
+        this.assign(input.0);
+        let (sign, ord) = this.ln_abs_gamma_round(Nearest);
+        let ret = prep_retval::<Self::FTy>(this, ord);
+        (ret, sign as i32)
+    }
+}
+
+/* stub implementations so we don't need to special case them */
+
+impl MpOp for crate::op::nextafter::Routine {
+    type MpTy = MpFloat;
+
+    fn new_mp() -> Self::MpTy {
+        unimplemented!("nextafter does not yet have a MPFR operation");
+    }
+
+    fn run(_this: &mut Self::MpTy, _input: Self::RustArgs) -> Self::RustRet {
+        unimplemented!("nextafter does not yet have a MPFR operation");
+    }
+}
+
+impl MpOp for crate::op::nextafterf::Routine {
+    type MpTy = MpFloat;
+
+    fn new_mp() -> Self::MpTy {
+        unimplemented!("nextafter does not yet have a MPFR operation");
+    }
+
+    fn run(_this: &mut Self::MpTy, _input: Self::RustArgs) -> Self::RustRet {
+        unimplemented!("nextafter does not yet have a MPFR operation");
+    }
+}
diff --git a/library/compiler-builtins/libm-test/src/num.rs b/library/compiler-builtins/libm-test/src/num.rs
new file mode 100644
index 00000000000..3237c85039d
--- /dev/null
+++ b/library/compiler-builtins/libm-test/src/num.rs
@@ -0,0 +1,586 @@
+//! Helpful numeric operations.
+
+use std::cmp::min;
+use std::ops::RangeInclusive;
+
+use libm::support::Float;
+
+use crate::{Int, MinInt};
+
+/// Extension to `libm`'s `Float` trait with methods that are useful for tests but not
+/// needed in `libm` itself.
+pub trait FloatExt: Float {
+    /// The minimum subnormal number.
+    const TINY_BITS: Self::Int = Self::Int::ONE;
+
+    /// Retrieve additional constants for this float type.
+    fn consts() -> Consts<Self> {
+        Consts::new()
+    }
+
+    /// Increment by one ULP, saturating at infinity.
+    fn next_up(self) -> Self {
+        let bits = self.to_bits();
+        if self.is_nan() || bits == Self::INFINITY.to_bits() {
+            return self;
+        }
+
+        let abs = self.abs().to_bits();
+        let next_bits = if abs == Self::Int::ZERO {
+            // Next up from 0 is the smallest subnormal
+            Self::TINY_BITS
+        } else if bits == abs {
+            // Positive: counting up is more positive
+            bits + Self::Int::ONE
+        } else {
+            // Negative: counting down is more positive
+            bits - Self::Int::ONE
+        };
+        Self::from_bits(next_bits)
+    }
+
+    /// A faster way to effectively call `next_up` `n` times.
+    fn n_up(self, n: Self::Int) -> Self {
+        let bits = self.to_bits();
+        if self.is_nan() || bits == Self::INFINITY.to_bits() || n == Self::Int::ZERO {
+            return self;
+        }
+
+        let abs = self.abs().to_bits();
+        let is_positive = bits == abs;
+        let crosses_zero = !is_positive && n > abs;
+        let inf_bits = Self::INFINITY.to_bits();
+
+        let next_bits = if abs == Self::Int::ZERO {
+            min(n, inf_bits)
+        } else if crosses_zero {
+            min(n - abs, inf_bits)
+        } else if is_positive {
+            // Positive, counting up is more positive but this may overflow
+            match bits.checked_add(n) {
+                Some(v) if v >= inf_bits => inf_bits,
+                Some(v) => v,
+                None => inf_bits,
+            }
+        } else {
+            // Negative, counting down is more positive
+            bits - n
+        };
+        Self::from_bits(next_bits)
+    }
+
+    /// Decrement by one ULP, saturating at negative infinity.
+    fn next_down(self) -> Self {
+        let bits = self.to_bits();
+        if self.is_nan() || bits == Self::NEG_INFINITY.to_bits() {
+            return self;
+        }
+
+        let abs = self.abs().to_bits();
+        let next_bits = if abs == Self::Int::ZERO {
+            // Next up from 0 is the smallest negative subnormal
+            Self::TINY_BITS | Self::SIGN_MASK
+        } else if bits == abs {
+            // Positive: counting down is more negative
+            bits - Self::Int::ONE
+        } else {
+            // Negative: counting up is more negative
+            bits + Self::Int::ONE
+        };
+        Self::from_bits(next_bits)
+    }
+
+    /// A faster way to effectively call `next_down` `n` times.
+    fn n_down(self, n: Self::Int) -> Self {
+        let bits = self.to_bits();
+        if self.is_nan() || bits == Self::NEG_INFINITY.to_bits() || n == Self::Int::ZERO {
+            return self;
+        }
+
+        let abs = self.abs().to_bits();
+        let is_positive = bits == abs;
+        let crosses_zero = is_positive && n > abs;
+        let inf_bits = Self::INFINITY.to_bits();
+        let ninf_bits = Self::NEG_INFINITY.to_bits();
+
+        let next_bits = if abs == Self::Int::ZERO {
+            min(n, inf_bits) | Self::SIGN_MASK
+        } else if crosses_zero {
+            min(n - abs, inf_bits) | Self::SIGN_MASK
+        } else if is_positive {
+            // Positive, counting down is more negative
+            bits - n
+        } else {
+            // Negative, counting up is more negative but this may overflow
+            match bits.checked_add(n) {
+                Some(v) if v > ninf_bits => ninf_bits,
+                Some(v) => v,
+                None => ninf_bits,
+            }
+        };
+        Self::from_bits(next_bits)
+    }
+}
+
+impl<F> FloatExt for F where F: Float {}
+
+/// Extra constants that are useful for tests.
+#[derive(Debug, Clone, Copy)]
+pub struct Consts<F> {
+    /// The default quiet NaN, which is also the minimum quiet NaN.
+    pub pos_nan: F,
+    /// The default quiet NaN with negative sign.
+    pub neg_nan: F,
+    /// NaN with maximum (unsigned) significand to be a quiet NaN. The significand is saturated.
+    pub max_qnan: F,
+    /// NaN with minimum (unsigned) significand to be a signaling NaN.
+    pub min_snan: F,
+    /// NaN with maximum (unsigned) significand to be a signaling NaN.
+    pub max_snan: F,
+    pub neg_max_qnan: F,
+    pub neg_min_snan: F,
+    pub neg_max_snan: F,
+}
+
+impl<F: FloatExt> Consts<F> {
+    fn new() -> Self {
+        let top_sigbit_mask = F::Int::ONE << (F::SIG_BITS - 1);
+        let pos_nan = F::EXP_MASK | top_sigbit_mask;
+        let max_qnan = F::EXP_MASK | F::SIG_MASK;
+        let min_snan = F::EXP_MASK | F::Int::ONE;
+        let max_snan = (F::EXP_MASK | F::SIG_MASK) ^ top_sigbit_mask;
+
+        let neg_nan = pos_nan | F::SIGN_MASK;
+        let neg_max_qnan = max_qnan | F::SIGN_MASK;
+        let neg_min_snan = min_snan | F::SIGN_MASK;
+        let neg_max_snan = max_snan | F::SIGN_MASK;
+
+        Self {
+            pos_nan: F::from_bits(pos_nan),
+            neg_nan: F::from_bits(neg_nan),
+            max_qnan: F::from_bits(max_qnan),
+            min_snan: F::from_bits(min_snan),
+            max_snan: F::from_bits(max_snan),
+            neg_max_qnan: F::from_bits(neg_max_qnan),
+            neg_min_snan: F::from_bits(neg_min_snan),
+            neg_max_snan: F::from_bits(neg_max_snan),
+        }
+    }
+
+    pub fn iter(self) -> impl Iterator<Item = F> {
+        // Destructure so we get unused warnings if we forget a list entry.
+        let Self {
+            pos_nan,
+            neg_nan,
+            max_qnan,
+            min_snan,
+            max_snan,
+            neg_max_qnan,
+            neg_min_snan,
+            neg_max_snan,
+        } = self;
+
+        [
+            pos_nan,
+            neg_nan,
+            max_qnan,
+            min_snan,
+            max_snan,
+            neg_max_qnan,
+            neg_min_snan,
+            neg_max_snan,
+        ]
+        .into_iter()
+    }
+}
+
+/// Return the number of steps between two floats, returning `None` if either input is NaN.
+///
+/// This is the number of steps needed for `n_up` or `n_down` to go between values. Infinities
+/// are treated the same as those functions (will return the nearest finite value), and only one
+/// of `-0` or `+0` is counted. It does not matter which value is greater.
+pub fn ulp_between<F: Float>(x: F, y: F) -> Option<F::Int> {
+    let a = as_ulp_steps(x)?;
+    let b = as_ulp_steps(y)?;
+    Some(a.abs_diff(b))
+}
+
+/// Return the (signed) number of steps from zero to `x`.
+fn as_ulp_steps<F: Float>(x: F) -> Option<F::SignedInt> {
+    let s = x.to_bits_signed();
+    let val = if s >= F::SignedInt::ZERO {
+        // each increment from `s = 0` is one step up from `x = 0.0`
+        s
+    } else {
+        // each increment from `s = F::SignedInt::MIN` is one step down from `x = -0.0`
+        F::SignedInt::MIN - s
+    };
+
+    // If `x` is NaN, return `None`
+    (!x.is_nan()).then_some(val)
+}
+
+/// An iterator that returns floats with linearly spaced integer representations, which translates
+/// to logarithmic spacing of their values.
+///
+/// Note that this tends to skip negative zero, so that needs to be checked explicitly.
+///
+/// Returns `(iterator, iterator_length)`.
+pub fn logspace<F: FloatExt>(
+    start: F,
+    end: F,
+    steps: F::Int,
+) -> (impl Iterator<Item = F> + Clone, F::Int)
+where
+    RangeInclusive<F::Int>: Iterator,
+{
+    assert!(!start.is_nan());
+    assert!(!end.is_nan());
+    assert!(end >= start);
+
+    let steps = steps
+        .checked_sub(F::Int::ONE)
+        .expect("`steps` must be at least 2");
+    let between = ulp_between(start, end).expect("`start` or `end` is NaN");
+    let spacing = (between / steps).max(F::Int::ONE);
+    let steps = steps.min(between); // At maximum, one step per ULP
+
+    let mut x = start;
+    (
+        (F::Int::ZERO..=steps).map(move |_| {
+            let ret = x;
+            x = x.n_up(spacing);
+            ret
+        }),
+        steps + F::Int::ONE,
+    )
+}
+
+/// Returns an iterator of up to `steps` integers evenly distributed.
+pub fn linear_ints(
+    range: RangeInclusive<i32>,
+    steps: u64,
+) -> (impl Iterator<Item = i32> + Clone, u64) {
+    let steps = steps.checked_sub(1).unwrap();
+    let between = u64::from(range.start().abs_diff(*range.end()));
+    let spacing = i32::try_from((between / steps).max(1)).unwrap();
+    let steps = steps.min(between);
+    let mut x: i32 = *range.start();
+    (
+        (0..=steps).map(move |_| {
+            let res = x;
+            // Wrapping add to avoid panic on last item (where `x` could overflow past i32::MAX as
+            // there is no next item).
+            x = x.wrapping_add(spacing);
+            res
+        }),
+        steps + 1,
+    )
+}
+
+#[cfg(test)]
+mod tests {
+    use std::cmp::max;
+
+    use super::*;
+    use crate::f8;
+
+    #[test]
+    fn test_next_up_down() {
+        for (i, v) in f8::ALL.into_iter().enumerate() {
+            let down = v.next_down().to_bits();
+            let up = v.next_up().to_bits();
+
+            if i == 0 {
+                assert_eq!(down, f8::NEG_INFINITY.to_bits(), "{i} next_down({v:#010b})");
+            } else {
+                let expected = if v == f8::ZERO {
+                    1 | f8::SIGN_MASK
+                } else {
+                    f8::ALL[i - 1].to_bits()
+                };
+                assert_eq!(down, expected, "{i} next_down({v:#010b})");
+            }
+
+            if i == f8::ALL_LEN - 1 {
+                assert_eq!(up, f8::INFINITY.to_bits(), "{i} next_up({v:#010b})");
+            } else {
+                let expected = if v == f8::NEG_ZERO {
+                    1
+                } else {
+                    f8::ALL[i + 1].to_bits()
+                };
+                assert_eq!(up, expected, "{i} next_up({v:#010b})");
+            }
+        }
+    }
+
+    #[test]
+    fn test_next_up_down_inf_nan() {
+        assert_eq!(f8::NEG_INFINITY.next_up().to_bits(), f8::ALL[0].to_bits(),);
+        assert_eq!(
+            f8::NEG_INFINITY.next_down().to_bits(),
+            f8::NEG_INFINITY.to_bits(),
+        );
+        assert_eq!(
+            f8::INFINITY.next_down().to_bits(),
+            f8::ALL[f8::ALL_LEN - 1].to_bits(),
+        );
+        assert_eq!(f8::INFINITY.next_up().to_bits(), f8::INFINITY.to_bits(),);
+        assert_eq!(f8::NAN.next_up().to_bits(), f8::NAN.to_bits(),);
+        assert_eq!(f8::NAN.next_down().to_bits(), f8::NAN.to_bits(),);
+    }
+
+    #[test]
+    fn test_n_up_down_quick() {
+        assert_eq!(f8::ALL[0].n_up(4).to_bits(), f8::ALL[4].to_bits(),);
+        assert_eq!(
+            f8::ALL[f8::ALL_LEN - 1].n_down(4).to_bits(),
+            f8::ALL[f8::ALL_LEN - 5].to_bits(),
+        );
+
+        // Check around zero
+        assert_eq!(f8::from_bits(0b0).n_up(7).to_bits(), 0b0_0000_111);
+        assert_eq!(f8::from_bits(0b0).n_down(7).to_bits(), 0b1_0000_111);
+
+        // Check across zero
+        assert_eq!(f8::from_bits(0b1_0000_111).n_up(8).to_bits(), 0b0_0000_001);
+        assert_eq!(
+            f8::from_bits(0b0_0000_111).n_down(8).to_bits(),
+            0b1_0000_001
+        );
+    }
+
+    #[test]
+    fn test_n_up_down_one() {
+        // Verify that `n_up(1)` and `n_down(1)` are the same as `next_up()` and next_down()`.`
+        for i in 0..u8::MAX {
+            let v = f8::from_bits(i);
+            assert_eq!(v.next_up().to_bits(), v.n_up(1).to_bits());
+            assert_eq!(v.next_down().to_bits(), v.n_down(1).to_bits());
+        }
+    }
+
+    #[test]
+    fn test_n_up_down_inf_nan_zero() {
+        assert_eq!(f8::NEG_INFINITY.n_up(1).to_bits(), f8::ALL[0].to_bits());
+        assert_eq!(
+            f8::NEG_INFINITY.n_up(239).to_bits(),
+            f8::ALL[f8::ALL_LEN - 1].to_bits()
+        );
+        assert_eq!(f8::NEG_INFINITY.n_up(240).to_bits(), f8::INFINITY.to_bits());
+        assert_eq!(
+            f8::NEG_INFINITY.n_down(u8::MAX).to_bits(),
+            f8::NEG_INFINITY.to_bits()
+        );
+
+        assert_eq!(
+            f8::INFINITY.n_down(1).to_bits(),
+            f8::ALL[f8::ALL_LEN - 1].to_bits()
+        );
+        assert_eq!(f8::INFINITY.n_down(239).to_bits(), f8::ALL[0].to_bits());
+        assert_eq!(
+            f8::INFINITY.n_down(240).to_bits(),
+            f8::NEG_INFINITY.to_bits()
+        );
+        assert_eq!(f8::INFINITY.n_up(u8::MAX).to_bits(), f8::INFINITY.to_bits());
+
+        assert_eq!(f8::NAN.n_up(u8::MAX).to_bits(), f8::NAN.to_bits());
+        assert_eq!(f8::NAN.n_down(u8::MAX).to_bits(), f8::NAN.to_bits());
+
+        assert_eq!(f8::ZERO.n_down(1).to_bits(), f8::TINY_BITS | f8::SIGN_MASK);
+        assert_eq!(f8::NEG_ZERO.n_up(1).to_bits(), f8::TINY_BITS);
+    }
+
+    /// True if the specified range of `f8::ALL` includes both +0 and -0
+    fn crossed_zero(start: usize, end: usize) -> bool {
+        let crossed = &f8::ALL[start..=end];
+        crossed.iter().any(|f| f8::eq_repr(*f, f8::ZERO))
+            && crossed.iter().any(|f| f8::eq_repr(*f, f8::NEG_ZERO))
+    }
+
+    #[test]
+    fn test_n_up_down() {
+        for (i, v) in f8::ALL.into_iter().enumerate() {
+            for n in 0..f8::ALL_LEN {
+                let down = v.n_down(n as u8).to_bits();
+                let up = v.n_up(n as u8).to_bits();
+
+                if let Some(down_exp_idx) = i.checked_sub(n) {
+                    // No overflow
+                    let mut expected = f8::ALL[down_exp_idx].to_bits();
+                    if n >= 1 && crossed_zero(down_exp_idx, i) {
+                        // If both -0 and +0 are included, we need to adjust our expected value
+                        match down_exp_idx.checked_sub(1) {
+                            Some(v) => expected = f8::ALL[v].to_bits(),
+                            // Saturate to -inf if we are out of values
+                            None => expected = f8::NEG_INFINITY.to_bits(),
+                        }
+                    }
+                    assert_eq!(down, expected, "{i} {n} n_down({v:#010b})");
+                } else {
+                    // Overflow to -inf
+                    assert_eq!(
+                        down,
+                        f8::NEG_INFINITY.to_bits(),
+                        "{i} {n} n_down({v:#010b})"
+                    );
+                }
+
+                let mut up_exp_idx = i + n;
+                if up_exp_idx < f8::ALL_LEN {
+                    // No overflow
+                    if n >= 1 && up_exp_idx < f8::ALL_LEN && crossed_zero(i, up_exp_idx) {
+                        // If both -0 and +0 are included, we need to adjust our expected value
+                        up_exp_idx += 1;
+                    }
+
+                    let expected = if up_exp_idx >= f8::ALL_LEN {
+                        f8::INFINITY.to_bits()
+                    } else {
+                        f8::ALL[up_exp_idx].to_bits()
+                    };
+
+                    assert_eq!(up, expected, "{i} {n} n_up({v:#010b})");
+                } else {
+                    // Overflow to +inf
+                    assert_eq!(up, f8::INFINITY.to_bits(), "{i} {n} n_up({v:#010b})");
+                }
+            }
+        }
+    }
+
+    #[test]
+    fn test_ulp_between() {
+        for (i, x) in f8::ALL.into_iter().enumerate() {
+            for (j, y) in f8::ALL.into_iter().enumerate() {
+                let ulp = ulp_between(x, y).unwrap();
+                let make_msg = || format!("i: {i} j: {j} x: {x:b} y: {y:b} ulp {ulp}");
+
+                let i_low = min(i, j);
+                let i_hi = max(i, j);
+                let mut expected = u8::try_from(i_hi - i_low).unwrap();
+                if crossed_zero(i_low, i_hi) {
+                    expected -= 1;
+                }
+
+                assert_eq!(ulp, expected, "{}", make_msg());
+
+                // Skip if either are zero since `next_{up,down}` will count over it
+                let either_zero = x == f8::ZERO || y == f8::ZERO;
+                if x < y && !either_zero {
+                    assert_eq!(x.n_up(ulp).to_bits(), y.to_bits(), "{}", make_msg());
+                    assert_eq!(y.n_down(ulp).to_bits(), x.to_bits(), "{}", make_msg());
+                } else if !either_zero {
+                    assert_eq!(y.n_up(ulp).to_bits(), x.to_bits(), "{}", make_msg());
+                    assert_eq!(x.n_down(ulp).to_bits(), y.to_bits(), "{}", make_msg());
+                }
+            }
+        }
+    }
+
+    #[test]
+    fn test_ulp_between_inf_nan_zero() {
+        assert_eq!(
+            ulp_between(f8::NEG_INFINITY, f8::INFINITY).unwrap(),
+            f8::ALL_LEN as u8
+        );
+        assert_eq!(
+            ulp_between(f8::INFINITY, f8::NEG_INFINITY).unwrap(),
+            f8::ALL_LEN as u8
+        );
+        assert_eq!(
+            ulp_between(f8::NEG_INFINITY, f8::ALL[f8::ALL_LEN - 1]).unwrap(),
+            f8::ALL_LEN as u8 - 1
+        );
+        assert_eq!(
+            ulp_between(f8::INFINITY, f8::ALL[0]).unwrap(),
+            f8::ALL_LEN as u8 - 1
+        );
+
+        assert_eq!(ulp_between(f8::ZERO, f8::NEG_ZERO).unwrap(), 0);
+        assert_eq!(ulp_between(f8::NAN, f8::ZERO), None);
+        assert_eq!(ulp_between(f8::ZERO, f8::NAN), None);
+    }
+
+    #[test]
+    fn test_logspace() {
+        let (ls, count) = logspace(f8::from_bits(0x0), f8::from_bits(0x4), 2);
+        let ls: Vec<_> = ls.collect();
+        let exp = [f8::from_bits(0x0), f8::from_bits(0x4)];
+        assert_eq!(ls, exp);
+        assert_eq!(ls.len(), usize::from(count));
+
+        let (ls, count) = logspace(f8::from_bits(0x0), f8::from_bits(0x4), 3);
+        let ls: Vec<_> = ls.collect();
+        let exp = [f8::from_bits(0x0), f8::from_bits(0x2), f8::from_bits(0x4)];
+        assert_eq!(ls, exp);
+        assert_eq!(ls.len(), usize::from(count));
+
+        // Check that we include all values with no repeats if `steps` exceeds the maximum number
+        // of steps.
+        let (ls, count) = logspace(f8::from_bits(0x0), f8::from_bits(0x3), 10);
+        let ls: Vec<_> = ls.collect();
+        let exp = [
+            f8::from_bits(0x0),
+            f8::from_bits(0x1),
+            f8::from_bits(0x2),
+            f8::from_bits(0x3),
+        ];
+        assert_eq!(ls, exp);
+        assert_eq!(ls.len(), usize::from(count));
+    }
+
+    #[test]
+    fn test_linear_ints() {
+        let (ints, count) = linear_ints(0..=4, 2);
+        let ints: Vec<_> = ints.collect();
+        let exp = [0, 4];
+        assert_eq!(ints, exp);
+        assert_eq!(ints.len(), usize::try_from(count).unwrap());
+
+        let (ints, count) = linear_ints(0..=4, 3);
+        let ints: Vec<_> = ints.collect();
+        let exp = [0, 2, 4];
+        assert_eq!(ints, exp);
+        assert_eq!(ints.len(), usize::try_from(count).unwrap());
+
+        // Check that we include all values with no repeats if `steps` exceeds the maximum number
+        // of steps.
+        let (ints, count) = linear_ints(0x0..=0x3, 10);
+        let ints: Vec<_> = ints.collect();
+        let exp = [0, 1, 2, 3];
+        assert_eq!(ints, exp);
+        assert_eq!(ints.len(), usize::try_from(count).unwrap());
+
+        // Check that there are no panics around `i32::MAX`.
+        let (ints, count) = linear_ints(i32::MAX - 1..=i32::MAX, 5);
+        let ints: Vec<_> = ints.collect();
+        let exp = [i32::MAX - 1, i32::MAX];
+        assert_eq!(ints, exp);
+        assert_eq!(ints.len(), usize::try_from(count).unwrap());
+    }
+
+    #[test]
+    fn test_consts() {
+        let Consts {
+            pos_nan,
+            neg_nan,
+            max_qnan,
+            min_snan,
+            max_snan,
+            neg_max_qnan,
+            neg_min_snan,
+            neg_max_snan,
+        } = f8::consts();
+
+        assert_eq!(pos_nan.to_bits(), 0b0_1111_100);
+        assert_eq!(neg_nan.to_bits(), 0b1_1111_100);
+        assert_eq!(max_qnan.to_bits(), 0b0_1111_111);
+        assert_eq!(min_snan.to_bits(), 0b0_1111_001);
+        assert_eq!(max_snan.to_bits(), 0b0_1111_011);
+        assert_eq!(neg_max_qnan.to_bits(), 0b1_1111_111);
+        assert_eq!(neg_min_snan.to_bits(), 0b1_1111_001);
+        assert_eq!(neg_max_snan.to_bits(), 0b1_1111_011);
+    }
+}
diff --git a/library/compiler-builtins/libm-test/src/op.rs b/library/compiler-builtins/libm-test/src/op.rs
new file mode 100644
index 00000000000..afd445ff9c5
--- /dev/null
+++ b/library/compiler-builtins/libm-test/src/op.rs
@@ -0,0 +1,155 @@
+//! Types representing individual functions.
+//!
+//! Each routine gets a module with its name, e.g. `mod sinf { /* ... */ }`. The module
+//! contains a unit struct `Routine` which implements `MathOp`.
+//!
+//! Basically everything could be called a "function" here, so we loosely use the following
+//! terminology:
+//!
+//! - "Function": the math operation that does not have an associated precision. E.g. `f(x) = e^x`,
+//!   `f(x) = log(x)`.
+//! - "Routine": A code implementation of a math operation with a specific precision. E.g. `exp`,
+//!   `expf`, `expl`, `log`, `logf`.
+//! - "Operation" / "Op": Something that relates a routine to a function or is otherwise higher
+//!   level. `Op` is also used as the name for generic parameters since it is terse.
+
+use std::fmt;
+use std::panic::{RefUnwindSafe, UnwindSafe};
+
+pub use shared::{ALL_OPERATIONS, FloatTy, MathOpInfo, Ty};
+
+use crate::{CheckOutput, Float, TupleCall};
+
+mod shared {
+    include!("../../crates/libm-macros/src/shared.rs");
+}
+
+/// An enum representing each possible symbol name (`sin`, `sinf`, `sinl`, etc).
+#[libm_macros::function_enum(BaseName)]
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum Identifier {}
+
+impl fmt::Display for Identifier {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.write_str(self.as_str())
+    }
+}
+
+/// The name without any type specifier, e.g. `sin` and `sinf` both become `sin`.
+#[libm_macros::base_name_enum]
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum BaseName {}
+
+impl fmt::Display for BaseName {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.write_str(self.as_str())
+    }
+}
+
+/// Attributes ascribed to a `libm` routine including signature, type information,
+/// and naming.
+pub trait MathOp {
+    /// The float type used for this operation.
+    type FTy: Float;
+
+    /// The function type representing the signature in a C library.
+    type CFn: Copy;
+
+    /// Arguments passed to the C library function as a tuple. These may include `&mut` return
+    /// values.
+    type CArgs<'a>
+    where
+        Self: 'a;
+
+    /// The type returned by C implementations.
+    type CRet;
+
+    /// The signature of the Rust function as a `fn(...) -> ...` type.
+    type RustFn: Copy + UnwindSafe;
+
+    /// Arguments passed to the Rust library function as a tuple.
+    ///
+    /// The required `TupleCall` bounds ensure this type can be passed either to the C function or
+    /// to the Rust function.
+    type RustArgs: Copy
+        + TupleCall<Self::RustFn, Output = Self::RustRet>
+        + TupleCall<Self::CFn, Output = Self::RustRet>
+        + RefUnwindSafe;
+
+    /// Type returned from the Rust function.
+    type RustRet: CheckOutput<Self::RustArgs>;
+
+    /// The name of this function, including suffix (e.g. `sin`, `sinf`).
+    const IDENTIFIER: Identifier;
+
+    /// The name as a string.
+    const NAME: &'static str = Self::IDENTIFIER.as_str();
+
+    /// The name of the function excluding the type suffix, e.g. `sin` and `sinf` are both `sin`.
+    const BASE_NAME: BaseName = Self::IDENTIFIER.base_name();
+
+    /// The function in `libm` which can be called.
+    const ROUTINE: Self::RustFn;
+
+    /// Whether or not the function is part of libm public API.
+    const PUBLIC: bool;
+}
+
+/// Access the associated `FTy` type from an op (helper to avoid ambiguous associated types).
+pub type OpFTy<Op> = <Op as MathOp>::FTy;
+/// Access the associated `FTy::Int` type from an op (helper to avoid ambiguous associated types).
+pub type OpITy<Op> = <<Op as MathOp>::FTy as Float>::Int;
+/// Access the associated `CFn` type from an op (helper to avoid ambiguous associated types).
+pub type OpCFn<Op> = <Op as MathOp>::CFn;
+/// Access the associated `CRet` type from an op (helper to avoid ambiguous associated types).
+pub type OpCRet<Op> = <Op as MathOp>::CRet;
+/// Access the associated `RustFn` type from an op (helper to avoid ambiguous associated types).
+pub type OpRustFn<Op> = <Op as MathOp>::RustFn;
+/// Access the associated `RustArgs` type from an op (helper to avoid ambiguous associated types).
+pub type OpRustArgs<Op> = <Op as MathOp>::RustArgs;
+/// Access the associated `RustRet` type from an op (helper to avoid ambiguous associated types).
+pub type OpRustRet<Op> = <Op as MathOp>::RustRet;
+
+macro_rules! create_op_modules {
+    // Matcher for unary functions
+    (
+        fn_name: $fn_name:ident,
+        FTy: $FTy:ty,
+        CFn: $CFn:ty,
+        CArgs: $CArgs:ty,
+        CRet: $CRet:ty,
+        RustFn: $RustFn:ty,
+        RustArgs: $RustArgs:ty,
+        RustRet: $RustRet:ty,
+        public: $public:expr,
+        attrs: [$($attr:meta),*],
+    ) => {
+        paste::paste! {
+            $(#[$attr])*
+            pub mod $fn_name {
+                use super::*;
+                pub struct Routine;
+
+                impl MathOp for Routine {
+                    type FTy = $FTy;
+                    type CFn = for<'a> $CFn;
+                    type CArgs<'a> = $CArgs where Self: 'a;
+                    type CRet = $CRet;
+                    type RustFn = $RustFn;
+                    type RustArgs = $RustArgs;
+                    type RustRet = $RustRet;
+
+                    const IDENTIFIER: Identifier = Identifier::[< $fn_name:camel >];
+                    const ROUTINE: Self::RustFn = libm::$fn_name;
+                    const PUBLIC: bool = $public;
+                }
+            }
+
+        }
+    };
+}
+
+libm_macros::for_each_function! {
+    callback: create_op_modules,
+    emit_types: all,
+}
diff --git a/library/compiler-builtins/libm-test/src/precision.rs b/library/compiler-builtins/libm-test/src/precision.rs
new file mode 100644
index 00000000000..f5fb5f6707b
--- /dev/null
+++ b/library/compiler-builtins/libm-test/src/precision.rs
@@ -0,0 +1,573 @@
+//! Configuration for skipping or changing the result for individual test cases (inputs) rather
+//! than ignoring entire tests.
+
+use core::f32;
+
+use CheckBasis::{Mpfr, Musl};
+use libm::support::CastFrom;
+use {BaseName as Bn, Identifier as Id};
+
+use crate::{BaseName, CheckBasis, CheckCtx, Float, Identifier, Int, TestResult};
+
+/// Type implementing [`IgnoreCase`].
+pub struct SpecialCase;
+
+/// ULP allowed to differ from the results returned by a test basis.
+#[allow(clippy::single_match)]
+pub fn default_ulp(ctx: &CheckCtx) -> u32 {
+    // ULP compared to the infinite (MPFR) result.
+    let mut ulp = match ctx.base_name {
+        // Operations that require exact results. This list should correlate with what we
+        // have documented at <https://doc.rust-lang.org/std/primitive.f32.html>.
+        Bn::Ceil
+        | Bn::Copysign
+        | Bn::Fabs
+        | Bn::Fdim
+        | Bn::Floor
+        | Bn::Fma
+        | Bn::Fmax
+        | Bn::Fmaximum
+        | Bn::FmaximumNum
+        | Bn::Fmin
+        | Bn::Fminimum
+        | Bn::FminimumNum
+        | Bn::Fmod
+        | Bn::Frexp
+        | Bn::Ilogb
+        | Bn::Ldexp
+        | Bn::Modf
+        | Bn::Nextafter
+        | Bn::Remainder
+        | Bn::Remquo
+        | Bn::Rint
+        | Bn::Round
+        | Bn::Roundeven
+        | Bn::Scalbn
+        | Bn::Sqrt
+        | Bn::Trunc => 0,
+
+        // Operations that aren't required to be exact, but our implementations are.
+        Bn::Cbrt => 0,
+
+        // Bessel functions have large inaccuracies.
+        Bn::J0 | Bn::J1 | Bn::Y0 | Bn::Y1 | Bn::Jn | Bn::Yn => 8_000_000,
+
+        // For all other operations, specify our implementation's worst case precision.
+        Bn::Acos => 1,
+        Bn::Acosh => 4,
+        Bn::Asin => 1,
+        Bn::Asinh => 2,
+        Bn::Atan => 1,
+        Bn::Atan2 => 2,
+        Bn::Atanh => 2,
+        Bn::Cos => 1,
+        Bn::Cosh => 1,
+        Bn::Erf => 1,
+        Bn::Erfc => 4,
+        Bn::Exp => 1,
+        Bn::Exp10 => 6,
+        Bn::Exp2 => 1,
+        Bn::Expm1 => 1,
+        Bn::Hypot => 1,
+        Bn::Lgamma | Bn::LgammaR => 16,
+        Bn::Log => 1,
+        Bn::Log10 => 1,
+        Bn::Log1p => 1,
+        Bn::Log2 => 1,
+        Bn::Pow => 1,
+        Bn::Sin => 1,
+        Bn::Sincos => 1,
+        Bn::Sinh => 2,
+        Bn::Tan => 1,
+        Bn::Tanh => 2,
+        // tgammaf has higher accuracy than tgamma.
+        Bn::Tgamma if ctx.fn_ident != Id::Tgamma => 1,
+        Bn::Tgamma => 20,
+    };
+
+    // There are some cases where musl's approximation is less accurate than ours. For these
+    // cases, increase the ULP.
+    if ctx.basis == Musl {
+        match ctx.base_name {
+            Bn::Cosh => ulp = 2,
+            Bn::Exp10 if usize::BITS < 64 => ulp = 4,
+            Bn::Lgamma | Bn::LgammaR => ulp = 400,
+            Bn::Tanh => ulp = 4,
+            _ => (),
+        }
+
+        match ctx.fn_ident {
+            Id::Cbrt => ulp = 2,
+            // FIXME(#401): musl has an incorrect result here.
+            Id::Fdim => ulp = 2,
+            Id::Sincosf => ulp = 500,
+            Id::Tgamma => ulp = 20,
+            _ => (),
+        }
+    }
+
+    if cfg!(target_arch = "x86") {
+        match ctx.fn_ident {
+            // Input `fma(0.999999999999999, 1.0000000000000013, 0.0) = 1.0000000000000002` is
+            // incorrect on i586 and i686.
+            Id::Fma => ulp = 1,
+            _ => (),
+        }
+    }
+
+    // In some cases, our implementation is less accurate than musl on i586.
+    if cfg!(x86_no_sse) {
+        match ctx.fn_ident {
+            // FIXME(#401): these need to be correctly rounded but are not.
+            Id::Fmaf => ulp = 1,
+            Id::Fdim => ulp = 1,
+            Id::Round => ulp = 1,
+
+            Id::Asinh => ulp = 3,
+            Id::Asinhf => ulp = 3,
+            Id::Cbrt => ulp = 1,
+            Id::Exp10 | Id::Exp10f => ulp = 1_000_000,
+            Id::Exp2 | Id::Exp2f => ulp = 10_000_000,
+            Id::Log1p | Id::Log1pf => ulp = 2,
+            Id::Tan => ulp = 2,
+            _ => (),
+        }
+    }
+
+    ulp
+}
+
+/// Result of checking for possible overrides.
+#[derive(Debug, Default)]
+pub enum CheckAction {
+    /// The check should pass. Default case.
+    #[default]
+    AssertSuccess,
+
+    /// Override the ULP for this check.
+    AssertWithUlp(u32),
+
+    /// Failure is expected, ensure this is the case (xfail). Takes a contxt string to help trace
+    /// back exactly why we expect this to fail.
+    AssertFailure(&'static str),
+
+    /// The override somehow validated the result, here it is.
+    Custom(TestResult),
+
+    /// Disregard the output.
+    Skip,
+}
+
+/// Don't run further validation on this test case.
+const SKIP: CheckAction = CheckAction::Skip;
+
+/// Return this to skip checks on a test that currently fails but shouldn't. Takes a description
+/// of context.
+const XFAIL: fn(&'static str) -> CheckAction = CheckAction::AssertFailure;
+
+/// Indicates that we expect a test to fail but we aren't asserting that it does (e.g. some results
+/// within a range do actually pass).
+///
+/// Same as `SKIP`, just indicates we have something to eventually fix.
+const XFAIL_NOCHECK: CheckAction = CheckAction::Skip;
+
+/// By default, all tests should pass.
+const DEFAULT: CheckAction = CheckAction::AssertSuccess;
+
+/// Allow overriding the outputs of specific test cases.
+///
+/// There are some cases where we want to xfail specific cases or handle certain inputs
+/// differently than the rest of calls to `validate`. This provides a hook to do that.
+///
+/// If `None` is returned, checks will proceed as usual. If `Some(result)` is returned, checks
+/// are skipped and the provided result is returned instead.
+///
+/// This gets implemented once per input type, then the functions provide further filtering
+/// based on function name and values.
+///
+/// `ulp` can also be set to adjust the ULP for that specific test, even if `None` is still
+/// returned.
+pub trait MaybeOverride<Input> {
+    fn check_float<F: Float>(
+        _input: Input,
+        _actual: F,
+        _expected: F,
+        _ctx: &CheckCtx,
+    ) -> CheckAction {
+        DEFAULT
+    }
+
+    fn check_int<I: Int>(_input: Input, _actual: I, _expected: I, _ctx: &CheckCtx) -> CheckAction {
+        DEFAULT
+    }
+}
+
+#[cfg(f16_enabled)]
+impl MaybeOverride<(f16,)> for SpecialCase {}
+
+impl MaybeOverride<(f32,)> for SpecialCase {
+    fn check_float<F: Float>(input: (f32,), actual: F, expected: F, ctx: &CheckCtx) -> CheckAction {
+        if ctx.base_name == BaseName::Expm1
+            && !input.0.is_infinite()
+            && input.0 > 80.0
+            && actual.is_infinite()
+            && !expected.is_infinite()
+        {
+            // we return infinity but the number is representable
+            if ctx.basis == CheckBasis::Musl {
+                return XFAIL_NOCHECK;
+            }
+            return XFAIL("expm1 representable numbers");
+        }
+
+        if cfg!(x86_no_sse)
+            && ctx.base_name == BaseName::Exp2
+            && !expected.is_infinite()
+            && actual.is_infinite()
+        {
+            // We return infinity when there is a representable value. Test input: 127.97238
+            return XFAIL("586 exp2 representable numbers");
+        }
+
+        if ctx.base_name == BaseName::Sinh && input.0.abs() > 80.0 && actual.is_nan() {
+            // we return some NaN that should be real values or infinite
+            if ctx.basis == CheckBasis::Musl {
+                return XFAIL_NOCHECK;
+            }
+            return XFAIL("sinh unexpected NaN");
+        }
+
+        if (ctx.base_name == BaseName::Lgamma || ctx.base_name == BaseName::LgammaR)
+            && input.0 > 4e36
+            && expected.is_infinite()
+            && !actual.is_infinite()
+        {
+            // This result should saturate but we return a finite value.
+            return XFAIL_NOCHECK;
+        }
+
+        if ctx.base_name == BaseName::J0 && input.0 < -1e34 {
+            // Errors get huge close to -inf
+            return XFAIL_NOCHECK;
+        }
+
+        unop_common(input, actual, expected, ctx)
+    }
+
+    fn check_int<I: Int>(input: (f32,), actual: I, expected: I, ctx: &CheckCtx) -> CheckAction {
+        // On MPFR for lgammaf_r, we set -1 as the integer result for negative infinity but MPFR
+        // sets +1
+        if ctx.basis == CheckBasis::Mpfr
+            && ctx.base_name == BaseName::LgammaR
+            && input.0 == f32::NEG_INFINITY
+            && actual.abs() == expected.abs()
+        {
+            return XFAIL("lgammar integer result");
+        }
+
+        DEFAULT
+    }
+}
+
+impl MaybeOverride<(f64,)> for SpecialCase {
+    fn check_float<F: Float>(input: (f64,), actual: F, expected: F, ctx: &CheckCtx) -> CheckAction {
+        if cfg!(x86_no_sse)
+            && ctx.base_name == BaseName::Ceil
+            && ctx.basis == CheckBasis::Musl
+            && input.0 < 0.0
+            && input.0 > -1.0
+            && expected == F::ZERO
+            && actual == F::ZERO
+        {
+            // musl returns -0.0, we return +0.0
+            return XFAIL("i586 ceil signed zero");
+        }
+
+        if cfg!(x86_no_sse)
+            && (ctx.base_name == BaseName::Rint || ctx.base_name == BaseName::Roundeven)
+            && (expected - actual).abs() <= F::ONE
+            && (expected - actual).abs() > F::ZERO
+        {
+            // Our rounding mode is incorrect.
+            return XFAIL("i586 rint rounding mode");
+        }
+
+        if cfg!(x86_no_sse)
+            && (ctx.fn_ident == Identifier::Ceil || ctx.fn_ident == Identifier::Floor)
+            && expected.eq_repr(F::NEG_ZERO)
+            && actual.eq_repr(F::ZERO)
+        {
+            // FIXME: the x87 implementations do not keep the distinction between -0.0 and 0.0.
+            // See https://github.com/rust-lang/libm/pull/404#issuecomment-2572399955
+            return XFAIL("i586 ceil/floor signed zero");
+        }
+
+        if cfg!(x86_no_sse)
+            && (ctx.fn_ident == Identifier::Exp10 || ctx.fn_ident == Identifier::Exp2)
+        {
+            // FIXME: i586 has very imprecise results with ULP > u32::MAX for these
+            // operations so we can't reasonably provide a limit.
+            return XFAIL_NOCHECK;
+        }
+
+        if ctx.base_name == BaseName::J0 && input.0 < -1e300 {
+            // Errors get huge close to -inf
+            return XFAIL_NOCHECK;
+        }
+
+        // maybe_check_nan_bits(actual, expected, ctx)
+        unop_common(input, actual, expected, ctx)
+    }
+
+    fn check_int<I: Int>(input: (f64,), actual: I, expected: I, ctx: &CheckCtx) -> CheckAction {
+        // On MPFR for lgamma_r, we set -1 as the integer result for negative infinity but MPFR
+        // sets +1
+        if ctx.basis == CheckBasis::Mpfr
+            && ctx.base_name == BaseName::LgammaR
+            && input.0 == f64::NEG_INFINITY
+            && actual.abs() == expected.abs()
+        {
+            return XFAIL("lgammar integer result");
+        }
+
+        DEFAULT
+    }
+}
+
+#[cfg(f128_enabled)]
+impl MaybeOverride<(f128,)> for SpecialCase {}
+
+// F1 and F2 are always the same type, this is just to please generics
+fn unop_common<F1: Float, F2: Float>(
+    input: (F1,),
+    actual: F2,
+    expected: F2,
+    ctx: &CheckCtx,
+) -> CheckAction {
+    if ctx.base_name == BaseName::Acosh
+        && input.0 < F1::NEG_ONE
+        && !(expected.is_nan() && actual.is_nan())
+    {
+        // acoshf is undefined for x <= 1.0, but we return a random result at lower values.
+
+        if ctx.basis == CheckBasis::Musl {
+            return XFAIL_NOCHECK;
+        }
+
+        return XFAIL("acoshf undefined");
+    }
+
+    if (ctx.base_name == BaseName::Lgamma || ctx.base_name == BaseName::LgammaR)
+        && input.0 < F1::ZERO
+        && !input.0.is_infinite()
+    {
+        // loggamma should not be defined for x < 0, yet we both return results
+        return XFAIL_NOCHECK;
+    }
+
+    // fabs and copysign must leave NaNs untouched.
+    if ctx.base_name == BaseName::Fabs && input.0.is_nan() {
+        // LLVM currently uses x87 instructions which quieten signalling NaNs to handle the i686
+        // `extern "C"` `f32`/`f64` return ABI.
+        // LLVM issue <https://github.com/llvm/llvm-project/issues/66803>
+        // Rust issue <https://github.com/rust-lang/rust/issues/115567>
+        if cfg!(target_arch = "x86") && ctx.basis == CheckBasis::Musl && actual.is_nan() {
+            return XFAIL_NOCHECK;
+        }
+
+        // MPFR only has one NaN bitpattern; allow the default `.is_nan()` checks to validate.
+        if ctx.basis == CheckBasis::Mpfr {
+            return DEFAULT;
+        }
+
+        // abs and copysign require signaling NaNs to be propagated, so verify bit equality.
+        if actual.to_bits() == expected.to_bits() {
+            return CheckAction::Custom(Ok(()));
+        } else {
+            return CheckAction::Custom(Err(anyhow::anyhow!("NaNs have different bitpatterns")));
+        }
+    }
+
+    DEFAULT
+}
+
+#[cfg(f16_enabled)]
+impl MaybeOverride<(f16, f16)> for SpecialCase {
+    fn check_float<F: Float>(
+        input: (f16, f16),
+        actual: F,
+        expected: F,
+        ctx: &CheckCtx,
+    ) -> CheckAction {
+        binop_common(input, actual, expected, ctx)
+    }
+}
+
+impl MaybeOverride<(f32, f32)> for SpecialCase {
+    fn check_float<F: Float>(
+        input: (f32, f32),
+        actual: F,
+        expected: F,
+        ctx: &CheckCtx,
+    ) -> CheckAction {
+        binop_common(input, actual, expected, ctx)
+    }
+}
+
+impl MaybeOverride<(f64, f64)> for SpecialCase {
+    fn check_float<F: Float>(
+        input: (f64, f64),
+        actual: F,
+        expected: F,
+        ctx: &CheckCtx,
+    ) -> CheckAction {
+        binop_common(input, actual, expected, ctx)
+    }
+}
+
+#[cfg(f128_enabled)]
+impl MaybeOverride<(f128, f128)> for SpecialCase {
+    fn check_float<F: Float>(
+        input: (f128, f128),
+        actual: F,
+        expected: F,
+        ctx: &CheckCtx,
+    ) -> CheckAction {
+        binop_common(input, actual, expected, ctx)
+    }
+}
+
+// F1 and F2 are always the same type, this is just to please generics
+fn binop_common<F1: Float, F2: Float>(
+    input: (F1, F1),
+    actual: F2,
+    expected: F2,
+    ctx: &CheckCtx,
+) -> CheckAction {
+    // MPFR only has one NaN bitpattern; allow the default `.is_nan()` checks to validate. Skip if
+    // the first input (magnitude source) is NaN and the output is also a NaN, or if the second
+    // input (sign source) is NaN.
+    if ctx.basis == CheckBasis::Mpfr
+        && ((input.0.is_nan() && actual.is_nan() && expected.is_nan()) || input.1.is_nan())
+    {
+        return SKIP;
+    }
+
+    /* FIXME(#439): our fmin and fmax do not compare signed zeros */
+
+    if ctx.base_name == BaseName::Fmin
+        && input.0.biteq(F1::NEG_ZERO)
+        && input.1.biteq(F1::ZERO)
+        && expected.biteq(F2::NEG_ZERO)
+        && actual.biteq(F2::ZERO)
+    {
+        return XFAIL("fmin signed zeroes");
+    }
+
+    if ctx.base_name == BaseName::Fmax
+        && input.0.biteq(F1::NEG_ZERO)
+        && input.1.biteq(F1::ZERO)
+        && expected.biteq(F2::ZERO)
+        && actual.biteq(F2::NEG_ZERO)
+    {
+        return XFAIL("fmax signed zeroes");
+    }
+
+    // Musl propagates NaNs if one is provided as the input, but we return the other input.
+    if (ctx.base_name == BaseName::Fmax || ctx.base_name == BaseName::Fmin)
+        && ctx.basis == Musl
+        && (input.0.is_nan() ^ input.1.is_nan())
+        && expected.is_nan()
+    {
+        return XFAIL("fmax/fmin musl NaN");
+    }
+
+    DEFAULT
+}
+
+impl MaybeOverride<(i32, f32)> for SpecialCase {
+    fn check_float<F: Float>(
+        input: (i32, f32),
+        actual: F,
+        expected: F,
+        ctx: &CheckCtx,
+    ) -> CheckAction {
+        // `ynf(213, 109.15641) = -inf` with our library, should be finite.
+        if ctx.basis == Mpfr
+            && ctx.base_name == BaseName::Yn
+            && input.0 > 200
+            && !expected.is_infinite()
+            && actual.is_infinite()
+        {
+            return XFAIL("ynf infinity mismatch");
+        }
+
+        int_float_common(input, actual, expected, ctx)
+    }
+}
+
+impl MaybeOverride<(i32, f64)> for SpecialCase {
+    fn check_float<F: Float>(
+        input: (i32, f64),
+        actual: F,
+        expected: F,
+        ctx: &CheckCtx,
+    ) -> CheckAction {
+        int_float_common(input, actual, expected, ctx)
+    }
+}
+
+fn int_float_common<F1: Float, F2: Float>(
+    input: (i32, F1),
+    actual: F2,
+    expected: F2,
+    ctx: &CheckCtx,
+) -> CheckAction {
+    if ctx.basis == Mpfr
+        && (ctx.base_name == BaseName::Jn || ctx.base_name == BaseName::Yn)
+        && input.1 == F1::NEG_INFINITY
+        && actual == F2::ZERO
+        && expected == F2::ZERO
+    {
+        return XFAIL("we disagree with MPFR on the sign of zero");
+    }
+
+    // Values near infinity sometimes get cut off for us. `ynf(681, 509.90924) = -inf` but should
+    // be -3.2161271e38.
+    if ctx.basis == Musl
+        && ctx.fn_ident == Identifier::Ynf
+        && !expected.is_infinite()
+        && actual.is_infinite()
+        && (expected.abs().to_bits().abs_diff(actual.abs().to_bits())
+            < F2::Int::cast_from(10_000_000u32))
+    {
+        return XFAIL_NOCHECK;
+    }
+
+    // Our bessel functions blow up with large N values
+    if ctx.basis == Musl && (ctx.base_name == BaseName::Jn || ctx.base_name == BaseName::Yn) {
+        if cfg!(x86_no_sse) {
+            // Precision is especially bad on i586, not worth checking.
+            return XFAIL_NOCHECK;
+        }
+
+        if input.0 > 4000 {
+            return XFAIL_NOCHECK;
+        } else if input.0 > 100 {
+            return CheckAction::AssertWithUlp(1_000_000);
+        }
+    }
+    DEFAULT
+}
+
+#[cfg(f16_enabled)]
+impl MaybeOverride<(f16, i32)> for SpecialCase {}
+impl MaybeOverride<(f32, i32)> for SpecialCase {}
+impl MaybeOverride<(f64, i32)> for SpecialCase {}
+#[cfg(f128_enabled)]
+impl MaybeOverride<(f128, i32)> for SpecialCase {}
+
+impl MaybeOverride<(f32, f32, f32)> for SpecialCase {}
+impl MaybeOverride<(f64, f64, f64)> for SpecialCase {}
+#[cfg(f128_enabled)]
+impl MaybeOverride<(f128, f128, f128)> for SpecialCase {}
diff --git a/library/compiler-builtins/libm-test/src/run_cfg.rs b/library/compiler-builtins/libm-test/src/run_cfg.rs
new file mode 100644
index 00000000000..3345a01d2de
--- /dev/null
+++ b/library/compiler-builtins/libm-test/src/run_cfg.rs
@@ -0,0 +1,385 @@
+//! Configuration for how tests get run.
+
+use std::ops::RangeInclusive;
+use std::sync::LazyLock;
+use std::{env, str};
+
+use crate::generate::random::{SEED, SEED_ENV};
+use crate::{BaseName, FloatTy, Identifier, test_log};
+
+/// The environment variable indicating which extensive tests should be run.
+pub const EXTENSIVE_ENV: &str = "LIBM_EXTENSIVE_TESTS";
+
+/// Specify the number of iterations via this environment variable, rather than using the default.
+pub const EXTENSIVE_ITER_ENV: &str = "LIBM_EXTENSIVE_ITERATIONS";
+
+/// The override value, if set by the above environment.
+static EXTENSIVE_ITER_OVERRIDE: LazyLock<Option<u64>> = LazyLock::new(|| {
+    env::var(EXTENSIVE_ITER_ENV)
+        .map(|v| v.parse().expect("failed to parse iteration count"))
+        .ok()
+});
+
+/// Specific tests that need to have a reduced amount of iterations to complete in a reasonable
+/// amount of time.
+///
+/// Contains the itentifier+generator combo to match on, plus the factor to reduce by.
+const EXTEMELY_SLOW_TESTS: &[(Identifier, GeneratorKind, u64)] = &[
+    (Identifier::Fmodf128, GeneratorKind::QuickSpaced, 50),
+    (Identifier::Fmodf128, GeneratorKind::Extensive, 50),
+];
+
+/// Maximum number of iterations to run for a single routine.
+///
+/// The default value of one greater than `u32::MAX` allows testing single-argument `f32` routines
+/// and single- or double-argument `f16` routines exhaustively. `f64` and `f128` can't feasibly
+/// be tested exhaustively; however, [`EXTENSIVE_ITER_ENV`] can be set to run tests for multiple
+/// hours.
+pub fn extensive_max_iterations() -> u64 {
+    let default = 1 << 32; // default value
+    EXTENSIVE_ITER_OVERRIDE.unwrap_or(default)
+}
+
+/// Context passed to [`CheckOutput`].
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub struct CheckCtx {
+    /// Allowed ULP deviation
+    pub ulp: u32,
+    pub fn_ident: Identifier,
+    pub base_name: BaseName,
+    /// Function name.
+    pub fn_name: &'static str,
+    /// Return the unsuffixed version of the function name.
+    pub base_name_str: &'static str,
+    /// Source of truth for tests.
+    pub basis: CheckBasis,
+    pub gen_kind: GeneratorKind,
+    /// If specified, this value will override the value returned by [`iteration_count`].
+    pub override_iterations: Option<u64>,
+}
+
+impl CheckCtx {
+    /// Create a new check context, using the default ULP for the function.
+    pub fn new(fn_ident: Identifier, basis: CheckBasis, gen_kind: GeneratorKind) -> Self {
+        let mut ret = Self {
+            ulp: 0,
+            fn_ident,
+            fn_name: fn_ident.as_str(),
+            base_name: fn_ident.base_name(),
+            base_name_str: fn_ident.base_name().as_str(),
+            basis,
+            gen_kind,
+            override_iterations: None,
+        };
+        ret.ulp = crate::default_ulp(&ret);
+        ret
+    }
+
+    /// The number of input arguments for this function.
+    pub fn input_count(&self) -> usize {
+        self.fn_ident.math_op().rust_sig.args.len()
+    }
+
+    pub fn override_iterations(&mut self, count: u64) {
+        self.override_iterations = Some(count)
+    }
+}
+
+/// Possible items to test against
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub enum CheckBasis {
+    /// Check against Musl's math sources.
+    Musl,
+    /// Check against infinite precision (MPFR).
+    Mpfr,
+    /// Benchmarks or other times when this is not relevant.
+    None,
+}
+
+/// The different kinds of generators that provide test input, which account for input pattern
+/// and quantity.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum GeneratorKind {
+    EdgeCases,
+    Extensive,
+    QuickSpaced,
+    Random,
+    List,
+}
+
+/// A list of all functions that should get extensive tests.
+///
+/// This also supports the special test name `all` to run all tests, as well as `all_f16`,
+/// `all_f32`, `all_f64`, and `all_f128` to run all tests for a specific float type.
+static EXTENSIVE: LazyLock<Vec<Identifier>> = LazyLock::new(|| {
+    let var = env::var(EXTENSIVE_ENV).unwrap_or_default();
+    let list = var.split(",").filter(|s| !s.is_empty()).collect::<Vec<_>>();
+    let mut ret = Vec::new();
+
+    let append_ty_ops = |ret: &mut Vec<_>, fty: FloatTy| {
+        let iter = Identifier::ALL
+            .iter()
+            .filter(move |id| id.math_op().float_ty == fty)
+            .copied();
+        ret.extend(iter);
+    };
+
+    for item in list {
+        match item {
+            "all" => ret = Identifier::ALL.to_owned(),
+            "all_f16" => append_ty_ops(&mut ret, FloatTy::F16),
+            "all_f32" => append_ty_ops(&mut ret, FloatTy::F32),
+            "all_f64" => append_ty_ops(&mut ret, FloatTy::F64),
+            "all_f128" => append_ty_ops(&mut ret, FloatTy::F128),
+            s => {
+                let id = Identifier::from_str(s)
+                    .unwrap_or_else(|| panic!("unrecognized test name `{s}`"));
+                ret.push(id);
+            }
+        }
+    }
+
+    ret
+});
+
+/// Information about the function to be tested.
+#[derive(Debug)]
+struct TestEnv {
+    /// Tests should be reduced because the platform is slow. E.g. 32-bit or emulated.
+    slow_platform: bool,
+    /// The float cannot be tested exhaustively, `f64` or `f128`.
+    large_float_ty: bool,
+    /// Env indicates that an extensive test should be run.
+    should_run_extensive: bool,
+    /// Multiprecision tests will be run.
+    mp_tests_enabled: bool,
+    /// The number of inputs to the function.
+    input_count: usize,
+}
+
+impl TestEnv {
+    fn from_env(ctx: &CheckCtx) -> Self {
+        let id = ctx.fn_ident;
+        let op = id.math_op();
+
+        let will_run_mp = cfg!(feature = "build-mpfr");
+        let large_float_ty = match op.float_ty {
+            FloatTy::F16 | FloatTy::F32 => false,
+            FloatTy::F64 | FloatTy::F128 => true,
+        };
+
+        let will_run_extensive = EXTENSIVE.contains(&id);
+
+        let input_count = op.rust_sig.args.len();
+
+        Self {
+            slow_platform: slow_platform(),
+            large_float_ty,
+            should_run_extensive: will_run_extensive,
+            mp_tests_enabled: will_run_mp,
+            input_count,
+        }
+    }
+}
+
+/// Tests are pretty slow on non-64-bit targets, x86 MacOS, and targets that run in QEMU. Start
+/// with a reduced number on these platforms.
+fn slow_platform() -> bool {
+    let slow_on_ci = crate::emulated()
+        || usize::BITS < 64
+        || cfg!(all(target_arch = "x86_64", target_vendor = "apple"));
+
+    // If not running in CI, there is no need to reduce iteration count.
+    slow_on_ci && crate::ci()
+}
+
+/// The number of iterations to run for a given test.
+pub fn iteration_count(ctx: &CheckCtx, argnum: usize) -> u64 {
+    let t_env = TestEnv::from_env(ctx);
+
+    // Ideally run 5M tests
+    let mut domain_iter_count: u64 = 4_000_000;
+
+    // Start with a reduced number of tests on slow platforms.
+    if t_env.slow_platform {
+        domain_iter_count = 100_000;
+    }
+
+    // If we will be running tests against MPFR, we don't need to test as much against musl.
+    // However, there are some platforms where we have to test against musl since MPFR can't be
+    // built.
+    if t_env.mp_tests_enabled && ctx.basis == CheckBasis::Musl {
+        domain_iter_count /= 100;
+    }
+
+    // Run fewer random tests than domain tests.
+    let random_iter_count = domain_iter_count / 100;
+
+    let mut total_iterations = match ctx.gen_kind {
+        GeneratorKind::QuickSpaced => domain_iter_count,
+        GeneratorKind::Random => random_iter_count,
+        GeneratorKind::Extensive => extensive_max_iterations(),
+        GeneratorKind::EdgeCases | GeneratorKind::List => {
+            unimplemented!("shoudn't need `iteration_count` for {:?}", ctx.gen_kind)
+        }
+    };
+
+    // Larger float types get more iterations.
+    if t_env.large_float_ty && ctx.gen_kind != GeneratorKind::Extensive {
+        if ctx.gen_kind == GeneratorKind::Extensive {
+            // Extensive already has a pretty high test count.
+            total_iterations *= 2;
+        } else {
+            total_iterations *= 4;
+        }
+    }
+
+    // Functions with more arguments get more iterations.
+    let arg_multiplier = 1 << (t_env.input_count - 1);
+    total_iterations *= arg_multiplier;
+
+    // FMA has a huge domain but is reasonably fast to run, so increase another 1.5x.
+    if ctx.base_name == BaseName::Fma {
+        total_iterations = 3 * total_iterations / 2;
+    }
+
+    // Some tests are significantly slower than others and need to be further reduced.
+    if let Some((_id, _gen, scale)) = EXTEMELY_SLOW_TESTS
+        .iter()
+        .find(|(id, generator, _scale)| *id == ctx.fn_ident && *generator == ctx.gen_kind)
+    {
+        // However, do not override if the extensive iteration count has been manually set.
+        if !(ctx.gen_kind == GeneratorKind::Extensive && EXTENSIVE_ITER_OVERRIDE.is_some()) {
+            total_iterations /= scale;
+        }
+    }
+
+    if cfg!(optimizations_enabled) {
+        // Always run at least 10,000 tests.
+        total_iterations = total_iterations.max(10_000);
+    } else {
+        // Without optimizations, just run a quick check regardless of other parameters.
+        total_iterations = 800;
+    }
+
+    let mut overridden = false;
+    if let Some(count) = ctx.override_iterations {
+        total_iterations = count;
+        overridden = true;
+    }
+
+    // Adjust for the number of inputs
+    let ntests = match t_env.input_count {
+        1 => total_iterations,
+        2 => (total_iterations as f64).sqrt().ceil() as u64,
+        3 => (total_iterations as f64).cbrt().ceil() as u64,
+        _ => panic!("test has more than three arguments"),
+    };
+
+    let total = ntests.pow(t_env.input_count.try_into().unwrap());
+
+    let seed_msg = match ctx.gen_kind {
+        GeneratorKind::QuickSpaced | GeneratorKind::Extensive => String::new(),
+        GeneratorKind::Random => {
+            format!(
+                " using `{SEED_ENV}={}`",
+                str::from_utf8(SEED.as_slice()).unwrap()
+            )
+        }
+        GeneratorKind::EdgeCases | GeneratorKind::List => unimplemented!(),
+    };
+
+    test_log(&format!(
+        "{gen_kind:?} {basis:?} {fn_ident} arg {arg}/{args}: {ntests} iterations \
+         ({total} total){seed_msg}{omsg}",
+        gen_kind = ctx.gen_kind,
+        basis = ctx.basis,
+        fn_ident = ctx.fn_ident,
+        arg = argnum + 1,
+        args = t_env.input_count,
+        omsg = if overridden { " (overridden)" } else { "" }
+    ));
+
+    ntests
+}
+
+/// Some tests require that an integer be kept within reasonable limits; generate that here.
+pub fn int_range(ctx: &CheckCtx, argnum: usize) -> RangeInclusive<i32> {
+    let t_env = TestEnv::from_env(ctx);
+
+    if !matches!(ctx.base_name, BaseName::Jn | BaseName::Yn) {
+        return i32::MIN..=i32::MAX;
+    }
+
+    assert_eq!(
+        argnum, 0,
+        "For `jn`/`yn`, only the first argument takes an integer"
+    );
+
+    // The integer argument to `jn` is an iteration count. Limit this to ensure tests can be
+    // completed in a reasonable amount of time.
+    let non_extensive_range = if t_env.slow_platform || !cfg!(optimizations_enabled) {
+        (-0xf)..=0xff
+    } else {
+        (-0xff)..=0xffff
+    };
+
+    let extensive_range = (-0xfff)..=0xfffff;
+
+    match ctx.gen_kind {
+        GeneratorKind::Extensive => extensive_range,
+        GeneratorKind::QuickSpaced | GeneratorKind::Random => non_extensive_range,
+        GeneratorKind::EdgeCases => extensive_range,
+        GeneratorKind::List => unimplemented!("shoudn't need range for {:?}", ctx.gen_kind),
+    }
+}
+
+/// For domain tests, limit how many asymptotes or specified check points we test.
+pub fn check_point_count(ctx: &CheckCtx) -> usize {
+    assert_eq!(
+        ctx.gen_kind,
+        GeneratorKind::EdgeCases,
+        "check_point_count is intended for edge case tests"
+    );
+    let t_env = TestEnv::from_env(ctx);
+    if t_env.slow_platform || !cfg!(optimizations_enabled) {
+        4
+    } else {
+        10
+    }
+}
+
+/// When validating points of interest (e.g. asymptotes, inflection points, extremes), also check
+/// this many surrounding values.
+pub fn check_near_count(ctx: &CheckCtx) -> u64 {
+    assert_eq!(
+        ctx.gen_kind,
+        GeneratorKind::EdgeCases,
+        "check_near_count is intended for edge case tests"
+    );
+    if cfg!(optimizations_enabled) {
+        // Taper based on the number of inputs.
+        match ctx.input_count() {
+            1 | 2 => 100,
+            3 => 50,
+            x => panic!("unexpected argument count {x}"),
+        }
+    } else {
+        8
+    }
+}
+
+/// Check whether extensive actions should be run or skipped.
+pub fn skip_extensive_test(ctx: &CheckCtx) -> bool {
+    let t_env = TestEnv::from_env(ctx);
+    !t_env.should_run_extensive
+}
+
+/// The number of iterations to run for `u256` fuzz tests.
+pub fn bigint_fuzz_iteration_count() -> u64 {
+    if !cfg!(optimizations_enabled) {
+        return 1000;
+    }
+
+    if slow_platform() { 100_000 } else { 5_000_000 }
+}
diff --git a/library/compiler-builtins/libm-test/src/test_traits.rs b/library/compiler-builtins/libm-test/src/test_traits.rs
new file mode 100644
index 00000000000..dbb97016153
--- /dev/null
+++ b/library/compiler-builtins/libm-test/src/test_traits.rs
@@ -0,0 +1,453 @@
+//! Traits related to testing.
+//!
+//! There are two main traits in this module:
+//!
+//! - `TupleCall`: implemented on tuples to allow calling them as function arguments.
+//! - `CheckOutput`: implemented on anything that is an output type for validation against an
+//!   expected value.
+
+use std::panic::{RefUnwindSafe, UnwindSafe};
+use std::{fmt, panic};
+
+use anyhow::{Context, anyhow, bail, ensure};
+use libm::support::Hexf;
+
+use crate::precision::CheckAction;
+use crate::{
+    CheckBasis, CheckCtx, Float, GeneratorKind, Int, MaybeOverride, SpecialCase, TestResult,
+};
+
+/// Trait for calling a function with a tuple as arguments.
+///
+/// Implemented on the tuple with the function signature as the generic (so we can use the same
+/// tuple for multiple signatures).
+pub trait TupleCall<Func>: fmt::Debug {
+    type Output;
+    fn call(self, f: Func) -> Self::Output;
+
+    /// Intercept panics and print the input to stderr before continuing.
+    fn call_intercept_panics(self, f: Func) -> Self::Output
+    where
+        Self: RefUnwindSafe + Copy,
+        Func: UnwindSafe,
+    {
+        let res = panic::catch_unwind(|| self.call(f));
+        match res {
+            Ok(v) => v,
+            Err(e) => {
+                eprintln!("panic with the following input: {self:?}");
+                panic::resume_unwind(e)
+            }
+        }
+    }
+}
+
+/// A trait to implement on any output type so we can verify it in a generic way.
+pub trait CheckOutput<Input>: Sized {
+    /// Validate `self` (actual) and `expected` are the same.
+    ///
+    /// `input` is only used here for error messages.
+    fn validate(self, expected: Self, input: Input, ctx: &CheckCtx) -> TestResult;
+}
+
+/// A helper trait to print something as hex with the correct number of nibbles, e.g. a `u32`
+/// will always print with `0x` followed by 8 digits.
+///
+/// This is only used for printing errors so allocating is okay.
+pub trait Hex: Copy {
+    /// Hex integer syntax.
+    fn hex(self) -> String;
+    /// Hex float syntax.
+    fn hexf(self) -> String;
+}
+
+/* implement `TupleCall` */
+
+impl<T1, R> TupleCall<fn(T1) -> R> for (T1,)
+where
+    T1: fmt::Debug,
+{
+    type Output = R;
+
+    fn call(self, f: fn(T1) -> R) -> Self::Output {
+        f(self.0)
+    }
+}
+
+impl<T1, T2, R> TupleCall<fn(T1, T2) -> R> for (T1, T2)
+where
+    T1: fmt::Debug,
+    T2: fmt::Debug,
+{
+    type Output = R;
+
+    fn call(self, f: fn(T1, T2) -> R) -> Self::Output {
+        f(self.0, self.1)
+    }
+}
+
+impl<T1, T2, R> TupleCall<fn(T1, &mut T2) -> R> for (T1,)
+where
+    T1: fmt::Debug,
+    T2: fmt::Debug + Default,
+{
+    type Output = (R, T2);
+
+    fn call(self, f: fn(T1, &mut T2) -> R) -> Self::Output {
+        let mut t2 = T2::default();
+        (f(self.0, &mut t2), t2)
+    }
+}
+
+impl<T1, T2, T3, R> TupleCall<fn(T1, T2, T3) -> R> for (T1, T2, T3)
+where
+    T1: fmt::Debug,
+    T2: fmt::Debug,
+    T3: fmt::Debug,
+{
+    type Output = R;
+
+    fn call(self, f: fn(T1, T2, T3) -> R) -> Self::Output {
+        f(self.0, self.1, self.2)
+    }
+}
+
+impl<T1, T2, T3, R> TupleCall<fn(T1, T2, &mut T3) -> R> for (T1, T2)
+where
+    T1: fmt::Debug,
+    T2: fmt::Debug,
+    T3: fmt::Debug + Default,
+{
+    type Output = (R, T3);
+
+    fn call(self, f: fn(T1, T2, &mut T3) -> R) -> Self::Output {
+        let mut t3 = T3::default();
+        (f(self.0, self.1, &mut t3), t3)
+    }
+}
+
+impl<T1, T2, T3> TupleCall<for<'a> fn(T1, &'a mut T2, &'a mut T3)> for (T1,)
+where
+    T1: fmt::Debug,
+    T2: fmt::Debug + Default,
+    T3: fmt::Debug + Default,
+{
+    type Output = (T2, T3);
+
+    fn call(self, f: for<'a> fn(T1, &'a mut T2, &'a mut T3)) -> Self::Output {
+        let mut t2 = T2::default();
+        let mut t3 = T3::default();
+        f(self.0, &mut t2, &mut t3);
+        (t2, t3)
+    }
+}
+
+/* implement `Hex` */
+
+impl<T1> Hex for (T1,)
+where
+    T1: Hex,
+{
+    fn hex(self) -> String {
+        format!("({},)", self.0.hex())
+    }
+
+    fn hexf(self) -> String {
+        format!("({},)", self.0.hexf())
+    }
+}
+
+impl<T1, T2> Hex for (T1, T2)
+where
+    T1: Hex,
+    T2: Hex,
+{
+    fn hex(self) -> String {
+        format!("({}, {})", self.0.hex(), self.1.hex())
+    }
+
+    fn hexf(self) -> String {
+        format!("({}, {})", self.0.hexf(), self.1.hexf())
+    }
+}
+
+impl<T1, T2, T3> Hex for (T1, T2, T3)
+where
+    T1: Hex,
+    T2: Hex,
+    T3: Hex,
+{
+    fn hex(self) -> String {
+        format!("({}, {}, {})", self.0.hex(), self.1.hex(), self.2.hex())
+    }
+
+    fn hexf(self) -> String {
+        format!("({}, {}, {})", self.0.hexf(), self.1.hexf(), self.2.hexf())
+    }
+}
+
+/* trait implementations for ints */
+
+macro_rules! impl_int {
+    ($($ty:ty),*) => {
+        $(
+            impl Hex for $ty {
+                fn hex(self) -> String {
+                    format!("{self:#0width$x}", width = ((Self::BITS / 4) + 2) as usize)
+                }
+
+                fn hexf(self) -> String {
+                    String::new()
+                }
+            }
+
+            impl<Input> $crate::CheckOutput<Input> for $ty
+            where
+                Input: Hex + fmt::Debug,
+                SpecialCase: MaybeOverride<Input>,
+            {
+                fn validate<'a>(
+                    self,
+                    expected: Self,
+                    input: Input,
+                    ctx: &$crate::CheckCtx,
+                ) -> TestResult {
+                    validate_int(self, expected, input, ctx)
+                }
+            }
+        )*
+    };
+}
+
+fn validate_int<I, Input>(actual: I, expected: I, input: Input, ctx: &CheckCtx) -> TestResult
+where
+    I: Int + Hex,
+    Input: Hex + fmt::Debug,
+    SpecialCase: MaybeOverride<Input>,
+{
+    let (result, xfail_msg) = match SpecialCase::check_int(input, actual, expected, ctx) {
+        // `require_biteq` forbids overrides.
+        _ if ctx.gen_kind == GeneratorKind::List => (actual == expected, None),
+        CheckAction::AssertSuccess => (actual == expected, None),
+        CheckAction::AssertFailure(msg) => (actual != expected, Some(msg)),
+        CheckAction::Custom(res) => return res,
+        CheckAction::Skip => return Ok(()),
+        CheckAction::AssertWithUlp(_) => panic!("ulp has no meaning for integer checks"),
+    };
+
+    let make_xfail_msg = || match xfail_msg {
+        Some(m) => format!(
+            "expected failure but test passed. Does an XFAIL need to be updated?\n\
+            failed at: {m}",
+        ),
+        None => String::new(),
+    };
+
+    anyhow::ensure!(
+        result,
+        "\
+        \n    input:    {input:?} {ibits}\
+        \n    expected: {expected:<22?} {expbits}\
+        \n    actual:   {actual:<22?} {actbits}\
+        \n    {msg}\
+        ",
+        actbits = actual.hex(),
+        expbits = expected.hex(),
+        ibits = input.hex(),
+        msg = make_xfail_msg()
+    );
+
+    Ok(())
+}
+
+impl_int!(u32, i32, u64, i64);
+
+/* trait implementations for floats */
+
+macro_rules! impl_float {
+    ($($ty:ty),*) => {
+        $(
+            impl Hex for $ty {
+                fn hex(self) -> String {
+                    format!(
+                        "{:#0width$x}",
+                        self.to_bits(),
+                        width = ((Self::BITS / 4) + 2) as usize
+                    )
+                }
+
+                fn hexf(self) -> String {
+                    format!("{}", Hexf(self))
+                }
+            }
+
+            impl<Input> $crate::CheckOutput<Input> for $ty
+            where
+                Input: Hex + fmt::Debug,
+                SpecialCase: MaybeOverride<Input>,
+            {
+                fn validate<'a>(
+                    self,
+                    expected: Self,
+                    input: Input,
+                    ctx: &$crate::CheckCtx,
+                ) -> TestResult {
+                    validate_float(self, expected, input, ctx)
+                }
+            }
+        )*
+    };
+}
+
+fn validate_float<F, Input>(actual: F, expected: F, input: Input, ctx: &CheckCtx) -> TestResult
+where
+    F: Float + Hex,
+    Input: Hex + fmt::Debug,
+    u32: TryFrom<F::SignedInt, Error: fmt::Debug>,
+    SpecialCase: MaybeOverride<Input>,
+{
+    let mut assert_failure_msg = None;
+
+    // Create a wrapper function so we only need to `.with_context` once.
+    let mut inner = || -> TestResult {
+        let mut allowed_ulp = ctx.ulp;
+
+        // Forbid overrides if the items came from an explicit list, as long as we are checking
+        // against either MPFR or the result itself.
+        let require_biteq = ctx.gen_kind == GeneratorKind::List && ctx.basis != CheckBasis::Musl;
+
+        match SpecialCase::check_float(input, actual, expected, ctx) {
+            _ if require_biteq => (),
+            CheckAction::AssertSuccess => (),
+            CheckAction::AssertFailure(msg) => assert_failure_msg = Some(msg),
+            CheckAction::Custom(res) => return res,
+            CheckAction::Skip => return Ok(()),
+            CheckAction::AssertWithUlp(ulp_override) => allowed_ulp = ulp_override,
+        };
+
+        // Check when both are NaNs
+        if actual.is_nan() && expected.is_nan() {
+            if require_biteq && ctx.basis == CheckBasis::None {
+                ensure!(
+                    actual.to_bits() == expected.to_bits(),
+                    "mismatched NaN bitpatterns"
+                );
+            }
+            // By default, NaNs have nothing special to check.
+            return Ok(());
+        } else if actual.is_nan() || expected.is_nan() {
+            // Check when only one is a NaN
+            bail!("real value != NaN")
+        }
+
+        // Make sure that the signs are the same before checing ULP to avoid wraparound
+        let act_sig = actual.signum();
+        let exp_sig = expected.signum();
+        ensure!(
+            act_sig == exp_sig,
+            "mismatched signs {act_sig:?} {exp_sig:?}"
+        );
+
+        if actual.is_infinite() ^ expected.is_infinite() {
+            bail!("mismatched infinities");
+        }
+
+        let act_bits = actual.to_bits().signed();
+        let exp_bits = expected.to_bits().signed();
+
+        let ulp_diff = act_bits.checked_sub(exp_bits).unwrap().abs();
+
+        let ulp_u32 = u32::try_from(ulp_diff)
+            .map_err(|e| anyhow!("{e:?}: ulp of {ulp_diff} exceeds u32::MAX"))?;
+
+        ensure!(ulp_u32 <= allowed_ulp, "ulp {ulp_diff} > {allowed_ulp}",);
+
+        Ok(())
+    };
+
+    let mut res = inner();
+
+    if let Some(msg) = assert_failure_msg {
+        // Invert `Ok` and `Err` if the test is an xfail.
+        if res.is_ok() {
+            let e = anyhow!(
+                "expected failure but test passed. Does an XFAIL need to be updated?\n\
+                failed at: {msg}",
+            );
+            res = Err(e)
+        } else {
+            res = Ok(())
+        }
+    }
+
+    res.with_context(|| {
+        format!(
+            "\
+            \n    input:    {input:?}\
+            \n    as hex:   {ihex}\
+            \n    as bits:  {ibits}\
+            \n    expected: {expected:<22?} {exphex} {expbits}\
+            \n    actual:   {actual:<22?} {acthex} {actbits}\
+            ",
+            ihex = input.hexf(),
+            ibits = input.hex(),
+            exphex = expected.hexf(),
+            expbits = expected.hex(),
+            actbits = actual.hex(),
+            acthex = actual.hexf(),
+        )
+    })
+}
+
+impl_float!(f32, f64);
+
+#[cfg(f16_enabled)]
+impl_float!(f16);
+
+#[cfg(f128_enabled)]
+impl_float!(f128);
+
+/* trait implementations for compound types */
+
+/// Implement `CheckOutput` for combinations of types.
+macro_rules! impl_tuples {
+    ($(($a:ty, $b:ty);)*) => {
+        $(
+            impl<Input> CheckOutput<Input> for ($a, $b)
+            where
+                Input: Hex + fmt::Debug,
+                SpecialCase: MaybeOverride<Input>,
+              {
+                fn validate<'a>(
+                    self,
+                    expected: Self,
+                    input: Input,
+                    ctx: &CheckCtx,
+                ) -> TestResult {
+                    self.0.validate(expected.0, input, ctx)
+                        .and_then(|()| self.1.validate(expected.1, input, ctx))
+                        .with_context(|| format!(
+                            "full context:\
+                            \n    input:    {input:?} {ibits}\
+                            \n    as hex:   {ihex}\
+                            \n    as bits:  {ibits}\
+                            \n    expected: {expected:?} {expbits}\
+                            \n    actual:   {self:?} {actbits}\
+                            ",
+                            ihex = input.hexf(),
+                            ibits = input.hex(),
+                            expbits = expected.hex(),
+                            actbits = self.hex(),
+                        ))
+                }
+            }
+        )*
+    };
+}
+
+impl_tuples!(
+    (f32, i32);
+    (f64, i32);
+    (f32, f32);
+    (f64, f64);
+);
diff --git a/library/compiler-builtins/libm-test/tests/check_coverage.rs b/library/compiler-builtins/libm-test/tests/check_coverage.rs
new file mode 100644
index 00000000000..3b445a3de9d
--- /dev/null
+++ b/library/compiler-builtins/libm-test/tests/check_coverage.rs
@@ -0,0 +1,61 @@
+//! Ensure that `for_each_function!` isn't missing any symbols.
+
+use std::collections::HashSet;
+use std::env;
+use std::path::Path;
+use std::process::Command;
+
+macro_rules! callback {
+    (
+        fn_name: $name:ident,
+        attrs: [$($attr:meta),*],
+        extra: [$set:ident],
+    ) => {
+        let name = stringify!($name);
+        let new = $set.insert(name);
+        assert!(new, "duplicate function `{name}` in `ALL_OPERATIONS`");
+    };
+}
+
+#[test]
+fn test_for_each_function_all_included() {
+    let all_functions: HashSet<_> = include_str!("../../etc/function-list.txt")
+        .lines()
+        .filter(|line| !line.starts_with("#"))
+        .collect();
+
+    let mut tested = HashSet::new();
+
+    libm_macros::for_each_function! {
+        callback: callback,
+        extra: [tested],
+    };
+
+    let untested = all_functions.difference(&tested);
+    if untested.clone().next().is_some() {
+        panic!(
+            "missing tests for the following: {untested:#?} \
+            \nmake sure any new functions are entered in \
+            `ALL_OPERATIONS` (in `libm-macros`)."
+        );
+    }
+    assert_eq!(all_functions, tested);
+}
+
+#[test]
+fn ensure_list_updated() {
+    if libm_test::ci() {
+        // Most CI tests run in Docker where we don't have Python or Rustdoc, so it's easiest
+        // to just run the python file directly when it is available.
+        eprintln!("skipping test; CI runs the python file directly");
+        return;
+    }
+
+    let res = Command::new("python3")
+        .arg(Path::new(env!("CARGO_MANIFEST_DIR")).join("../etc/update-api-list.py"))
+        .arg("--check")
+        .status()
+        .unwrap();
+
+    assert!(res.success(), "May need to run `./etc/update-api-list.py`");
+}
diff --git a/library/compiler-builtins/libm-test/tests/compare_built_musl.rs b/library/compiler-builtins/libm-test/tests/compare_built_musl.rs
new file mode 100644
index 00000000000..6ccbb6f4c51
--- /dev/null
+++ b/library/compiler-builtins/libm-test/tests/compare_built_musl.rs
@@ -0,0 +1,106 @@
+//! Compare our implementations with the result of musl functions, as provided by `musl-math-sys`.
+//!
+//! Currently this only tests randomized inputs. In the future this may be improved to test edge
+//! cases or run exhaustive tests.
+//!
+//! Note that musl functions do not always provide 0.5ULP rounding, so our functions can do better
+//! than these results.
+
+// There are some targets we can't build musl for
+#![cfg(feature = "build-musl")]
+
+use libm_test::generate::{case_list, edge_cases, random, spaced};
+use libm_test::{CheckBasis, CheckCtx, CheckOutput, GeneratorKind, MathOp, TupleCall};
+
+const BASIS: CheckBasis = CheckBasis::Musl;
+
+fn musl_runner<Op: MathOp>(
+    ctx: &CheckCtx,
+    cases: impl Iterator<Item = Op::RustArgs>,
+    musl_fn: Op::CFn,
+) {
+    for input in cases {
+        let musl_res = input.call(musl_fn);
+        let crate_res = input.call_intercept_panics(Op::ROUTINE);
+
+        crate_res.validate(musl_res, input, ctx).unwrap();
+    }
+}
+
+/// Test against musl with generators from a domain.
+macro_rules! musl_tests {
+    (
+        fn_name: $fn_name:ident,
+        attrs: [$($attr:meta),*],
+    ) => {
+        paste::paste! {
+            #[test]
+            $(#[$attr])*
+            fn [< musl_case_list_ $fn_name >]() {
+                type Op = libm_test::op::$fn_name::Routine;
+                let ctx = CheckCtx::new(Op::IDENTIFIER, BASIS, GeneratorKind::List);
+                let cases = case_list::get_test_cases_basis::<Op>(&ctx).0;
+                musl_runner::<Op>(&ctx, cases, musl_math_sys::$fn_name);
+            }
+
+            #[test]
+            $(#[$attr])*
+            fn [< musl_random_ $fn_name >]() {
+                type Op = libm_test::op::$fn_name::Routine;
+                let ctx = CheckCtx::new(Op::IDENTIFIER, BASIS, GeneratorKind::Random);
+                let cases = random::get_test_cases::<<Op as MathOp>::RustArgs>(&ctx).0;
+                musl_runner::<Op>(&ctx, cases, musl_math_sys::$fn_name);
+            }
+
+            #[test]
+            $(#[$attr])*
+            fn [< musl_edge_case_ $fn_name >]() {
+                type Op = libm_test::op::$fn_name::Routine;
+                let ctx = CheckCtx::new(Op::IDENTIFIER, BASIS, GeneratorKind::EdgeCases);
+                let cases = edge_cases::get_test_cases::<Op>(&ctx).0;
+                musl_runner::<Op>(&ctx, cases, musl_math_sys::$fn_name);
+            }
+
+            #[test]
+            $(#[$attr])*
+            fn [< musl_quickspace_ $fn_name >]() {
+                type Op = libm_test::op::$fn_name::Routine;
+                let ctx = CheckCtx::new(Op::IDENTIFIER, BASIS, GeneratorKind::QuickSpaced);
+                let cases = spaced::get_test_cases::<Op>(&ctx).0;
+                musl_runner::<Op>(&ctx, cases, musl_math_sys::$fn_name);
+            }
+        }
+    };
+}
+
+libm_macros::for_each_function! {
+    callback: musl_tests,
+    attributes: [],
+    // Not provided by musl
+    skip_f16_f128: true,
+    skip: [
+        // TODO integer inputs
+        jn,
+        jnf,
+        ldexp,
+        ldexpf,
+        scalbn,
+        scalbnf,
+        yn,
+        ynf,
+
+        // Not provided by musl
+        // verify-sorted-start
+        fmaximum,
+        fmaximum_num,
+        fmaximum_numf,
+        fmaximumf,
+        fminimum,
+        fminimum_num,
+        fminimum_numf,
+        fminimumf,
+        roundeven,
+        roundevenf,
+        // // verify-sorted-end
+    ],
+}
diff --git a/library/compiler-builtins/libm-test/tests/multiprecision.rs b/library/compiler-builtins/libm-test/tests/multiprecision.rs
new file mode 100644
index 00000000000..80b2c78688e
--- /dev/null
+++ b/library/compiler-builtins/libm-test/tests/multiprecision.rs
@@ -0,0 +1,79 @@
+//! Test with "infinite precision"
+
+#![cfg(feature = "build-mpfr")]
+
+use libm_test::generate::{case_list, edge_cases, random, spaced};
+use libm_test::mpfloat::MpOp;
+use libm_test::{CheckBasis, CheckCtx, CheckOutput, GeneratorKind, MathOp, TupleCall};
+
+const BASIS: CheckBasis = CheckBasis::Mpfr;
+
+fn mp_runner<Op: MathOp + MpOp>(ctx: &CheckCtx, cases: impl Iterator<Item = Op::RustArgs>) {
+    let mut mp_vals = Op::new_mp();
+    for input in cases {
+        let mp_res = Op::run(&mut mp_vals, input);
+        let crate_res = input.call_intercept_panics(Op::ROUTINE);
+
+        crate_res.validate(mp_res, input, ctx).unwrap();
+    }
+}
+
+macro_rules! mp_tests {
+    (
+        fn_name: $fn_name:ident,
+        attrs: [$($attr:meta),*],
+    ) => {
+        paste::paste! {
+            #[test]
+            $(#[$attr])*
+            fn [< mp_case_list_ $fn_name >]() {
+                type Op = libm_test::op::$fn_name::Routine;
+                let ctx = CheckCtx::new(Op::IDENTIFIER, BASIS, GeneratorKind::List);
+                let cases = case_list::get_test_cases_basis::<Op>(&ctx).0;
+                mp_runner::<Op>(&ctx, cases);
+            }
+
+            #[test]
+            $(#[$attr])*
+            fn [< mp_random_ $fn_name >]() {
+                type Op = libm_test::op::$fn_name::Routine;
+                let ctx = CheckCtx::new(Op::IDENTIFIER, BASIS, GeneratorKind::Random);
+                let cases = random::get_test_cases::<<Op as MathOp>::RustArgs>(&ctx).0;
+                mp_runner::<Op>(&ctx, cases);
+            }
+
+            #[test]
+            $(#[$attr])*
+            fn [< mp_edge_case_ $fn_name >]() {
+                type Op = libm_test::op::$fn_name::Routine;
+                let ctx = CheckCtx::new(Op::IDENTIFIER, BASIS, GeneratorKind::EdgeCases);
+                let cases = edge_cases::get_test_cases::<Op>(&ctx).0;
+                mp_runner::<Op>(&ctx, cases);
+            }
+
+            #[test]
+            $(#[$attr])*
+            fn [< mp_quickspace_ $fn_name >]() {
+                type Op = libm_test::op::$fn_name::Routine;
+                let ctx = CheckCtx::new(Op::IDENTIFIER, BASIS, GeneratorKind::QuickSpaced);
+                let cases = spaced::get_test_cases::<Op>(&ctx).0;
+                mp_runner::<Op>(&ctx, cases);
+            }
+        }
+    };
+}
+
+libm_macros::for_each_function! {
+    callback: mp_tests,
+    attributes: [
+        // Also an assertion failure on i686: at `MPFR_ASSERTN (! mpfr_erangeflag_p ())`
+        #[ignore = "large values are infeasible in MPFR"]
+        [jn, jnf, yn, ynf],
+    ],
+    skip: [
+        // FIXME: test needed, see
+        // https://github.com/rust-lang/libm/pull/311#discussion_r1818273392
+        nextafter,
+        nextafterf,
+    ],
+}
diff --git a/library/compiler-builtins/libm-test/tests/standalone.rs b/library/compiler-builtins/libm-test/tests/standalone.rs
new file mode 100644
index 00000000000..7b30a3b48d7
--- /dev/null
+++ b/library/compiler-builtins/libm-test/tests/standalone.rs
@@ -0,0 +1,38 @@
+//! Test cases that have both an input and an output, so do not require a basis.
+
+use libm_test::generate::case_list;
+use libm_test::{CheckBasis, CheckCtx, CheckOutput, GeneratorKind, MathOp, TupleCall};
+
+const BASIS: CheckBasis = CheckBasis::None;
+
+fn standalone_runner<Op: MathOp>(
+    ctx: &CheckCtx,
+    cases: impl Iterator<Item = (Op::RustArgs, Op::RustRet)>,
+) {
+    for (input, expected) in cases {
+        let crate_res = input.call_intercept_panics(Op::ROUTINE);
+        crate_res.validate(expected, input, ctx).unwrap();
+    }
+}
+
+macro_rules! mp_tests {
+    (
+        fn_name: $fn_name:ident,
+        attrs: [$($attr:meta),*],
+    ) => {
+        paste::paste! {
+            #[test]
+            $(#[$attr])*
+            fn [< standalone_ $fn_name >]() {
+                type Op = libm_test::op::$fn_name::Routine;
+                let ctx = CheckCtx::new(Op::IDENTIFIER, BASIS, GeneratorKind::List);
+                let cases = case_list::get_test_cases_standalone::<Op>(&ctx);
+                standalone_runner::<Op>(&ctx, cases);
+            }
+        }
+    };
+}
+
+libm_macros::for_each_function! {
+    callback: mp_tests,
+}
diff --git a/library/compiler-builtins/libm-test/tests/u256.rs b/library/compiler-builtins/libm-test/tests/u256.rs
new file mode 100644
index 00000000000..8cbb3ad226f
--- /dev/null
+++ b/library/compiler-builtins/libm-test/tests/u256.rs
@@ -0,0 +1,155 @@
+//! Test the u256 implementation. the ops already get exercised reasonably well through the `f128`
+//! routines, so this only does a few million fuzz iterations against GMP.
+
+#![cfg(feature = "build-mpfr")]
+
+use std::sync::LazyLock;
+
+use libm::support::{HInt, u256};
+type BigInt = rug::Integer;
+
+use libm_test::bigint_fuzz_iteration_count;
+use libm_test::generate::random::SEED;
+use rand::{Rng, SeedableRng};
+use rand_chacha::ChaCha8Rng;
+use rug::Assign;
+use rug::integer::Order;
+use rug::ops::NotAssign;
+
+static BIGINT_U256_MAX: LazyLock<BigInt> =
+    LazyLock::new(|| BigInt::from_digits(&[u128::MAX, u128::MAX], Order::Lsf));
+
+/// Copied from the test module.
+fn hexu(v: u256) -> String {
+    format!("0x{:032x}{:032x}", v.hi, v.lo)
+}
+
+fn random_u256(rng: &mut ChaCha8Rng) -> u256 {
+    let lo: u128 = rng.random();
+    let hi: u128 = rng.random();
+    u256 { lo, hi }
+}
+
+fn assign_bigint(bx: &mut BigInt, x: u256) {
+    bx.assign_digits(&[x.lo, x.hi], Order::Lsf);
+}
+
+fn from_bigint(bx: &mut BigInt) -> u256 {
+    // Truncate so the result fits into `[u128; 2]`. This makes all ops overflowing.
+    *bx &= &*BIGINT_U256_MAX;
+    let mut bres = [0u128, 0];
+    bx.write_digits(&mut bres, Order::Lsf);
+    bx.assign(0);
+    u256 {
+        lo: bres[0],
+        hi: bres[1],
+    }
+}
+
+fn check_one(
+    x: impl FnOnce() -> String,
+    y: impl FnOnce() -> Option<String>,
+    actual: u256,
+    expected: &mut BigInt,
+) {
+    let expected = from_bigint(expected);
+    if actual != expected {
+        let xmsg = x();
+        let ymsg = y().map(|y| format!("y:        {y}\n")).unwrap_or_default();
+        panic!(
+            "Results do not match\n\
+            input:    {xmsg}\n\
+            {ymsg}\
+            actual:   {}\n\
+            expected: {}\
+            ",
+            hexu(actual),
+            hexu(expected),
+        )
+    }
+}
+
+#[test]
+fn mp_u256_bitor() {
+    let mut rng = ChaCha8Rng::from_seed(*SEED);
+    let mut bx = BigInt::new();
+    let mut by = BigInt::new();
+
+    for _ in 0..bigint_fuzz_iteration_count() {
+        let x = random_u256(&mut rng);
+        let y = random_u256(&mut rng);
+        assign_bigint(&mut bx, x);
+        assign_bigint(&mut by, y);
+        let actual = x | y;
+        bx |= &by;
+        check_one(|| hexu(x), || Some(hexu(y)), actual, &mut bx);
+    }
+}
+
+#[test]
+fn mp_u256_not() {
+    let mut rng = ChaCha8Rng::from_seed(*SEED);
+    let mut bx = BigInt::new();
+
+    for _ in 0..bigint_fuzz_iteration_count() {
+        let x = random_u256(&mut rng);
+        assign_bigint(&mut bx, x);
+        let actual = !x;
+        bx.not_assign();
+        check_one(|| hexu(x), || None, actual, &mut bx);
+    }
+}
+
+#[test]
+fn mp_u256_add() {
+    let mut rng = ChaCha8Rng::from_seed(*SEED);
+    let mut bx = BigInt::new();
+    let mut by = BigInt::new();
+
+    for _ in 0..bigint_fuzz_iteration_count() {
+        let x = random_u256(&mut rng);
+        let y = random_u256(&mut rng);
+        assign_bigint(&mut bx, x);
+        assign_bigint(&mut by, y);
+        let actual = x + y;
+        bx += &by;
+        check_one(|| hexu(x), || Some(hexu(y)), actual, &mut bx);
+    }
+}
+
+#[test]
+fn mp_u256_shr() {
+    let mut rng = ChaCha8Rng::from_seed(*SEED);
+    let mut bx = BigInt::new();
+
+    for _ in 0..bigint_fuzz_iteration_count() {
+        let x = random_u256(&mut rng);
+        let shift: u32 = rng.random_range(0..255);
+        assign_bigint(&mut bx, x);
+        let actual = x >> shift;
+        bx >>= shift;
+        check_one(|| hexu(x), || Some(shift.to_string()), actual, &mut bx);
+    }
+}
+
+#[test]
+fn mp_u256_widen_mul() {
+    let mut rng = ChaCha8Rng::from_seed(*SEED);
+    let mut bx = BigInt::new();
+    let mut by = BigInt::new();
+
+    for _ in 0..bigint_fuzz_iteration_count() {
+        let x: u128 = rng.random();
+        let y: u128 = rng.random();
+        bx.assign(x);
+        by.assign(y);
+        let actual = x.widen_mul(y);
+        bx *= &by;
+        check_one(
+            || format!("{x:#034x}"),
+            || Some(format!("{y:#034x}")),
+            actual,
+            &mut bx,
+        );
+    }
+}
diff --git a/library/compiler-builtins/libm-test/tests/z_extensive/main.rs b/library/compiler-builtins/libm-test/tests/z_extensive/main.rs
new file mode 100644
index 00000000000..5448cb6eaa5
--- /dev/null
+++ b/library/compiler-builtins/libm-test/tests/z_extensive/main.rs
@@ -0,0 +1,14 @@
+//! `main` is just a wrapper to handle configuration.
+
+#[cfg(not(feature = "build-mpfr"))]
+fn main() {
+    eprintln!("multiprecision not enabled; skipping extensive tests");
+}
+
+#[cfg(feature = "build-mpfr")]
+mod run;
+
+#[cfg(feature = "build-mpfr")]
+fn main() {
+    run::run();
+}
diff --git a/library/compiler-builtins/libm-test/tests/z_extensive/run.rs b/library/compiler-builtins/libm-test/tests/z_extensive/run.rs
new file mode 100644
index 00000000000..59c806ce73e
--- /dev/null
+++ b/library/compiler-builtins/libm-test/tests/z_extensive/run.rs
@@ -0,0 +1,247 @@
+//! Exhaustive tests for `f16` and `f32`, high-iteration for `f64` and `f128`.
+
+use std::fmt;
+use std::io::{self, IsTerminal};
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::time::Duration;
+
+use indicatif::{ProgressBar, ProgressStyle};
+use libm_test::generate::spaced;
+use libm_test::mpfloat::MpOp;
+use libm_test::{
+    CheckBasis, CheckCtx, CheckOutput, GeneratorKind, MathOp, TestResult, TupleCall,
+    skip_extensive_test,
+};
+use libtest_mimic::{Arguments, Trial};
+use rayon::prelude::*;
+use spaced::SpacedInput;
+
+const BASIS: CheckBasis = CheckBasis::Mpfr;
+const GEN_KIND: GeneratorKind = GeneratorKind::Extensive;
+
+/// Run the extensive test suite.
+pub fn run() {
+    let mut args = Arguments::from_args();
+    // Prevent multiple tests from running in parallel, each test gets parallized internally.
+    args.test_threads = Some(1);
+    let tests = register_all_tests();
+
+    // With default parallelism, the CPU doesn't saturate. We don't need to be nice to
+    // other processes, so do 1.5x to make sure we use all available resources.
+    let threads = std::thread::available_parallelism()
+        .map(Into::into)
+        .unwrap_or(0)
+        * 3
+        / 2;
+    rayon::ThreadPoolBuilder::new()
+        .num_threads(threads)
+        .build_global()
+        .unwrap();
+
+    libtest_mimic::run(&args, tests).exit();
+}
+
+macro_rules! mp_extensive_tests {
+    (
+        fn_name: $fn_name:ident,
+        attrs: [$($attr:meta),*],
+        extra: [$push_to:ident],
+    ) => {
+        $(#[$attr])*
+        register_single_test::<libm_test::op::$fn_name::Routine>(&mut $push_to);
+    };
+}
+
+/// Create a list of tests for consumption by `libtest_mimic`.
+fn register_all_tests() -> Vec<Trial> {
+    let mut all_tests = Vec::new();
+
+    libm_macros::for_each_function! {
+        callback: mp_extensive_tests,
+        extra: [all_tests],
+        skip: [
+            // FIXME: test needed, see
+            // https://github.com/rust-lang/libm/pull/311#discussion_r1818273392
+            nextafter,
+            nextafterf,
+        ],
+    }
+
+    all_tests
+}
+
+/// Add a single test to the list.
+fn register_single_test<Op>(all: &mut Vec<Trial>)
+where
+    Op: MathOp + MpOp,
+    Op::RustArgs: SpacedInput<Op> + Send,
+{
+    let test_name = format!("mp_extensive_{}", Op::NAME);
+    let ctx = CheckCtx::new(Op::IDENTIFIER, BASIS, GEN_KIND);
+    let skip = skip_extensive_test(&ctx);
+
+    let runner = move || {
+        if !cfg!(optimizations_enabled) {
+            panic!("extensive tests should be run with --release");
+        }
+
+        let res = run_single_test::<Op>(&ctx);
+        let e = match res {
+            Ok(()) => return Ok(()),
+            Err(e) => e,
+        };
+
+        // Format with the `Debug` implementation so we get the error cause chain, and print it
+        // here so we see the result immediately (rather than waiting for all tests to conclude).
+        let e = format!("{e:?}");
+        eprintln!("failure testing {}:{e}\n", Op::IDENTIFIER);
+
+        Err(e.into())
+    };
+
+    all.push(Trial::test(test_name, runner).with_ignored_flag(skip));
+}
+
+/// Test runner for a signle routine.
+fn run_single_test<Op>(ctx: &CheckCtx) -> TestResult
+where
+    Op: MathOp + MpOp,
+    Op::RustArgs: SpacedInput<Op> + Send,
+{
+    // Small delay before printing anything so other output from the runner has a chance to flush.
+    std::thread::sleep(Duration::from_millis(500));
+    eprintln!();
+
+    let completed = AtomicU64::new(0);
+    let (ref mut cases, total) = spaced::get_test_cases::<Op>(ctx);
+    let pb = Progress::new(Op::NAME, total);
+
+    let test_single_chunk = |mp_vals: &mut Op::MpTy, input_vec: Vec<Op::RustArgs>| -> TestResult {
+        for input in input_vec {
+            // Test the input.
+            let mp_res = Op::run(mp_vals, input);
+            let crate_res = input.call_intercept_panics(Op::ROUTINE);
+            crate_res.validate(mp_res, input, ctx)?;
+
+            let completed = completed.fetch_add(1, Ordering::Relaxed) + 1;
+            pb.update(completed, input);
+        }
+
+        Ok(())
+    };
+
+    // Chunk the cases so Rayon doesn't switch threads between each iterator item. 50k seems near
+    // a performance sweet spot. Ideally we would reuse these allocations rather than discarding,
+    // but that is difficult with Rayon's API.
+    let chunk_size = 50_000;
+    let chunks = std::iter::from_fn(move || {
+        let mut v = Vec::with_capacity(chunk_size);
+        v.extend(cases.take(chunk_size));
+        (!v.is_empty()).then_some(v)
+    });
+
+    // Run the actual tests
+    let res = chunks
+        .par_bridge()
+        .try_for_each_init(Op::new_mp, test_single_chunk);
+
+    let real_total = completed.load(Ordering::Relaxed);
+    pb.complete(real_total);
+
+    if res.is_ok() && real_total != total {
+        // Provide a warning if our estimate needs to be updated.
+        panic!("total run {real_total} does not match expected {total}");
+    }
+
+    res
+}
+
+/// Wrapper around a `ProgressBar` that handles styles and non-TTY messages.
+struct Progress {
+    pb: ProgressBar,
+    name_padded: String,
+    final_style: ProgressStyle,
+    is_tty: bool,
+}
+
+impl Progress {
+    const PB_TEMPLATE: &str = "[{elapsed:3} {percent:3}%] {bar:20.cyan/blue} NAME \
+        {human_pos:>13}/{human_len:13} {per_sec:18} eta {eta:8} {msg}";
+    const PB_TEMPLATE_FINAL: &str = "[{elapsed:3} {percent:3}%] {bar:20.cyan/blue} NAME \
+        {human_pos:>13}/{human_len:13} {per_sec:18} done in {elapsed_precise}";
+
+    fn new(name: &str, total: u64) -> Self {
+        eprintln!("starting extensive tests for `{name}`");
+        let name_padded = format!("{name:9}");
+        let is_tty = io::stderr().is_terminal();
+
+        let initial_style =
+            ProgressStyle::with_template(&Self::PB_TEMPLATE.replace("NAME", &name_padded))
+                .unwrap()
+                .progress_chars("##-");
+
+        let final_style =
+            ProgressStyle::with_template(&Self::PB_TEMPLATE_FINAL.replace("NAME", &name_padded))
+                .unwrap()
+                .progress_chars("##-");
+
+        let pb = ProgressBar::new(total);
+        pb.set_style(initial_style);
+
+        Self {
+            pb,
+            final_style,
+            name_padded,
+            is_tty,
+        }
+    }
+
+    fn update(&self, completed: u64, input: impl fmt::Debug) {
+        // Infrequently update the progress bar.
+        if completed % 20_000 == 0 {
+            self.pb.set_position(completed);
+        }
+
+        if completed % 500_000 == 0 {
+            self.pb.set_message(format!("input: {input:<24?}"));
+        }
+
+        if !self.is_tty && completed % 5_000_000 == 0 {
+            let len = self.pb.length().unwrap_or_default();
+            eprintln!(
+                "[{elapsed:3?}s {percent:3.0}%] {name} \
+                {human_pos:>10}/{human_len:<10} {per_sec:14.2}/s eta {eta:4}s {input:<24?}",
+                elapsed = self.pb.elapsed().as_secs(),
+                percent = completed as f32 * 100.0 / len as f32,
+                name = self.name_padded,
+                human_pos = completed,
+                human_len = len,
+                per_sec = self.pb.per_sec(),
+                eta = self.pb.eta().as_secs()
+            );
+        }
+    }
+
+    fn complete(self, real_total: u64) {
+        self.pb.set_style(self.final_style);
+        self.pb.set_position(real_total);
+        self.pb.abandon();
+
+        if !self.is_tty {
+            let len = self.pb.length().unwrap_or_default();
+            eprintln!(
+                "[{elapsed:3}s {percent:3.0}%] {name} \
+                {human_pos:>10}/{human_len:<10} {per_sec:14.2}/s done in {elapsed_precise}",
+                elapsed = self.pb.elapsed().as_secs(),
+                percent = real_total as f32 * 100.0 / len as f32,
+                name = self.name_padded,
+                human_pos = real_total,
+                human_len = len,
+                per_sec = self.pb.per_sec(),
+                elapsed_precise = self.pb.elapsed().as_secs(),
+            );
+        }
+
+        eprintln!();
+    }
+}
diff --git a/library/compiler-builtins/libm/CHANGELOG.md b/library/compiler-builtins/libm/CHANGELOG.md
new file mode 100644
index 00000000000..33fec06aa23
--- /dev/null
+++ b/library/compiler-builtins/libm/CHANGELOG.md
@@ -0,0 +1,229 @@
+# Changelog
+
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to
+[Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [Unreleased]
+
+## [0.2.15](https://github.com/rust-lang/compiler-builtins/compare/libm-v0.2.14...libm-v0.2.15) - 2025-05-06
+
+### Other
+
+- Require `target_has_atomic = "ptr"` for runtime feature detection
+
+## [0.2.14](https://github.com/rust-lang/compiler-builtins/compare/libm-v0.2.13...libm-v0.2.14) - 2025-05-03
+
+### Other
+
+- Use runtime feature detection for fma routines on x86
+
+## [0.2.13](https://github.com/rust-lang/compiler-builtins/compare/libm-v0.2.12...libm-v0.2.13) - 2025-04-21
+
+### Fixed
+
+- Switch back to workspace resolver v2 to unbreak builds without the 2024 edition
+
+## [0.2.12](https://github.com/rust-lang/compiler-builtins/compare/libm-v0.2.11...libm-v0.2.12) - 2025-04-21
+
+- Mark generic functions `#[inline]`
+- Combine the source files for `fmod`
+- Ensure all public functions are marked `no_panic`
+- Add assembly version of simple operations on aarch64
+- Add `roundeven{,f,f16,f128}`
+- Add `fminimum`, `fmaximum`, `fminimum_num`, and `fmaximum_num`
+- Eliminate the use of `force_eval!` in `ceil`, `floor`, and `trunc`
+- Port the CORE-MATH version of `cbrt`
+- Add `fmaf128`
+- fma: Ensure zero has the correct sign
+- Add `scalbnf16`, `scalbnf128`, `ldexpf16`, and `ldexpf128`
+- Specify license as just MIT
+- Add `fmodf128`
+- Add `fmodf16` using the generic implementation
+- Add `fminf16`, `fmaxf16`, `fminf128`, and `fmaxf128`
+- Add `roundf16` and `roundf128`
+- Add `rintf16` and `rintf128`
+- Add `floorf16` and `floorf128`
+- Add `ceilf16` and `ceilf128`
+- Add `sqrtf16` and `sqrtf128`
+- Simplify and optimize `fdim` ([#442](https://github.com/rust-lang/libm/pull/442))
+- Add `fdimf16` and `fdimf128`
+- Add `truncf16` and `truncf128`
+- Add `fabsf16`, `fabsf128`, `copysignf16`, and `copysignf128`
+- Move some numeric trait logic to default implementations
+- Add some more basic docstrings ([#352](https://github.com/rust-lang/libm/pull/352))
+- Add support for loongarch64-unknown-linux-gnu
+- Add an "arch" Cargo feature that is on by default
+- Rename the `special_case` module to `precision` and move default ULP
+- Move the existing "unstable" feature to "unstable-intrinsics"
+
+There are a number of things that changed internally, see the git log for a full
+list of changes.
+
+## [0.2.11](https://github.com/rust-lang/libm/compare/libm-v0.2.10...libm-v0.2.11) - 2024-10-28
+
+### Fixed
+
+- fix type of constants in ported sincosf ([#331](https://github.com/rust-lang/libm/pull/331))
+
+### Other
+
+- Disable a unit test that is failing on i586
+- Add a procedural macro for expanding all function signatures
+- Introduce `musl-math-sys` for bindings to musl math symbols
+- Add basic docstrings to some functions ([#337](https://github.com/rust-lang/libm/pull/337))
+
+## [0.2.10](https://github.com/rust-lang/libm/compare/libm-v0.2.9...libm-v0.2.10) - 2024-10-28
+
+### Other
+
+- Set the MSRV to 1.63 and test this in CI
+
+## [0.2.9](https://github.com/rust-lang/libm/compare/libm-v0.2.8...libm-v0.2.9) - 2024-10-26
+
+### Fixed
+
+- Update exponent calculations in nextafter to match musl
+
+### Changed
+
+- Update licensing to MIT AND (MIT OR Apache-2.0), as this is derivative from
+  MIT-licensed musl.
+- Set edition to 2021 for all crates
+- Upgrade all dependencies
+
+### Other
+
+- Don't deny warnings in lib.rs
+- Rename the `musl-bitwise-tests` feature to `test-musl-serialized`
+- Rename the `musl-reference-tests` feature to `musl-bitwise-tests`
+- Move `musl-reference-tests` to a new `libm-test` crate
+- Add a `force-soft-floats` feature to prevent using any intrinsics or
+  arch-specific code
+- Deny warnings in CI
+- Fix `clippy::deprecated_cfg_attr` on compiler_builtins
+- Corrected English typos
+- Remove unneeded `extern core` in `tgamma`
+- Allow internal_features lint when building with "unstable"
+
+## [v0.2.1] - 2019-11-22
+
+### Fixed
+
+- sincosf
+
+## [v0.2.0] - 2019-10-18
+
+### Added
+
+- Benchmarks
+- signum
+- remainder
+- remainderf
+- nextafter
+- nextafterf
+
+### Fixed
+
+- Rounding to negative zero
+- Overflows in rem_pio2 and remquo
+- Overflows in fma
+- sincosf
+
+### Removed
+
+- F32Ext and F64Ext traits
+
+## [v0.1.4] - 2019-06-12
+
+### Fixed
+
+- Restored compatibility with Rust 1.31.0
+
+## [v0.1.3] - 2019-05-14
+
+### Added
+
+- minf
+- fmin
+- fmaxf
+- fmax
+
+## [v0.1.2] - 2018-07-18
+
+### Added
+
+- acosf
+- asin
+- asinf
+- atan
+- atan2
+- atan2f
+- atanf
+- cos
+- cosf
+- cosh
+- coshf
+- exp2
+- expm1
+- expm1f
+- expo2
+- fmaf
+- pow
+- sin
+- sinf
+- sinh
+- sinhf
+- tan
+- tanf
+- tanh
+- tanhf
+
+## [v0.1.1] - 2018-07-14
+
+### Added
+
+- acos
+- acosf
+- asin
+- asinf
+- atanf
+- cbrt
+- cbrtf
+- ceil
+- ceilf
+- cosf
+- exp
+- exp2
+- exp2f
+- expm1
+- expm1f
+- fdim
+- fdimf
+- floorf
+- fma
+- fmod
+- log
+- log2
+- log10
+- log10f
+- log1p
+- log1pf
+- log2f
+- roundf
+- sinf
+- tanf
+
+## v0.1.0 - 2018-07-13
+
+- Initial release
+
+[Unreleased]: https://github.com/japaric/libm/compare/v0.2.1...HEAD
+[v0.2.1]: https://github.com/japaric/libm/compare/0.2.0...v0.2.1
+[v0.2.0]: https://github.com/japaric/libm/compare/0.1.4...v0.2.0
+[v0.1.4]: https://github.com/japaric/libm/compare/0.1.3...v0.1.4
+[v0.1.3]: https://github.com/japaric/libm/compare/v0.1.2...0.1.3
+[v0.1.2]: https://github.com/japaric/libm/compare/v0.1.1...v0.1.2
+[v0.1.1]: https://github.com/japaric/libm/compare/v0.1.0...v0.1.1
diff --git a/library/compiler-builtins/libm/Cargo.toml b/library/compiler-builtins/libm/Cargo.toml
new file mode 100644
index 00000000000..b6fb5efcf76
--- /dev/null
+++ b/library/compiler-builtins/libm/Cargo.toml
@@ -0,0 +1,49 @@
+[package]
+authors = ["Jorge Aparicio <jorge@japaric.io>"]
+categories = ["no-std"]
+description = "libm in pure Rust"
+documentation = "https://docs.rs/libm"
+keywords = ["libm", "math"]
+license = "MIT"
+name = "libm"
+readme = "README.md"
+repository = "https://github.com/rust-lang/compiler-builtins"
+version = "0.2.15"
+edition = "2021"
+rust-version = "1.63"
+
+[features]
+default = ["arch"]
+
+# Enable architecture-specific features such as SIMD or assembly routines.
+arch = []
+
+# This tells the compiler to assume that a Nightly toolchain is being used and
+# that it should activate any useful Nightly things accordingly.
+unstable = ["unstable-intrinsics", "unstable-float"]
+
+# Enable calls to functions in `core::intrinsics`
+unstable-intrinsics = []
+
+# Make some internal things public for testing.
+unstable-public-internals = []
+
+# Enable the nightly-only `f16` and `f128`.
+unstable-float = []
+
+# Used to prevent using any intrinsics or arch-specific code.
+#
+# HACK: this is a negative feature which is generally a bad idea in Cargo, but
+# we need it to be able to forbid other features when this crate is used in
+# Rust dependencies. Setting this overrides all features that may enable
+# hard float operations.
+force-soft-floats = []
+
+[dev-dependencies]
+no-panic = "0.1.35"
+
+[lints.rust]
+unexpected_cfgs = { level = "warn", check-cfg = [
+  # compiler-builtins sets this feature, but we use it in `libm`
+  'cfg(feature, values("compiler-builtins"))',
+] }
diff --git a/library/compiler-builtins/libm/LICENSE.txt b/library/compiler-builtins/libm/LICENSE.txt
new file mode 100644
index 00000000000..2f8e41f1474
--- /dev/null
+++ b/library/compiler-builtins/libm/LICENSE.txt
@@ -0,0 +1,258 @@
+rust-lang/libm as a whole is available for use under the MIT license:
+
+------------------------------------------------------------------------------
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+
+As a contributor, you agree that your code can be used under either the MIT
+license or the Apache-2.0 license:
+
+------------------------------------------------------------------------------
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+------------------------------------------------------------------------------
+
+This Rust library contains the following copyrights:
+
+    Copyright (c) 2018 Jorge Aparicio
+
+Portions of this software are derived from third-party works licensed under
+terms compatible with the above MIT license:
+
+* musl libc https://www.musl-libc.org/. This library contains the following
+  copyright:
+
+      Copyright © 2005-2020 Rich Felker, et al.
+
+* The CORE-MATH project https://core-math.gitlabpages.inria.fr/. CORE-MATH
+  routines are available under the MIT license on a per-file basis.
+
+The musl libc COPYRIGHT file also includes the following notice relevant to
+math portions of the library:
+
+------------------------------------------------------------------------------
+Much of the math library code (src/math/* and src/complex/*) is
+Copyright © 1993,2004 Sun Microsystems or
+Copyright © 2003-2011 David Schultz or
+Copyright © 2003-2009 Steven G. Kargl or
+Copyright © 2003-2009 Bruce D. Evans or
+Copyright © 2008 Stephen L. Moshier or
+Copyright © 2017-2018 Arm Limited
+and labelled as such in comments in the individual source files. All
+have been licensed under extremely permissive terms.
+------------------------------------------------------------------------------
+
+Copyright notices are retained in src/* files where relevant.
diff --git a/library/compiler-builtins/libm/README.md b/library/compiler-builtins/libm/README.md
new file mode 100644
index 00000000000..349e892dfcf
--- /dev/null
+++ b/library/compiler-builtins/libm/README.md
@@ -0,0 +1,42 @@
+# `libm`
+
+A Rust implementations of the C math library.
+
+## Usage
+
+`libm` provides fallback implementations for Rust's [float math functions] in
+`core`, and the [`core_float_math`] feature. If what is available suits your
+needs, there is no need to add `libm` as a dependency.
+
+If more functionality is needed, this crate can also be used directly:
+
+```toml
+[dependencies]
+libm = "0.2.11"
+```
+
+[float math functions]: https://doc.rust-lang.org/std/primitive.f32.html
+[`core_float_math`]: https://github.com/rust-lang/rust/issues/137578
+
+## Contributing
+
+Please check [CONTRIBUTING.md](../CONTRIBUTING.md)
+
+## Minimum Rust version policy
+
+This crate supports rustc 1.63 and newer.
+
+## License
+
+Usage is under the MIT license, available at
+<https://opensource.org/license/mit>.
+
+### Contribution
+
+Contributions are licensed under both the MIT license and the Apache License,
+Version 2.0, available at <htps://www.apache.org/licenses/LICENSE-2.0>. Unless
+you explicitly state otherwise, any contribution intentionally submitted for
+inclusion in the work by you, as defined in the Apache-2.0 license, shall be
+dual licensed as mentioned, without any additional terms or conditions.
+
+See [LICENSE.txt](LICENSE.txt) for full details.
diff --git a/library/compiler-builtins/libm/build.rs b/library/compiler-builtins/libm/build.rs
new file mode 100644
index 00000000000..07d08ed4364
--- /dev/null
+++ b/library/compiler-builtins/libm/build.rs
@@ -0,0 +1,18 @@
+use std::env;
+
+mod configure;
+
+fn main() {
+    let cfg = configure::Config::from_env();
+
+    println!("cargo:rerun-if-changed=build.rs");
+    println!("cargo:rerun-if-changed=configure.rs");
+    println!("cargo:rustc-check-cfg=cfg(assert_no_panic)");
+
+    // If set, enable `no-panic`. Requires LTO (`release-opt` profile).
+    if env::var("ENSURE_NO_PANIC").is_ok() {
+        println!("cargo:rustc-cfg=assert_no_panic");
+    }
+
+    configure::emit_libm_config(&cfg);
+}
diff --git a/library/compiler-builtins/libm/configure.rs b/library/compiler-builtins/libm/configure.rs
new file mode 100644
index 00000000000..2a497c7b117
--- /dev/null
+++ b/library/compiler-builtins/libm/configure.rs
@@ -0,0 +1,189 @@
+// Configuration shared with both libm and libm-test
+
+use std::env;
+use std::path::PathBuf;
+
+#[allow(dead_code)]
+pub struct Config {
+    pub manifest_dir: PathBuf,
+    pub out_dir: PathBuf,
+    pub opt_level: String,
+    pub cargo_features: Vec<String>,
+    pub target_arch: String,
+    pub target_env: String,
+    pub target_family: Option<String>,
+    pub target_os: String,
+    pub target_string: String,
+    pub target_vendor: String,
+    pub target_features: Vec<String>,
+}
+
+impl Config {
+    pub fn from_env() -> Self {
+        let target_features = env::var("CARGO_CFG_TARGET_FEATURE")
+            .map(|feats| feats.split(',').map(ToOwned::to_owned).collect())
+            .unwrap_or_default();
+        let cargo_features = env::vars()
+            .filter_map(|(name, _value)| name.strip_prefix("CARGO_FEATURE_").map(ToOwned::to_owned))
+            .map(|s| s.to_lowercase().replace("_", "-"))
+            .collect();
+
+        Self {
+            manifest_dir: PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap()),
+            out_dir: PathBuf::from(env::var("OUT_DIR").unwrap()),
+            opt_level: env::var("OPT_LEVEL").unwrap(),
+            cargo_features,
+            target_arch: env::var("CARGO_CFG_TARGET_ARCH").unwrap(),
+            target_env: env::var("CARGO_CFG_TARGET_ENV").unwrap(),
+            target_family: env::var("CARGO_CFG_TARGET_FAMILY").ok(),
+            target_os: env::var("CARGO_CFG_TARGET_OS").unwrap(),
+            target_string: env::var("TARGET").unwrap(),
+            target_vendor: env::var("CARGO_CFG_TARGET_VENDOR").unwrap(),
+            target_features,
+        }
+    }
+}
+
+/// Libm gets most config options made available.
+#[allow(dead_code)]
+pub fn emit_libm_config(cfg: &Config) {
+    emit_intrinsics_cfg();
+    emit_arch_cfg();
+    emit_optimization_cfg(cfg);
+    emit_cfg_shorthands(cfg);
+    emit_cfg_env(cfg);
+    emit_f16_f128_cfg(cfg);
+}
+
+/// Tests don't need most feature-related config.
+#[allow(dead_code)]
+pub fn emit_test_config(cfg: &Config) {
+    emit_optimization_cfg(cfg);
+    emit_cfg_shorthands(cfg);
+    emit_cfg_env(cfg);
+    emit_f16_f128_cfg(cfg);
+}
+
+/// Simplify the feature logic for enabling intrinsics so code only needs to use
+/// `cfg(intrinsics_enabled)`.
+fn emit_intrinsics_cfg() {
+    println!("cargo:rustc-check-cfg=cfg(intrinsics_enabled)");
+
+    // Disabled by default; `unstable-intrinsics` enables again; `force-soft-floats` overrides
+    // to disable.
+    if cfg!(feature = "unstable-intrinsics") && !cfg!(feature = "force-soft-floats") {
+        println!("cargo:rustc-cfg=intrinsics_enabled");
+    }
+}
+
+/// Simplify the feature logic for enabling arch-specific features so code only needs to use
+/// `cfg(arch_enabled)`.
+fn emit_arch_cfg() {
+    println!("cargo:rustc-check-cfg=cfg(arch_enabled)");
+
+    // Enabled by default via the "arch" feature, `force-soft-floats` overrides to disable.
+    if cfg!(feature = "arch") && !cfg!(feature = "force-soft-floats") {
+        println!("cargo:rustc-cfg=arch_enabled");
+    }
+}
+
+/// Some tests are extremely slow. Emit a config option based on optimization level.
+fn emit_optimization_cfg(cfg: &Config) {
+    println!("cargo:rustc-check-cfg=cfg(optimizations_enabled)");
+
+    if !matches!(cfg.opt_level.as_str(), "0" | "1") {
+        println!("cargo:rustc-cfg=optimizations_enabled");
+    }
+}
+
+/// Provide an alias for common longer config combinations.
+fn emit_cfg_shorthands(cfg: &Config) {
+    println!("cargo:rustc-check-cfg=cfg(x86_no_sse)");
+    if cfg.target_arch == "x86" && !cfg.target_features.iter().any(|f| f == "sse") {
+        // Shorthand to detect i586 targets
+        println!("cargo:rustc-cfg=x86_no_sse");
+    }
+}
+
+/// Reemit config that we make use of for test logging.
+fn emit_cfg_env(cfg: &Config) {
+    println!(
+        "cargo:rustc-env=CFG_CARGO_FEATURES={:?}",
+        cfg.cargo_features
+    );
+    println!("cargo:rustc-env=CFG_OPT_LEVEL={}", cfg.opt_level);
+    println!(
+        "cargo:rustc-env=CFG_TARGET_FEATURES={:?}",
+        cfg.target_features
+    );
+}
+
+/// Configure whether or not `f16` and `f128` support should be enabled.
+fn emit_f16_f128_cfg(cfg: &Config) {
+    println!("cargo:rustc-check-cfg=cfg(f16_enabled)");
+    println!("cargo:rustc-check-cfg=cfg(f128_enabled)");
+
+    // `unstable-float` enables these features.
+    if !cfg!(feature = "unstable-float") {
+        return;
+    }
+
+    // Set whether or not `f16` and `f128` are supported at a basic level by LLVM. This only means
+    // that the backend will not crash when using these types and generates code that can be called
+    // without crashing (no infinite recursion). This does not mean that the platform doesn't have
+    // ABI or other bugs.
+    //
+    // We do this here rather than in `rust-lang/rust` because configuring via cargo features is
+    // not straightforward.
+    //
+    // Original source of this list:
+    // <https://github.com/rust-lang/compiler-builtins/pull/652#issuecomment-2266151350>
+    let f16_enabled = match cfg.target_arch.as_str() {
+        // Unsupported <https://github.com/llvm/llvm-project/issues/94434>
+        "arm64ec" => false,
+        // Selection failure <https://github.com/llvm/llvm-project/issues/50374>
+        "s390x" => false,
+        // Infinite recursion <https://github.com/llvm/llvm-project/issues/97981>
+        // FIXME(llvm): loongarch fixed by <https://github.com/llvm/llvm-project/pull/107791>
+        "csky" => false,
+        "hexagon" => false,
+        "loongarch64" => false,
+        "mips" | "mips64" | "mips32r6" | "mips64r6" => false,
+        "powerpc" | "powerpc64" => false,
+        "sparc" | "sparc64" => false,
+        "wasm32" | "wasm64" => false,
+        // Most everything else works as of LLVM 19
+        _ => true,
+    };
+
+    let f128_enabled = match cfg.target_arch.as_str() {
+        // Unsupported (libcall is not supported) <https://github.com/llvm/llvm-project/issues/121122>
+        "amdgpu" => false,
+        // Unsupported <https://github.com/llvm/llvm-project/issues/94434>
+        "arm64ec" => false,
+        // Selection failure <https://github.com/llvm/llvm-project/issues/96432>
+        "mips64" | "mips64r6" => false,
+        // Selection failure <https://github.com/llvm/llvm-project/issues/95471>
+        "nvptx64" => false,
+        // Selection failure <https://github.com/llvm/llvm-project/issues/101545>
+        "powerpc64" if &cfg.target_os == "aix" => false,
+        // Selection failure <https://github.com/llvm/llvm-project/issues/41838>
+        "sparc" => false,
+        // Most everything else works as of LLVM 19
+        _ => true,
+    };
+
+    // If the feature is set, disable these types.
+    let disable_both = env::var_os("CARGO_FEATURE_NO_F16_F128").is_some();
+
+    println!("cargo:rustc-check-cfg=cfg(f16_enabled)");
+    println!("cargo:rustc-check-cfg=cfg(f128_enabled)");
+
+    if f16_enabled && !disable_both {
+        println!("cargo:rustc-cfg=f16_enabled");
+    }
+
+    if f128_enabled && !disable_both {
+        println!("cargo:rustc-cfg=f128_enabled");
+    }
+}
diff --git a/library/compiler-builtins/libm/src/lib.rs b/library/compiler-builtins/libm/src/lib.rs
new file mode 100644
index 00000000000..31b12235314
--- /dev/null
+++ b/library/compiler-builtins/libm/src/lib.rs
@@ -0,0 +1,33 @@
+//! libm in pure Rust
+#![no_std]
+#![cfg_attr(intrinsics_enabled, allow(internal_features))]
+#![cfg_attr(intrinsics_enabled, feature(core_intrinsics))]
+#![cfg_attr(
+    all(intrinsics_enabled, target_family = "wasm"),
+    feature(wasm_numeric_instr)
+)]
+#![cfg_attr(f128_enabled, feature(f128))]
+#![cfg_attr(f16_enabled, feature(f16))]
+#![allow(clippy::assign_op_pattern)]
+#![allow(clippy::deprecated_cfg_attr)]
+#![allow(clippy::eq_op)]
+#![allow(clippy::excessive_precision)]
+#![allow(clippy::float_cmp)]
+#![allow(clippy::int_plus_one)]
+#![allow(clippy::just_underscores_and_digits)]
+#![allow(clippy::many_single_char_names)]
+#![allow(clippy::mixed_case_hex_literals)]
+#![allow(clippy::needless_late_init)]
+#![allow(clippy::needless_return)]
+#![allow(clippy::unreadable_literal)]
+#![allow(clippy::zero_divided_by_zero)]
+#![forbid(unsafe_op_in_unsafe_fn)]
+
+mod libm_helper;
+mod math;
+
+use core::{f32, f64};
+
+pub use libm_helper::*;
+
+pub use self::math::*;
diff --git a/library/compiler-builtins/libm/src/libm_helper.rs b/library/compiler-builtins/libm/src/libm_helper.rs
new file mode 100644
index 00000000000..dfa1ff77bf2
--- /dev/null
+++ b/library/compiler-builtins/libm/src/libm_helper.rs
@@ -0,0 +1,244 @@
+use core::marker::PhantomData;
+
+use crate::*;
+
+/// Generic helper for libm functions, abstracting over f32 and f64. <br/>
+/// # Type Parameter:
+/// - `T`: Either `f32` or `f64`
+///
+/// # Examples
+/// ```rust
+/// use libm::{self, Libm};
+///
+/// const PI_F32: f32 = 3.1415927410e+00;
+/// const PI_F64: f64 = 3.1415926535897931160e+00;
+///
+/// assert!(Libm::<f32>::cos(0.0f32) == libm::cosf(0.0));
+/// assert!(Libm::<f32>::sin(PI_F32) == libm::sinf(PI_F32));
+///
+/// assert!(Libm::<f64>::cos(0.0f64) == libm::cos(0.0));
+/// assert!(Libm::<f64>::sin(PI_F64) == libm::sin(PI_F64));
+/// ```
+pub struct Libm<T>(PhantomData<T>);
+
+macro_rules! libm_helper {
+    ($t:ident, funcs: $funcs:tt) => {
+        impl Libm<$t> {
+            #![allow(unused_parens)]
+
+            libm_helper! { $funcs }
+        }
+    };
+
+    ({$($func:tt;)*}) => {
+        $(
+            libm_helper! { $func }
+        )*
+    };
+
+    ((fn $func:ident($($arg:ident: $arg_typ:ty),*) -> ($($ret_typ:ty),*); => $libm_fn:ident)) => {
+        #[inline(always)]
+        pub fn $func($($arg: $arg_typ),*) -> ($($ret_typ),*) {
+            $libm_fn($($arg),*)
+        }
+    };
+}
+
+// verify-apilist-start
+libm_helper! {
+    f32,
+    funcs: {
+        // verify-sorted-start
+        (fn acos(x: f32) -> (f32);                  => acosf);
+        (fn acosh(x: f32) -> (f32);                 => acoshf);
+        (fn asin(x: f32) -> (f32);                  => asinf);
+        (fn asinh(x: f32) -> (f32);                 => asinhf);
+        (fn atan(x: f32) -> (f32);                  => atanf);
+        (fn atan2(y: f32, x: f32) -> (f32);         => atan2f);
+        (fn atanh(x: f32) -> (f32);                 => atanhf);
+        (fn cbrt(x: f32) -> (f32);                  => cbrtf);
+        (fn ceil(x: f32) -> (f32);                  => ceilf);
+        (fn copysign(x: f32, y: f32) -> (f32);      => copysignf);
+        (fn cos(x: f32) -> (f32);                   => cosf);
+        (fn cosh(x: f32) -> (f32);                  => coshf);
+        (fn erf(x: f32) -> (f32);                   => erff);
+        (fn erfc(x: f32) -> (f32);                  => erfcf);
+        (fn exp(x: f32) -> (f32);                   => expf);
+        (fn exp10(x: f32) -> (f32);                 => exp10f);
+        (fn exp2(x: f32) -> (f32);                  => exp2f);
+        (fn expm1(x: f32) -> (f32);                 => expm1f);
+        (fn fabs(x: f32) -> (f32);                  => fabsf);
+        (fn fdim(x: f32, y: f32) -> (f32);          => fdimf);
+        (fn floor(x: f32) -> (f32);                 => floorf);
+        (fn fma(x: f32, y: f32, z: f32) -> (f32);   => fmaf);
+        (fn fmax(x: f32, y: f32) -> (f32);          => fmaxf);
+        (fn fmin(x: f32, y: f32) -> (f32);          => fminf);
+        (fn fmod(x: f32, y: f32) -> (f32);          => fmodf);
+        (fn frexp(x: f32) -> (f32, i32);            => frexpf);
+        (fn hypot(x: f32, y: f32) -> (f32);         => hypotf);
+        (fn ilogb(x: f32) -> (i32);                 => ilogbf);
+        (fn j0(x: f32) -> (f32);                    => j0f);
+        (fn j1(x: f32) -> (f32);                    => j1f);
+        (fn jn(n: i32, x: f32) -> (f32);            => jnf);
+        (fn ldexp(x: f32, n: i32) -> (f32);         => ldexpf);
+        (fn lgamma(x: f32) -> (f32);                => lgammaf);
+        (fn lgamma_r(x: f32) -> (f32, i32);         => lgammaf_r);
+        (fn log(x: f32) -> (f32);                   => logf);
+        (fn log10(x: f32) -> (f32);                 => log10f);
+        (fn log1p(x: f32) -> (f32);                 => log1pf);
+        (fn log2(x: f32) -> (f32);                  => log2f);
+        (fn modf(x: f32) -> (f32, f32);             => modff);
+        (fn nextafter(x: f32, y: f32) -> (f32);     => nextafterf);
+        (fn pow(x: f32, y: f32) -> (f32);           => powf);
+        (fn remainder(x: f32, y: f32) -> (f32);     => remainderf);
+        (fn remquo(x: f32, y: f32) -> (f32, i32);   => remquof);
+        (fn rint(x: f32) -> (f32);                  => rintf);
+        (fn round(x: f32) -> (f32);                 => roundf);
+        (fn roundeven(x: f32) -> (f32);             => roundevenf);
+        (fn scalbn(x: f32, n: i32) -> (f32);        => scalbnf);
+        (fn sin(x: f32) -> (f32);                   => sinf);
+        (fn sincos(x: f32) -> (f32, f32);           => sincosf);
+        (fn sinh(x: f32) -> (f32);                  => sinhf);
+        (fn sqrt(x: f32) -> (f32);                  => sqrtf);
+        (fn tan(x: f32) -> (f32);                   => tanf);
+        (fn tanh(x: f32) -> (f32);                  => tanhf);
+        (fn tgamma(x: f32) -> (f32);                => tgammaf);
+        (fn trunc(x: f32) -> (f32);                 => truncf);
+        (fn y0(x: f32) -> (f32);                    => y0f);
+        (fn y1(x: f32) -> (f32);                    => y1f);
+        (fn yn(n: i32, x: f32) -> (f32);            => ynf);
+        // verify-sorted-end
+    }
+}
+
+libm_helper! {
+    f64,
+    funcs: {
+        // verify-sorted-start
+        (fn acos(x: f64) -> (f64);                  => acos);
+        (fn acosh(x: f64) -> (f64);                 => acosh);
+        (fn asin(x: f64) -> (f64);                  => asin);
+        (fn asinh(x: f64) -> (f64);                 => asinh);
+        (fn atan(x: f64) -> (f64);                  => atan);
+        (fn atan2(y: f64, x: f64) -> (f64);         => atan2);
+        (fn atanh(x: f64) -> (f64);                 => atanh);
+        (fn cbrt(x: f64) -> (f64);                  => cbrt);
+        (fn ceil(x: f64) -> (f64);                  => ceil);
+        (fn copysign(x: f64, y: f64) -> (f64);      => copysign);
+        (fn cos(x: f64) -> (f64);                   => cos);
+        (fn cosh(x: f64) -> (f64);                  => cosh);
+        (fn erf(x: f64) -> (f64);                   => erf);
+        (fn erfc(x: f64) -> (f64);                  => erfc);
+        (fn exp(x: f64) -> (f64);                   => exp);
+        (fn exp10(x: f64) -> (f64);                 => exp10);
+        (fn exp2(x: f64) -> (f64);                  => exp2);
+        (fn expm1(x: f64) -> (f64);                 => expm1);
+        (fn fabs(x: f64) -> (f64);                  => fabs);
+        (fn fdim(x: f64, y: f64) -> (f64);          => fdim);
+        (fn floor(x: f64) -> (f64);                 => floor);
+        (fn fma(x: f64, y: f64, z: f64) -> (f64);   => fma);
+        (fn fmax(x: f64, y: f64) -> (f64);          => fmax);
+        (fn fmaximum(x: f64, y: f64) -> (f64);      => fmaximum);
+        (fn fmaximum_num(x: f64, y: f64) -> (f64);  => fmaximum_num);
+        (fn fmaximum_numf(x: f32, y: f32) -> (f32); => fmaximum_numf);
+        (fn fmaximumf(x: f32, y: f32) -> (f32);     => fmaximumf);
+        (fn fmin(x: f64, y: f64) -> (f64);          => fmin);
+        (fn fminimum(x: f64, y: f64) -> (f64);      => fminimum);
+        (fn fminimum_num(x: f64, y: f64) -> (f64);  => fminimum_num);
+        (fn fminimum_numf(x: f32, y: f32) -> (f32); => fminimum_numf);
+        (fn fminimumf(x: f32, y: f32) -> (f32);     => fminimumf);
+        (fn fmod(x: f64, y: f64) -> (f64);          => fmod);
+        (fn frexp(x: f64) -> (f64, i32);            => frexp);
+        (fn hypot(x: f64, y: f64) -> (f64);         => hypot);
+        (fn ilogb(x: f64) -> (i32);                 => ilogb);
+        (fn j0(x: f64) -> (f64);                    => j0);
+        (fn j1(x: f64) -> (f64);                    => j1);
+        (fn jn(n: i32, x: f64) -> (f64);            => jn);
+        (fn ldexp(x: f64, n: i32) -> (f64);         => ldexp);
+        (fn lgamma(x: f64) -> (f64);                => lgamma);
+        (fn lgamma_r(x: f64) -> (f64, i32);         => lgamma_r);
+        (fn log(x: f64) -> (f64);                   => log);
+        (fn log10(x: f64) -> (f64);                 => log10);
+        (fn log1p(x: f64) -> (f64);                 => log1p);
+        (fn log2(x: f64) -> (f64);                  => log2);
+        (fn modf(x: f64) -> (f64, f64);             => modf);
+        (fn nextafter(x: f64, y: f64) -> (f64);     => nextafter);
+        (fn pow(x: f64, y: f64) -> (f64);           => pow);
+        (fn remainder(x: f64, y: f64) -> (f64);     => remainder);
+        (fn remquo(x: f64, y: f64) -> (f64, i32);   => remquo);
+        (fn rint(x: f64) -> (f64);                  => rint);
+        (fn round(x: f64) -> (f64);                 => round);
+        (fn roundevem(x: f64) -> (f64);             => roundeven);
+        (fn scalbn(x: f64, n: i32) -> (f64);        => scalbn);
+        (fn sin(x: f64) -> (f64);                   => sin);
+        (fn sincos(x: f64) -> (f64, f64);           => sincos);
+        (fn sinh(x: f64) -> (f64);                  => sinh);
+        (fn sqrt(x: f64) -> (f64);                  => sqrt);
+        (fn tan(x: f64) -> (f64);                   => tan);
+        (fn tanh(x: f64) -> (f64);                  => tanh);
+        (fn tgamma(x: f64) -> (f64);                => tgamma);
+        (fn trunc(x: f64) -> (f64);                 => trunc);
+        (fn y0(x: f64) -> (f64);                    => y0);
+        (fn y1(x: f64) -> (f64);                    => y1);
+        (fn yn(n: i32, x: f64) -> (f64);            => yn);
+        // verify-sorted-end
+    }
+}
+
+#[cfg(f16_enabled)]
+libm_helper! {
+    f16,
+    funcs: {
+        // verify-sorted-start
+        (fn ceil(x: f16) -> (f16);                  => ceilf16);
+        (fn copysign(x: f16, y: f16) -> (f16);      => copysignf16);
+        (fn fabs(x: f16) -> (f16);                  => fabsf16);
+        (fn fdim(x: f16, y: f16) -> (f16);          => fdimf16);
+        (fn floor(x: f16) -> (f16);                 => floorf16);
+        (fn fmax(x: f16, y: f16) -> (f16);          => fmaxf16);
+        (fn fmaximum_num(x: f16, y: f16) -> (f16);  => fmaximum_numf16);
+        (fn fmaximumf16(x: f16, y: f16) -> (f16);   => fmaximumf16);
+        (fn fmin(x: f16, y: f16) -> (f16);          => fminf16);
+        (fn fminimum(x: f16, y: f16) -> (f16);      => fminimumf16);
+        (fn fminimum_num(x: f16, y: f16) -> (f16);  => fminimum_numf16);
+        (fn fmod(x: f16, y: f16) -> (f16);          => fmodf16);
+        (fn ldexp(x: f16, n: i32) -> (f16);         => ldexpf16);
+        (fn rint(x: f16) -> (f16);                  => rintf16);
+        (fn round(x: f16) -> (f16);                 => roundf16);
+        (fn roundeven(x: f16) -> (f16);             => roundevenf16);
+        (fn scalbn(x: f16, n: i32) -> (f16);        => scalbnf16);
+        (fn sqrtf(x: f16) -> (f16);                 => sqrtf16);
+        (fn truncf(x: f16) -> (f16);                => truncf16);
+        // verify-sorted-end
+    }
+}
+
+#[cfg(f128_enabled)]
+libm_helper! {
+    f128,
+    funcs: {
+        // verify-sorted-start
+        (fn ceil(x: f128) -> (f128);                => ceilf128);
+        (fn copysign(x: f128, y: f128) -> (f128);   => copysignf128);
+        (fn fabs(x: f128) -> (f128);                => fabsf128);
+        (fn fdim(x: f128, y: f128) -> (f128);       => fdimf128);
+        (fn floor(x: f128) -> (f128);               => floorf128);
+        (fn fma(x: f128, y: f128, z: f128) -> (f128); => fmaf128);
+        (fn fmax(x: f128, y: f128) -> (f128);       => fmaxf128);
+        (fn fmaximum(x: f128, y: f128) -> (f128);      => fmaximumf128);
+        (fn fmaximum_num(x: f128, y: f128) -> (f128);  => fmaximum_numf128);
+        (fn fmin(x: f128, y: f128) -> (f128);       => fminf128);
+        (fn fminimum(x: f128, y: f128) -> (f128);      => fminimumf128);
+        (fn fminimum_num(x: f128, y: f128) -> (f128);  => fminimum_numf128);
+        (fn fmod(x: f128, y: f128) -> (f128);       => fmodf128);
+        (fn ldexp(x: f128, n: i32) -> (f128);       => ldexpf128);
+        (fn rint(x: f128) -> (f128);                => rintf128);
+        (fn round(x: f128) -> (f128);               => roundf128);
+        (fn roundeven(x: f128) -> (f128);           => roundevenf128);
+        (fn scalbn(x: f128, n: i32) -> (f128);      => scalbnf128);
+        (fn sqrt(x: f128) -> (f128);                => sqrtf128);
+        (fn trunc(x: f128) -> (f128);               => truncf128);
+        // verify-sorted-end
+    }
+}
+// verify-apilist-end
diff --git a/library/compiler-builtins/libm/src/math/acos.rs b/library/compiler-builtins/libm/src/math/acos.rs
new file mode 100644
index 00000000000..23b13251ee2
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/acos.rs
@@ -0,0 +1,112 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/e_acos.c */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunSoft, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+/* acos(x)
+ * Method :
+ *      acos(x)  = pi/2 - asin(x)
+ *      acos(-x) = pi/2 + asin(x)
+ * For |x|<=0.5
+ *      acos(x) = pi/2 - (x + x*x^2*R(x^2))     (see asin.c)
+ * For x>0.5
+ *      acos(x) = pi/2 - (pi/2 - 2asin(sqrt((1-x)/2)))
+ *              = 2asin(sqrt((1-x)/2))
+ *              = 2s + 2s*z*R(z)        ...z=(1-x)/2, s=sqrt(z)
+ *              = 2f + (2c + 2s*z*R(z))
+ *     where f=hi part of s, and c = (z-f*f)/(s+f) is the correction term
+ *     for f so that f+c ~ sqrt(z).
+ * For x<-0.5
+ *      acos(x) = pi - 2asin(sqrt((1-|x|)/2))
+ *              = pi - 0.5*(s+s*z*R(z)), where z=(1-|x|)/2,s=sqrt(z)
+ *
+ * Special cases:
+ *      if x is NaN, return x itself;
+ *      if |x|>1, return NaN with invalid signal.
+ *
+ * Function needed: sqrt
+ */
+
+use super::sqrt;
+
+const PIO2_HI: f64 = 1.57079632679489655800e+00; /* 0x3FF921FB, 0x54442D18 */
+const PIO2_LO: f64 = 6.12323399573676603587e-17; /* 0x3C91A626, 0x33145C07 */
+const PS0: f64 = 1.66666666666666657415e-01; /* 0x3FC55555, 0x55555555 */
+const PS1: f64 = -3.25565818622400915405e-01; /* 0xBFD4D612, 0x03EB6F7D */
+const PS2: f64 = 2.01212532134862925881e-01; /* 0x3FC9C155, 0x0E884455 */
+const PS3: f64 = -4.00555345006794114027e-02; /* 0xBFA48228, 0xB5688F3B */
+const PS4: f64 = 7.91534994289814532176e-04; /* 0x3F49EFE0, 0x7501B288 */
+const PS5: f64 = 3.47933107596021167570e-05; /* 0x3F023DE1, 0x0DFDF709 */
+const QS1: f64 = -2.40339491173441421878e+00; /* 0xC0033A27, 0x1C8A2D4B */
+const QS2: f64 = 2.02094576023350569471e+00; /* 0x40002AE5, 0x9C598AC8 */
+const QS3: f64 = -6.88283971605453293030e-01; /* 0xBFE6066C, 0x1B8D0159 */
+const QS4: f64 = 7.70381505559019352791e-02; /* 0x3FB3B8C5, 0xB12E9282 */
+
+fn r(z: f64) -> f64 {
+    let p: f64 = z * (PS0 + z * (PS1 + z * (PS2 + z * (PS3 + z * (PS4 + z * PS5)))));
+    let q: f64 = 1.0 + z * (QS1 + z * (QS2 + z * (QS3 + z * QS4)));
+    p / q
+}
+
+/// Arccosine (f64)
+///
+/// Computes the inverse cosine (arc cosine) of the input value.
+/// Arguments must be in the range -1 to 1.
+/// Returns values in radians, in the range of 0 to pi.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn acos(x: f64) -> f64 {
+    let x1p_120f = f64::from_bits(0x3870000000000000); // 0x1p-120 === 2 ^ -120
+    let z: f64;
+    let w: f64;
+    let s: f64;
+    let c: f64;
+    let df: f64;
+    let hx: u32;
+    let ix: u32;
+
+    hx = (x.to_bits() >> 32) as u32;
+    ix = hx & 0x7fffffff;
+    /* |x| >= 1 or nan */
+    if ix >= 0x3ff00000 {
+        let lx: u32 = x.to_bits() as u32;
+
+        if ((ix - 0x3ff00000) | lx) == 0 {
+            /* acos(1)=0, acos(-1)=pi */
+            if (hx >> 31) != 0 {
+                return 2. * PIO2_HI + x1p_120f;
+            }
+            return 0.;
+        }
+        return 0. / (x - x);
+    }
+    /* |x| < 0.5 */
+    if ix < 0x3fe00000 {
+        if ix <= 0x3c600000 {
+            /* |x| < 2**-57 */
+            return PIO2_HI + x1p_120f;
+        }
+        return PIO2_HI - (x - (PIO2_LO - x * r(x * x)));
+    }
+    /* x < -0.5 */
+    if (hx >> 31) != 0 {
+        z = (1.0 + x) * 0.5;
+        s = sqrt(z);
+        w = r(z) * s - PIO2_LO;
+        return 2. * (PIO2_HI - (s + w));
+    }
+    /* x > 0.5 */
+    z = (1.0 - x) * 0.5;
+    s = sqrt(z);
+    // Set the low 4 bytes to zero
+    df = f64::from_bits(s.to_bits() & 0xff_ff_ff_ff_00_00_00_00);
+
+    c = (z - df * df) / (s + df);
+    w = r(z) * s + c;
+    2. * (df + w)
+}
diff --git a/library/compiler-builtins/libm/src/math/acosf.rs b/library/compiler-builtins/libm/src/math/acosf.rs
new file mode 100644
index 00000000000..dd88eea5b13
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/acosf.rs
@@ -0,0 +1,79 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/e_acosf.c */
+/*
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
+ */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+use super::sqrt::sqrtf;
+
+const PIO2_HI: f32 = 1.5707962513e+00; /* 0x3fc90fda */
+const PIO2_LO: f32 = 7.5497894159e-08; /* 0x33a22168 */
+const P_S0: f32 = 1.6666586697e-01;
+const P_S1: f32 = -4.2743422091e-02;
+const P_S2: f32 = -8.6563630030e-03;
+const Q_S1: f32 = -7.0662963390e-01;
+
+fn r(z: f32) -> f32 {
+    let p = z * (P_S0 + z * (P_S1 + z * P_S2));
+    let q = 1. + z * Q_S1;
+    p / q
+}
+
+/// Arccosine (f32)
+///
+/// Computes the inverse cosine (arc cosine) of the input value.
+/// Arguments must be in the range -1 to 1.
+/// Returns values in radians, in the range of 0 to pi.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn acosf(x: f32) -> f32 {
+    let x1p_120 = f32::from_bits(0x03800000); // 0x1p-120 === 2 ^ (-120)
+
+    let z: f32;
+    let w: f32;
+    let s: f32;
+
+    let mut hx = x.to_bits();
+    let ix = hx & 0x7fffffff;
+    /* |x| >= 1 or nan */
+    if ix >= 0x3f800000 {
+        if ix == 0x3f800000 {
+            if (hx >> 31) != 0 {
+                return 2. * PIO2_HI + x1p_120;
+            }
+            return 0.;
+        }
+        return 0. / (x - x);
+    }
+    /* |x| < 0.5 */
+    if ix < 0x3f000000 {
+        if ix <= 0x32800000 {
+            /* |x| < 2**-26 */
+            return PIO2_HI + x1p_120;
+        }
+        return PIO2_HI - (x - (PIO2_LO - x * r(x * x)));
+    }
+    /* x < -0.5 */
+    if (hx >> 31) != 0 {
+        z = (1. + x) * 0.5;
+        s = sqrtf(z);
+        w = r(z) * s - PIO2_LO;
+        return 2. * (PIO2_HI - (s + w));
+    }
+    /* x > 0.5 */
+    z = (1. - x) * 0.5;
+    s = sqrtf(z);
+    hx = s.to_bits();
+    let df = f32::from_bits(hx & 0xfffff000);
+    let c = (z - df * df) / (s + df);
+    w = r(z) * s + c;
+    2. * (df + w)
+}
diff --git a/library/compiler-builtins/libm/src/math/acosh.rs b/library/compiler-builtins/libm/src/math/acosh.rs
new file mode 100644
index 00000000000..d1f5b9fa937
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/acosh.rs
@@ -0,0 +1,27 @@
+use super::{log, log1p, sqrt};
+
+const LN2: f64 = 0.693147180559945309417232121458176568; /* 0x3fe62e42,  0xfefa39ef*/
+
+/// Inverse hyperbolic cosine (f64)
+///
+/// Calculates the inverse hyperbolic cosine of `x`.
+/// Is defined as `log(x + sqrt(x*x-1))`.
+/// `x` must be a number greater than or equal to 1.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn acosh(x: f64) -> f64 {
+    let u = x.to_bits();
+    let e = ((u >> 52) as usize) & 0x7ff;
+
+    /* x < 1 domain error is handled in the called functions */
+
+    if e < 0x3ff + 1 {
+        /* |x| < 2, up to 2ulp error in [1,1.125] */
+        return log1p(x - 1.0 + sqrt((x - 1.0) * (x - 1.0) + 2.0 * (x - 1.0)));
+    }
+    if e < 0x3ff + 26 {
+        /* |x| < 0x1p26 */
+        return log(2.0 * x - 1.0 / (x + sqrt(x * x - 1.0)));
+    }
+    /* |x| >= 0x1p26 or nan */
+    return log(x) + LN2;
+}
diff --git a/library/compiler-builtins/libm/src/math/acoshf.rs b/library/compiler-builtins/libm/src/math/acoshf.rs
new file mode 100644
index 00000000000..ad3455fdd48
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/acoshf.rs
@@ -0,0 +1,26 @@
+use super::{log1pf, logf, sqrtf};
+
+const LN2: f32 = 0.693147180559945309417232121458176568;
+
+/// Inverse hyperbolic cosine (f32)
+///
+/// Calculates the inverse hyperbolic cosine of `x`.
+/// Is defined as `log(x + sqrt(x*x-1))`.
+/// `x` must be a number greater than or equal to 1.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn acoshf(x: f32) -> f32 {
+    let u = x.to_bits();
+    let a = u & 0x7fffffff;
+
+    if a < 0x3f800000 + (1 << 23) {
+        /* |x| < 2, invalid if x < 1 or nan */
+        /* up to 2ulp error in [1,1.125] */
+        return log1pf(x - 1.0 + sqrtf((x - 1.0) * (x - 1.0) + 2.0 * (x - 1.0)));
+    }
+    if a < 0x3f800000 + (12 << 23) {
+        /* |x| < 0x1p12 */
+        return logf(2.0 * x - 1.0 / (x + sqrtf(x * x - 1.0)));
+    }
+    /* x >= 0x1p12 */
+    return logf(x) + LN2;
+}
diff --git a/library/compiler-builtins/libm/src/math/arch/aarch64.rs b/library/compiler-builtins/libm/src/math/arch/aarch64.rs
new file mode 100644
index 00000000000..020bb731cdc
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/arch/aarch64.rs
@@ -0,0 +1,115 @@
+//! Architecture-specific support for aarch64 with neon.
+
+use core::arch::asm;
+
+pub fn fma(mut x: f64, y: f64, z: f64) -> f64 {
+    // SAFETY: `fmadd` is available with neon and has no side effects.
+    unsafe {
+        asm!(
+            "fmadd {x:d}, {x:d}, {y:d}, {z:d}",
+            x = inout(vreg) x,
+            y = in(vreg) y,
+            z = in(vreg) z,
+            options(nomem, nostack, pure)
+        );
+    }
+    x
+}
+
+pub fn fmaf(mut x: f32, y: f32, z: f32) -> f32 {
+    // SAFETY: `fmadd` is available with neon and has no side effects.
+    unsafe {
+        asm!(
+            "fmadd {x:s}, {x:s}, {y:s}, {z:s}",
+            x = inout(vreg) x,
+            y = in(vreg) y,
+            z = in(vreg) z,
+            options(nomem, nostack, pure)
+        );
+    }
+    x
+}
+
+pub fn rint(mut x: f64) -> f64 {
+    // SAFETY: `frintn` is available with neon and has no side effects.
+    //
+    // `frintn` is always round-to-nearest which does not match the C specification, but Rust does
+    // not support rounding modes.
+    unsafe {
+        asm!(
+            "frintn {x:d}, {x:d}",
+            x = inout(vreg) x,
+            options(nomem, nostack, pure)
+        );
+    }
+    x
+}
+
+pub fn rintf(mut x: f32) -> f32 {
+    // SAFETY: `frintn` is available with neon and has no side effects.
+    //
+    // `frintn` is always round-to-nearest which does not match the C specification, but Rust does
+    // not support rounding modes.
+    unsafe {
+        asm!(
+            "frintn {x:s}, {x:s}",
+            x = inout(vreg) x,
+            options(nomem, nostack, pure)
+        );
+    }
+    x
+}
+
+#[cfg(all(f16_enabled, target_feature = "fp16"))]
+pub fn rintf16(mut x: f16) -> f16 {
+    // SAFETY: `frintn` is available for `f16` with `fp16` (implies `neon`) and has no side effects.
+    //
+    // `frintn` is always round-to-nearest which does not match the C specification, but Rust does
+    // not support rounding modes.
+    unsafe {
+        asm!(
+            "frintn {x:h}, {x:h}",
+            x = inout(vreg) x,
+            options(nomem, nostack, pure)
+        );
+    }
+    x
+}
+
+pub fn sqrt(mut x: f64) -> f64 {
+    // SAFETY: `fsqrt` is available with neon and has no side effects.
+    unsafe {
+        asm!(
+            "fsqrt {x:d}, {x:d}",
+            x = inout(vreg) x,
+            options(nomem, nostack, pure)
+        );
+    }
+    x
+}
+
+pub fn sqrtf(mut x: f32) -> f32 {
+    // SAFETY: `fsqrt` is available with neon and has no side effects.
+    unsafe {
+        asm!(
+            "fsqrt {x:s}, {x:s}",
+            x = inout(vreg) x,
+            options(nomem, nostack, pure)
+        );
+    }
+    x
+}
+
+#[cfg(all(f16_enabled, target_feature = "fp16"))]
+pub fn sqrtf16(mut x: f16) -> f16 {
+    // SAFETY: `fsqrt` is available for `f16` with `fp16` (implies `neon`) and has no
+    // side effects.
+    unsafe {
+        asm!(
+            "fsqrt {x:h}, {x:h}",
+            x = inout(vreg) x,
+            options(nomem, nostack, pure)
+        );
+    }
+    x
+}
diff --git a/library/compiler-builtins/libm/src/math/arch/i586.rs b/library/compiler-builtins/libm/src/math/arch/i586.rs
new file mode 100644
index 00000000000..f92b9a2af71
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/arch/i586.rs
@@ -0,0 +1,37 @@
+//! Architecture-specific support for x86-32 without SSE2
+
+use super::super::fabs;
+
+/// Use an alternative implementation on x86, because the
+/// main implementation fails with the x87 FPU used by
+/// debian i386, probably due to excess precision issues.
+/// Basic implementation taken from https://github.com/rust-lang/libm/issues/219.
+pub fn ceil(x: f64) -> f64 {
+    if fabs(x).to_bits() < 4503599627370496.0_f64.to_bits() {
+        let truncated = x as i64 as f64;
+        if truncated < x {
+            return truncated + 1.0;
+        } else {
+            return truncated;
+        }
+    } else {
+        return x;
+    }
+}
+
+/// Use an alternative implementation on x86, because the
+/// main implementation fails with the x87 FPU used by
+/// debian i386, probably due to excess precision issues.
+/// Basic implementation taken from https://github.com/rust-lang/libm/issues/219.
+pub fn floor(x: f64) -> f64 {
+    if fabs(x).to_bits() < 4503599627370496.0_f64.to_bits() {
+        let truncated = x as i64 as f64;
+        if truncated > x {
+            return truncated - 1.0;
+        } else {
+            return truncated;
+        }
+    } else {
+        return x;
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/arch/mod.rs b/library/compiler-builtins/libm/src/math/arch/mod.rs
new file mode 100644
index 00000000000..984ae7f3129
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/arch/mod.rs
@@ -0,0 +1,50 @@
+//! Architecture-specific routines and operations.
+//!
+//! LLVM will already optimize calls to some of these in cases that there are hardware
+//! instructions. Providing an implementation here just ensures that the faster implementation
+//! is used when calling the function directly. This helps anyone who uses `libm` directly, as
+//! well as improving things when these routines are called as part of other implementations.
+
+// Most implementations should be defined here, to ensure they are not made available when
+// soft floats are required.
+#[cfg(arch_enabled)]
+cfg_if! {
+    if #[cfg(all(target_arch = "wasm32", intrinsics_enabled))] {
+        mod wasm32;
+        pub use wasm32::{
+            ceil, ceilf, fabs, fabsf, floor, floorf, rint, rintf, sqrt, sqrtf, trunc, truncf,
+        };
+    } else if #[cfg(target_feature = "sse2")] {
+        mod x86;
+        pub use x86::{sqrt, sqrtf, fma, fmaf};
+    } else if #[cfg(all(
+        any(target_arch = "aarch64", target_arch = "arm64ec"),
+        target_feature = "neon"
+    ))] {
+        mod aarch64;
+
+        pub use aarch64::{
+            fma,
+            fmaf,
+            rint,
+            rintf,
+            sqrt,
+            sqrtf,
+        };
+
+        #[cfg(all(f16_enabled, target_feature = "fp16"))]
+        pub use aarch64::{
+            rintf16,
+            sqrtf16,
+        };
+    }
+}
+
+// There are certain architecture-specific implementations that are needed for correctness
+// even with `force-soft-float`. These are configured here.
+cfg_if! {
+    if #[cfg(all(target_arch = "x86", not(target_feature = "sse2")))] {
+        mod i586;
+        pub use i586::{ceil, floor};
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/arch/wasm32.rs b/library/compiler-builtins/libm/src/math/arch/wasm32.rs
new file mode 100644
index 00000000000..de80c8a5817
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/arch/wasm32.rs
@@ -0,0 +1,50 @@
+//! Wasm has builtins for simple float operations. Use the unstable `core::arch` intrinsics which
+//! are significantly faster than soft float operations.
+
+pub fn ceil(x: f64) -> f64 {
+    core::arch::wasm32::f64_ceil(x)
+}
+
+pub fn ceilf(x: f32) -> f32 {
+    core::arch::wasm32::f32_ceil(x)
+}
+
+pub fn fabs(x: f64) -> f64 {
+    x.abs()
+}
+
+pub fn fabsf(x: f32) -> f32 {
+    x.abs()
+}
+
+pub fn floor(x: f64) -> f64 {
+    core::arch::wasm32::f64_floor(x)
+}
+
+pub fn floorf(x: f32) -> f32 {
+    core::arch::wasm32::f32_floor(x)
+}
+
+pub fn rint(x: f64) -> f64 {
+    core::arch::wasm32::f64_nearest(x)
+}
+
+pub fn rintf(x: f32) -> f32 {
+    core::arch::wasm32::f32_nearest(x)
+}
+
+pub fn sqrt(x: f64) -> f64 {
+    core::arch::wasm32::f64_sqrt(x)
+}
+
+pub fn sqrtf(x: f32) -> f32 {
+    core::arch::wasm32::f32_sqrt(x)
+}
+
+pub fn trunc(x: f64) -> f64 {
+    core::arch::wasm32::f64_trunc(x)
+}
+
+pub fn truncf(x: f32) -> f32 {
+    core::arch::wasm32::f32_trunc(x)
+}
diff --git a/library/compiler-builtins/libm/src/math/arch/x86.rs b/library/compiler-builtins/libm/src/math/arch/x86.rs
new file mode 100644
index 00000000000..454aa285074
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/arch/x86.rs
@@ -0,0 +1,32 @@
+//! Architecture-specific support for x86-32 and x86-64 with SSE2
+
+mod detect;
+mod fma;
+
+pub use fma::{fma, fmaf};
+
+pub fn sqrtf(mut x: f32) -> f32 {
+    // SAFETY: `sqrtss` is part of `sse2`, which this module is gated behind. It has no memory
+    // access or side effects.
+    unsafe {
+        core::arch::asm!(
+            "sqrtss {x}, {x}",
+            x = inout(xmm_reg) x,
+            options(nostack, nomem, pure),
+        )
+    };
+    x
+}
+
+pub fn sqrt(mut x: f64) -> f64 {
+    // SAFETY: `sqrtsd` is part of `sse2`, which this module is gated behind. It has no memory
+    // access or side effects.
+    unsafe {
+        core::arch::asm!(
+            "sqrtsd {x}, {x}",
+            x = inout(xmm_reg) x,
+            options(nostack, nomem, pure),
+        )
+    };
+    x
+}
diff --git a/library/compiler-builtins/libm/src/math/arch/x86/detect.rs b/library/compiler-builtins/libm/src/math/arch/x86/detect.rs
new file mode 100644
index 00000000000..e6d9b040bfa
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/arch/x86/detect.rs
@@ -0,0 +1,232 @@
+// Using runtime feature detection requires atomics. Currently there are no x86 targets
+// that support sse but not `AtomicPtr`.
+
+#[cfg(target_arch = "x86")]
+use core::arch::x86::{__cpuid, __cpuid_count, _xgetbv, CpuidResult};
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64::{__cpuid, __cpuid_count, _xgetbv, CpuidResult};
+
+use crate::support::feature_detect::{Flags, get_or_init_flags_cache, unique_masks};
+
+/// CPU features that get cached (doesn't correlate to anything on the CPU).
+pub mod cpu_flags {
+    use super::unique_masks;
+
+    unique_masks! {
+        u32,
+        SSE3,
+        F16C,
+        SSE,
+        SSE2,
+        ERMSB,
+        MOVRS,
+        FMA,
+        FMA4,
+        AVX512FP16,
+        AVX512BF16,
+    }
+}
+
+/// Get CPU features, loading from a cache if available.
+pub fn get_cpu_features() -> Flags {
+    use core::sync::atomic::AtomicU32;
+    static CACHE: AtomicU32 = AtomicU32::new(0);
+    get_or_init_flags_cache(&CACHE, load_x86_features)
+}
+
+/// Read from cpuid and translate to a `Flags` instance, using `cpu_flags`.
+///
+/// Implementation is taken from [std-detect][std-detect].
+///
+/// [std-detect]: https://github.com/rust-lang/stdarch/blob/690b3a6334d482874163bd6fcef408e0518febe9/crates/std_detect/src/detect/os/x86.rs#L142
+fn load_x86_features() -> Flags {
+    let mut value = Flags::empty();
+
+    if cfg!(target_env = "sgx") {
+        // doesn't support this because it is untrusted data
+        return Flags::empty();
+    }
+
+    // Calling `__cpuid`/`__cpuid_count` from here on is safe because the CPU
+    // has `cpuid` support.
+
+    // 0. EAX = 0: Basic Information:
+    // - EAX returns the "Highest Function Parameter", that is, the maximum leaf
+    //   value for subsequent calls of `cpuinfo` in range [0, 0x8000_0000].
+    // - The vendor ID is stored in 12 u8 ascii chars, returned in EBX, EDX, and ECX
+    //   (in that order)
+    let mut vendor_id = [0u8; 12];
+    let max_basic_leaf;
+    unsafe {
+        let CpuidResult { eax, ebx, ecx, edx } = __cpuid(0);
+        max_basic_leaf = eax;
+        vendor_id[0..4].copy_from_slice(&ebx.to_ne_bytes());
+        vendor_id[4..8].copy_from_slice(&edx.to_ne_bytes());
+        vendor_id[8..12].copy_from_slice(&ecx.to_ne_bytes());
+    }
+
+    if max_basic_leaf < 1 {
+        // Earlier Intel 486, CPUID not implemented
+        return value;
+    }
+
+    // EAX = 1, ECX = 0: Queries "Processor Info and Feature Bits";
+    // Contains information about most x86 features.
+    let CpuidResult { ecx, edx, .. } = unsafe { __cpuid(0x0000_0001_u32) };
+    let proc_info_ecx = Flags::from_bits(ecx);
+    let proc_info_edx = Flags::from_bits(edx);
+
+    // EAX = 7: Queries "Extended Features";
+    // Contains information about bmi,bmi2, and avx2 support.
+    let mut extended_features_ebx = Flags::empty();
+    let mut extended_features_edx = Flags::empty();
+    let mut extended_features_eax_leaf_1 = Flags::empty();
+    if max_basic_leaf >= 7 {
+        let CpuidResult { ebx, edx, .. } = unsafe { __cpuid(0x0000_0007_u32) };
+        extended_features_ebx = Flags::from_bits(ebx);
+        extended_features_edx = Flags::from_bits(edx);
+
+        let CpuidResult { eax, .. } = unsafe { __cpuid_count(0x0000_0007_u32, 0x0000_0001_u32) };
+        extended_features_eax_leaf_1 = Flags::from_bits(eax)
+    }
+
+    // EAX = 0x8000_0000, ECX = 0: Get Highest Extended Function Supported
+    // - EAX returns the max leaf value for extended information, that is,
+    //   `cpuid` calls in range [0x8000_0000; u32::MAX]:
+    let extended_max_basic_leaf = unsafe { __cpuid(0x8000_0000_u32) }.eax;
+
+    // EAX = 0x8000_0001, ECX=0: Queries "Extended Processor Info and Feature Bits"
+    let mut extended_proc_info_ecx = Flags::empty();
+    if extended_max_basic_leaf >= 1 {
+        let CpuidResult { ecx, .. } = unsafe { __cpuid(0x8000_0001_u32) };
+        extended_proc_info_ecx = Flags::from_bits(ecx);
+    }
+
+    let mut enable = |regflags: Flags, regbit, flag| {
+        if regflags.test_nth(regbit) {
+            value.insert(flag);
+        }
+    };
+
+    enable(proc_info_ecx, 0, cpu_flags::SSE3);
+    enable(proc_info_ecx, 29, cpu_flags::F16C);
+    enable(proc_info_edx, 25, cpu_flags::SSE);
+    enable(proc_info_edx, 26, cpu_flags::SSE2);
+    enable(extended_features_ebx, 9, cpu_flags::ERMSB);
+    enable(extended_features_eax_leaf_1, 31, cpu_flags::MOVRS);
+
+    // `XSAVE` and `AVX` support:
+    let cpu_xsave = proc_info_ecx.test_nth(26);
+    if cpu_xsave {
+        // 0. Here the CPU supports `XSAVE`.
+
+        // 1. Detect `OSXSAVE`, that is, whether the OS is AVX enabled and
+        //    supports saving the state of the AVX/AVX2 vector registers on
+        //    context-switches, see:
+        //
+        // - [intel: is avx enabled?][is_avx_enabled],
+        // - [mozilla: sse.cpp][mozilla_sse_cpp].
+        //
+        // [is_avx_enabled]: https://software.intel.com/en-us/blogs/2011/04/14/is-avx-enabled
+        // [mozilla_sse_cpp]: https://hg.mozilla.org/mozilla-central/file/64bab5cbb9b6/mozglue/build/SSE.cpp#l190
+        let cpu_osxsave = proc_info_ecx.test_nth(27);
+
+        if cpu_osxsave {
+            // 2. The OS must have signaled the CPU that it supports saving and
+            // restoring the:
+            //
+            // * SSE -> `XCR0.SSE[1]`
+            // * AVX -> `XCR0.AVX[2]`
+            // * AVX-512 -> `XCR0.AVX-512[7:5]`.
+            // * AMX -> `XCR0.AMX[18:17]`
+            //
+            // by setting the corresponding bits of `XCR0` to `1`.
+            //
+            // This is safe because the CPU supports `xsave` and the OS has set `osxsave`.
+            let xcr0 = unsafe { _xgetbv(0) };
+            // Test `XCR0.SSE[1]` and `XCR0.AVX[2]` with the mask `0b110 == 6`:
+            let os_avx_support = xcr0 & 6 == 6;
+            // Test `XCR0.AVX-512[7:5]` with the mask `0b1110_0000 == 0xe0`:
+            let os_avx512_support = xcr0 & 0xe0 == 0xe0;
+
+            // Only if the OS and the CPU support saving/restoring the AVX
+            // registers we enable `xsave` support:
+            if os_avx_support {
+                // See "13.3 ENABLING THE XSAVE FEATURE SET AND XSAVE-ENABLED
+                // FEATURES" in the "Intel® 64 and IA-32 Architectures Software
+                // Developer’s Manual, Volume 1: Basic Architecture":
+                //
+                // "Software enables the XSAVE feature set by setting
+                // CR4.OSXSAVE[bit 18] to 1 (e.g., with the MOV to CR4
+                // instruction). If this bit is 0, execution of any of XGETBV,
+                // XRSTOR, XRSTORS, XSAVE, XSAVEC, XSAVEOPT, XSAVES, and XSETBV
+                // causes an invalid-opcode exception (#UD)"
+
+                // FMA (uses 256-bit wide registers):
+                enable(proc_info_ecx, 12, cpu_flags::FMA);
+
+                // For AVX-512 the OS also needs to support saving/restoring
+                // the extended state, only then we enable AVX-512 support:
+                if os_avx512_support {
+                    enable(extended_features_edx, 23, cpu_flags::AVX512FP16);
+                    enable(extended_features_eax_leaf_1, 5, cpu_flags::AVX512BF16);
+                }
+            }
+        }
+    }
+
+    // As Hygon Dhyana originates from AMD technology and shares most of the architecture with
+    // AMD's family 17h, but with different CPU Vendor ID("HygonGenuine")/Family series number
+    // (Family 18h).
+    //
+    // For CPUID feature bits, Hygon Dhyana(family 18h) share the same definition with AMD
+    // family 17h.
+    //
+    // Related AMD CPUID specification is https://www.amd.com/system/files/TechDocs/25481.pdf
+    // (AMD64 Architecture Programmer's Manual, Appendix E).
+    // Related Hygon kernel patch can be found on
+    // http://lkml.kernel.org/r/5ce86123a7b9dad925ac583d88d2f921040e859b.1538583282.git.puwen@hygon.cn
+    if vendor_id == *b"AuthenticAMD" || vendor_id == *b"HygonGenuine" {
+        // These features are available on AMD arch CPUs:
+        enable(extended_proc_info_ecx, 16, cpu_flags::FMA4);
+    }
+
+    value
+}
+
+#[cfg(test)]
+mod tests {
+    extern crate std;
+    use std::is_x86_feature_detected;
+
+    use super::*;
+
+    #[test]
+    fn check_matches_std() {
+        let features = get_cpu_features();
+        for i in 0..cpu_flags::ALL.len() {
+            let flag = cpu_flags::ALL[i];
+            let name = cpu_flags::NAMES[i];
+
+            let std_detected = match flag {
+                cpu_flags::SSE3 => is_x86_feature_detected!("sse3"),
+                cpu_flags::F16C => is_x86_feature_detected!("f16c"),
+                cpu_flags::SSE => is_x86_feature_detected!("sse"),
+                cpu_flags::SSE2 => is_x86_feature_detected!("sse2"),
+                cpu_flags::ERMSB => is_x86_feature_detected!("ermsb"),
+                cpu_flags::MOVRS => continue, // only very recent support in std
+                cpu_flags::FMA => is_x86_feature_detected!("fma"),
+                cpu_flags::FMA4 => continue, // not yet supported in std
+                cpu_flags::AVX512FP16 => is_x86_feature_detected!("avx512fp16"),
+                cpu_flags::AVX512BF16 => is_x86_feature_detected!("avx512bf16"),
+                _ => panic!("untested CPU flag {name}"),
+            };
+
+            assert_eq!(
+                std_detected,
+                features.contains(flag),
+                "different flag {name}. flags: {features:?}"
+            );
+        }
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/arch/x86/fma.rs b/library/compiler-builtins/libm/src/math/arch/x86/fma.rs
new file mode 100644
index 00000000000..43ac187792d
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/arch/x86/fma.rs
@@ -0,0 +1,135 @@
+//! Use assembly fma if the `fma` or `fma4` feature is detected at runtime.
+
+use core::arch::asm;
+
+use super::super::super::generic;
+use super::detect::{cpu_flags, get_cpu_features};
+use crate::support::Round;
+use crate::support::feature_detect::select_once;
+
+pub fn fma(x: f64, y: f64, z: f64) -> f64 {
+    select_once! {
+        sig: fn(x: f64, y: f64, z: f64) -> f64,
+        init: || {
+            let features = get_cpu_features();
+            if features.contains(cpu_flags::FMA) {
+                fma_with_fma
+            } else if features.contains(cpu_flags::FMA4) {
+               fma_with_fma4
+            } else {
+                fma_fallback as Func
+            }
+        },
+        // SAFETY: `fn_ptr` is the result of `init`, preconditions have been checked.
+        call: |fn_ptr: Func| unsafe { fn_ptr(x, y, z) },
+    }
+}
+
+pub fn fmaf(x: f32, y: f32, z: f32) -> f32 {
+    select_once! {
+        sig: fn(x: f32, y: f32, z: f32) -> f32,
+        init: || {
+            let features = get_cpu_features();
+            if features.contains(cpu_flags::FMA) {
+                fmaf_with_fma
+            } else if features.contains(cpu_flags::FMA4) {
+                fmaf_with_fma4
+            } else {
+                fmaf_fallback as Func
+            }
+        },
+        // SAFETY: `fn_ptr` is the result of `init`, preconditions have been checked.
+        call: |fn_ptr: Func| unsafe { fn_ptr(x, y, z) },
+    }
+}
+
+/// # Safety
+///
+/// Must have +fma available.
+unsafe fn fma_with_fma(mut x: f64, y: f64, z: f64) -> f64 {
+    debug_assert!(get_cpu_features().contains(cpu_flags::FMA));
+
+    // SAFETY: fma is asserted available by precondition, which provides the instruction. No
+    // memory access or side effects.
+    unsafe {
+        asm!(
+            "vfmadd213sd {x}, {y}, {z}",
+            x = inout(xmm_reg) x,
+            y = in(xmm_reg) y,
+            z = in(xmm_reg) z,
+            options(nostack, nomem, pure),
+        );
+    }
+    x
+}
+
+/// # Safety
+///
+/// Must have +fma available.
+unsafe fn fmaf_with_fma(mut x: f32, y: f32, z: f32) -> f32 {
+    debug_assert!(get_cpu_features().contains(cpu_flags::FMA));
+
+    // SAFETY: fma is asserted available by precondition, which provides the instruction. No
+    // memory access or side effects.
+    unsafe {
+        asm!(
+            "vfmadd213ss {x}, {y}, {z}",
+            x = inout(xmm_reg) x,
+            y = in(xmm_reg) y,
+            z = in(xmm_reg) z,
+            options(nostack, nomem, pure),
+        );
+    }
+    x
+}
+
+/// # Safety
+///
+/// Must have +fma4 available.
+unsafe fn fma_with_fma4(mut x: f64, y: f64, z: f64) -> f64 {
+    debug_assert!(get_cpu_features().contains(cpu_flags::FMA4));
+
+    // SAFETY: fma4 is asserted available by precondition, which provides the instruction. No
+    // memory access or side effects.
+    unsafe {
+        asm!(
+            "vfmaddsd {x}, {x}, {y}, {z}",
+            x = inout(xmm_reg) x,
+            y = in(xmm_reg) y,
+            z = in(xmm_reg) z,
+            options(nostack, nomem, pure),
+        );
+    }
+    x
+}
+
+/// # Safety
+///
+/// Must have +fma4 available.
+unsafe fn fmaf_with_fma4(mut x: f32, y: f32, z: f32) -> f32 {
+    debug_assert!(get_cpu_features().contains(cpu_flags::FMA4));
+
+    // SAFETY: fma4 is asserted available by precondition, which provides the instruction. No
+    // memory access or side effects.
+    unsafe {
+        asm!(
+            "vfmaddss {x}, {x}, {y}, {z}",
+            x = inout(xmm_reg) x,
+            y = in(xmm_reg) y,
+            z = in(xmm_reg) z,
+            options(nostack, nomem, pure),
+        );
+    }
+    x
+}
+
+// FIXME: the `select_implementation` macro should handle arch implementations that want
+// to use the fallback, so we don't need to recreate the body.
+
+fn fma_fallback(x: f64, y: f64, z: f64) -> f64 {
+    generic::fma_round(x, y, z, Round::Nearest).val
+}
+
+fn fmaf_fallback(x: f32, y: f32, z: f32) -> f32 {
+    generic::fma_wide_round(x, y, z, Round::Nearest).val
+}
diff --git a/library/compiler-builtins/libm/src/math/asin.rs b/library/compiler-builtins/libm/src/math/asin.rs
new file mode 100644
index 00000000000..12d0cd35fa5
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/asin.rs
@@ -0,0 +1,115 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/e_asin.c */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunSoft, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+/* asin(x)
+ * Method :
+ *      Since  asin(x) = x + x^3/6 + x^5*3/40 + x^7*15/336 + ...
+ *      we approximate asin(x) on [0,0.5] by
+ *              asin(x) = x + x*x^2*R(x^2)
+ *      where
+ *              R(x^2) is a rational approximation of (asin(x)-x)/x^3
+ *      and its remez error is bounded by
+ *              |(asin(x)-x)/x^3 - R(x^2)| < 2^(-58.75)
+ *
+ *      For x in [0.5,1]
+ *              asin(x) = pi/2-2*asin(sqrt((1-x)/2))
+ *      Let y = (1-x), z = y/2, s := sqrt(z), and pio2_hi+pio2_lo=pi/2;
+ *      then for x>0.98
+ *              asin(x) = pi/2 - 2*(s+s*z*R(z))
+ *                      = pio2_hi - (2*(s+s*z*R(z)) - pio2_lo)
+ *      For x<=0.98, let pio4_hi = pio2_hi/2, then
+ *              f = hi part of s;
+ *              c = sqrt(z) - f = (z-f*f)/(s+f)         ...f+c=sqrt(z)
+ *      and
+ *              asin(x) = pi/2 - 2*(s+s*z*R(z))
+ *                      = pio4_hi+(pio4-2s)-(2s*z*R(z)-pio2_lo)
+ *                      = pio4_hi+(pio4-2f)-(2s*z*R(z)-(pio2_lo+2c))
+ *
+ * Special cases:
+ *      if x is NaN, return x itself;
+ *      if |x|>1, return NaN with invalid signal.
+ *
+ */
+
+use super::{fabs, get_high_word, get_low_word, sqrt, with_set_low_word};
+
+const PIO2_HI: f64 = 1.57079632679489655800e+00; /* 0x3FF921FB, 0x54442D18 */
+const PIO2_LO: f64 = 6.12323399573676603587e-17; /* 0x3C91A626, 0x33145C07 */
+/* coefficients for R(x^2) */
+const P_S0: f64 = 1.66666666666666657415e-01; /* 0x3FC55555, 0x55555555 */
+const P_S1: f64 = -3.25565818622400915405e-01; /* 0xBFD4D612, 0x03EB6F7D */
+const P_S2: f64 = 2.01212532134862925881e-01; /* 0x3FC9C155, 0x0E884455 */
+const P_S3: f64 = -4.00555345006794114027e-02; /* 0xBFA48228, 0xB5688F3B */
+const P_S4: f64 = 7.91534994289814532176e-04; /* 0x3F49EFE0, 0x7501B288 */
+const P_S5: f64 = 3.47933107596021167570e-05; /* 0x3F023DE1, 0x0DFDF709 */
+const Q_S1: f64 = -2.40339491173441421878e+00; /* 0xC0033A27, 0x1C8A2D4B */
+const Q_S2: f64 = 2.02094576023350569471e+00; /* 0x40002AE5, 0x9C598AC8 */
+const Q_S3: f64 = -6.88283971605453293030e-01; /* 0xBFE6066C, 0x1B8D0159 */
+const Q_S4: f64 = 7.70381505559019352791e-02; /* 0x3FB3B8C5, 0xB12E9282 */
+
+fn comp_r(z: f64) -> f64 {
+    let p = z * (P_S0 + z * (P_S1 + z * (P_S2 + z * (P_S3 + z * (P_S4 + z * P_S5)))));
+    let q = 1.0 + z * (Q_S1 + z * (Q_S2 + z * (Q_S3 + z * Q_S4)));
+    p / q
+}
+
+/// Arcsine (f64)
+///
+/// Computes the inverse sine (arc sine) of the argument `x`.
+/// Arguments to asin must be in the range -1 to 1.
+/// Returns values in radians, in the range of -pi/2 to pi/2.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn asin(mut x: f64) -> f64 {
+    let z: f64;
+    let r: f64;
+    let s: f64;
+    let hx: u32;
+    let ix: u32;
+
+    hx = get_high_word(x);
+    ix = hx & 0x7fffffff;
+    /* |x| >= 1 or nan */
+    if ix >= 0x3ff00000 {
+        let lx: u32;
+        lx = get_low_word(x);
+        if ((ix - 0x3ff00000) | lx) == 0 {
+            /* asin(1) = +-pi/2 with inexact */
+            return x * PIO2_HI + f64::from_bits(0x3870000000000000);
+        } else {
+            return 0.0 / (x - x);
+        }
+    }
+    /* |x| < 0.5 */
+    if ix < 0x3fe00000 {
+        /* if 0x1p-1022 <= |x| < 0x1p-26, avoid raising underflow */
+        if (0x00100000..0x3e500000).contains(&ix) {
+            return x;
+        } else {
+            return x + x * comp_r(x * x);
+        }
+    }
+    /* 1 > |x| >= 0.5 */
+    z = (1.0 - fabs(x)) * 0.5;
+    s = sqrt(z);
+    r = comp_r(z);
+    if ix >= 0x3fef3333 {
+        /* if |x| > 0.975 */
+        x = PIO2_HI - (2. * (s + s * r) - PIO2_LO);
+    } else {
+        let f: f64;
+        let c: f64;
+        /* f+c = sqrt(z) */
+        f = with_set_low_word(s, 0);
+        c = (z - f * f) / (s + f);
+        x = 0.5 * PIO2_HI - (2.0 * s * r - (PIO2_LO - 2.0 * c) - (0.5 * PIO2_HI - 2.0 * f));
+    }
+    if hx >> 31 != 0 { -x } else { x }
+}
diff --git a/library/compiler-builtins/libm/src/math/asinf.rs b/library/compiler-builtins/libm/src/math/asinf.rs
new file mode 100644
index 00000000000..ed685556730
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/asinf.rs
@@ -0,0 +1,68 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/e_asinf.c */
+/*
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
+ */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+use super::sqrt::sqrt;
+use super::support::Float;
+
+const PIO2: f64 = 1.570796326794896558e+00;
+
+/* coefficients for R(x^2) */
+const P_S0: f32 = 1.6666586697e-01;
+const P_S1: f32 = -4.2743422091e-02;
+const P_S2: f32 = -8.6563630030e-03;
+const Q_S1: f32 = -7.0662963390e-01;
+
+fn r(z: f32) -> f32 {
+    let p = z * (P_S0 + z * (P_S1 + z * P_S2));
+    let q = 1. + z * Q_S1;
+    p / q
+}
+
+/// Arcsine (f32)
+///
+/// Computes the inverse sine (arc sine) of the argument `x`.
+/// Arguments to asin must be in the range -1 to 1.
+/// Returns values in radians, in the range of -pi/2 to pi/2.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn asinf(mut x: f32) -> f32 {
+    let x1p_120 = f64::from_bits(0x3870000000000000); // 0x1p-120 === 2 ^ (-120)
+
+    let hx = x.to_bits();
+    let ix = hx & 0x7fffffff;
+
+    if ix >= 0x3f800000 {
+        /* |x| >= 1 */
+        if ix == 0x3f800000 {
+            /* |x| == 1 */
+            return ((x as f64) * PIO2 + x1p_120) as f32; /* asin(+-1) = +-pi/2 with inexact */
+        }
+        return 0. / (x - x); /* asin(|x|>1) is NaN */
+    }
+
+    if ix < 0x3f000000 {
+        /* |x| < 0.5 */
+        /* if 0x1p-126 <= |x| < 0x1p-12, avoid raising underflow */
+        if (0x00800000..0x39800000).contains(&ix) {
+            return x;
+        }
+        return x + x * r(x * x);
+    }
+
+    /* 1 > |x| >= 0.5 */
+    let z = (1. - Float::abs(x)) * 0.5;
+    let s = sqrt(z as f64);
+    x = (PIO2 - 2. * (s + s * (r(z) as f64))) as f32;
+    if (hx >> 31) != 0 { -x } else { x }
+}
diff --git a/library/compiler-builtins/libm/src/math/asinh.rs b/library/compiler-builtins/libm/src/math/asinh.rs
new file mode 100644
index 00000000000..75d3c3ad462
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/asinh.rs
@@ -0,0 +1,36 @@
+use super::{log, log1p, sqrt};
+
+const LN2: f64 = 0.693147180559945309417232121458176568; /* 0x3fe62e42,  0xfefa39ef*/
+
+/* asinh(x) = sign(x)*log(|x|+sqrt(x*x+1)) ~= x - x^3/6 + o(x^5) */
+/// Inverse hyperbolic sine (f64)
+///
+/// Calculates the inverse hyperbolic sine of `x`.
+/// Is defined as `sgn(x)*log(|x|+sqrt(x*x+1))`.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn asinh(mut x: f64) -> f64 {
+    let mut u = x.to_bits();
+    let e = ((u >> 52) as usize) & 0x7ff;
+    let sign = (u >> 63) != 0;
+
+    /* |x| */
+    u &= (!0) >> 1;
+    x = f64::from_bits(u);
+
+    if e >= 0x3ff + 26 {
+        /* |x| >= 0x1p26 or inf or nan */
+        x = log(x) + LN2;
+    } else if e >= 0x3ff + 1 {
+        /* |x| >= 2 */
+        x = log(2.0 * x + 1.0 / (sqrt(x * x + 1.0) + x));
+    } else if e >= 0x3ff - 26 {
+        /* |x| >= 0x1p-26, up to 1.6ulp error in [0.125,0.5] */
+        x = log1p(x + x * x / (sqrt(x * x + 1.0) + 1.0));
+    } else {
+        /* |x| < 0x1p-26, raise inexact if x != 0 */
+        let x1p120 = f64::from_bits(0x4770000000000000);
+        force_eval!(x + x1p120);
+    }
+
+    if sign { -x } else { x }
+}
diff --git a/library/compiler-builtins/libm/src/math/asinhf.rs b/library/compiler-builtins/libm/src/math/asinhf.rs
new file mode 100644
index 00000000000..27ed9dd372d
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/asinhf.rs
@@ -0,0 +1,35 @@
+use super::{log1pf, logf, sqrtf};
+
+const LN2: f32 = 0.693147180559945309417232121458176568;
+
+/* asinh(x) = sign(x)*log(|x|+sqrt(x*x+1)) ~= x - x^3/6 + o(x^5) */
+/// Inverse hyperbolic sine (f32)
+///
+/// Calculates the inverse hyperbolic sine of `x`.
+/// Is defined as `sgn(x)*log(|x|+sqrt(x*x+1))`.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn asinhf(mut x: f32) -> f32 {
+    let u = x.to_bits();
+    let i = u & 0x7fffffff;
+    let sign = (u >> 31) != 0;
+
+    /* |x| */
+    x = f32::from_bits(i);
+
+    if i >= 0x3f800000 + (12 << 23) {
+        /* |x| >= 0x1p12 or inf or nan */
+        x = logf(x) + LN2;
+    } else if i >= 0x3f800000 + (1 << 23) {
+        /* |x| >= 2 */
+        x = logf(2.0 * x + 1.0 / (sqrtf(x * x + 1.0) + x));
+    } else if i >= 0x3f800000 - (12 << 23) {
+        /* |x| >= 0x1p-12, up to 1.6ulp error in [0.125,0.5] */
+        x = log1pf(x + x * x / (sqrtf(x * x + 1.0) + 1.0));
+    } else {
+        /* |x| < 0x1p-12, raise inexact if x!=0 */
+        let x1p120 = f32::from_bits(0x7b800000);
+        force_eval!(x + x1p120);
+    }
+
+    if sign { -x } else { x }
+}
diff --git a/library/compiler-builtins/libm/src/math/atan.rs b/library/compiler-builtins/libm/src/math/atan.rs
new file mode 100644
index 00000000000..4ca5cc91a1e
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/atan.rs
@@ -0,0 +1,182 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/s_atan.c */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+/* atan(x)
+ * Method
+ *   1. Reduce x to positive by atan(x) = -atan(-x).
+ *   2. According to the integer k=4t+0.25 chopped, t=x, the argument
+ *      is further reduced to one of the following intervals and the
+ *      arctangent of t is evaluated by the corresponding formula:
+ *
+ *      [0,7/16]      atan(x) = t-t^3*(a1+t^2*(a2+...(a10+t^2*a11)...)
+ *      [7/16,11/16]  atan(x) = atan(1/2) + atan( (t-0.5)/(1+t/2) )
+ *      [11/16.19/16] atan(x) = atan( 1 ) + atan( (t-1)/(1+t) )
+ *      [19/16,39/16] atan(x) = atan(3/2) + atan( (t-1.5)/(1+1.5t) )
+ *      [39/16,INF]   atan(x) = atan(INF) + atan( -1/t )
+ *
+ * Constants:
+ * The hexadecimal values are the intended ones for the following
+ * constants. The decimal values may be used, provided that the
+ * compiler will convert from decimal to binary accurately enough
+ * to produce the hexadecimal values shown.
+ */
+
+use core::f64;
+
+use super::fabs;
+
+const ATANHI: [f64; 4] = [
+    4.63647609000806093515e-01, /* atan(0.5)hi 0x3FDDAC67, 0x0561BB4F */
+    7.85398163397448278999e-01, /* atan(1.0)hi 0x3FE921FB, 0x54442D18 */
+    9.82793723247329054082e-01, /* atan(1.5)hi 0x3FEF730B, 0xD281F69B */
+    1.57079632679489655800e+00, /* atan(inf)hi 0x3FF921FB, 0x54442D18 */
+];
+
+const ATANLO: [f64; 4] = [
+    2.26987774529616870924e-17, /* atan(0.5)lo 0x3C7A2B7F, 0x222F65E2 */
+    3.06161699786838301793e-17, /* atan(1.0)lo 0x3C81A626, 0x33145C07 */
+    1.39033110312309984516e-17, /* atan(1.5)lo 0x3C700788, 0x7AF0CBBD */
+    6.12323399573676603587e-17, /* atan(inf)lo 0x3C91A626, 0x33145C07 */
+];
+
+const AT: [f64; 11] = [
+    3.33333333333329318027e-01,  /* 0x3FD55555, 0x5555550D */
+    -1.99999999998764832476e-01, /* 0xBFC99999, 0x9998EBC4 */
+    1.42857142725034663711e-01,  /* 0x3FC24924, 0x920083FF */
+    -1.11111104054623557880e-01, /* 0xBFBC71C6, 0xFE231671 */
+    9.09088713343650656196e-02,  /* 0x3FB745CD, 0xC54C206E */
+    -7.69187620504482999495e-02, /* 0xBFB3B0F2, 0xAF749A6D */
+    6.66107313738753120669e-02,  /* 0x3FB10D66, 0xA0D03D51 */
+    -5.83357013379057348645e-02, /* 0xBFADDE2D, 0x52DEFD9A */
+    4.97687799461593236017e-02,  /* 0x3FA97B4B, 0x24760DEB */
+    -3.65315727442169155270e-02, /* 0xBFA2B444, 0x2C6A6C2F */
+    1.62858201153657823623e-02,  /* 0x3F90AD3A, 0xE322DA11 */
+];
+
+/// Arctangent (f64)
+///
+/// Computes the inverse tangent (arc tangent) of the input value.
+/// Returns a value in radians, in the range of -pi/2 to pi/2.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn atan(x: f64) -> f64 {
+    let mut x = x;
+    let mut ix = (x.to_bits() >> 32) as u32;
+    let sign = ix >> 31;
+    ix &= 0x7fff_ffff;
+    if ix >= 0x4410_0000 {
+        if x.is_nan() {
+            return x;
+        }
+
+        let z = ATANHI[3] + f64::from_bits(0x0380_0000); // 0x1p-120f
+        return if sign != 0 { -z } else { z };
+    }
+
+    let id = if ix < 0x3fdc_0000 {
+        /* |x| < 0.4375 */
+        if ix < 0x3e40_0000 {
+            /* |x| < 2^-27 */
+            if ix < 0x0010_0000 {
+                /* raise underflow for subnormal x */
+                force_eval!(x as f32);
+            }
+
+            return x;
+        }
+
+        -1
+    } else {
+        x = fabs(x);
+        if ix < 0x3ff30000 {
+            /* |x| < 1.1875 */
+            if ix < 0x3fe60000 {
+                /* 7/16 <= |x| < 11/16 */
+                x = (2. * x - 1.) / (2. + x);
+                0
+            } else {
+                /* 11/16 <= |x| < 19/16 */
+                x = (x - 1.) / (x + 1.);
+                1
+            }
+        } else if ix < 0x40038000 {
+            /* |x| < 2.4375 */
+            x = (x - 1.5) / (1. + 1.5 * x);
+            2
+        } else {
+            /* 2.4375 <= |x| < 2^66 */
+            x = -1. / x;
+            3
+        }
+    };
+
+    let z = x * x;
+    let w = z * z;
+    /* break sum from i=0 to 10 AT[i]z**(i+1) into odd and even poly */
+    let s1 = z * (AT[0] + w * (AT[2] + w * (AT[4] + w * (AT[6] + w * (AT[8] + w * AT[10])))));
+    let s2 = w * (AT[1] + w * (AT[3] + w * (AT[5] + w * (AT[7] + w * AT[9]))));
+
+    if id < 0 {
+        return x - x * (s1 + s2);
+    }
+
+    let z = i!(ATANHI, id as usize) - (x * (s1 + s2) - i!(ATANLO, id as usize) - x);
+
+    if sign != 0 { -z } else { z }
+}
+
+#[cfg(test)]
+mod tests {
+    use core::f64;
+
+    use super::atan;
+
+    #[test]
+    fn sanity_check() {
+        for (input, answer) in [
+            (3.0_f64.sqrt() / 3.0, f64::consts::FRAC_PI_6),
+            (1.0, f64::consts::FRAC_PI_4),
+            (3.0_f64.sqrt(), f64::consts::FRAC_PI_3),
+            (-3.0_f64.sqrt() / 3.0, -f64::consts::FRAC_PI_6),
+            (-1.0, -f64::consts::FRAC_PI_4),
+            (-3.0_f64.sqrt(), -f64::consts::FRAC_PI_3),
+        ]
+        .iter()
+        {
+            assert!(
+                (atan(*input) - answer) / answer < 1e-5,
+                "\natan({:.4}/16) = {:.4}, actual: {}",
+                input * 16.0,
+                answer,
+                atan(*input)
+            );
+        }
+    }
+
+    #[test]
+    fn zero() {
+        assert_eq!(atan(0.0), 0.0);
+    }
+
+    #[test]
+    fn infinity() {
+        assert_eq!(atan(f64::INFINITY), f64::consts::FRAC_PI_2);
+    }
+
+    #[test]
+    fn minus_infinity() {
+        assert_eq!(atan(f64::NEG_INFINITY), -f64::consts::FRAC_PI_2);
+    }
+
+    #[test]
+    fn nan() {
+        assert!(atan(f64::NAN).is_nan());
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/atan2.rs b/library/compiler-builtins/libm/src/math/atan2.rs
new file mode 100644
index 00000000000..c668731cf37
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/atan2.rs
@@ -0,0 +1,131 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/e_atan2.c */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunSoft, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ *
+ */
+/* atan2(y,x)
+ * Method :
+ *      1. Reduce y to positive by atan2(y,x)=-atan2(-y,x).
+ *      2. Reduce x to positive by (if x and y are unexceptional):
+ *              ARG (x+iy) = arctan(y/x)           ... if x > 0,
+ *              ARG (x+iy) = pi - arctan[y/(-x)]   ... if x < 0,
+ *
+ * Special cases:
+ *
+ *      ATAN2((anything), NaN ) is NaN;
+ *      ATAN2(NAN , (anything) ) is NaN;
+ *      ATAN2(+-0, +(anything but NaN)) is +-0  ;
+ *      ATAN2(+-0, -(anything but NaN)) is +-pi ;
+ *      ATAN2(+-(anything but 0 and NaN), 0) is +-pi/2;
+ *      ATAN2(+-(anything but INF and NaN), +INF) is +-0 ;
+ *      ATAN2(+-(anything but INF and NaN), -INF) is +-pi;
+ *      ATAN2(+-INF,+INF ) is +-pi/4 ;
+ *      ATAN2(+-INF,-INF ) is +-3pi/4;
+ *      ATAN2(+-INF, (anything but,0,NaN, and INF)) is +-pi/2;
+ *
+ * Constants:
+ * The hexadecimal values are the intended ones for the following
+ * constants. The decimal values may be used, provided that the
+ * compiler will convert from decimal to binary accurately enough
+ * to produce the hexadecimal values shown.
+ */
+
+use super::{atan, fabs};
+
+const PI: f64 = 3.1415926535897931160E+00; /* 0x400921FB, 0x54442D18 */
+const PI_LO: f64 = 1.2246467991473531772E-16; /* 0x3CA1A626, 0x33145C07 */
+
+/// Arctangent of y/x (f64)
+///
+/// Computes the inverse tangent (arc tangent) of `y/x`.
+/// Produces the correct result even for angles near pi/2 or -pi/2 (that is, when `x` is near 0).
+/// Returns a value in radians, in the range of -pi to pi.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn atan2(y: f64, x: f64) -> f64 {
+    if x.is_nan() || y.is_nan() {
+        return x + y;
+    }
+    let mut ix = (x.to_bits() >> 32) as u32;
+    let lx = x.to_bits() as u32;
+    let mut iy = (y.to_bits() >> 32) as u32;
+    let ly = y.to_bits() as u32;
+    if ((ix.wrapping_sub(0x3ff00000)) | lx) == 0 {
+        /* x = 1.0 */
+        return atan(y);
+    }
+    let m = ((iy >> 31) & 1) | ((ix >> 30) & 2); /* 2*sign(x)+sign(y) */
+    ix &= 0x7fffffff;
+    iy &= 0x7fffffff;
+
+    /* when y = 0 */
+    if (iy | ly) == 0 {
+        return match m {
+            0 | 1 => y, /* atan(+-0,+anything)=+-0 */
+            2 => PI,    /* atan(+0,-anything) = PI */
+            _ => -PI,   /* atan(-0,-anything) =-PI */
+        };
+    }
+    /* when x = 0 */
+    if (ix | lx) == 0 {
+        return if m & 1 != 0 { -PI / 2.0 } else { PI / 2.0 };
+    }
+    /* when x is INF */
+    if ix == 0x7ff00000 {
+        if iy == 0x7ff00000 {
+            return match m {
+                0 => PI / 4.0,        /* atan(+INF,+INF) */
+                1 => -PI / 4.0,       /* atan(-INF,+INF) */
+                2 => 3.0 * PI / 4.0,  /* atan(+INF,-INF) */
+                _ => -3.0 * PI / 4.0, /* atan(-INF,-INF) */
+            };
+        } else {
+            return match m {
+                0 => 0.0,  /* atan(+...,+INF) */
+                1 => -0.0, /* atan(-...,+INF) */
+                2 => PI,   /* atan(+...,-INF) */
+                _ => -PI,  /* atan(-...,-INF) */
+            };
+        }
+    }
+    /* |y/x| > 0x1p64 */
+    if ix.wrapping_add(64 << 20) < iy || iy == 0x7ff00000 {
+        return if m & 1 != 0 { -PI / 2.0 } else { PI / 2.0 };
+    }
+
+    /* z = atan(|y/x|) without spurious underflow */
+    let z = if (m & 2 != 0) && iy.wrapping_add(64 << 20) < ix {
+        /* |y/x| < 0x1p-64, x<0 */
+        0.0
+    } else {
+        atan(fabs(y / x))
+    };
+    match m {
+        0 => z,                /* atan(+,+) */
+        1 => -z,               /* atan(-,+) */
+        2 => PI - (z - PI_LO), /* atan(+,-) */
+        _ => (z - PI_LO) - PI, /* atan(-,-) */
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    #[cfg_attr(x86_no_sse, ignore = "FIXME(i586): possible incorrect rounding")]
+    fn sanity_check() {
+        assert_eq!(atan2(0.0, 1.0), 0.0);
+        assert_eq!(atan2(0.0, -1.0), PI);
+        assert_eq!(atan2(-0.0, -1.0), -PI);
+        assert_eq!(atan2(3.0, 2.0), atan(3.0 / 2.0));
+        assert_eq!(atan2(2.0, -1.0), atan(2.0 / -1.0) + PI);
+        assert_eq!(atan2(-2.0, -1.0), atan(-2.0 / -1.0) - PI);
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/atan2f.rs b/library/compiler-builtins/libm/src/math/atan2f.rs
new file mode 100644
index 00000000000..95b466fff4e
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/atan2f.rs
@@ -0,0 +1,90 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/e_atan2f.c */
+/*
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
+ */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+use super::{atanf, fabsf};
+
+const PI: f32 = 3.1415927410e+00; /* 0x40490fdb */
+const PI_LO: f32 = -8.7422776573e-08; /* 0xb3bbbd2e */
+
+/// Arctangent of y/x (f32)
+///
+/// Computes the inverse tangent (arc tangent) of `y/x`.
+/// Produces the correct result even for angles near pi/2 or -pi/2 (that is, when `x` is near 0).
+/// Returns a value in radians, in the range of -pi to pi.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn atan2f(y: f32, x: f32) -> f32 {
+    if x.is_nan() || y.is_nan() {
+        return x + y;
+    }
+    let mut ix = x.to_bits();
+    let mut iy = y.to_bits();
+
+    if ix == 0x3f800000 {
+        /* x=1.0 */
+        return atanf(y);
+    }
+    let m = ((iy >> 31) & 1) | ((ix >> 30) & 2); /* 2*sign(x)+sign(y) */
+    ix &= 0x7fffffff;
+    iy &= 0x7fffffff;
+
+    /* when y = 0 */
+    if iy == 0 {
+        return match m {
+            0 | 1 => y, /* atan(+-0,+anything)=+-0 */
+            2 => PI,    /* atan(+0,-anything) = pi */
+            _ => -PI,   /* atan(-0,-anything) =-pi */
+        };
+    }
+    /* when x = 0 */
+    if ix == 0 {
+        return if m & 1 != 0 { -PI / 2. } else { PI / 2. };
+    }
+    /* when x is INF */
+    if ix == 0x7f800000 {
+        return if iy == 0x7f800000 {
+            match m {
+                0 => PI / 4.,       /* atan(+INF,+INF) */
+                1 => -PI / 4.,      /* atan(-INF,+INF) */
+                2 => 3. * PI / 4.,  /* atan(+INF,-INF)*/
+                _ => -3. * PI / 4., /* atan(-INF,-INF)*/
+            }
+        } else {
+            match m {
+                0 => 0.,  /* atan(+...,+INF) */
+                1 => -0., /* atan(-...,+INF) */
+                2 => PI,  /* atan(+...,-INF) */
+                _ => -PI, /* atan(-...,-INF) */
+            }
+        };
+    }
+    /* |y/x| > 0x1p26 */
+    if (ix + (26 << 23) < iy) || (iy == 0x7f800000) {
+        return if m & 1 != 0 { -PI / 2. } else { PI / 2. };
+    }
+
+    /* z = atan(|y/x|) with correct underflow */
+    let z = if (m & 2 != 0) && (iy + (26 << 23) < ix) {
+        /*|y/x| < 0x1p-26, x < 0 */
+        0.
+    } else {
+        atanf(fabsf(y / x))
+    };
+    match m {
+        0 => z,                /* atan(+,+) */
+        1 => -z,               /* atan(-,+) */
+        2 => PI - (z - PI_LO), /* atan(+,-) */
+        _ => (z - PI_LO) - PI, /* case 3 */ /* atan(-,-) */
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/atanf.rs b/library/compiler-builtins/libm/src/math/atanf.rs
new file mode 100644
index 00000000000..da8daa41a01
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/atanf.rs
@@ -0,0 +1,108 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/s_atanf.c */
+/*
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
+ */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+use super::fabsf;
+
+const ATAN_HI: [f32; 4] = [
+    4.6364760399e-01, /* atan(0.5)hi 0x3eed6338 */
+    7.8539812565e-01, /* atan(1.0)hi 0x3f490fda */
+    9.8279368877e-01, /* atan(1.5)hi 0x3f7b985e */
+    1.5707962513e+00, /* atan(inf)hi 0x3fc90fda */
+];
+
+const ATAN_LO: [f32; 4] = [
+    5.0121582440e-09, /* atan(0.5)lo 0x31ac3769 */
+    3.7748947079e-08, /* atan(1.0)lo 0x33222168 */
+    3.4473217170e-08, /* atan(1.5)lo 0x33140fb4 */
+    7.5497894159e-08, /* atan(inf)lo 0x33a22168 */
+];
+
+const A_T: [f32; 5] = [
+    3.3333328366e-01,
+    -1.9999158382e-01,
+    1.4253635705e-01,
+    -1.0648017377e-01,
+    6.1687607318e-02,
+];
+
+/// Arctangent (f32)
+///
+/// Computes the inverse tangent (arc tangent) of the input value.
+/// Returns a value in radians, in the range of -pi/2 to pi/2.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn atanf(mut x: f32) -> f32 {
+    let x1p_120 = f32::from_bits(0x03800000); // 0x1p-120 === 2 ^ (-120)
+
+    let z: f32;
+
+    let mut ix = x.to_bits();
+    let sign = (ix >> 31) != 0;
+    ix &= 0x7fffffff;
+
+    if ix >= 0x4c800000 {
+        /* if |x| >= 2**26 */
+        if x.is_nan() {
+            return x;
+        }
+        z = i!(ATAN_HI, 3) + x1p_120;
+        return if sign { -z } else { z };
+    }
+    let id = if ix < 0x3ee00000 {
+        /* |x| < 0.4375 */
+        if ix < 0x39800000 {
+            /* |x| < 2**-12 */
+            if ix < 0x00800000 {
+                /* raise underflow for subnormal x */
+                force_eval!(x * x);
+            }
+            return x;
+        }
+        -1
+    } else {
+        x = fabsf(x);
+        if ix < 0x3f980000 {
+            /* |x| < 1.1875 */
+            if ix < 0x3f300000 {
+                /*  7/16 <= |x| < 11/16 */
+                x = (2. * x - 1.) / (2. + x);
+                0
+            } else {
+                /* 11/16 <= |x| < 19/16 */
+                x = (x - 1.) / (x + 1.);
+                1
+            }
+        } else if ix < 0x401c0000 {
+            /* |x| < 2.4375 */
+            x = (x - 1.5) / (1. + 1.5 * x);
+            2
+        } else {
+            /* 2.4375 <= |x| < 2**26 */
+            x = -1. / x;
+            3
+        }
+    };
+    /* end of argument reduction */
+    z = x * x;
+    let w = z * z;
+    /* break sum from i=0 to 10 aT[i]z**(i+1) into odd and even poly */
+    let s1 = z * (i!(A_T, 0) + w * (i!(A_T, 2) + w * i!(A_T, 4)));
+    let s2 = w * (i!(A_T, 1) + w * i!(A_T, 3));
+    if id < 0 {
+        return x - x * (s1 + s2);
+    }
+    let id = id as usize;
+    let z = i!(ATAN_HI, id) - ((x * (s1 + s2) - i!(ATAN_LO, id)) - x);
+    if sign { -z } else { z }
+}
diff --git a/library/compiler-builtins/libm/src/math/atanh.rs b/library/compiler-builtins/libm/src/math/atanh.rs
new file mode 100644
index 00000000000..9dc826f5605
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/atanh.rs
@@ -0,0 +1,33 @@
+use super::log1p;
+
+/* atanh(x) = log((1+x)/(1-x))/2 = log1p(2x/(1-x))/2 ~= x + x^3/3 + o(x^5) */
+/// Inverse hyperbolic tangent (f64)
+///
+/// Calculates the inverse hyperbolic tangent of `x`.
+/// Is defined as `log((1+x)/(1-x))/2 = log1p(2x/(1-x))/2`.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn atanh(x: f64) -> f64 {
+    let u = x.to_bits();
+    let e = ((u >> 52) as usize) & 0x7ff;
+    let sign = (u >> 63) != 0;
+
+    /* |x| */
+    let mut y = f64::from_bits(u & 0x7fff_ffff_ffff_ffff);
+
+    if e < 0x3ff - 1 {
+        if e < 0x3ff - 32 {
+            /* handle underflow */
+            if e == 0 {
+                force_eval!(y as f32);
+            }
+        } else {
+            /* |x| < 0.5, up to 1.7ulp error */
+            y = 0.5 * log1p(2.0 * y + 2.0 * y * y / (1.0 - y));
+        }
+    } else {
+        /* avoid overflow */
+        y = 0.5 * log1p(2.0 * (y / (1.0 - y)));
+    }
+
+    if sign { -y } else { y }
+}
diff --git a/library/compiler-builtins/libm/src/math/atanhf.rs b/library/compiler-builtins/libm/src/math/atanhf.rs
new file mode 100644
index 00000000000..80ccec1f67f
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/atanhf.rs
@@ -0,0 +1,33 @@
+use super::log1pf;
+
+/* atanh(x) = log((1+x)/(1-x))/2 = log1p(2x/(1-x))/2 ~= x + x^3/3 + o(x^5) */
+/// Inverse hyperbolic tangent (f32)
+///
+/// Calculates the inverse hyperbolic tangent of `x`.
+/// Is defined as `log((1+x)/(1-x))/2 = log1p(2x/(1-x))/2`.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn atanhf(mut x: f32) -> f32 {
+    let mut u = x.to_bits();
+    let sign = (u >> 31) != 0;
+
+    /* |x| */
+    u &= 0x7fffffff;
+    x = f32::from_bits(u);
+
+    if u < 0x3f800000 - (1 << 23) {
+        if u < 0x3f800000 - (32 << 23) {
+            /* handle underflow */
+            if u < (1 << 23) {
+                force_eval!(x * x);
+            }
+        } else {
+            /* |x| < 0.5, up to 1.7ulp error */
+            x = 0.5 * log1pf(2.0 * x + 2.0 * x * x / (1.0 - x));
+        }
+    } else {
+        /* avoid overflow */
+        x = 0.5 * log1pf(2.0 * (x / (1.0 - x)));
+    }
+
+    if sign { -x } else { x }
+}
diff --git a/library/compiler-builtins/libm/src/math/cbrt.rs b/library/compiler-builtins/libm/src/math/cbrt.rs
new file mode 100644
index 00000000000..cf56f7a9792
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/cbrt.rs
@@ -0,0 +1,219 @@
+/* SPDX-License-Identifier: MIT */
+/* origin: core-math/src/binary64/cbrt/cbrt.c
+ * Copyright (c) 2021-2022 Alexei Sibidanov.
+ * Ported to Rust in 2025 by Trevor Gross.
+ */
+
+use super::Float;
+use super::support::{FpResult, Round, cold_path};
+
+/// Compute the cube root of the argument.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn cbrt(x: f64) -> f64 {
+    cbrt_round(x, Round::Nearest).val
+}
+
+pub fn cbrt_round(x: f64, round: Round) -> FpResult<f64> {
+    const ESCALE: [f64; 3] = [
+        1.0,
+        hf64!("0x1.428a2f98d728bp+0"), /* 2^(1/3) */
+        hf64!("0x1.965fea53d6e3dp+0"), /* 2^(2/3) */
+    ];
+
+    /* the polynomial c0+c1*x+c2*x^2+c3*x^3 approximates x^(1/3) on [1,2]
+    with maximal error < 9.2e-5 (attained at x=2) */
+    const C: [f64; 4] = [
+        hf64!("0x1.1b0babccfef9cp-1"),
+        hf64!("0x1.2c9a3e94d1da5p-1"),
+        hf64!("-0x1.4dc30b1a1ddbap-3"),
+        hf64!("0x1.7a8d3e4ec9b07p-6"),
+    ];
+
+    let u0: f64 = hf64!("0x1.5555555555555p-2");
+    let u1: f64 = hf64!("0x1.c71c71c71c71cp-3");
+
+    let rsc = [1.0, -1.0, 0.5, -0.5, 0.25, -0.25];
+
+    let off = [hf64!("0x1p-53"), 0.0, 0.0, 0.0];
+
+    /* rm=0 for rounding to nearest, and other values for directed roundings */
+    let hx: u64 = x.to_bits();
+    let mut mant: u64 = hx & f64::SIG_MASK;
+    let sign: u64 = hx >> 63;
+
+    let mut e: u32 = (hx >> f64::SIG_BITS) as u32 & f64::EXP_SAT;
+
+    if ((e + 1) & f64::EXP_SAT) < 2 {
+        cold_path();
+
+        let ix: u64 = hx & !f64::SIGN_MASK;
+
+        /* 0, inf, nan: we return x + x instead of simply x,
+        to that for x a signaling NaN, it correctly triggers
+        the invalid exception. */
+        if e == f64::EXP_SAT || ix == 0 {
+            return FpResult::ok(x + x);
+        }
+
+        let nz = ix.leading_zeros() - 11; /* subnormal */
+        mant <<= nz;
+        mant &= f64::SIG_MASK;
+        e = e.wrapping_sub(nz - 1);
+    }
+
+    e = e.wrapping_add(3072);
+    let cvt1: u64 = mant | (0x3ffu64 << 52);
+    let mut cvt5: u64 = cvt1;
+
+    let et: u32 = e / 3;
+    let it: u32 = e % 3;
+
+    /* 2^(3k+it) <= x < 2^(3k+it+1), with 0 <= it <= 3 */
+    cvt5 += u64::from(it) << f64::SIG_BITS;
+    cvt5 |= sign << 63;
+    let zz: f64 = f64::from_bits(cvt5);
+
+    /* cbrt(x) = cbrt(zz)*2^(et-1365) where 1 <= zz < 8 */
+    let mut isc: u64 = ESCALE[it as usize].to_bits(); // todo: index
+    isc |= sign << 63;
+    let cvt2: u64 = isc;
+    let z: f64 = f64::from_bits(cvt1);
+
+    /* cbrt(zz) = cbrt(z)*isc, where isc encodes 1, 2^(1/3) or 2^(2/3),
+    and 1 <= z < 2 */
+    let r: f64 = 1.0 / z;
+    let rr: f64 = r * rsc[((it as usize) << 1) | sign as usize];
+    let z2: f64 = z * z;
+    let c0: f64 = C[0] + z * C[1];
+    let c2: f64 = C[2] + z * C[3];
+    let mut y: f64 = c0 + z2 * c2;
+    let mut y2: f64 = y * y;
+
+    /* y is an approximation of z^(1/3) */
+    let mut h: f64 = y2 * (y * r) - 1.0;
+
+    /* h determines the error between y and z^(1/3) */
+    y -= (h * y) * (u0 - u1 * h);
+
+    /* The correction y -= (h*y)*(u0 - u1*h) corresponds to a cubic variant
+    of Newton's method, with the function f(y) = 1-z/y^3. */
+    y *= f64::from_bits(cvt2);
+
+    /* Now y is an approximation of zz^(1/3),
+     * and rr an approximation of 1/zz. We now perform another iteration of
+     * Newton-Raphson, this time with a linear approximation only. */
+    y2 = y * y;
+    let mut y2l: f64 = y.fma(y, -y2);
+
+    /* y2 + y2l = y^2 exactly */
+    let mut y3: f64 = y2 * y;
+    let mut y3l: f64 = y.fma(y2, -y3) + y * y2l;
+
+    /* y3 + y3l approximates y^3 with about 106 bits of accuracy */
+    h = ((y3 - zz) + y3l) * rr;
+    let mut dy: f64 = h * (y * u0);
+
+    /* the approximation of zz^(1/3) is y - dy */
+    let mut y1: f64 = y - dy;
+    dy = (y - y1) - dy;
+
+    /* the approximation of zz^(1/3) is now y1 + dy, where |dy| < 1/2 ulp(y)
+     * (for rounding to nearest) */
+    let mut ady: f64 = dy.abs();
+
+    /* For directed roundings, ady0 is tiny when dy is tiny, or ady0 is near
+     * from ulp(1);
+     * for rounding to nearest, ady0 is tiny when dy is near from 1/2 ulp(1),
+     * or from 3/2 ulp(1). */
+    let mut ady0: f64 = (ady - off[round as usize]).abs();
+    let mut ady1: f64 = (ady - (hf64!("0x1p-52") + off[round as usize])).abs();
+
+    if ady0 < hf64!("0x1p-75") || ady1 < hf64!("0x1p-75") {
+        cold_path();
+
+        y2 = y1 * y1;
+        y2l = y1.fma(y1, -y2);
+        y3 = y2 * y1;
+        y3l = y1.fma(y2, -y3) + y1 * y2l;
+        h = ((y3 - zz) + y3l) * rr;
+        dy = h * (y1 * u0);
+        y = y1 - dy;
+        dy = (y1 - y) - dy;
+        y1 = y;
+        ady = dy.abs();
+        ady0 = (ady - off[round as usize]).abs();
+        ady1 = (ady - (hf64!("0x1p-52") + off[round as usize])).abs();
+
+        if ady0 < hf64!("0x1p-98") || ady1 < hf64!("0x1p-98") {
+            cold_path();
+            let azz: f64 = zz.abs();
+
+            // ~ 0x1.79d15d0e8d59b80000000000000ffc3dp+0
+            if azz == hf64!("0x1.9b78223aa307cp+1") {
+                y1 = hf64!("0x1.79d15d0e8d59cp+0").copysign(zz);
+            }
+
+            // ~ 0x1.de87aa837820e80000000000001c0f08p+0
+            if azz == hf64!("0x1.a202bfc89ddffp+2") {
+                y1 = hf64!("0x1.de87aa837820fp+0").copysign(zz);
+            }
+
+            if round != Round::Nearest {
+                let wlist = [
+                    (hf64!("0x1.3a9ccd7f022dbp+0"), hf64!("0x1.1236160ba9b93p+0")), // ~ 0x1.1236160ba9b930000000000001e7e8fap+0
+                    (hf64!("0x1.7845d2faac6fep+0"), hf64!("0x1.23115e657e49cp+0")), // ~ 0x1.23115e657e49c0000000000001d7a799p+0
+                    (hf64!("0x1.d1ef81cbbbe71p+0"), hf64!("0x1.388fb44cdcf5ap+0")), // ~ 0x1.388fb44cdcf5a0000000000002202c55p+0
+                    (hf64!("0x1.0a2014f62987cp+1"), hf64!("0x1.46bcbf47dc1e8p+0")), // ~ 0x1.46bcbf47dc1e8000000000000303aa2dp+0
+                    (hf64!("0x1.fe18a044a5501p+1"), hf64!("0x1.95decfec9c904p+0")), // ~ 0x1.95decfec9c9040000000000000159e8ep+0
+                    (hf64!("0x1.a6bb8c803147bp+2"), hf64!("0x1.e05335a6401dep+0")), // ~ 0x1.e05335a6401de00000000000027ca017p+0
+                    (hf64!("0x1.ac8538a031cbdp+2"), hf64!("0x1.e281d87098de8p+0")), // ~ 0x1.e281d87098de80000000000000ee9314p+0
+                ];
+
+                for (a, b) in wlist {
+                    if azz == a {
+                        let tmp = if round as u64 + sign == 2 {
+                            hf64!("0x1p-52")
+                        } else {
+                            0.0
+                        };
+                        y1 = (b + tmp).copysign(zz);
+                    }
+                }
+            }
+        }
+    }
+
+    let mut cvt3: u64 = y1.to_bits();
+    cvt3 = cvt3.wrapping_add(((et.wrapping_sub(342).wrapping_sub(1023)) as u64) << 52);
+    let m0: u64 = cvt3 << 30;
+    let m1 = m0 >> 63;
+
+    if (m0 ^ m1) <= (1u64 << 30) {
+        cold_path();
+
+        let mut cvt4: u64 = y1.to_bits();
+        cvt4 = (cvt4 + (164 << 15)) & 0xffffffffffff0000u64;
+
+        if ((f64::from_bits(cvt4) - y1) - dy).abs() < hf64!("0x1p-60") || (zz).abs() == 1.0 {
+            cvt3 = (cvt3 + (1u64 << 15)) & 0xffffffffffff0000u64;
+        }
+    }
+
+    FpResult::ok(f64::from_bits(cvt3))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn spot_checks() {
+        if !cfg!(x86_no_sse) {
+            // Exposes a rounding mode problem. Ignored on i586 because of inaccurate FMA.
+            assert_biteq!(
+                cbrt(f64::from_bits(0xf7f792b28f600000)),
+                f64::from_bits(0xd29ce68655d962f3)
+            );
+        }
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/cbrtf.rs b/library/compiler-builtins/libm/src/math/cbrtf.rs
new file mode 100644
index 00000000000..9d70305c647
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/cbrtf.rs
@@ -0,0 +1,75 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/s_cbrtf.c */
+/*
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
+ * Debugged and optimized by Bruce D. Evans.
+ */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+/* cbrtf(x)
+ * Return cube root of x
+ */
+
+use core::f32;
+
+const B1: u32 = 709958130; /* B1 = (127-127.0/3-0.03306235651)*2**23 */
+const B2: u32 = 642849266; /* B2 = (127-127.0/3-24/3-0.03306235651)*2**23 */
+
+/// Cube root (f32)
+///
+/// Computes the cube root of the argument.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn cbrtf(x: f32) -> f32 {
+    let x1p24 = f32::from_bits(0x4b800000); // 0x1p24f === 2 ^ 24
+
+    let mut r: f64;
+    let mut t: f64;
+    let mut ui: u32 = x.to_bits();
+    let mut hx: u32 = ui & 0x7fffffff;
+
+    if hx >= 0x7f800000 {
+        /* cbrt(NaN,INF) is itself */
+        return x + x;
+    }
+
+    /* rough cbrt to 5 bits */
+    if hx < 0x00800000 {
+        /* zero or subnormal? */
+        if hx == 0 {
+            return x; /* cbrt(+-0) is itself */
+        }
+        ui = (x * x1p24).to_bits();
+        hx = ui & 0x7fffffff;
+        hx = hx / 3 + B2;
+    } else {
+        hx = hx / 3 + B1;
+    }
+    ui &= 0x80000000;
+    ui |= hx;
+
+    /*
+     * First step Newton iteration (solving t*t-x/t == 0) to 16 bits.  In
+     * double precision so that its terms can be arranged for efficiency
+     * without causing overflow or underflow.
+     */
+    t = f32::from_bits(ui) as f64;
+    r = t * t * t;
+    t = t * (x as f64 + x as f64 + r) / (x as f64 + r + r);
+
+    /*
+     * Second step Newton iteration to 47 bits.  In double precision for
+     * efficiency and accuracy.
+     */
+    r = t * t * t;
+    t = t * (x as f64 + x as f64 + r) / (x as f64 + r + r);
+
+    /* rounding to 24 bits is perfect in round-to-nearest mode */
+    t as f32
+}
diff --git a/library/compiler-builtins/libm/src/math/ceil.rs b/library/compiler-builtins/libm/src/math/ceil.rs
new file mode 100644
index 00000000000..4e103545727
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/ceil.rs
@@ -0,0 +1,46 @@
+/// Ceil (f16)
+///
+/// Finds the nearest integer greater than or equal to `x`.
+#[cfg(f16_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn ceilf16(x: f16) -> f16 {
+    super::generic::ceil(x)
+}
+
+/// Ceil (f32)
+///
+/// Finds the nearest integer greater than or equal to `x`.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn ceilf(x: f32) -> f32 {
+    select_implementation! {
+        name: ceilf,
+        use_arch: all(target_arch = "wasm32", intrinsics_enabled),
+        args: x,
+    }
+
+    super::generic::ceil(x)
+}
+
+/// Ceil (f64)
+///
+/// Finds the nearest integer greater than or equal to `x`.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn ceil(x: f64) -> f64 {
+    select_implementation! {
+        name: ceil,
+        use_arch: all(target_arch = "wasm32", intrinsics_enabled),
+        use_arch_required: all(target_arch = "x86", not(target_feature = "sse2")),
+        args: x,
+    }
+
+    super::generic::ceil(x)
+}
+
+/// Ceil (f128)
+///
+/// Finds the nearest integer greater than or equal to `x`.
+#[cfg(f128_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn ceilf128(x: f128) -> f128 {
+    super::generic::ceil(x)
+}
diff --git a/library/compiler-builtins/libm/src/math/copysign.rs b/library/compiler-builtins/libm/src/math/copysign.rs
new file mode 100644
index 00000000000..d2a86e7fd54
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/copysign.rs
@@ -0,0 +1,88 @@
+/// Sign of Y, magnitude of X (f16)
+///
+/// Constructs a number with the magnitude (absolute value) of its
+/// first argument, `x`, and the sign of its second argument, `y`.
+#[cfg(f16_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn copysignf16(x: f16, y: f16) -> f16 {
+    super::generic::copysign(x, y)
+}
+
+/// Sign of Y, magnitude of X (f32)
+///
+/// Constructs a number with the magnitude (absolute value) of its
+/// first argument, `x`, and the sign of its second argument, `y`.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn copysignf(x: f32, y: f32) -> f32 {
+    super::generic::copysign(x, y)
+}
+
+/// Sign of Y, magnitude of X (f64)
+///
+/// Constructs a number with the magnitude (absolute value) of its
+/// first argument, `x`, and the sign of its second argument, `y`.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn copysign(x: f64, y: f64) -> f64 {
+    super::generic::copysign(x, y)
+}
+
+/// Sign of Y, magnitude of X (f128)
+///
+/// Constructs a number with the magnitude (absolute value) of its
+/// first argument, `x`, and the sign of its second argument, `y`.
+#[cfg(f128_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn copysignf128(x: f128, y: f128) -> f128 {
+    super::generic::copysign(x, y)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::support::Float;
+
+    fn spec_test<F: Float>(f: impl Fn(F, F) -> F) {
+        assert_biteq!(f(F::ZERO, F::ZERO), F::ZERO);
+        assert_biteq!(f(F::NEG_ZERO, F::ZERO), F::ZERO);
+        assert_biteq!(f(F::ZERO, F::NEG_ZERO), F::NEG_ZERO);
+        assert_biteq!(f(F::NEG_ZERO, F::NEG_ZERO), F::NEG_ZERO);
+
+        assert_biteq!(f(F::ONE, F::ONE), F::ONE);
+        assert_biteq!(f(F::NEG_ONE, F::ONE), F::ONE);
+        assert_biteq!(f(F::ONE, F::NEG_ONE), F::NEG_ONE);
+        assert_biteq!(f(F::NEG_ONE, F::NEG_ONE), F::NEG_ONE);
+
+        assert_biteq!(f(F::INFINITY, F::INFINITY), F::INFINITY);
+        assert_biteq!(f(F::NEG_INFINITY, F::INFINITY), F::INFINITY);
+        assert_biteq!(f(F::INFINITY, F::NEG_INFINITY), F::NEG_INFINITY);
+        assert_biteq!(f(F::NEG_INFINITY, F::NEG_INFINITY), F::NEG_INFINITY);
+
+        // Not required but we expect it
+        assert_biteq!(f(F::NAN, F::NAN), F::NAN);
+        assert_biteq!(f(F::NEG_NAN, F::NAN), F::NAN);
+        assert_biteq!(f(F::NAN, F::NEG_NAN), F::NEG_NAN);
+        assert_biteq!(f(F::NEG_NAN, F::NEG_NAN), F::NEG_NAN);
+    }
+
+    #[test]
+    #[cfg(f16_enabled)]
+    fn spec_tests_f16() {
+        spec_test::<f16>(copysignf16);
+    }
+
+    #[test]
+    fn spec_tests_f32() {
+        spec_test::<f32>(copysignf);
+    }
+
+    #[test]
+    fn spec_tests_f64() {
+        spec_test::<f64>(copysign);
+    }
+
+    #[test]
+    #[cfg(f128_enabled)]
+    fn spec_tests_f128() {
+        spec_test::<f128>(copysignf128);
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/copysignf.rs b/library/compiler-builtins/libm/src/math/copysignf.rs
new file mode 100644
index 00000000000..8b9bed4c0c4
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/copysignf.rs
@@ -0,0 +1,8 @@
+/// Sign of Y, magnitude of X (f32)
+///
+/// Constructs a number with the magnitude (absolute value) of its
+/// first argument, `x`, and the sign of its second argument, `y`.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn copysignf(x: f32, y: f32) -> f32 {
+    super::generic::copysign(x, y)
+}
diff --git a/library/compiler-builtins/libm/src/math/copysignf128.rs b/library/compiler-builtins/libm/src/math/copysignf128.rs
new file mode 100644
index 00000000000..7bd81d42b2e
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/copysignf128.rs
@@ -0,0 +1,8 @@
+/// Sign of Y, magnitude of X (f128)
+///
+/// Constructs a number with the magnitude (absolute value) of its
+/// first argument, `x`, and the sign of its second argument, `y`.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn copysignf128(x: f128, y: f128) -> f128 {
+    super::generic::copysign(x, y)
+}
diff --git a/library/compiler-builtins/libm/src/math/copysignf16.rs b/library/compiler-builtins/libm/src/math/copysignf16.rs
new file mode 100644
index 00000000000..82065868601
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/copysignf16.rs
@@ -0,0 +1,8 @@
+/// Sign of Y, magnitude of X (f16)
+///
+/// Constructs a number with the magnitude (absolute value) of its
+/// first argument, `x`, and the sign of its second argument, `y`.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn copysignf16(x: f16, y: f16) -> f16 {
+    super::generic::copysign(x, y)
+}
diff --git a/library/compiler-builtins/libm/src/math/cos.rs b/library/compiler-builtins/libm/src/math/cos.rs
new file mode 100644
index 00000000000..de99cd4c5e4
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/cos.rs
@@ -0,0 +1,77 @@
+// origin: FreeBSD /usr/src/lib/msun/src/s_cos.c */
+//
+// ====================================================
+// Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+//
+// Developed at SunPro, a Sun Microsystems, Inc. business.
+// Permission to use, copy, modify, and distribute this
+// software is freely granted, provided that this notice
+// is preserved.
+// ====================================================
+
+use super::{k_cos, k_sin, rem_pio2};
+
+// cos(x)
+// Return cosine function of x.
+//
+// kernel function:
+//      k_sin           ... sine function on [-pi/4,pi/4]
+//      k_cos           ... cosine function on [-pi/4,pi/4]
+//      rem_pio2        ... argument reduction routine
+//
+// Method.
+//      Let S,C and T denote the sin, cos and tan respectively on
+//      [-PI/4, +PI/4]. Reduce the argument x to y1+y2 = x-k*pi/2
+//      in [-pi/4 , +pi/4], and let n = k mod 4.
+//      We have
+//
+//          n        sin(x)      cos(x)        tan(x)
+//     ----------------------------------------------------------
+//          0          S           C             T
+//          1          C          -S            -1/T
+//          2         -S          -C             T
+//          3         -C           S            -1/T
+//     ----------------------------------------------------------
+//
+// Special cases:
+//      Let trig be any of sin, cos, or tan.
+//      trig(+-INF)  is NaN, with signals;
+//      trig(NaN)    is that NaN;
+//
+// Accuracy:
+//      TRIG(x) returns trig(x) nearly rounded
+//
+
+/// The cosine of `x` (f64).
+///
+/// `x` is specified in radians.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn cos(x: f64) -> f64 {
+    let ix = (f64::to_bits(x) >> 32) as u32 & 0x7fffffff;
+
+    /* |x| ~< pi/4 */
+    if ix <= 0x3fe921fb {
+        if ix < 0x3e46a09e {
+            /* if x < 2**-27 * sqrt(2) */
+            /* raise inexact if x != 0 */
+            if x as i32 == 0 {
+                return 1.0;
+            }
+        }
+        return k_cos(x, 0.0);
+    }
+
+    /* cos(Inf or NaN) is NaN */
+    if ix >= 0x7ff00000 {
+        return x - x;
+    }
+
+    /* argument reduction needed */
+    let (n, y0, y1) = rem_pio2(x);
+    match n & 3 {
+        0 => k_cos(y0, y1),
+        1 => -k_sin(y0, y1, 1),
+        2 => -k_cos(y0, y1),
+        _ => k_sin(y0, y1, 1),
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/cosf.rs b/library/compiler-builtins/libm/src/math/cosf.rs
new file mode 100644
index 00000000000..27c2fc3b994
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/cosf.rs
@@ -0,0 +1,86 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/s_cosf.c */
+/*
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
+ * Optimized by Bruce D. Evans.
+ */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+use core::f64::consts::FRAC_PI_2;
+
+use super::{k_cosf, k_sinf, rem_pio2f};
+
+/* Small multiples of pi/2 rounded to double precision. */
+const C1_PIO2: f64 = 1. * FRAC_PI_2; /* 0x3FF921FB, 0x54442D18 */
+const C2_PIO2: f64 = 2. * FRAC_PI_2; /* 0x400921FB, 0x54442D18 */
+const C3_PIO2: f64 = 3. * FRAC_PI_2; /* 0x4012D97C, 0x7F3321D2 */
+const C4_PIO2: f64 = 4. * FRAC_PI_2; /* 0x401921FB, 0x54442D18 */
+
+/// The cosine of `x` (f32).
+///
+/// `x` is specified in radians.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn cosf(x: f32) -> f32 {
+    let x64 = x as f64;
+
+    let x1p120 = f32::from_bits(0x7b800000); // 0x1p120f === 2 ^ 120
+
+    let mut ix = x.to_bits();
+    let sign = (ix >> 31) != 0;
+    ix &= 0x7fffffff;
+
+    if ix <= 0x3f490fda {
+        /* |x| ~<= pi/4 */
+        if ix < 0x39800000 {
+            /* |x| < 2**-12 */
+            /* raise inexact if x != 0 */
+            force_eval!(x + x1p120);
+            return 1.;
+        }
+        return k_cosf(x64);
+    }
+    if ix <= 0x407b53d1 {
+        /* |x| ~<= 5*pi/4 */
+        if ix > 0x4016cbe3 {
+            /* |x|  ~> 3*pi/4 */
+            return -k_cosf(if sign { x64 + C2_PIO2 } else { x64 - C2_PIO2 });
+        } else if sign {
+            return k_sinf(x64 + C1_PIO2);
+        } else {
+            return k_sinf(C1_PIO2 - x64);
+        }
+    }
+    if ix <= 0x40e231d5 {
+        /* |x| ~<= 9*pi/4 */
+        if ix > 0x40afeddf {
+            /* |x| ~> 7*pi/4 */
+            return k_cosf(if sign { x64 + C4_PIO2 } else { x64 - C4_PIO2 });
+        } else if sign {
+            return k_sinf(-x64 - C3_PIO2);
+        } else {
+            return k_sinf(x64 - C3_PIO2);
+        }
+    }
+
+    /* cos(Inf or NaN) is NaN */
+    if ix >= 0x7f800000 {
+        return x - x;
+    }
+
+    /* general argument reduction needed */
+    let (n, y) = rem_pio2f(x);
+    match n & 3 {
+        0 => k_cosf(y),
+        1 => k_sinf(-y),
+        2 => -k_cosf(y),
+        _ => k_sinf(y),
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/cosh.rs b/library/compiler-builtins/libm/src/math/cosh.rs
new file mode 100644
index 00000000000..d2e43fd6cb6
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/cosh.rs
@@ -0,0 +1,36 @@
+use super::{exp, expm1, k_expo2};
+
+/// Hyperbolic cosine (f64)
+///
+/// Computes the hyperbolic cosine of the argument x.
+/// Is defined as `(exp(x) + exp(-x))/2`
+/// Angles are specified in radians.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn cosh(mut x: f64) -> f64 {
+    /* |x| */
+    let mut ix = x.to_bits();
+    ix &= 0x7fffffffffffffff;
+    x = f64::from_bits(ix);
+    let w = ix >> 32;
+
+    /* |x| < log(2) */
+    if w < 0x3fe62e42 {
+        if w < 0x3ff00000 - (26 << 20) {
+            let x1p120 = f64::from_bits(0x4770000000000000);
+            force_eval!(x + x1p120);
+            return 1.;
+        }
+        let t = expm1(x); // exponential minus 1
+        return 1. + t * t / (2. * (1. + t));
+    }
+
+    /* |x| < log(DBL_MAX) */
+    if w < 0x40862e42 {
+        let t = exp(x);
+        /* note: if x>log(0x1p26) then the 1/t is not needed */
+        return 0.5 * (t + 1. / t);
+    }
+
+    /* |x| > log(DBL_MAX) or nan */
+    k_expo2(x)
+}
diff --git a/library/compiler-builtins/libm/src/math/coshf.rs b/library/compiler-builtins/libm/src/math/coshf.rs
new file mode 100644
index 00000000000..567a24410e7
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/coshf.rs
@@ -0,0 +1,36 @@
+use super::{expf, expm1f, k_expo2f};
+
+/// Hyperbolic cosine (f64)
+///
+/// Computes the hyperbolic cosine of the argument x.
+/// Is defined as `(exp(x) + exp(-x))/2`
+/// Angles are specified in radians.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn coshf(mut x: f32) -> f32 {
+    let x1p120 = f32::from_bits(0x7b800000); // 0x1p120f === 2 ^ 120
+
+    /* |x| */
+    let mut ix = x.to_bits();
+    ix &= 0x7fffffff;
+    x = f32::from_bits(ix);
+    let w = ix;
+
+    /* |x| < log(2) */
+    if w < 0x3f317217 {
+        if w < (0x3f800000 - (12 << 23)) {
+            force_eval!(x + x1p120);
+            return 1.;
+        }
+        let t = expm1f(x);
+        return 1. + t * t / (2. * (1. + t));
+    }
+
+    /* |x| < log(FLT_MAX) */
+    if w < 0x42b17217 {
+        let t = expf(x);
+        return 0.5 * (t + 1. / t);
+    }
+
+    /* |x| > log(FLT_MAX) or nan */
+    k_expo2f(x)
+}
diff --git a/library/compiler-builtins/libm/src/math/erf.rs b/library/compiler-builtins/libm/src/math/erf.rs
new file mode 100644
index 00000000000..5d82228a05f
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/erf.rs
@@ -0,0 +1,314 @@
+use super::{exp, fabs, get_high_word, with_set_low_word};
+/* origin: FreeBSD /usr/src/lib/msun/src/s_erf.c */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+/* double erf(double x)
+ * double erfc(double x)
+ *                           x
+ *                    2      |\
+ *     erf(x)  =  ---------  | exp(-t*t)dt
+ *                 sqrt(pi) \|
+ *                           0
+ *
+ *     erfc(x) =  1-erf(x)
+ *  Note that
+ *              erf(-x) = -erf(x)
+ *              erfc(-x) = 2 - erfc(x)
+ *
+ * Method:
+ *      1. For |x| in [0, 0.84375]
+ *          erf(x)  = x + x*R(x^2)
+ *          erfc(x) = 1 - erf(x)           if x in [-.84375,0.25]
+ *                  = 0.5 + ((0.5-x)-x*R)  if x in [0.25,0.84375]
+ *         where R = P/Q where P is an odd poly of degree 8 and
+ *         Q is an odd poly of degree 10.
+ *                                               -57.90
+ *                      | R - (erf(x)-x)/x | <= 2
+ *
+ *
+ *         Remark. The formula is derived by noting
+ *          erf(x) = (2/sqrt(pi))*(x - x^3/3 + x^5/10 - x^7/42 + ....)
+ *         and that
+ *          2/sqrt(pi) = 1.128379167095512573896158903121545171688
+ *         is close to one. The interval is chosen because the fix
+ *         point of erf(x) is near 0.6174 (i.e., erf(x)=x when x is
+ *         near 0.6174), and by some experiment, 0.84375 is chosen to
+ *         guarantee the error is less than one ulp for erf.
+ *
+ *      2. For |x| in [0.84375,1.25], let s = |x| - 1, and
+ *         c = 0.84506291151 rounded to single (24 bits)
+ *              erf(x)  = sign(x) * (c  + P1(s)/Q1(s))
+ *              erfc(x) = (1-c)  - P1(s)/Q1(s) if x > 0
+ *                        1+(c+P1(s)/Q1(s))    if x < 0
+ *              |P1/Q1 - (erf(|x|)-c)| <= 2**-59.06
+ *         Remark: here we use the taylor series expansion at x=1.
+ *              erf(1+s) = erf(1) + s*Poly(s)
+ *                       = 0.845.. + P1(s)/Q1(s)
+ *         That is, we use rational approximation to approximate
+ *                      erf(1+s) - (c = (single)0.84506291151)
+ *         Note that |P1/Q1|< 0.078 for x in [0.84375,1.25]
+ *         where
+ *              P1(s) = degree 6 poly in s
+ *              Q1(s) = degree 6 poly in s
+ *
+ *      3. For x in [1.25,1/0.35(~2.857143)],
+ *              erfc(x) = (1/x)*exp(-x*x-0.5625+R1/S1)
+ *              erf(x)  = 1 - erfc(x)
+ *         where
+ *              R1(z) = degree 7 poly in z, (z=1/x^2)
+ *              S1(z) = degree 8 poly in z
+ *
+ *      4. For x in [1/0.35,28]
+ *              erfc(x) = (1/x)*exp(-x*x-0.5625+R2/S2) if x > 0
+ *                      = 2.0 - (1/x)*exp(-x*x-0.5625+R2/S2) if -6<x<0
+ *                      = 2.0 - tiny            (if x <= -6)
+ *              erf(x)  = sign(x)*(1.0 - erfc(x)) if x < 6, else
+ *              erf(x)  = sign(x)*(1.0 - tiny)
+ *         where
+ *              R2(z) = degree 6 poly in z, (z=1/x^2)
+ *              S2(z) = degree 7 poly in z
+ *
+ *      Note1:
+ *         To compute exp(-x*x-0.5625+R/S), let s be a single
+ *         precision number and s := x; then
+ *              -x*x = -s*s + (s-x)*(s+x)
+ *              exp(-x*x-0.5626+R/S) =
+ *                      exp(-s*s-0.5625)*exp((s-x)*(s+x)+R/S);
+ *      Note2:
+ *         Here 4 and 5 make use of the asymptotic series
+ *                        exp(-x*x)
+ *              erfc(x) ~ ---------- * ( 1 + Poly(1/x^2) )
+ *                        x*sqrt(pi)
+ *         We use rational approximation to approximate
+ *              g(s)=f(1/x^2) = log(erfc(x)*x) - x*x + 0.5625
+ *         Here is the error bound for R1/S1 and R2/S2
+ *              |R1/S1 - f(x)|  < 2**(-62.57)
+ *              |R2/S2 - f(x)|  < 2**(-61.52)
+ *
+ *      5. For inf > x >= 28
+ *              erf(x)  = sign(x) *(1 - tiny)  (raise inexact)
+ *              erfc(x) = tiny*tiny (raise underflow) if x > 0
+ *                      = 2 - tiny if x<0
+ *
+ *      7. Special case:
+ *              erf(0)  = 0, erf(inf)  = 1, erf(-inf) = -1,
+ *              erfc(0) = 1, erfc(inf) = 0, erfc(-inf) = 2,
+ *              erfc/erf(NaN) is NaN
+ */
+
+const ERX: f64 = 8.45062911510467529297e-01; /* 0x3FEB0AC1, 0x60000000 */
+/*
+ * Coefficients for approximation to  erf on [0,0.84375]
+ */
+const EFX8: f64 = 1.02703333676410069053e+00; /* 0x3FF06EBA, 0x8214DB69 */
+const PP0: f64 = 1.28379167095512558561e-01; /* 0x3FC06EBA, 0x8214DB68 */
+const PP1: f64 = -3.25042107247001499370e-01; /* 0xBFD4CD7D, 0x691CB913 */
+const PP2: f64 = -2.84817495755985104766e-02; /* 0xBF9D2A51, 0xDBD7194F */
+const PP3: f64 = -5.77027029648944159157e-03; /* 0xBF77A291, 0x236668E4 */
+const PP4: f64 = -2.37630166566501626084e-05; /* 0xBEF8EAD6, 0x120016AC */
+const QQ1: f64 = 3.97917223959155352819e-01; /* 0x3FD97779, 0xCDDADC09 */
+const QQ2: f64 = 6.50222499887672944485e-02; /* 0x3FB0A54C, 0x5536CEBA */
+const QQ3: f64 = 5.08130628187576562776e-03; /* 0x3F74D022, 0xC4D36B0F */
+const QQ4: f64 = 1.32494738004321644526e-04; /* 0x3F215DC9, 0x221C1A10 */
+const QQ5: f64 = -3.96022827877536812320e-06; /* 0xBED09C43, 0x42A26120 */
+/*
+ * Coefficients for approximation to  erf  in [0.84375,1.25]
+ */
+const PA0: f64 = -2.36211856075265944077e-03; /* 0xBF6359B8, 0xBEF77538 */
+const PA1: f64 = 4.14856118683748331666e-01; /* 0x3FDA8D00, 0xAD92B34D */
+const PA2: f64 = -3.72207876035701323847e-01; /* 0xBFD7D240, 0xFBB8C3F1 */
+const PA3: f64 = 3.18346619901161753674e-01; /* 0x3FD45FCA, 0x805120E4 */
+const PA4: f64 = -1.10894694282396677476e-01; /* 0xBFBC6398, 0x3D3E28EC */
+const PA5: f64 = 3.54783043256182359371e-02; /* 0x3FA22A36, 0x599795EB */
+const PA6: f64 = -2.16637559486879084300e-03; /* 0xBF61BF38, 0x0A96073F */
+const QA1: f64 = 1.06420880400844228286e-01; /* 0x3FBB3E66, 0x18EEE323 */
+const QA2: f64 = 5.40397917702171048937e-01; /* 0x3FE14AF0, 0x92EB6F33 */
+const QA3: f64 = 7.18286544141962662868e-02; /* 0x3FB2635C, 0xD99FE9A7 */
+const QA4: f64 = 1.26171219808761642112e-01; /* 0x3FC02660, 0xE763351F */
+const QA5: f64 = 1.36370839120290507362e-02; /* 0x3F8BEDC2, 0x6B51DD1C */
+const QA6: f64 = 1.19844998467991074170e-02; /* 0x3F888B54, 0x5735151D */
+/*
+ * Coefficients for approximation to  erfc in [1.25,1/0.35]
+ */
+const RA0: f64 = -9.86494403484714822705e-03; /* 0xBF843412, 0x600D6435 */
+const RA1: f64 = -6.93858572707181764372e-01; /* 0xBFE63416, 0xE4BA7360 */
+const RA2: f64 = -1.05586262253232909814e+01; /* 0xC0251E04, 0x41B0E726 */
+const RA3: f64 = -6.23753324503260060396e+01; /* 0xC04F300A, 0xE4CBA38D */
+const RA4: f64 = -1.62396669462573470355e+02; /* 0xC0644CB1, 0x84282266 */
+const RA5: f64 = -1.84605092906711035994e+02; /* 0xC067135C, 0xEBCCABB2 */
+const RA6: f64 = -8.12874355063065934246e+01; /* 0xC0545265, 0x57E4D2F2 */
+const RA7: f64 = -9.81432934416914548592e+00; /* 0xC023A0EF, 0xC69AC25C */
+const SA1: f64 = 1.96512716674392571292e+01; /* 0x4033A6B9, 0xBD707687 */
+const SA2: f64 = 1.37657754143519042600e+02; /* 0x4061350C, 0x526AE721 */
+const SA3: f64 = 4.34565877475229228821e+02; /* 0x407B290D, 0xD58A1A71 */
+const SA4: f64 = 6.45387271733267880336e+02; /* 0x40842B19, 0x21EC2868 */
+const SA5: f64 = 4.29008140027567833386e+02; /* 0x407AD021, 0x57700314 */
+const SA6: f64 = 1.08635005541779435134e+02; /* 0x405B28A3, 0xEE48AE2C */
+const SA7: f64 = 6.57024977031928170135e+00; /* 0x401A47EF, 0x8E484A93 */
+const SA8: f64 = -6.04244152148580987438e-02; /* 0xBFAEEFF2, 0xEE749A62 */
+/*
+ * Coefficients for approximation to  erfc in [1/.35,28]
+ */
+const RB0: f64 = -9.86494292470009928597e-03; /* 0xBF843412, 0x39E86F4A */
+const RB1: f64 = -7.99283237680523006574e-01; /* 0xBFE993BA, 0x70C285DE */
+const RB2: f64 = -1.77579549177547519889e+01; /* 0xC031C209, 0x555F995A */
+const RB3: f64 = -1.60636384855821916062e+02; /* 0xC064145D, 0x43C5ED98 */
+const RB4: f64 = -6.37566443368389627722e+02; /* 0xC083EC88, 0x1375F228 */
+const RB5: f64 = -1.02509513161107724954e+03; /* 0xC0900461, 0x6A2E5992 */
+const RB6: f64 = -4.83519191608651397019e+02; /* 0xC07E384E, 0x9BDC383F */
+const SB1: f64 = 3.03380607434824582924e+01; /* 0x403E568B, 0x261D5190 */
+const SB2: f64 = 3.25792512996573918826e+02; /* 0x40745CAE, 0x221B9F0A */
+const SB3: f64 = 1.53672958608443695994e+03; /* 0x409802EB, 0x189D5118 */
+const SB4: f64 = 3.19985821950859553908e+03; /* 0x40A8FFB7, 0x688C246A */
+const SB5: f64 = 2.55305040643316442583e+03; /* 0x40A3F219, 0xCEDF3BE6 */
+const SB6: f64 = 4.74528541206955367215e+02; /* 0x407DA874, 0xE79FE763 */
+const SB7: f64 = -2.24409524465858183362e+01; /* 0xC03670E2, 0x42712D62 */
+
+fn erfc1(x: f64) -> f64 {
+    let s: f64;
+    let p: f64;
+    let q: f64;
+
+    s = fabs(x) - 1.0;
+    p = PA0 + s * (PA1 + s * (PA2 + s * (PA3 + s * (PA4 + s * (PA5 + s * PA6)))));
+    q = 1.0 + s * (QA1 + s * (QA2 + s * (QA3 + s * (QA4 + s * (QA5 + s * QA6)))));
+
+    1.0 - ERX - p / q
+}
+
+fn erfc2(ix: u32, mut x: f64) -> f64 {
+    let s: f64;
+    let r: f64;
+    let big_s: f64;
+    let z: f64;
+
+    if ix < 0x3ff40000 {
+        /* |x| < 1.25 */
+        return erfc1(x);
+    }
+
+    x = fabs(x);
+    s = 1.0 / (x * x);
+    if ix < 0x4006db6d {
+        /* |x| < 1/.35 ~ 2.85714 */
+        r = RA0 + s * (RA1 + s * (RA2 + s * (RA3 + s * (RA4 + s * (RA5 + s * (RA6 + s * RA7))))));
+        big_s = 1.0
+            + s * (SA1
+                + s * (SA2 + s * (SA3 + s * (SA4 + s * (SA5 + s * (SA6 + s * (SA7 + s * SA8)))))));
+    } else {
+        /* |x| > 1/.35 */
+        r = RB0 + s * (RB1 + s * (RB2 + s * (RB3 + s * (RB4 + s * (RB5 + s * RB6)))));
+        big_s =
+            1.0 + s * (SB1 + s * (SB2 + s * (SB3 + s * (SB4 + s * (SB5 + s * (SB6 + s * SB7))))));
+    }
+    z = with_set_low_word(x, 0);
+
+    exp(-z * z - 0.5625) * exp((z - x) * (z + x) + r / big_s) / x
+}
+
+/// Error function (f64)
+///
+/// Calculates an approximation to the “error function”, which estimates
+/// the probability that an observation will fall within x standard
+/// deviations of the mean (assuming a normal distribution).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn erf(x: f64) -> f64 {
+    let r: f64;
+    let s: f64;
+    let z: f64;
+    let y: f64;
+    let mut ix: u32;
+    let sign: usize;
+
+    ix = get_high_word(x);
+    sign = (ix >> 31) as usize;
+    ix &= 0x7fffffff;
+    if ix >= 0x7ff00000 {
+        /* erf(nan)=nan, erf(+-inf)=+-1 */
+        return 1.0 - 2.0 * (sign as f64) + 1.0 / x;
+    }
+    if ix < 0x3feb0000 {
+        /* |x| < 0.84375 */
+        if ix < 0x3e300000 {
+            /* |x| < 2**-28 */
+            /* avoid underflow */
+            return 0.125 * (8.0 * x + EFX8 * x);
+        }
+        z = x * x;
+        r = PP0 + z * (PP1 + z * (PP2 + z * (PP3 + z * PP4)));
+        s = 1.0 + z * (QQ1 + z * (QQ2 + z * (QQ3 + z * (QQ4 + z * QQ5))));
+        y = r / s;
+        return x + x * y;
+    }
+    if ix < 0x40180000 {
+        /* 0.84375 <= |x| < 6 */
+        y = 1.0 - erfc2(ix, x);
+    } else {
+        let x1p_1022 = f64::from_bits(0x0010000000000000);
+        y = 1.0 - x1p_1022;
+    }
+
+    if sign != 0 { -y } else { y }
+}
+
+/// Complementary error function (f64)
+///
+/// Calculates the complementary probability.
+/// Is `1 - erf(x)`. Is computed directly, so that you can use it to avoid
+/// the loss of precision that would result from subtracting
+/// large probabilities (on large `x`) from 1.
+pub fn erfc(x: f64) -> f64 {
+    let r: f64;
+    let s: f64;
+    let z: f64;
+    let y: f64;
+    let mut ix: u32;
+    let sign: usize;
+
+    ix = get_high_word(x);
+    sign = (ix >> 31) as usize;
+    ix &= 0x7fffffff;
+    if ix >= 0x7ff00000 {
+        /* erfc(nan)=nan, erfc(+-inf)=0,2 */
+        return 2.0 * (sign as f64) + 1.0 / x;
+    }
+    if ix < 0x3feb0000 {
+        /* |x| < 0.84375 */
+        if ix < 0x3c700000 {
+            /* |x| < 2**-56 */
+            return 1.0 - x;
+        }
+        z = x * x;
+        r = PP0 + z * (PP1 + z * (PP2 + z * (PP3 + z * PP4)));
+        s = 1.0 + z * (QQ1 + z * (QQ2 + z * (QQ3 + z * (QQ4 + z * QQ5))));
+        y = r / s;
+        if sign != 0 || ix < 0x3fd00000 {
+            /* x < 1/4 */
+            return 1.0 - (x + x * y);
+        }
+        return 0.5 - (x - 0.5 + x * y);
+    }
+    if ix < 0x403c0000 {
+        /* 0.84375 <= |x| < 28 */
+        if sign != 0 {
+            return 2.0 - erfc2(ix, x);
+        } else {
+            return erfc2(ix, x);
+        }
+    }
+
+    let x1p_1022 = f64::from_bits(0x0010000000000000);
+    if sign != 0 {
+        2.0 - x1p_1022
+    } else {
+        x1p_1022 * x1p_1022
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/erff.rs b/library/compiler-builtins/libm/src/math/erff.rs
new file mode 100644
index 00000000000..fe15f01082e
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/erff.rs
@@ -0,0 +1,226 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/s_erff.c */
+/*
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
+ */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+use super::{expf, fabsf};
+
+const ERX: f32 = 8.4506291151e-01; /* 0x3f58560b */
+/*
+ * Coefficients for approximation to  erf on [0,0.84375]
+ */
+const EFX8: f32 = 1.0270333290e+00; /* 0x3f8375d4 */
+const PP0: f32 = 1.2837916613e-01; /* 0x3e0375d4 */
+const PP1: f32 = -3.2504209876e-01; /* 0xbea66beb */
+const PP2: f32 = -2.8481749818e-02; /* 0xbce9528f */
+const PP3: f32 = -5.7702702470e-03; /* 0xbbbd1489 */
+const PP4: f32 = -2.3763017452e-05; /* 0xb7c756b1 */
+const QQ1: f32 = 3.9791721106e-01; /* 0x3ecbbbce */
+const QQ2: f32 = 6.5022252500e-02; /* 0x3d852a63 */
+const QQ3: f32 = 5.0813062117e-03; /* 0x3ba68116 */
+const QQ4: f32 = 1.3249473704e-04; /* 0x390aee49 */
+const QQ5: f32 = -3.9602282413e-06; /* 0xb684e21a */
+/*
+ * Coefficients for approximation to  erf  in [0.84375,1.25]
+ */
+const PA0: f32 = -2.3621185683e-03; /* 0xbb1acdc6 */
+const PA1: f32 = 4.1485610604e-01; /* 0x3ed46805 */
+const PA2: f32 = -3.7220788002e-01; /* 0xbebe9208 */
+const PA3: f32 = 3.1834661961e-01; /* 0x3ea2fe54 */
+const PA4: f32 = -1.1089469492e-01; /* 0xbde31cc2 */
+const PA5: f32 = 3.5478305072e-02; /* 0x3d1151b3 */
+const PA6: f32 = -2.1663755178e-03; /* 0xbb0df9c0 */
+const QA1: f32 = 1.0642088205e-01; /* 0x3dd9f331 */
+const QA2: f32 = 5.4039794207e-01; /* 0x3f0a5785 */
+const QA3: f32 = 7.1828655899e-02; /* 0x3d931ae7 */
+const QA4: f32 = 1.2617121637e-01; /* 0x3e013307 */
+const QA5: f32 = 1.3637083583e-02; /* 0x3c5f6e13 */
+const QA6: f32 = 1.1984500103e-02; /* 0x3c445aa3 */
+/*
+ * Coefficients for approximation to  erfc in [1.25,1/0.35]
+ */
+const RA0: f32 = -9.8649440333e-03; /* 0xbc21a093 */
+const RA1: f32 = -6.9385856390e-01; /* 0xbf31a0b7 */
+const RA2: f32 = -1.0558626175e+01; /* 0xc128f022 */
+const RA3: f32 = -6.2375331879e+01; /* 0xc2798057 */
+const RA4: f32 = -1.6239666748e+02; /* 0xc322658c */
+const RA5: f32 = -1.8460508728e+02; /* 0xc3389ae7 */
+const RA6: f32 = -8.1287437439e+01; /* 0xc2a2932b */
+const RA7: f32 = -9.8143291473e+00; /* 0xc11d077e */
+const SA1: f32 = 1.9651271820e+01; /* 0x419d35ce */
+const SA2: f32 = 1.3765776062e+02; /* 0x4309a863 */
+const SA3: f32 = 4.3456588745e+02; /* 0x43d9486f */
+const SA4: f32 = 6.4538726807e+02; /* 0x442158c9 */
+const SA5: f32 = 4.2900814819e+02; /* 0x43d6810b */
+const SA6: f32 = 1.0863500214e+02; /* 0x42d9451f */
+const SA7: f32 = 6.5702495575e+00; /* 0x40d23f7c */
+const SA8: f32 = -6.0424413532e-02; /* 0xbd777f97 */
+/*
+ * Coefficients for approximation to  erfc in [1/.35,28]
+ */
+const RB0: f32 = -9.8649431020e-03; /* 0xbc21a092 */
+const RB1: f32 = -7.9928326607e-01; /* 0xbf4c9dd4 */
+const RB2: f32 = -1.7757955551e+01; /* 0xc18e104b */
+const RB3: f32 = -1.6063638306e+02; /* 0xc320a2ea */
+const RB4: f32 = -6.3756646729e+02; /* 0xc41f6441 */
+const RB5: f32 = -1.0250950928e+03; /* 0xc480230b */
+const RB6: f32 = -4.8351919556e+02; /* 0xc3f1c275 */
+const SB1: f32 = 3.0338060379e+01; /* 0x41f2b459 */
+const SB2: f32 = 3.2579251099e+02; /* 0x43a2e571 */
+const SB3: f32 = 1.5367296143e+03; /* 0x44c01759 */
+const SB4: f32 = 3.1998581543e+03; /* 0x4547fdbb */
+const SB5: f32 = 2.5530502930e+03; /* 0x451f90ce */
+const SB6: f32 = 4.7452853394e+02; /* 0x43ed43a7 */
+const SB7: f32 = -2.2440952301e+01; /* 0xc1b38712 */
+
+fn erfc1(x: f32) -> f32 {
+    let s: f32;
+    let p: f32;
+    let q: f32;
+
+    s = fabsf(x) - 1.0;
+    p = PA0 + s * (PA1 + s * (PA2 + s * (PA3 + s * (PA4 + s * (PA5 + s * PA6)))));
+    q = 1.0 + s * (QA1 + s * (QA2 + s * (QA3 + s * (QA4 + s * (QA5 + s * QA6)))));
+    return 1.0 - ERX - p / q;
+}
+
+fn erfc2(mut ix: u32, mut x: f32) -> f32 {
+    let s: f32;
+    let r: f32;
+    let big_s: f32;
+    let z: f32;
+
+    if ix < 0x3fa00000 {
+        /* |x| < 1.25 */
+        return erfc1(x);
+    }
+
+    x = fabsf(x);
+    s = 1.0 / (x * x);
+    if ix < 0x4036db6d {
+        /* |x| < 1/0.35 */
+        r = RA0 + s * (RA1 + s * (RA2 + s * (RA3 + s * (RA4 + s * (RA5 + s * (RA6 + s * RA7))))));
+        big_s = 1.0
+            + s * (SA1
+                + s * (SA2 + s * (SA3 + s * (SA4 + s * (SA5 + s * (SA6 + s * (SA7 + s * SA8)))))));
+    } else {
+        /* |x| >= 1/0.35 */
+        r = RB0 + s * (RB1 + s * (RB2 + s * (RB3 + s * (RB4 + s * (RB5 + s * RB6)))));
+        big_s =
+            1.0 + s * (SB1 + s * (SB2 + s * (SB3 + s * (SB4 + s * (SB5 + s * (SB6 + s * SB7))))));
+    }
+    ix = x.to_bits();
+    z = f32::from_bits(ix & 0xffffe000);
+
+    expf(-z * z - 0.5625) * expf((z - x) * (z + x) + r / big_s) / x
+}
+
+/// Error function (f32)
+///
+/// Calculates an approximation to the “error function”, which estimates
+/// the probability that an observation will fall within x standard
+/// deviations of the mean (assuming a normal distribution).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn erff(x: f32) -> f32 {
+    let r: f32;
+    let s: f32;
+    let z: f32;
+    let y: f32;
+    let mut ix: u32;
+    let sign: usize;
+
+    ix = x.to_bits();
+    sign = (ix >> 31) as usize;
+    ix &= 0x7fffffff;
+    if ix >= 0x7f800000 {
+        /* erf(nan)=nan, erf(+-inf)=+-1 */
+        return 1.0 - 2.0 * (sign as f32) + 1.0 / x;
+    }
+    if ix < 0x3f580000 {
+        /* |x| < 0.84375 */
+        if ix < 0x31800000 {
+            /* |x| < 2**-28 */
+            /*avoid underflow */
+            return 0.125 * (8.0 * x + EFX8 * x);
+        }
+        z = x * x;
+        r = PP0 + z * (PP1 + z * (PP2 + z * (PP3 + z * PP4)));
+        s = 1.0 + z * (QQ1 + z * (QQ2 + z * (QQ3 + z * (QQ4 + z * QQ5))));
+        y = r / s;
+        return x + x * y;
+    }
+    if ix < 0x40c00000 {
+        /* |x| < 6 */
+        y = 1.0 - erfc2(ix, x);
+    } else {
+        let x1p_120 = f32::from_bits(0x03800000);
+        y = 1.0 - x1p_120;
+    }
+
+    if sign != 0 { -y } else { y }
+}
+
+/// Complementary error function (f32)
+///
+/// Calculates the complementary probability.
+/// Is `1 - erf(x)`. Is computed directly, so that you can use it to avoid
+/// the loss of precision that would result from subtracting
+/// large probabilities (on large `x`) from 1.
+pub fn erfcf(x: f32) -> f32 {
+    let r: f32;
+    let s: f32;
+    let z: f32;
+    let y: f32;
+    let mut ix: u32;
+    let sign: usize;
+
+    ix = x.to_bits();
+    sign = (ix >> 31) as usize;
+    ix &= 0x7fffffff;
+    if ix >= 0x7f800000 {
+        /* erfc(nan)=nan, erfc(+-inf)=0,2 */
+        return 2.0 * (sign as f32) + 1.0 / x;
+    }
+
+    if ix < 0x3f580000 {
+        /* |x| < 0.84375 */
+        if ix < 0x23800000 {
+            /* |x| < 2**-56 */
+            return 1.0 - x;
+        }
+        z = x * x;
+        r = PP0 + z * (PP1 + z * (PP2 + z * (PP3 + z * PP4)));
+        s = 1.0 + z * (QQ1 + z * (QQ2 + z * (QQ3 + z * (QQ4 + z * QQ5))));
+        y = r / s;
+        if sign != 0 || ix < 0x3e800000 {
+            /* x < 1/4 */
+            return 1.0 - (x + x * y);
+        }
+        return 0.5 - (x - 0.5 + x * y);
+    }
+    if ix < 0x41e00000 {
+        /* |x| < 28 */
+        if sign != 0 {
+            return 2.0 - erfc2(ix, x);
+        } else {
+            return erfc2(ix, x);
+        }
+    }
+
+    let x1p_120 = f32::from_bits(0x03800000);
+    if sign != 0 {
+        2.0 - x1p_120
+    } else {
+        x1p_120 * x1p_120
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/exp.rs b/library/compiler-builtins/libm/src/math/exp.rs
new file mode 100644
index 00000000000..782042b62cd
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/exp.rs
@@ -0,0 +1,150 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/e_exp.c */
+/*
+ * ====================================================
+ * Copyright (C) 2004 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+/* exp(x)
+ * Returns the exponential of x.
+ *
+ * Method
+ *   1. Argument reduction:
+ *      Reduce x to an r so that |r| <= 0.5*ln2 ~ 0.34658.
+ *      Given x, find r and integer k such that
+ *
+ *               x = k*ln2 + r,  |r| <= 0.5*ln2.
+ *
+ *      Here r will be represented as r = hi-lo for better
+ *      accuracy.
+ *
+ *   2. Approximation of exp(r) by a special rational function on
+ *      the interval [0,0.34658]:
+ *      Write
+ *          R(r**2) = r*(exp(r)+1)/(exp(r)-1) = 2 + r*r/6 - r**4/360 + ...
+ *      We use a special Remez algorithm on [0,0.34658] to generate
+ *      a polynomial of degree 5 to approximate R. The maximum error
+ *      of this polynomial approximation is bounded by 2**-59. In
+ *      other words,
+ *          R(z) ~ 2.0 + P1*z + P2*z**2 + P3*z**3 + P4*z**4 + P5*z**5
+ *      (where z=r*r, and the values of P1 to P5 are listed below)
+ *      and
+ *          |                  5          |     -59
+ *          | 2.0+P1*z+...+P5*z   -  R(z) | <= 2
+ *          |                             |
+ *      The computation of exp(r) thus becomes
+ *                              2*r
+ *              exp(r) = 1 + ----------
+ *                            R(r) - r
+ *                                 r*c(r)
+ *                     = 1 + r + ----------- (for better accuracy)
+ *                                2 - c(r)
+ *      where
+ *                              2       4             10
+ *              c(r) = r - (P1*r  + P2*r  + ... + P5*r   ).
+ *
+ *   3. Scale back to obtain exp(x):
+ *      From step 1, we have
+ *         exp(x) = 2^k * exp(r)
+ *
+ * Special cases:
+ *      exp(INF) is INF, exp(NaN) is NaN;
+ *      exp(-INF) is 0, and
+ *      for finite argument, only exp(0)=1 is exact.
+ *
+ * Accuracy:
+ *      according to an error analysis, the error is always less than
+ *      1 ulp (unit in the last place).
+ *
+ * Misc. info.
+ *      For IEEE double
+ *          if x >  709.782712893383973096 then exp(x) overflows
+ *          if x < -745.133219101941108420 then exp(x) underflows
+ */
+
+use super::scalbn;
+
+const HALF: [f64; 2] = [0.5, -0.5];
+const LN2HI: f64 = 6.93147180369123816490e-01; /* 0x3fe62e42, 0xfee00000 */
+const LN2LO: f64 = 1.90821492927058770002e-10; /* 0x3dea39ef, 0x35793c76 */
+const INVLN2: f64 = 1.44269504088896338700e+00; /* 0x3ff71547, 0x652b82fe */
+const P1: f64 = 1.66666666666666019037e-01; /* 0x3FC55555, 0x5555553E */
+const P2: f64 = -2.77777777770155933842e-03; /* 0xBF66C16C, 0x16BEBD93 */
+const P3: f64 = 6.61375632143793436117e-05; /* 0x3F11566A, 0xAF25DE2C */
+const P4: f64 = -1.65339022054652515390e-06; /* 0xBEBBBD41, 0xC5D26BF1 */
+const P5: f64 = 4.13813679705723846039e-08; /* 0x3E663769, 0x72BEA4D0 */
+
+/// Exponential, base *e* (f64)
+///
+/// Calculate the exponential of `x`, that is, *e* raised to the power `x`
+/// (where *e* is the base of the natural system of logarithms, approximately 2.71828).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn exp(mut x: f64) -> f64 {
+    let x1p1023 = f64::from_bits(0x7fe0000000000000); // 0x1p1023 === 2 ^ 1023
+    let x1p_149 = f64::from_bits(0x36a0000000000000); // 0x1p-149 === 2 ^ -149
+
+    let hi: f64;
+    let lo: f64;
+    let c: f64;
+    let xx: f64;
+    let y: f64;
+    let k: i32;
+    let sign: i32;
+    let mut hx: u32;
+
+    hx = (x.to_bits() >> 32) as u32;
+    sign = (hx >> 31) as i32;
+    hx &= 0x7fffffff; /* high word of |x| */
+
+    /* special cases */
+    if hx >= 0x4086232b {
+        /* if |x| >= 708.39... */
+        if x.is_nan() {
+            return x;
+        }
+        if x > 709.782712893383973096 {
+            /* overflow if x!=inf */
+            x *= x1p1023;
+            return x;
+        }
+        if x < -708.39641853226410622 {
+            /* underflow if x!=-inf */
+            force_eval!((-x1p_149 / x) as f32);
+            if x < -745.13321910194110842 {
+                return 0.;
+            }
+        }
+    }
+
+    /* argument reduction */
+    if hx > 0x3fd62e42 {
+        /* if |x| > 0.5 ln2 */
+        if hx >= 0x3ff0a2b2 {
+            /* if |x| >= 1.5 ln2 */
+            k = (INVLN2 * x + i!(HALF, sign as usize)) as i32;
+        } else {
+            k = 1 - sign - sign;
+        }
+        hi = x - k as f64 * LN2HI; /* k*ln2hi is exact here */
+        lo = k as f64 * LN2LO;
+        x = hi - lo;
+    } else if hx > 0x3e300000 {
+        /* if |x| > 2**-28 */
+        k = 0;
+        hi = x;
+        lo = 0.;
+    } else {
+        /* inexact if x!=0 */
+        force_eval!(x1p1023 + x);
+        return 1. + x;
+    }
+
+    /* x is now in primary range */
+    xx = x * x;
+    c = x - xx * (P1 + xx * (P2 + xx * (P3 + xx * (P4 + xx * P5))));
+    y = 1. + (x * c / (2. - c) - lo + hi);
+    if k == 0 { y } else { scalbn(y, k) }
+}
diff --git a/library/compiler-builtins/libm/src/math/exp10.rs b/library/compiler-builtins/libm/src/math/exp10.rs
new file mode 100644
index 00000000000..7c33c92b603
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/exp10.rs
@@ -0,0 +1,23 @@
+use super::{exp2, modf, pow};
+
+const LN10: f64 = 3.32192809488736234787031942948939;
+const P10: &[f64] = &[
+    1e-15, 1e-14, 1e-13, 1e-12, 1e-11, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1,
+    1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 1e11, 1e12, 1e13, 1e14, 1e15,
+];
+
+/// Calculates 10 raised to the power of `x` (f64).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn exp10(x: f64) -> f64 {
+    let (mut y, n) = modf(x);
+    let u: u64 = n.to_bits();
+    /* fabs(n) < 16 without raising invalid on nan */
+    if ((u >> 52) & 0x7ff) < 0x3ff + 4 {
+        if y == 0.0 {
+            return i!(P10, ((n as isize) + 15) as usize);
+        }
+        y = exp2(LN10 * y);
+        return y * i!(P10, ((n as isize) + 15) as usize);
+    }
+    return pow(10.0, x);
+}
diff --git a/library/compiler-builtins/libm/src/math/exp10f.rs b/library/compiler-builtins/libm/src/math/exp10f.rs
new file mode 100644
index 00000000000..303045b3313
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/exp10f.rs
@@ -0,0 +1,23 @@
+use super::{exp2, exp2f, modff};
+
+const LN10_F32: f32 = 3.32192809488736234787031942948939;
+const LN10_F64: f64 = 3.32192809488736234787031942948939;
+const P10: &[f32] = &[
+    1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7,
+];
+
+/// Calculates 10 raised to the power of `x` (f32).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn exp10f(x: f32) -> f32 {
+    let (mut y, n) = modff(x);
+    let u = n.to_bits();
+    /* fabsf(n) < 8 without raising invalid on nan */
+    if ((u >> 23) & 0xff) < 0x7f + 3 {
+        if y == 0.0 {
+            return i!(P10, ((n as isize) + 7) as usize);
+        }
+        y = exp2f(LN10_F32 * y);
+        return y * i!(P10, ((n as isize) + 7) as usize);
+    }
+    return exp2(LN10_F64 * (x as f64)) as f32;
+}
diff --git a/library/compiler-builtins/libm/src/math/exp2.rs b/library/compiler-builtins/libm/src/math/exp2.rs
new file mode 100644
index 00000000000..6e98d066cbf
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/exp2.rs
@@ -0,0 +1,394 @@
+// origin: FreeBSD /usr/src/lib/msun/src/s_exp2.c */
+//-
+// Copyright (c) 2005 David Schultz <das@FreeBSD.ORG>
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// 1. Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+// OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+use super::scalbn;
+
+const TBLSIZE: usize = 256;
+
+#[rustfmt::skip]
+static TBL: [u64; TBLSIZE * 2] = [
+    //  exp2(z + eps)          eps
+    0x3fe6a09e667f3d5d, 0x3d39880000000000,
+    0x3fe6b052fa751744, 0x3cd8000000000000,
+    0x3fe6c012750bd9fe, 0xbd28780000000000,
+    0x3fe6cfdcddd476bf, 0x3d1ec00000000000,
+    0x3fe6dfb23c651a29, 0xbcd8000000000000,
+    0x3fe6ef9298593ae3, 0xbcbc000000000000,
+    0x3fe6ff7df9519386, 0xbd2fd80000000000,
+    0x3fe70f7466f42da3, 0xbd2c880000000000,
+    0x3fe71f75e8ec5fc3, 0x3d13c00000000000,
+    0x3fe72f8286eacf05, 0xbd38300000000000,
+    0x3fe73f9a48a58152, 0xbd00c00000000000,
+    0x3fe74fbd35d7ccfc, 0x3d2f880000000000,
+    0x3fe75feb564267f1, 0x3d03e00000000000,
+    0x3fe77024b1ab6d48, 0xbd27d00000000000,
+    0x3fe780694fde5d38, 0xbcdd000000000000,
+    0x3fe790b938ac1d00, 0x3ce3000000000000,
+    0x3fe7a11473eb0178, 0xbced000000000000,
+    0x3fe7b17b0976d060, 0x3d20400000000000,
+    0x3fe7c1ed0130c133, 0x3ca0000000000000,
+    0x3fe7d26a62ff8636, 0xbd26900000000000,
+    0x3fe7e2f336cf4e3b, 0xbd02e00000000000,
+    0x3fe7f3878491c3e8, 0xbd24580000000000,
+    0x3fe80427543e1b4e, 0x3d33000000000000,
+    0x3fe814d2add1071a, 0x3d0f000000000000,
+    0x3fe82589994ccd7e, 0xbd21c00000000000,
+    0x3fe8364c1eb942d0, 0x3d29d00000000000,
+    0x3fe8471a4623cab5, 0x3d47100000000000,
+    0x3fe857f4179f5bbc, 0x3d22600000000000,
+    0x3fe868d99b4491af, 0xbd32c40000000000,
+    0x3fe879cad931a395, 0xbd23000000000000,
+    0x3fe88ac7d98a65b8, 0xbd2a800000000000,
+    0x3fe89bd0a4785800, 0xbced000000000000,
+    0x3fe8ace5422aa223, 0x3d33280000000000,
+    0x3fe8be05bad619fa, 0x3d42b40000000000,
+    0x3fe8cf3216b54383, 0xbd2ed00000000000,
+    0x3fe8e06a5e08664c, 0xbd20500000000000,
+    0x3fe8f1ae99157807, 0x3d28280000000000,
+    0x3fe902fed0282c0e, 0xbd1cb00000000000,
+    0x3fe9145b0b91ff96, 0xbd05e00000000000,
+    0x3fe925c353aa2ff9, 0x3cf5400000000000,
+    0x3fe93737b0cdc64a, 0x3d17200000000000,
+    0x3fe948b82b5f98ae, 0xbd09000000000000,
+    0x3fe95a44cbc852cb, 0x3d25680000000000,
+    0x3fe96bdd9a766f21, 0xbd36d00000000000,
+    0x3fe97d829fde4e2a, 0xbd01000000000000,
+    0x3fe98f33e47a23a3, 0x3d2d000000000000,
+    0x3fe9a0f170ca0604, 0xbd38a40000000000,
+    0x3fe9b2bb4d53ff89, 0x3d355c0000000000,
+    0x3fe9c49182a3f15b, 0x3d26b80000000000,
+    0x3fe9d674194bb8c5, 0xbcec000000000000,
+    0x3fe9e86319e3238e, 0x3d17d00000000000,
+    0x3fe9fa5e8d07f302, 0x3d16400000000000,
+    0x3fea0c667b5de54d, 0xbcf5000000000000,
+    0x3fea1e7aed8eb8f6, 0x3d09e00000000000,
+    0x3fea309bec4a2e27, 0x3d2ad80000000000,
+    0x3fea42c980460a5d, 0xbd1af00000000000,
+    0x3fea5503b23e259b, 0x3d0b600000000000,
+    0x3fea674a8af46213, 0x3d38880000000000,
+    0x3fea799e1330b3a7, 0x3d11200000000000,
+    0x3fea8bfe53c12e8d, 0x3d06c00000000000,
+    0x3fea9e6b5579fcd2, 0xbd29b80000000000,
+    0x3feab0e521356fb8, 0x3d2b700000000000,
+    0x3feac36bbfd3f381, 0x3cd9000000000000,
+    0x3fead5ff3a3c2780, 0x3ce4000000000000,
+    0x3feae89f995ad2a3, 0xbd2c900000000000,
+    0x3feafb4ce622f367, 0x3d16500000000000,
+    0x3feb0e07298db790, 0x3d2fd40000000000,
+    0x3feb20ce6c9a89a9, 0x3d12700000000000,
+    0x3feb33a2b84f1a4b, 0x3d4d470000000000,
+    0x3feb468415b747e7, 0xbd38380000000000,
+    0x3feb59728de5593a, 0x3c98000000000000,
+    0x3feb6c6e29f1c56a, 0x3d0ad00000000000,
+    0x3feb7f76f2fb5e50, 0x3cde800000000000,
+    0x3feb928cf22749b2, 0xbd04c00000000000,
+    0x3feba5b030a10603, 0xbd0d700000000000,
+    0x3febb8e0b79a6f66, 0x3d0d900000000000,
+    0x3febcc1e904bc1ff, 0x3d02a00000000000,
+    0x3febdf69c3f3a16f, 0xbd1f780000000000,
+    0x3febf2c25bd71db8, 0xbd10a00000000000,
+    0x3fec06286141b2e9, 0xbd11400000000000,
+    0x3fec199bdd8552e0, 0x3d0be00000000000,
+    0x3fec2d1cd9fa64ee, 0xbd09400000000000,
+    0x3fec40ab5fffd02f, 0xbd0ed00000000000,
+    0x3fec544778fafd15, 0x3d39660000000000,
+    0x3fec67f12e57d0cb, 0xbd1a100000000000,
+    0x3fec7ba88988c1b6, 0xbd58458000000000,
+    0x3fec8f6d9406e733, 0xbd1a480000000000,
+    0x3feca3405751c4df, 0x3ccb000000000000,
+    0x3fecb720dcef9094, 0x3d01400000000000,
+    0x3feccb0f2e6d1689, 0x3cf0200000000000,
+    0x3fecdf0b555dc412, 0x3cf3600000000000,
+    0x3fecf3155b5bab3b, 0xbd06900000000000,
+    0x3fed072d4a0789bc, 0x3d09a00000000000,
+    0x3fed1b532b08c8fa, 0xbd15e00000000000,
+    0x3fed2f87080d8a85, 0x3d1d280000000000,
+    0x3fed43c8eacaa203, 0x3d01a00000000000,
+    0x3fed5818dcfba491, 0x3cdf000000000000,
+    0x3fed6c76e862e6a1, 0xbd03a00000000000,
+    0x3fed80e316c9834e, 0xbd0cd80000000000,
+    0x3fed955d71ff6090, 0x3cf4c00000000000,
+    0x3feda9e603db32ae, 0x3cff900000000000,
+    0x3fedbe7cd63a8325, 0x3ce9800000000000,
+    0x3fedd321f301b445, 0xbcf5200000000000,
+    0x3fede7d5641c05bf, 0xbd1d700000000000,
+    0x3fedfc97337b9aec, 0xbd16140000000000,
+    0x3fee11676b197d5e, 0x3d0b480000000000,
+    0x3fee264614f5a3e7, 0x3d40ce0000000000,
+    0x3fee3b333b16ee5c, 0x3d0c680000000000,
+    0x3fee502ee78b3fb4, 0xbd09300000000000,
+    0x3fee653924676d68, 0xbce5000000000000,
+    0x3fee7a51fbc74c44, 0xbd07f80000000000,
+    0x3fee8f7977cdb726, 0xbcf3700000000000,
+    0x3feea4afa2a490e8, 0x3ce5d00000000000,
+    0x3feeb9f4867ccae4, 0x3d161a0000000000,
+    0x3feecf482d8e680d, 0x3cf5500000000000,
+    0x3feee4aaa2188514, 0x3cc6400000000000,
+    0x3feefa1bee615a13, 0xbcee800000000000,
+    0x3fef0f9c1cb64106, 0xbcfa880000000000,
+    0x3fef252b376bb963, 0xbd2c900000000000,
+    0x3fef3ac948dd7275, 0x3caa000000000000,
+    0x3fef50765b6e4524, 0xbcf4f00000000000,
+    0x3fef6632798844fd, 0x3cca800000000000,
+    0x3fef7bfdad9cbe38, 0x3cfabc0000000000,
+    0x3fef91d802243c82, 0xbcd4600000000000,
+    0x3fefa7c1819e908e, 0xbd0b0c0000000000,
+    0x3fefbdba3692d511, 0xbcc0e00000000000,
+    0x3fefd3c22b8f7194, 0xbd10de8000000000,
+    0x3fefe9d96b2a23ee, 0x3cee430000000000,
+    0x3ff0000000000000, 0x0,
+    0x3ff00b1afa5abcbe, 0xbcb3400000000000,
+    0x3ff0163da9fb3303, 0xbd12170000000000,
+    0x3ff02168143b0282, 0x3cba400000000000,
+    0x3ff02c9a3e77806c, 0x3cef980000000000,
+    0x3ff037d42e11bbca, 0xbcc7400000000000,
+    0x3ff04315e86e7f89, 0x3cd8300000000000,
+    0x3ff04e5f72f65467, 0xbd1a3f0000000000,
+    0x3ff059b0d315855a, 0xbd02840000000000,
+    0x3ff0650a0e3c1f95, 0x3cf1600000000000,
+    0x3ff0706b29ddf71a, 0x3d15240000000000,
+    0x3ff07bd42b72a82d, 0xbce9a00000000000,
+    0x3ff0874518759bd0, 0x3ce6400000000000,
+    0x3ff092bdf66607c8, 0xbd00780000000000,
+    0x3ff09e3ecac6f383, 0xbc98000000000000,
+    0x3ff0a9c79b1f3930, 0x3cffa00000000000,
+    0x3ff0b5586cf988fc, 0xbcfac80000000000,
+    0x3ff0c0f145e46c8a, 0x3cd9c00000000000,
+    0x3ff0cc922b724816, 0x3d05200000000000,
+    0x3ff0d83b23395dd8, 0xbcfad00000000000,
+    0x3ff0e3ec32d3d1f3, 0x3d1bac0000000000,
+    0x3ff0efa55fdfa9a6, 0xbd04e80000000000,
+    0x3ff0fb66affed2f0, 0xbd0d300000000000,
+    0x3ff1073028d7234b, 0x3cf1500000000000,
+    0x3ff11301d0125b5b, 0x3cec000000000000,
+    0x3ff11edbab5e2af9, 0x3d16bc0000000000,
+    0x3ff12abdc06c31d5, 0x3ce8400000000000,
+    0x3ff136a814f2047d, 0xbd0ed00000000000,
+    0x3ff1429aaea92de9, 0x3ce8e00000000000,
+    0x3ff14e95934f3138, 0x3ceb400000000000,
+    0x3ff15a98c8a58e71, 0x3d05300000000000,
+    0x3ff166a45471c3df, 0x3d03380000000000,
+    0x3ff172b83c7d5211, 0x3d28d40000000000,
+    0x3ff17ed48695bb9f, 0xbd05d00000000000,
+    0x3ff18af9388c8d93, 0xbd1c880000000000,
+    0x3ff1972658375d66, 0x3d11f00000000000,
+    0x3ff1a35beb6fcba7, 0x3d10480000000000,
+    0x3ff1af99f81387e3, 0xbd47390000000000,
+    0x3ff1bbe084045d54, 0x3d24e40000000000,
+    0x3ff1c82f95281c43, 0xbd0a200000000000,
+    0x3ff1d4873168b9b2, 0x3ce3800000000000,
+    0x3ff1e0e75eb44031, 0x3ceac00000000000,
+    0x3ff1ed5022fcd938, 0x3d01900000000000,
+    0x3ff1f9c18438cdf7, 0xbd1b780000000000,
+    0x3ff2063b88628d8f, 0x3d2d940000000000,
+    0x3ff212be3578a81e, 0x3cd8000000000000,
+    0x3ff21f49917ddd41, 0x3d2b340000000000,
+    0x3ff22bdda2791323, 0x3d19f80000000000,
+    0x3ff2387a6e7561e7, 0xbd19c80000000000,
+    0x3ff2451ffb821427, 0x3d02300000000000,
+    0x3ff251ce4fb2a602, 0xbd13480000000000,
+    0x3ff25e85711eceb0, 0x3d12700000000000,
+    0x3ff26b4565e27d16, 0x3d11d00000000000,
+    0x3ff2780e341de00f, 0x3d31ee0000000000,
+    0x3ff284dfe1f5633e, 0xbd14c00000000000,
+    0x3ff291ba7591bb30, 0xbd13d80000000000,
+    0x3ff29e9df51fdf09, 0x3d08b00000000000,
+    0x3ff2ab8a66d10e9b, 0xbd227c0000000000,
+    0x3ff2b87fd0dada3a, 0x3d2a340000000000,
+    0x3ff2c57e39771af9, 0xbd10800000000000,
+    0x3ff2d285a6e402d9, 0xbd0ed00000000000,
+    0x3ff2df961f641579, 0xbcf4200000000000,
+    0x3ff2ecafa93e2ecf, 0xbd24980000000000,
+    0x3ff2f9d24abd8822, 0xbd16300000000000,
+    0x3ff306fe0a31b625, 0xbd32360000000000,
+    0x3ff31432edeea50b, 0xbd70df8000000000,
+    0x3ff32170fc4cd7b8, 0xbd22480000000000,
+    0x3ff32eb83ba8e9a2, 0xbd25980000000000,
+    0x3ff33c08b2641766, 0x3d1ed00000000000,
+    0x3ff3496266e3fa27, 0xbcdc000000000000,
+    0x3ff356c55f929f0f, 0xbd30d80000000000,
+    0x3ff36431a2de88b9, 0x3d22c80000000000,
+    0x3ff371a7373aaa39, 0x3d20600000000000,
+    0x3ff37f26231e74fe, 0xbd16600000000000,
+    0x3ff38cae6d05d838, 0xbd0ae00000000000,
+    0x3ff39a401b713ec3, 0xbd44720000000000,
+    0x3ff3a7db34e5a020, 0x3d08200000000000,
+    0x3ff3b57fbfec6e95, 0x3d3e800000000000,
+    0x3ff3c32dc313a8f2, 0x3cef800000000000,
+    0x3ff3d0e544ede122, 0xbd17a00000000000,
+    0x3ff3dea64c1234bb, 0x3d26300000000000,
+    0x3ff3ec70df1c4ecc, 0xbd48a60000000000,
+    0x3ff3fa4504ac7e8c, 0xbd3cdc0000000000,
+    0x3ff40822c367a0bb, 0x3d25b80000000000,
+    0x3ff4160a21f72e95, 0x3d1ec00000000000,
+    0x3ff423fb27094646, 0xbd13600000000000,
+    0x3ff431f5d950a920, 0x3d23980000000000,
+    0x3ff43ffa3f84b9eb, 0x3cfa000000000000,
+    0x3ff44e0860618919, 0xbcf6c00000000000,
+    0x3ff45c2042a7d201, 0xbd0bc00000000000,
+    0x3ff46a41ed1d0016, 0xbd12800000000000,
+    0x3ff4786d668b3326, 0x3d30e00000000000,
+    0x3ff486a2b5c13c00, 0xbd2d400000000000,
+    0x3ff494e1e192af04, 0x3d0c200000000000,
+    0x3ff4a32af0d7d372, 0xbd1e500000000000,
+    0x3ff4b17dea6db801, 0x3d07800000000000,
+    0x3ff4bfdad53629e1, 0xbd13800000000000,
+    0x3ff4ce41b817c132, 0x3d00800000000000,
+    0x3ff4dcb299fddddb, 0x3d2c700000000000,
+    0x3ff4eb2d81d8ab96, 0xbd1ce00000000000,
+    0x3ff4f9b2769d2d02, 0x3d19200000000000,
+    0x3ff508417f4531c1, 0xbd08c00000000000,
+    0x3ff516daa2cf662a, 0xbcfa000000000000,
+    0x3ff5257de83f51ea, 0x3d4a080000000000,
+    0x3ff5342b569d4eda, 0xbd26d80000000000,
+    0x3ff542e2f4f6ac1a, 0xbd32440000000000,
+    0x3ff551a4ca5d94db, 0x3d483c0000000000,
+    0x3ff56070dde9116b, 0x3d24b00000000000,
+    0x3ff56f4736b529de, 0x3d415a0000000000,
+    0x3ff57e27dbe2c40e, 0xbd29e00000000000,
+    0x3ff58d12d497c76f, 0xbd23080000000000,
+    0x3ff59c0827ff0b4c, 0x3d4dec0000000000,
+    0x3ff5ab07dd485427, 0xbcc4000000000000,
+    0x3ff5ba11fba87af4, 0x3d30080000000000,
+    0x3ff5c9268a59460b, 0xbd26c80000000000,
+    0x3ff5d84590998e3f, 0x3d469a0000000000,
+    0x3ff5e76f15ad20e1, 0xbd1b400000000000,
+    0x3ff5f6a320dcebca, 0x3d17700000000000,
+    0x3ff605e1b976dcb8, 0x3d26f80000000000,
+    0x3ff6152ae6cdf715, 0x3d01000000000000,
+    0x3ff6247eb03a5531, 0xbd15d00000000000,
+    0x3ff633dd1d1929b5, 0xbd12d00000000000,
+    0x3ff6434634ccc313, 0xbcea800000000000,
+    0x3ff652b9febc8efa, 0xbd28600000000000,
+    0x3ff6623882553397, 0x3d71fe0000000000,
+    0x3ff671c1c708328e, 0xbd37200000000000,
+    0x3ff68155d44ca97e, 0x3ce6800000000000,
+    0x3ff690f4b19e9471, 0xbd29780000000000,
+];
+
+// exp2(x): compute the base 2 exponential of x
+//
+// Accuracy: Peak error < 0.503 ulp for normalized results.
+//
+// Method: (accurate tables)
+//
+//   Reduce x:
+//     x = k + y, for integer k and |y| <= 1/2.
+//     Thus we have exp2(x) = 2**k * exp2(y).
+//
+//   Reduce y:
+//     y = i/TBLSIZE + z - eps[i] for integer i near y * TBLSIZE.
+//     Thus we have exp2(y) = exp2(i/TBLSIZE) * exp2(z - eps[i]),
+//     with |z - eps[i]| <= 2**-9 + 2**-39 for the table used.
+//
+//   We compute exp2(i/TBLSIZE) via table lookup and exp2(z - eps[i]) via
+//   a degree-5 minimax polynomial with maximum error under 1.3 * 2**-61.
+//   The values in exp2t[] and eps[] are chosen such that
+//   exp2t[i] = exp2(i/TBLSIZE + eps[i]), and eps[i] is a small offset such
+//   that exp2t[i] is accurate to 2**-64.
+//
+//   Note that the range of i is +-TBLSIZE/2, so we actually index the tables
+//   by i0 = i + TBLSIZE/2.  For cache efficiency, exp2t[] and eps[] are
+//   virtual tables, interleaved in the real table tbl[].
+//
+//   This method is due to Gal, with many details due to Gal and Bachelis:
+//
+//      Gal, S. and Bachelis, B.  An Accurate Elementary Mathematical Library
+//      for the IEEE Floating Point Standard.  TOMS 17(1), 26-46 (1991).
+
+/// Exponential, base 2 (f64)
+///
+/// Calculate `2^x`, that is, 2 raised to the power `x`.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn exp2(mut x: f64) -> f64 {
+    let redux = f64::from_bits(0x4338000000000000) / TBLSIZE as f64;
+    let p1 = f64::from_bits(0x3fe62e42fefa39ef);
+    let p2 = f64::from_bits(0x3fcebfbdff82c575);
+    let p3 = f64::from_bits(0x3fac6b08d704a0a6);
+    let p4 = f64::from_bits(0x3f83b2ab88f70400);
+    let p5 = f64::from_bits(0x3f55d88003875c74);
+
+    // double_t r, t, z;
+    // uint32_t ix, i0;
+    // union {double f; uint64_t i;} u = {x};
+    // union {uint32_t u; int32_t i;} k;
+    let x1p1023 = f64::from_bits(0x7fe0000000000000);
+    let x1p52 = f64::from_bits(0x4330000000000000);
+    let _0x1p_149 = f64::from_bits(0xb6a0000000000000);
+
+    /* Filter out exceptional cases. */
+    let ui = f64::to_bits(x);
+    let ix = (ui >> 32) & 0x7fffffff;
+    if ix >= 0x408ff000 {
+        /* |x| >= 1022 or nan */
+        if ix >= 0x40900000 && ui >> 63 == 0 {
+            /* x >= 1024 or nan */
+            /* overflow */
+            x *= x1p1023;
+            return x;
+        }
+        if ix >= 0x7ff00000 {
+            /* -inf or -nan */
+            return -1.0 / x;
+        }
+        if ui >> 63 != 0 {
+            /* x <= -1022 */
+            /* underflow */
+            if x <= -1075.0 || x - x1p52 + x1p52 != x {
+                force_eval!((_0x1p_149 / x) as f32);
+            }
+            if x <= -1075.0 {
+                return 0.0;
+            }
+        }
+    } else if ix < 0x3c900000 {
+        /* |x| < 0x1p-54 */
+        return 1.0 + x;
+    }
+
+    /* Reduce x, computing z, i0, and k. */
+    let ui = f64::to_bits(x + redux);
+    let mut i0 = ui as u32;
+    i0 = i0.wrapping_add(TBLSIZE as u32 / 2);
+    let ku = i0 / TBLSIZE as u32 * TBLSIZE as u32;
+    let ki = div!(ku as i32, TBLSIZE as i32);
+    i0 %= TBLSIZE as u32;
+    let uf = f64::from_bits(ui) - redux;
+    let mut z = x - uf;
+
+    /* Compute r = exp2(y) = exp2t[i0] * p(z - eps[i]). */
+    let t = f64::from_bits(i!(TBL, 2 * i0 as usize)); /* exp2t[i0] */
+    z -= f64::from_bits(i!(TBL, 2 * i0 as usize + 1)); /* eps[i0]   */
+    let r = t + t * z * (p1 + z * (p2 + z * (p3 + z * (p4 + z * p5))));
+
+    scalbn(r, ki)
+}
+
+#[test]
+fn i0_wrap_test() {
+    let x = -3.0 / 256.0;
+    assert_eq!(exp2(x), f64::from_bits(0x3fefbdba3692d514));
+}
diff --git a/library/compiler-builtins/libm/src/math/exp2f.rs b/library/compiler-builtins/libm/src/math/exp2f.rs
new file mode 100644
index 00000000000..f452b6a20f8
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/exp2f.rs
@@ -0,0 +1,135 @@
+// origin: FreeBSD /usr/src/lib/msun/src/s_exp2f.c
+//-
+// Copyright (c) 2005 David Schultz <das@FreeBSD.ORG>
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// 1. Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+// OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+const TBLSIZE: usize = 16;
+
+static EXP2FT: [u64; TBLSIZE] = [
+    0x3fe6a09e667f3bcd,
+    0x3fe7a11473eb0187,
+    0x3fe8ace5422aa0db,
+    0x3fe9c49182a3f090,
+    0x3feae89f995ad3ad,
+    0x3fec199bdd85529c,
+    0x3fed5818dcfba487,
+    0x3feea4afa2a490da,
+    0x3ff0000000000000,
+    0x3ff0b5586cf9890f,
+    0x3ff172b83c7d517b,
+    0x3ff2387a6e756238,
+    0x3ff306fe0a31b715,
+    0x3ff3dea64c123422,
+    0x3ff4bfdad5362a27,
+    0x3ff5ab07dd485429,
+];
+
+// exp2f(x): compute the base 2 exponential of x
+//
+// Accuracy: Peak error < 0.501 ulp; location of peak: -0.030110927.
+//
+// Method: (equally-spaced tables)
+//
+//   Reduce x:
+//     x = k + y, for integer k and |y| <= 1/2.
+//     Thus we have exp2f(x) = 2**k * exp2(y).
+//
+//   Reduce y:
+//     y = i/TBLSIZE + z for integer i near y * TBLSIZE.
+//     Thus we have exp2(y) = exp2(i/TBLSIZE) * exp2(z),
+//     with |z| <= 2**-(TBLSIZE+1).
+//
+//   We compute exp2(i/TBLSIZE) via table lookup and exp2(z) via a
+//   degree-4 minimax polynomial with maximum error under 1.4 * 2**-33.
+//   Using double precision for everything except the reduction makes
+//   roundoff error insignificant and simplifies the scaling step.
+//
+//   This method is due to Tang, but I do not use his suggested parameters:
+//
+//      Tang, P.  Table-driven Implementation of the Exponential Function
+//      in IEEE Floating-Point Arithmetic.  TOMS 15(2), 144-157 (1989).
+
+/// Exponential, base 2 (f32)
+///
+/// Calculate `2^x`, that is, 2 raised to the power `x`.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn exp2f(mut x: f32) -> f32 {
+    let redux = f32::from_bits(0x4b400000) / TBLSIZE as f32;
+    let p1 = f32::from_bits(0x3f317218);
+    let p2 = f32::from_bits(0x3e75fdf0);
+    let p3 = f32::from_bits(0x3d6359a4);
+    let p4 = f32::from_bits(0x3c1d964e);
+
+    // double_t t, r, z;
+    // uint32_t ix, i0, k;
+
+    let x1p127 = f32::from_bits(0x7f000000);
+
+    /* Filter out exceptional cases. */
+    let ui = f32::to_bits(x);
+    let ix = ui & 0x7fffffff;
+    if ix > 0x42fc0000 {
+        /* |x| > 126 */
+        if ix > 0x7f800000 {
+            /* NaN */
+            return x;
+        }
+        if (0x43000000..0x80000000).contains(&ui) {
+            /* x >= 128 */
+            x *= x1p127;
+            return x;
+        }
+        if ui >= 0x80000000 {
+            /* x < -126 */
+            if ui >= 0xc3160000 || (ui & 0x0000ffff != 0) {
+                force_eval!(f32::from_bits(0x80000001) / x);
+            }
+            if ui >= 0xc3160000 {
+                /* x <= -150 */
+                return 0.0;
+            }
+        }
+    } else if ix <= 0x33000000 {
+        /* |x| <= 0x1p-25 */
+        return 1.0 + x;
+    }
+
+    /* Reduce x, computing z, i0, and k. */
+    let ui = f32::to_bits(x + redux);
+    let mut i0 = ui;
+    i0 += TBLSIZE as u32 / 2;
+    let k = i0 / TBLSIZE as u32;
+    let ukf = f64::from_bits(((0x3ff + k) as u64) << 52);
+    i0 &= TBLSIZE as u32 - 1;
+    let mut uf = f32::from_bits(ui);
+    uf -= redux;
+    let z: f64 = (x - uf) as f64;
+    /* Compute r = exp2(y) = exp2ft[i0] * p(z). */
+    let r: f64 = f64::from_bits(i!(EXP2FT, i0 as usize));
+    let t: f64 = r * z;
+    let r: f64 = r + t * (p1 as f64 + z * p2 as f64) + t * (z * z) * (p3 as f64 + z * p4 as f64);
+
+    /* Scale by 2**k */
+    (r * ukf) as f32
+}
diff --git a/library/compiler-builtins/libm/src/math/expf.rs b/library/compiler-builtins/libm/src/math/expf.rs
new file mode 100644
index 00000000000..8dc067ab084
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/expf.rs
@@ -0,0 +1,97 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/e_expf.c */
+/*
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
+ */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+use super::scalbnf;
+
+const HALF: [f32; 2] = [0.5, -0.5];
+const LN2_HI: f32 = 6.9314575195e-01; /* 0x3f317200 */
+const LN2_LO: f32 = 1.4286067653e-06; /* 0x35bfbe8e */
+const INV_LN2: f32 = 1.4426950216e+00; /* 0x3fb8aa3b */
+/*
+ * Domain [-0.34568, 0.34568], range ~[-4.278e-9, 4.447e-9]:
+ * |x*(exp(x)+1)/(exp(x)-1) - p(x)| < 2**-27.74
+ */
+const P1: f32 = 1.6666625440e-1; /*  0xaaaa8f.0p-26 */
+const P2: f32 = -2.7667332906e-3; /* -0xb55215.0p-32 */
+
+/// Exponential, base *e* (f32)
+///
+/// Calculate the exponential of `x`, that is, *e* raised to the power `x`
+/// (where *e* is the base of the natural system of logarithms, approximately 2.71828).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn expf(mut x: f32) -> f32 {
+    let x1p127 = f32::from_bits(0x7f000000); // 0x1p127f === 2 ^ 127
+    let x1p_126 = f32::from_bits(0x800000); // 0x1p-126f === 2 ^ -126  /*original 0x1p-149f    ??????????? */
+    let mut hx = x.to_bits();
+    let sign = (hx >> 31) as i32; /* sign bit of x */
+    let signb: bool = sign != 0;
+    hx &= 0x7fffffff; /* high word of |x| */
+
+    /* special cases */
+    if hx >= 0x42aeac50 {
+        /* if |x| >= -87.33655f or NaN */
+        if hx > 0x7f800000 {
+            /* NaN */
+            return x;
+        }
+        if (hx >= 0x42b17218) && (!signb) {
+            /* x >= 88.722839f */
+            /* overflow */
+            x *= x1p127;
+            return x;
+        }
+        if signb {
+            /* underflow */
+            force_eval!(-x1p_126 / x);
+            if hx >= 0x42cff1b5 {
+                /* x <= -103.972084f */
+                return 0.;
+            }
+        }
+    }
+
+    /* argument reduction */
+    let k: i32;
+    let hi: f32;
+    let lo: f32;
+    if hx > 0x3eb17218 {
+        /* if |x| > 0.5 ln2 */
+        if hx > 0x3f851592 {
+            /* if |x| > 1.5 ln2 */
+            k = (INV_LN2 * x + i!(HALF, sign as usize)) as i32;
+        } else {
+            k = 1 - sign - sign;
+        }
+        let kf = k as f32;
+        hi = x - kf * LN2_HI; /* k*ln2hi is exact here */
+        lo = kf * LN2_LO;
+        x = hi - lo;
+    } else if hx > 0x39000000 {
+        /* |x| > 2**-14 */
+        k = 0;
+        hi = x;
+        lo = 0.;
+    } else {
+        /* raise inexact */
+        force_eval!(x1p127 + x);
+        return 1. + x;
+    }
+
+    /* x is now in primary range */
+    let xx = x * x;
+    let c = x - xx * (P1 + xx * P2);
+    let y = 1. + (x * c / (2. - c) - lo + hi);
+    if k == 0 { y } else { scalbnf(y, k) }
+}
diff --git a/library/compiler-builtins/libm/src/math/expm1.rs b/library/compiler-builtins/libm/src/math/expm1.rs
new file mode 100644
index 00000000000..f25153f32a3
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/expm1.rs
@@ -0,0 +1,144 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/s_expm1.c */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+use core::f64;
+
+const O_THRESHOLD: f64 = 7.09782712893383973096e+02; /* 0x40862E42, 0xFEFA39EF */
+const LN2_HI: f64 = 6.93147180369123816490e-01; /* 0x3fe62e42, 0xfee00000 */
+const LN2_LO: f64 = 1.90821492927058770002e-10; /* 0x3dea39ef, 0x35793c76 */
+const INVLN2: f64 = 1.44269504088896338700e+00; /* 0x3ff71547, 0x652b82fe */
+/* Scaled Q's: Qn_here = 2**n * Qn_above, for R(2*z) where z = hxs = x*x/2: */
+const Q1: f64 = -3.33333333333331316428e-02; /* BFA11111 111110F4 */
+const Q2: f64 = 1.58730158725481460165e-03; /* 3F5A01A0 19FE5585 */
+const Q3: f64 = -7.93650757867487942473e-05; /* BF14CE19 9EAADBB7 */
+const Q4: f64 = 4.00821782732936239552e-06; /* 3ED0CFCA 86E65239 */
+const Q5: f64 = -2.01099218183624371326e-07; /* BE8AFDB7 6E09C32D */
+
+/// Exponential, base *e*, of x-1 (f64)
+///
+/// Calculates the exponential of `x` and subtract 1, that is, *e* raised
+/// to the power `x` minus 1 (where *e* is the base of the natural
+/// system of logarithms, approximately 2.71828).
+/// The result is accurate even for small values of `x`,
+/// where using `exp(x)-1` would lose many significant digits.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn expm1(mut x: f64) -> f64 {
+    let hi: f64;
+    let lo: f64;
+    let k: i32;
+    let c: f64;
+    let mut t: f64;
+    let mut y: f64;
+
+    let mut ui = x.to_bits();
+    let hx = ((ui >> 32) & 0x7fffffff) as u32;
+    let sign = (ui >> 63) as i32;
+
+    /* filter out huge and non-finite argument */
+    if hx >= 0x4043687A {
+        /* if |x|>=56*ln2 */
+        if x.is_nan() {
+            return x;
+        }
+        if sign != 0 {
+            return -1.0;
+        }
+        if x > O_THRESHOLD {
+            x *= f64::from_bits(0x7fe0000000000000);
+            return x;
+        }
+    }
+
+    /* argument reduction */
+    if hx > 0x3fd62e42 {
+        /* if  |x| > 0.5 ln2 */
+        if hx < 0x3FF0A2B2 {
+            /* and |x| < 1.5 ln2 */
+            if sign == 0 {
+                hi = x - LN2_HI;
+                lo = LN2_LO;
+                k = 1;
+            } else {
+                hi = x + LN2_HI;
+                lo = -LN2_LO;
+                k = -1;
+            }
+        } else {
+            k = (INVLN2 * x + if sign != 0 { -0.5 } else { 0.5 }) as i32;
+            t = k as f64;
+            hi = x - t * LN2_HI; /* t*ln2_hi is exact here */
+            lo = t * LN2_LO;
+        }
+        x = hi - lo;
+        c = (hi - x) - lo;
+    } else if hx < 0x3c900000 {
+        /* |x| < 2**-54, return x */
+        if hx < 0x00100000 {
+            force_eval!(x);
+        }
+        return x;
+    } else {
+        c = 0.0;
+        k = 0;
+    }
+
+    /* x is now in primary range */
+    let hfx = 0.5 * x;
+    let hxs = x * hfx;
+    let r1 = 1.0 + hxs * (Q1 + hxs * (Q2 + hxs * (Q3 + hxs * (Q4 + hxs * Q5))));
+    t = 3.0 - r1 * hfx;
+    let mut e = hxs * ((r1 - t) / (6.0 - x * t));
+    if k == 0 {
+        /* c is 0 */
+        return x - (x * e - hxs);
+    }
+    e = x * (e - c) - c;
+    e -= hxs;
+    /* exp(x) ~ 2^k (x_reduced - e + 1) */
+    if k == -1 {
+        return 0.5 * (x - e) - 0.5;
+    }
+    if k == 1 {
+        if x < -0.25 {
+            return -2.0 * (e - (x + 0.5));
+        }
+        return 1.0 + 2.0 * (x - e);
+    }
+    ui = ((0x3ff + k) as u64) << 52; /* 2^k */
+    let twopk = f64::from_bits(ui);
+    if !(0..=56).contains(&k) {
+        /* suffice to return exp(x)-1 */
+        y = x - e + 1.0;
+        if k == 1024 {
+            y = y * 2.0 * f64::from_bits(0x7fe0000000000000);
+        } else {
+            y = y * twopk;
+        }
+        return y - 1.0;
+    }
+    ui = ((0x3ff - k) as u64) << 52; /* 2^-k */
+    let uf = f64::from_bits(ui);
+    if k < 20 {
+        y = (x - e + (1.0 - uf)) * twopk;
+    } else {
+        y = (x - (e + uf) + 1.0) * twopk;
+    }
+    y
+}
+
+#[cfg(test)]
+mod tests {
+    #[test]
+    fn sanity_check() {
+        assert_eq!(super::expm1(1.1), 2.0041660239464334);
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/expm1f.rs b/library/compiler-builtins/libm/src/math/expm1f.rs
new file mode 100644
index 00000000000..63dc86e37c8
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/expm1f.rs
@@ -0,0 +1,134 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/s_expm1f.c */
+/*
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
+ */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+const O_THRESHOLD: f32 = 8.8721679688e+01; /* 0x42b17180 */
+const LN2_HI: f32 = 6.9313812256e-01; /* 0x3f317180 */
+const LN2_LO: f32 = 9.0580006145e-06; /* 0x3717f7d1 */
+const INV_LN2: f32 = 1.4426950216e+00; /* 0x3fb8aa3b */
+/*
+ * Domain [-0.34568, 0.34568], range ~[-6.694e-10, 6.696e-10]:
+ * |6 / x * (1 + 2 * (1 / (exp(x) - 1) - 1 / x)) - q(x)| < 2**-30.04
+ * Scaled coefficients: Qn_here = 2**n * Qn_for_q (see s_expm1.c):
+ */
+const Q1: f32 = -3.3333212137e-2; /* -0x888868.0p-28 */
+const Q2: f32 = 1.5807170421e-3; /*  0xcf3010.0p-33 */
+
+/// Exponential, base *e*, of x-1 (f32)
+///
+/// Calculates the exponential of `x` and subtract 1, that is, *e* raised
+/// to the power `x` minus 1 (where *e* is the base of the natural
+/// system of logarithms, approximately 2.71828).
+/// The result is accurate even for small values of `x`,
+/// where using `exp(x)-1` would lose many significant digits.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn expm1f(mut x: f32) -> f32 {
+    let x1p127 = f32::from_bits(0x7f000000); // 0x1p127f === 2 ^ 127
+
+    let mut hx = x.to_bits();
+    let sign = (hx >> 31) != 0;
+    hx &= 0x7fffffff;
+
+    /* filter out huge and non-finite argument */
+    if hx >= 0x4195b844 {
+        /* if |x|>=27*ln2 */
+        if hx > 0x7f800000 {
+            /* NaN */
+            return x;
+        }
+        if sign {
+            return -1.;
+        }
+        if x > O_THRESHOLD {
+            x *= x1p127;
+            return x;
+        }
+    }
+
+    let k: i32;
+    let hi: f32;
+    let lo: f32;
+    let mut c = 0f32;
+    /* argument reduction */
+    if hx > 0x3eb17218 {
+        /* if  |x| > 0.5 ln2 */
+        if hx < 0x3F851592 {
+            /* and |x| < 1.5 ln2 */
+            if !sign {
+                hi = x - LN2_HI;
+                lo = LN2_LO;
+                k = 1;
+            } else {
+                hi = x + LN2_HI;
+                lo = -LN2_LO;
+                k = -1;
+            }
+        } else {
+            k = (INV_LN2 * x + (if sign { -0.5 } else { 0.5 })) as i32;
+            let t = k as f32;
+            hi = x - t * LN2_HI; /* t*ln2_hi is exact here */
+            lo = t * LN2_LO;
+        }
+        x = hi - lo;
+        c = (hi - x) - lo;
+    } else if hx < 0x33000000 {
+        /* when |x|<2**-25, return x */
+        if hx < 0x00800000 {
+            force_eval!(x * x);
+        }
+        return x;
+    } else {
+        k = 0;
+    }
+
+    /* x is now in primary range */
+    let hfx = 0.5 * x;
+    let hxs = x * hfx;
+    let r1 = 1. + hxs * (Q1 + hxs * Q2);
+    let t = 3. - r1 * hfx;
+    let mut e = hxs * ((r1 - t) / (6. - x * t));
+    if k == 0 {
+        /* c is 0 */
+        return x - (x * e - hxs);
+    }
+    e = x * (e - c) - c;
+    e -= hxs;
+    /* exp(x) ~ 2^k (x_reduced - e + 1) */
+    if k == -1 {
+        return 0.5 * (x - e) - 0.5;
+    }
+    if k == 1 {
+        if x < -0.25 {
+            return -2. * (e - (x + 0.5));
+        }
+        return 1. + 2. * (x - e);
+    }
+    let twopk = f32::from_bits(((0x7f + k) << 23) as u32); /* 2^k */
+    if !(0..=56).contains(&k) {
+        /* suffice to return exp(x)-1 */
+        let mut y = x - e + 1.;
+        if k == 128 {
+            y = y * 2. * x1p127;
+        } else {
+            y = y * twopk;
+        }
+        return y - 1.;
+    }
+    let uf = f32::from_bits(((0x7f - k) << 23) as u32); /* 2^-k */
+    if k < 23 {
+        (x - e + (1. - uf)) * twopk
+    } else {
+        (x - (e + uf) + 1.) * twopk
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/expo2.rs b/library/compiler-builtins/libm/src/math/expo2.rs
new file mode 100644
index 00000000000..82e9b360a76
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/expo2.rs
@@ -0,0 +1,14 @@
+use super::{combine_words, exp};
+
+/* exp(x)/2 for x >= log(DBL_MAX), slightly better than 0.5*exp(x/2)*exp(x/2) */
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub(crate) fn expo2(x: f64) -> f64 {
+    /* k is such that k*ln2 has minimal relative error and x - kln2 > log(DBL_MIN) */
+    const K: i32 = 2043;
+    let kln2 = f64::from_bits(0x40962066151add8b);
+
+    /* note that k is odd and scale*scale overflows */
+    let scale = combine_words(((0x3ff + K / 2) as u32) << 20, 0);
+    /* exp(x - k ln2) * 2**(k-1) */
+    exp(x - kln2) * scale * scale
+}
diff --git a/library/compiler-builtins/libm/src/math/fabs.rs b/library/compiler-builtins/libm/src/math/fabs.rs
new file mode 100644
index 00000000000..0050a309fee
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/fabs.rs
@@ -0,0 +1,116 @@
+/// Absolute value (magnitude) (f16)
+///
+/// Calculates the absolute value (magnitude) of the argument `x`,
+/// by direct manipulation of the bit representation of `x`.
+#[cfg(f16_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fabsf16(x: f16) -> f16 {
+    super::generic::fabs(x)
+}
+
+/// Absolute value (magnitude) (f32)
+///
+/// Calculates the absolute value (magnitude) of the argument `x`,
+/// by direct manipulation of the bit representation of `x`.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fabsf(x: f32) -> f32 {
+    select_implementation! {
+        name: fabsf,
+        use_arch: all(target_arch = "wasm32", intrinsics_enabled),
+        args: x,
+    }
+
+    super::generic::fabs(x)
+}
+
+/// Absolute value (magnitude) (f64)
+///
+/// Calculates the absolute value (magnitude) of the argument `x`,
+/// by direct manipulation of the bit representation of `x`.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fabs(x: f64) -> f64 {
+    select_implementation! {
+        name: fabs,
+        use_arch: all(target_arch = "wasm32", intrinsics_enabled),
+        args: x,
+    }
+
+    super::generic::fabs(x)
+}
+
+/// Absolute value (magnitude) (f128)
+///
+/// Calculates the absolute value (magnitude) of the argument `x`,
+/// by direct manipulation of the bit representation of `x`.
+#[cfg(f128_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fabsf128(x: f128) -> f128 {
+    super::generic::fabs(x)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::support::Float;
+
+    /// Based on https://en.cppreference.com/w/cpp/numeric/math/fabs
+    fn spec_test<F: Float>(f: impl Fn(F) -> F) {
+        assert_biteq!(f(F::ZERO), F::ZERO);
+        assert_biteq!(f(F::NEG_ZERO), F::ZERO);
+        assert_biteq!(f(F::INFINITY), F::INFINITY);
+        assert_biteq!(f(F::NEG_INFINITY), F::INFINITY);
+        assert!(f(F::NAN).is_nan());
+
+        // Not spec rewquired but we expect it
+        assert!(f(F::NAN).is_sign_positive());
+        assert!(f(F::from_bits(F::NAN.to_bits() | F::SIGN_MASK)).is_sign_positive());
+    }
+
+    #[test]
+    #[cfg(f16_enabled)]
+    fn sanity_check_f16() {
+        assert_eq!(fabsf16(-1.0f16), 1.0);
+        assert_eq!(fabsf16(2.8f16), 2.8);
+    }
+
+    #[test]
+    #[cfg(f16_enabled)]
+    fn spec_tests_f16() {
+        spec_test::<f16>(fabsf16);
+    }
+
+    #[test]
+    fn sanity_check_f32() {
+        assert_eq!(fabsf(-1.0f32), 1.0);
+        assert_eq!(fabsf(2.8f32), 2.8);
+    }
+
+    #[test]
+    fn spec_tests_f32() {
+        spec_test::<f32>(fabsf);
+    }
+
+    #[test]
+    fn sanity_check_f64() {
+        assert_eq!(fabs(-1.0f64), 1.0);
+        assert_eq!(fabs(2.8f64), 2.8);
+    }
+
+    #[test]
+    fn spec_tests_f64() {
+        spec_test::<f64>(fabs);
+    }
+
+    #[test]
+    #[cfg(f128_enabled)]
+    fn sanity_check_f128() {
+        assert_eq!(fabsf128(-1.0f128), 1.0);
+        assert_eq!(fabsf128(2.8f128), 2.8);
+    }
+
+    #[test]
+    #[cfg(f128_enabled)]
+    fn spec_tests_f128() {
+        spec_test::<f128>(fabsf128);
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/fabsf.rs b/library/compiler-builtins/libm/src/math/fabsf.rs
new file mode 100644
index 00000000000..e5820a26c52
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/fabsf.rs
@@ -0,0 +1,39 @@
+/// Absolute value (magnitude) (f32)
+///
+/// Calculates the absolute value (magnitude) of the argument `x`,
+/// by direct manipulation of the bit representation of `x`.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fabsf(x: f32) -> f32 {
+    select_implementation! {
+        name: fabsf,
+        use_arch: all(target_arch = "wasm32", intrinsics_enabled),
+        args: x,
+    }
+
+    super::generic::fabs(x)
+}
+
+// PowerPC tests are failing on LLVM 13: https://github.com/rust-lang/rust/issues/88520
+#[cfg(not(target_arch = "powerpc64"))]
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn sanity_check() {
+        assert_eq!(fabsf(-1.0), 1.0);
+        assert_eq!(fabsf(2.8), 2.8);
+    }
+
+    /// The spec: https://en.cppreference.com/w/cpp/numeric/math/fabs
+    #[test]
+    fn spec_tests() {
+        assert!(fabsf(f32::NAN).is_nan());
+        for f in [0.0, -0.0].iter().copied() {
+            assert_eq!(fabsf(f), 0.0);
+        }
+        for f in [f32::INFINITY, f32::NEG_INFINITY].iter().copied() {
+            assert_eq!(fabsf(f), f32::INFINITY);
+        }
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/fabsf128.rs b/library/compiler-builtins/libm/src/math/fabsf128.rs
new file mode 100644
index 00000000000..46429ca4940
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/fabsf128.rs
@@ -0,0 +1,31 @@
+/// Absolute value (magnitude) (f128)
+///
+/// Calculates the absolute value (magnitude) of the argument `x`,
+/// by direct manipulation of the bit representation of `x`.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fabsf128(x: f128) -> f128 {
+    super::generic::fabs(x)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn sanity_check() {
+        assert_eq!(fabsf128(-1.0), 1.0);
+        assert_eq!(fabsf128(2.8), 2.8);
+    }
+
+    /// The spec: https://en.cppreference.com/w/cpp/numeric/math/fabs
+    #[test]
+    fn spec_tests() {
+        assert!(fabsf128(f128::NAN).is_nan());
+        for f in [0.0, -0.0].iter().copied() {
+            assert_eq!(fabsf128(f), 0.0);
+        }
+        for f in [f128::INFINITY, f128::NEG_INFINITY].iter().copied() {
+            assert_eq!(fabsf128(f), f128::INFINITY);
+        }
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/fabsf16.rs b/library/compiler-builtins/libm/src/math/fabsf16.rs
new file mode 100644
index 00000000000..eee42ac6a3c
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/fabsf16.rs
@@ -0,0 +1,31 @@
+/// Absolute value (magnitude) (f16)
+///
+/// Calculates the absolute value (magnitude) of the argument `x`,
+/// by direct manipulation of the bit representation of `x`.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fabsf16(x: f16) -> f16 {
+    super::generic::fabs(x)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn sanity_check() {
+        assert_eq!(fabsf16(-1.0), 1.0);
+        assert_eq!(fabsf16(2.8), 2.8);
+    }
+
+    /// The spec: https://en.cppreference.com/w/cpp/numeric/math/fabs
+    #[test]
+    fn spec_tests() {
+        assert!(fabsf16(f16::NAN).is_nan());
+        for f in [0.0, -0.0].iter().copied() {
+            assert_eq!(fabsf16(f), 0.0);
+        }
+        for f in [f16::INFINITY, f16::NEG_INFINITY].iter().copied() {
+            assert_eq!(fabsf16(f), f16::INFINITY);
+        }
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/fdim.rs b/library/compiler-builtins/libm/src/math/fdim.rs
new file mode 100644
index 00000000000..082c5478b2a
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/fdim.rs
@@ -0,0 +1,53 @@
+/// Positive difference (f16)
+///
+/// Determines the positive difference between arguments, returning:
+/// * x - y if x > y, or
+/// * +0    if x <= y, or
+/// * NAN   if either argument is NAN.
+///
+/// A range error may occur.
+#[cfg(f16_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fdimf16(x: f16, y: f16) -> f16 {
+    super::generic::fdim(x, y)
+}
+
+/// Positive difference (f32)
+///
+/// Determines the positive difference between arguments, returning:
+/// * x - y if x > y, or
+/// * +0    if x <= y, or
+/// * NAN   if either argument is NAN.
+///
+/// A range error may occur.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fdimf(x: f32, y: f32) -> f32 {
+    super::generic::fdim(x, y)
+}
+
+/// Positive difference (f64)
+///
+/// Determines the positive difference between arguments, returning:
+/// * x - y if x > y, or
+/// * +0    if x <= y, or
+/// * NAN   if either argument is NAN.
+///
+/// A range error may occur.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fdim(x: f64, y: f64) -> f64 {
+    super::generic::fdim(x, y)
+}
+
+/// Positive difference (f128)
+///
+/// Determines the positive difference between arguments, returning:
+/// * x - y if x > y, or
+/// * +0    if x <= y, or
+/// * NAN   if either argument is NAN.
+///
+/// A range error may occur.
+#[cfg(f128_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fdimf128(x: f128, y: f128) -> f128 {
+    super::generic::fdim(x, y)
+}
diff --git a/library/compiler-builtins/libm/src/math/fdimf.rs b/library/compiler-builtins/libm/src/math/fdimf.rs
new file mode 100644
index 00000000000..367ef517c63
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/fdimf.rs
@@ -0,0 +1,12 @@
+/// Positive difference (f32)
+///
+/// Determines the positive difference between arguments, returning:
+/// * x - y if x > y, or
+/// * +0    if x <= y, or
+/// * NAN   if either argument is NAN.
+///
+/// A range error may occur.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fdimf(x: f32, y: f32) -> f32 {
+    super::generic::fdim(x, y)
+}
diff --git a/library/compiler-builtins/libm/src/math/fdimf128.rs b/library/compiler-builtins/libm/src/math/fdimf128.rs
new file mode 100644
index 00000000000..6f3d1d0ff1d
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/fdimf128.rs
@@ -0,0 +1,12 @@
+/// Positive difference (f128)
+///
+/// Determines the positive difference between arguments, returning:
+/// * x - y if x > y, or
+/// * +0    if x <= y, or
+/// * NAN   if either argument is NAN.
+///
+/// A range error may occur.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fdimf128(x: f128, y: f128) -> f128 {
+    super::generic::fdim(x, y)
+}
diff --git a/library/compiler-builtins/libm/src/math/fdimf16.rs b/library/compiler-builtins/libm/src/math/fdimf16.rs
new file mode 100644
index 00000000000..37bd6885817
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/fdimf16.rs
@@ -0,0 +1,12 @@
+/// Positive difference (f16)
+///
+/// Determines the positive difference between arguments, returning:
+/// * x - y if x > y, or
+/// * +0    if x <= y, or
+/// * NAN   if either argument is NAN.
+///
+/// A range error may occur.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fdimf16(x: f16, y: f16) -> f16 {
+    super::generic::fdim(x, y)
+}
diff --git a/library/compiler-builtins/libm/src/math/floor.rs b/library/compiler-builtins/libm/src/math/floor.rs
new file mode 100644
index 00000000000..3c5eab101d1
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/floor.rs
@@ -0,0 +1,46 @@
+/// Floor (f16)
+///
+/// Finds the nearest integer less than or equal to `x`.
+#[cfg(f16_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn floorf16(x: f16) -> f16 {
+    return super::generic::floor(x);
+}
+
+/// Floor (f64)
+///
+/// Finds the nearest integer less than or equal to `x`.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn floor(x: f64) -> f64 {
+    select_implementation! {
+        name: floor,
+        use_arch: all(target_arch = "wasm32", intrinsics_enabled),
+        use_arch_required: all(target_arch = "x86", not(target_feature = "sse2")),
+        args: x,
+    }
+
+    return super::generic::floor(x);
+}
+
+/// Floor (f32)
+///
+/// Finds the nearest integer less than or equal to `x`.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn floorf(x: f32) -> f32 {
+    select_implementation! {
+        name: floorf,
+        use_arch: all(target_arch = "wasm32", intrinsics_enabled),
+        args: x,
+    }
+
+    return super::generic::floor(x);
+}
+
+/// Floor (f128)
+///
+/// Finds the nearest integer less than or equal to `x`.
+#[cfg(f128_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn floorf128(x: f128) -> f128 {
+    return super::generic::floor(x);
+}
diff --git a/library/compiler-builtins/libm/src/math/floorf.rs b/library/compiler-builtins/libm/src/math/floorf.rs
new file mode 100644
index 00000000000..16957b7f355
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/floorf.rs
@@ -0,0 +1,13 @@
+/// Floor (f32)
+///
+/// Finds the nearest integer less than or equal to `x`.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn floorf(x: f32) -> f32 {
+    select_implementation! {
+        name: floorf,
+        use_arch: all(target_arch = "wasm32", intrinsics_enabled),
+        args: x,
+    }
+
+    return super::generic::floor(x);
+}
diff --git a/library/compiler-builtins/libm/src/math/floorf128.rs b/library/compiler-builtins/libm/src/math/floorf128.rs
new file mode 100644
index 00000000000..9a9fe415115
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/floorf128.rs
@@ -0,0 +1,7 @@
+/// Floor (f128)
+///
+/// Finds the nearest integer less than or equal to `x`.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn floorf128(x: f128) -> f128 {
+    return super::generic::floor(x);
+}
diff --git a/library/compiler-builtins/libm/src/math/floorf16.rs b/library/compiler-builtins/libm/src/math/floorf16.rs
new file mode 100644
index 00000000000..f9b868e0410
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/floorf16.rs
@@ -0,0 +1,7 @@
+/// Floor (f16)
+///
+/// Finds the nearest integer less than or equal to `x`.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn floorf16(x: f16) -> f16 {
+    return super::generic::floor(x);
+}
diff --git a/library/compiler-builtins/libm/src/math/fma.rs b/library/compiler-builtins/libm/src/math/fma.rs
new file mode 100644
index 00000000000..5bf473cfe06
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/fma.rs
@@ -0,0 +1,171 @@
+/* SPDX-License-Identifier: MIT */
+/* origin: musl src/math/fma.c, fmaf.c Ported to generic Rust algorithm in 2025, TG. */
+
+use super::generic;
+use crate::support::Round;
+
+// Placeholder so we can have `fmaf16` in the `Float` trait.
+#[allow(unused)]
+#[cfg(f16_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub(crate) fn fmaf16(_x: f16, _y: f16, _z: f16) -> f16 {
+    unimplemented!()
+}
+
+/// Floating multiply add (f32)
+///
+/// Computes `(x*y)+z`, rounded as one ternary operation (i.e. calculated with infinite precision).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fmaf(x: f32, y: f32, z: f32) -> f32 {
+    select_implementation! {
+        name: fmaf,
+        use_arch: any(
+            all(target_arch = "aarch64", target_feature = "neon"),
+            target_feature = "sse2",
+        ),
+        args: x, y, z,
+    }
+
+    generic::fma_wide_round(x, y, z, Round::Nearest).val
+}
+
+/// Fused multiply add (f64)
+///
+/// Computes `(x*y)+z`, rounded as one ternary operation (i.e. calculated with infinite precision).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fma(x: f64, y: f64, z: f64) -> f64 {
+    select_implementation! {
+        name: fma,
+        use_arch: any(
+            all(target_arch = "aarch64", target_feature = "neon"),
+            target_feature = "sse2",
+        ),
+        args: x, y, z,
+    }
+
+    generic::fma_round(x, y, z, Round::Nearest).val
+}
+
+/// Fused multiply add (f128)
+///
+/// Computes `(x*y)+z`, rounded as one ternary operation (i.e. calculated with infinite precision).
+#[cfg(f128_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fmaf128(x: f128, y: f128, z: f128) -> f128 {
+    generic::fma_round(x, y, z, Round::Nearest).val
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::support::{CastFrom, CastInto, Float, FpResult, HInt, MinInt, Round, Status};
+
+    /// Test the generic `fma_round` algorithm for a given float.
+    fn spec_test<F>(f: impl Fn(F, F, F) -> F)
+    where
+        F: Float,
+        F: CastFrom<F::SignedInt>,
+        F: CastFrom<i8>,
+        F::Int: HInt,
+        u32: CastInto<F::Int>,
+    {
+        let x = F::from_bits(F::Int::ONE);
+        let y = F::from_bits(F::Int::ONE);
+        let z = F::ZERO;
+
+        // 754-2020 says "When the exact result of (a × b) + c is non-zero yet the result of
+        // fusedMultiplyAdd is zero because of rounding, the zero result takes the sign of the
+        // exact result"
+        assert_biteq!(f(x, y, z), F::ZERO);
+        assert_biteq!(f(x, -y, z), F::NEG_ZERO);
+        assert_biteq!(f(-x, y, z), F::NEG_ZERO);
+        assert_biteq!(f(-x, -y, z), F::ZERO);
+    }
+
+    #[test]
+    fn spec_test_f32() {
+        spec_test::<f32>(fmaf);
+
+        // Also do a small check that the non-widening version works for f32 (this should ideally
+        // get tested some more).
+        spec_test::<f32>(|x, y, z| generic::fma_round(x, y, z, Round::Nearest).val);
+    }
+
+    #[test]
+    fn spec_test_f64() {
+        spec_test::<f64>(fma);
+
+        let expect_underflow = [
+            (
+                hf64!("0x1.0p-1070"),
+                hf64!("0x1.0p-1070"),
+                hf64!("0x1.ffffffffffffp-1023"),
+                hf64!("0x0.ffffffffffff8p-1022"),
+            ),
+            (
+                // FIXME: we raise underflow but this should only be inexact (based on C and
+                // `rustc_apfloat`).
+                hf64!("0x1.0p-1070"),
+                hf64!("0x1.0p-1070"),
+                hf64!("-0x1.0p-1022"),
+                hf64!("-0x1.0p-1022"),
+            ),
+        ];
+
+        for (x, y, z, res) in expect_underflow {
+            let FpResult { val, status } = generic::fma_round(x, y, z, Round::Nearest);
+            assert_biteq!(val, res);
+            assert_eq!(status, Status::UNDERFLOW);
+        }
+    }
+
+    #[test]
+    #[cfg(f128_enabled)]
+    fn spec_test_f128() {
+        spec_test::<f128>(fmaf128);
+    }
+
+    #[test]
+    fn issue_263() {
+        let a = f32::from_bits(1266679807);
+        let b = f32::from_bits(1300234242);
+        let c = f32::from_bits(1115553792);
+        let expected = f32::from_bits(1501560833);
+        assert_eq!(fmaf(a, b, c), expected);
+    }
+
+    #[test]
+    fn fma_segfault() {
+        // These two inputs cause fma to segfault on release due to overflow:
+        assert_eq!(
+            fma(
+                -0.0000000000000002220446049250313,
+                -0.0000000000000002220446049250313,
+                -0.0000000000000002220446049250313
+            ),
+            -0.00000000000000022204460492503126,
+        );
+
+        let result = fma(-0.992, -0.992, -0.992);
+        //force rounding to storage format on x87 to prevent superious errors.
+        #[cfg(all(target_arch = "x86", not(target_feature = "sse2")))]
+        let result = force_eval!(result);
+        assert_eq!(result, -0.007936000000000007,);
+    }
+
+    #[test]
+    fn fma_sbb() {
+        assert_eq!(
+            fma(-(1.0 - f64::EPSILON), f64::MIN, f64::MIN),
+            -3991680619069439e277
+        );
+    }
+
+    #[test]
+    fn fma_underflow() {
+        assert_eq!(
+            fma(1.1102230246251565e-16, -9.812526705433188e-305, 1.0894e-320),
+            0.0,
+        );
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/fmin_fmax.rs b/library/compiler-builtins/libm/src/math/fmin_fmax.rs
new file mode 100644
index 00000000000..2947b783e2f
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/fmin_fmax.rs
@@ -0,0 +1,167 @@
+/// Return the lesser of two arguments or, if either argument is NaN, the other argument.
+///
+/// This coincides with IEEE 754-2011 `minNum`. The result disregards signed zero (meaning if
+/// the inputs are -0.0 and +0.0, either may be returned).
+#[cfg(f16_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fminf16(x: f16, y: f16) -> f16 {
+    super::generic::fmin(x, y)
+}
+
+/// Return the lesser of two arguments or, if either argument is NaN, the other argument.
+///
+/// This coincides with IEEE 754-2011 `minNum`. The result disregards signed zero (meaning if
+/// the inputs are -0.0 and +0.0, either may be returned).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fminf(x: f32, y: f32) -> f32 {
+    super::generic::fmin(x, y)
+}
+
+/// Return the lesser of two arguments or, if either argument is NaN, the other argument.
+///
+/// This coincides with IEEE 754-2011 `minNum`. The result disregards signed zero (meaning if
+/// the inputs are -0.0 and +0.0, either may be returned).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fmin(x: f64, y: f64) -> f64 {
+    super::generic::fmin(x, y)
+}
+
+/// Return the lesser of two arguments or, if either argument is NaN, the other argument.
+///
+/// This coincides with IEEE 754-2011 `minNum`. The result disregards signed zero (meaning if
+/// the inputs are -0.0 and +0.0, either may be returned).
+#[cfg(f128_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fminf128(x: f128, y: f128) -> f128 {
+    super::generic::fmin(x, y)
+}
+
+/// Return the greater of two arguments or, if either argument is NaN, the other argument.
+///
+/// This coincides with IEEE 754-2011 `maxNum`. The result disregards signed zero (meaning if
+/// the inputs are -0.0 and +0.0, either may be returned).
+#[cfg(f16_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fmaxf16(x: f16, y: f16) -> f16 {
+    super::generic::fmax(x, y)
+}
+
+/// Return the greater of two arguments or, if either argument is NaN, the other argument.
+///
+/// This coincides with IEEE 754-2011 `maxNum`. The result disregards signed zero (meaning if
+/// the inputs are -0.0 and +0.0, either may be returned).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fmaxf(x: f32, y: f32) -> f32 {
+    super::generic::fmax(x, y)
+}
+
+/// Return the greater of two arguments or, if either argument is NaN, the other argument.
+///
+/// This coincides with IEEE 754-2011 `maxNum`. The result disregards signed zero (meaning if
+/// the inputs are -0.0 and +0.0, either may be returned).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fmax(x: f64, y: f64) -> f64 {
+    super::generic::fmax(x, y)
+}
+
+/// Return the greater of two arguments or, if either argument is NaN, the other argument.
+///
+/// This coincides with IEEE 754-2011 `maxNum`. The result disregards signed zero (meaning if
+/// the inputs are -0.0 and +0.0, either may be returned).
+#[cfg(f128_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fmaxf128(x: f128, y: f128) -> f128 {
+    super::generic::fmax(x, y)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::support::{Float, Hexf};
+
+    fn fmin_spec_test<F: Float>(f: impl Fn(F, F) -> F) {
+        let cases = [
+            (F::ZERO, F::ZERO, F::ZERO),
+            (F::ONE, F::ONE, F::ONE),
+            (F::ZERO, F::ONE, F::ZERO),
+            (F::ONE, F::ZERO, F::ZERO),
+            (F::ZERO, F::NEG_ONE, F::NEG_ONE),
+            (F::NEG_ONE, F::ZERO, F::NEG_ONE),
+            (F::INFINITY, F::ZERO, F::ZERO),
+            (F::NEG_INFINITY, F::ZERO, F::NEG_INFINITY),
+            (F::NAN, F::ZERO, F::ZERO),
+            (F::ZERO, F::NAN, F::ZERO),
+            (F::NAN, F::NAN, F::NAN),
+        ];
+
+        for (x, y, res) in cases {
+            let val = f(x, y);
+            assert_biteq!(val, res, "fmin({}, {})", Hexf(x), Hexf(y));
+        }
+    }
+
+    #[test]
+    #[cfg(f16_enabled)]
+    fn fmin_spec_tests_f16() {
+        fmin_spec_test::<f16>(fminf16);
+    }
+
+    #[test]
+    fn fmin_spec_tests_f32() {
+        fmin_spec_test::<f32>(fminf);
+    }
+
+    #[test]
+    fn fmin_spec_tests_f64() {
+        fmin_spec_test::<f64>(fmin);
+    }
+
+    #[test]
+    #[cfg(f128_enabled)]
+    fn fmin_spec_tests_f128() {
+        fmin_spec_test::<f128>(fminf128);
+    }
+
+    fn fmax_spec_test<F: Float>(f: impl Fn(F, F) -> F) {
+        let cases = [
+            (F::ZERO, F::ZERO, F::ZERO),
+            (F::ONE, F::ONE, F::ONE),
+            (F::ZERO, F::ONE, F::ONE),
+            (F::ONE, F::ZERO, F::ONE),
+            (F::ZERO, F::NEG_ONE, F::ZERO),
+            (F::NEG_ONE, F::ZERO, F::ZERO),
+            (F::INFINITY, F::ZERO, F::INFINITY),
+            (F::NEG_INFINITY, F::ZERO, F::ZERO),
+            (F::NAN, F::ZERO, F::ZERO),
+            (F::ZERO, F::NAN, F::ZERO),
+            (F::NAN, F::NAN, F::NAN),
+        ];
+
+        for (x, y, res) in cases {
+            let val = f(x, y);
+            assert_biteq!(val, res, "fmax({}, {})", Hexf(x), Hexf(y));
+        }
+    }
+
+    #[test]
+    #[cfg(f16_enabled)]
+    fn fmax_spec_tests_f16() {
+        fmax_spec_test::<f16>(fmaxf16);
+    }
+
+    #[test]
+    fn fmax_spec_tests_f32() {
+        fmax_spec_test::<f32>(fmaxf);
+    }
+
+    #[test]
+    fn fmax_spec_tests_f64() {
+        fmax_spec_test::<f64>(fmax);
+    }
+
+    #[test]
+    #[cfg(f128_enabled)]
+    fn fmax_spec_tests_f128() {
+        fmax_spec_test::<f128>(fmaxf128);
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/fminimum_fmaximum.rs b/library/compiler-builtins/libm/src/math/fminimum_fmaximum.rs
new file mode 100644
index 00000000000..b7999e27392
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/fminimum_fmaximum.rs
@@ -0,0 +1,163 @@
+/// Return the lesser of two arguments or, if either argument is NaN, the other argument.
+///
+/// This coincides with IEEE 754-2019 `minimum`. The result orders -0.0 < 0.0.
+#[cfg(f16_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fminimumf16(x: f16, y: f16) -> f16 {
+    super::generic::fminimum(x, y)
+}
+
+/// Return the lesser of two arguments or, if either argument is NaN, the other argument.
+///
+/// This coincides with IEEE 754-2019 `minimum`. The result orders -0.0 < 0.0.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fminimum(x: f64, y: f64) -> f64 {
+    super::generic::fminimum(x, y)
+}
+
+/// Return the lesser of two arguments or, if either argument is NaN, the other argument.
+///
+/// This coincides with IEEE 754-2019 `minimum`. The result orders -0.0 < 0.0.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fminimumf(x: f32, y: f32) -> f32 {
+    super::generic::fminimum(x, y)
+}
+
+/// Return the lesser of two arguments or, if either argument is NaN, the other argument.
+///
+/// This coincides with IEEE 754-2019 `minimum`. The result orders -0.0 < 0.0.
+#[cfg(f128_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fminimumf128(x: f128, y: f128) -> f128 {
+    super::generic::fminimum(x, y)
+}
+
+/// Return the greater of two arguments or, if either argument is NaN, the other argument.
+///
+/// This coincides with IEEE 754-2019 `maximum`. The result orders -0.0 < 0.0.
+#[cfg(f16_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fmaximumf16(x: f16, y: f16) -> f16 {
+    super::generic::fmaximum(x, y)
+}
+
+/// Return the greater of two arguments or, if either argument is NaN, the other argument.
+///
+/// This coincides with IEEE 754-2019 `maximum`. The result orders -0.0 < 0.0.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fmaximumf(x: f32, y: f32) -> f32 {
+    super::generic::fmaximum(x, y)
+}
+
+/// Return the greater of two arguments or, if either argument is NaN, the other argument.
+///
+/// This coincides with IEEE 754-2019 `maximum`. The result orders -0.0 < 0.0.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fmaximum(x: f64, y: f64) -> f64 {
+    super::generic::fmaximum(x, y)
+}
+
+/// Return the greater of two arguments or, if either argument is NaN, the other argument.
+///
+/// This coincides with IEEE 754-2019 `maximum`. The result orders -0.0 < 0.0.
+#[cfg(f128_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fmaximumf128(x: f128, y: f128) -> f128 {
+    super::generic::fmaximum(x, y)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::support::{Float, Hexf};
+
+    fn fminimum_spec_test<F: Float>(f: impl Fn(F, F) -> F) {
+        let cases = [
+            (F::ZERO, F::ZERO, F::ZERO),
+            (F::ONE, F::ONE, F::ONE),
+            (F::ZERO, F::ONE, F::ZERO),
+            (F::ONE, F::ZERO, F::ZERO),
+            (F::ZERO, F::NEG_ONE, F::NEG_ONE),
+            (F::NEG_ONE, F::ZERO, F::NEG_ONE),
+            (F::INFINITY, F::ZERO, F::ZERO),
+            (F::NEG_INFINITY, F::ZERO, F::NEG_INFINITY),
+            (F::NAN, F::ZERO, F::NAN),
+            (F::ZERO, F::NAN, F::NAN),
+            (F::NAN, F::NAN, F::NAN),
+            (F::ZERO, F::NEG_ZERO, F::NEG_ZERO),
+            (F::NEG_ZERO, F::ZERO, F::NEG_ZERO),
+        ];
+
+        for (x, y, res) in cases {
+            let val = f(x, y);
+            assert_biteq!(val, res, "fminimum({}, {})", Hexf(x), Hexf(y));
+        }
+    }
+
+    #[test]
+    #[cfg(f16_enabled)]
+    fn fminimum_spec_tests_f16() {
+        fminimum_spec_test::<f16>(fminimumf16);
+    }
+
+    #[test]
+    fn fminimum_spec_tests_f32() {
+        fminimum_spec_test::<f32>(fminimumf);
+    }
+
+    #[test]
+    fn fminimum_spec_tests_f64() {
+        fminimum_spec_test::<f64>(fminimum);
+    }
+
+    #[test]
+    #[cfg(f128_enabled)]
+    fn fminimum_spec_tests_f128() {
+        fminimum_spec_test::<f128>(fminimumf128);
+    }
+
+    fn fmaximum_spec_test<F: Float>(f: impl Fn(F, F) -> F) {
+        let cases = [
+            (F::ZERO, F::ZERO, F::ZERO),
+            (F::ONE, F::ONE, F::ONE),
+            (F::ZERO, F::ONE, F::ONE),
+            (F::ONE, F::ZERO, F::ONE),
+            (F::ZERO, F::NEG_ONE, F::ZERO),
+            (F::NEG_ONE, F::ZERO, F::ZERO),
+            (F::INFINITY, F::ZERO, F::INFINITY),
+            (F::NEG_INFINITY, F::ZERO, F::ZERO),
+            (F::NAN, F::ZERO, F::NAN),
+            (F::ZERO, F::NAN, F::NAN),
+            (F::NAN, F::NAN, F::NAN),
+            (F::ZERO, F::NEG_ZERO, F::ZERO),
+            (F::NEG_ZERO, F::ZERO, F::ZERO),
+        ];
+
+        for (x, y, res) in cases {
+            let val = f(x, y);
+            assert_biteq!(val, res, "fmaximum({}, {})", Hexf(x), Hexf(y));
+        }
+    }
+
+    #[test]
+    #[cfg(f16_enabled)]
+    fn fmaximum_spec_tests_f16() {
+        fmaximum_spec_test::<f16>(fmaximumf16);
+    }
+
+    #[test]
+    fn fmaximum_spec_tests_f32() {
+        fmaximum_spec_test::<f32>(fmaximumf);
+    }
+
+    #[test]
+    fn fmaximum_spec_tests_f64() {
+        fmaximum_spec_test::<f64>(fmaximum);
+    }
+
+    #[test]
+    #[cfg(f128_enabled)]
+    fn fmaximum_spec_tests_f128() {
+        fmaximum_spec_test::<f128>(fmaximumf128);
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/fminimum_fmaximum_num.rs b/library/compiler-builtins/libm/src/math/fminimum_fmaximum_num.rs
new file mode 100644
index 00000000000..180d21f72b7
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/fminimum_fmaximum_num.rs
@@ -0,0 +1,163 @@
+/// Return the lesser of two arguments or, if either argument is NaN, NaN.
+///
+/// This coincides with IEEE 754-2019 `minimumNumber`. The result orders -0.0 < 0.0.
+#[cfg(f16_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fminimum_numf16(x: f16, y: f16) -> f16 {
+    super::generic::fminimum_num(x, y)
+}
+
+/// Return the lesser of two arguments or, if either argument is NaN, NaN.
+///
+/// This coincides with IEEE 754-2019 `minimumNumber`. The result orders -0.0 < 0.0.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fminimum_numf(x: f32, y: f32) -> f32 {
+    super::generic::fminimum_num(x, y)
+}
+
+/// Return the lesser of two arguments or, if either argument is NaN, NaN.
+///
+/// This coincides with IEEE 754-2019 `minimumNumber`. The result orders -0.0 < 0.0.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fminimum_num(x: f64, y: f64) -> f64 {
+    super::generic::fminimum_num(x, y)
+}
+
+/// Return the lesser of two arguments or, if either argument is NaN, NaN.
+///
+/// This coincides with IEEE 754-2019 `minimumNumber`. The result orders -0.0 < 0.0.
+#[cfg(f128_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fminimum_numf128(x: f128, y: f128) -> f128 {
+    super::generic::fminimum_num(x, y)
+}
+
+/// Return the greater of two arguments or, if either argument is NaN, NaN.
+///
+/// This coincides with IEEE 754-2019 `maximumNumber`. The result orders -0.0 < 0.0.
+#[cfg(f16_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fmaximum_numf16(x: f16, y: f16) -> f16 {
+    super::generic::fmaximum_num(x, y)
+}
+
+/// Return the greater of two arguments or, if either argument is NaN, NaN.
+///
+/// This coincides with IEEE 754-2019 `maximumNumber`. The result orders -0.0 < 0.0.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fmaximum_numf(x: f32, y: f32) -> f32 {
+    super::generic::fmaximum_num(x, y)
+}
+
+/// Return the greater of two arguments or, if either argument is NaN, NaN.
+///
+/// This coincides with IEEE 754-2019 `maximumNumber`. The result orders -0.0 < 0.0.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fmaximum_num(x: f64, y: f64) -> f64 {
+    super::generic::fmaximum_num(x, y)
+}
+
+/// Return the greater of two arguments or, if either argument is NaN, NaN.
+///
+/// This coincides with IEEE 754-2019 `maximumNumber`. The result orders -0.0 < 0.0.
+#[cfg(f128_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fmaximum_numf128(x: f128, y: f128) -> f128 {
+    super::generic::fmaximum_num(x, y)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::support::{Float, Hexf};
+
+    fn fminimum_num_spec_test<F: Float>(f: impl Fn(F, F) -> F) {
+        let cases = [
+            (F::ZERO, F::ZERO, F::ZERO),
+            (F::ONE, F::ONE, F::ONE),
+            (F::ZERO, F::ONE, F::ZERO),
+            (F::ONE, F::ZERO, F::ZERO),
+            (F::ZERO, F::NEG_ONE, F::NEG_ONE),
+            (F::NEG_ONE, F::ZERO, F::NEG_ONE),
+            (F::INFINITY, F::ZERO, F::ZERO),
+            (F::NEG_INFINITY, F::ZERO, F::NEG_INFINITY),
+            (F::NAN, F::ZERO, F::ZERO),
+            (F::ZERO, F::NAN, F::ZERO),
+            (F::NAN, F::NAN, F::NAN),
+            (F::ZERO, F::NEG_ZERO, F::NEG_ZERO),
+            (F::NEG_ZERO, F::ZERO, F::NEG_ZERO),
+        ];
+
+        for (x, y, res) in cases {
+            let val = f(x, y);
+            assert_biteq!(val, res, "fminimum_num({}, {})", Hexf(x), Hexf(y));
+        }
+    }
+
+    #[test]
+    #[cfg(f16_enabled)]
+    fn fminimum_num_spec_tests_f16() {
+        fminimum_num_spec_test::<f16>(fminimum_numf16);
+    }
+
+    #[test]
+    fn fminimum_num_spec_tests_f32() {
+        fminimum_num_spec_test::<f32>(fminimum_numf);
+    }
+
+    #[test]
+    fn fminimum_num_spec_tests_f64() {
+        fminimum_num_spec_test::<f64>(fminimum_num);
+    }
+
+    #[test]
+    #[cfg(f128_enabled)]
+    fn fminimum_num_spec_tests_f128() {
+        fminimum_num_spec_test::<f128>(fminimum_numf128);
+    }
+
+    fn fmaximum_num_spec_test<F: Float>(f: impl Fn(F, F) -> F) {
+        let cases = [
+            (F::ZERO, F::ZERO, F::ZERO),
+            (F::ONE, F::ONE, F::ONE),
+            (F::ZERO, F::ONE, F::ONE),
+            (F::ONE, F::ZERO, F::ONE),
+            (F::ZERO, F::NEG_ONE, F::ZERO),
+            (F::NEG_ONE, F::ZERO, F::ZERO),
+            (F::INFINITY, F::ZERO, F::INFINITY),
+            (F::NEG_INFINITY, F::ZERO, F::ZERO),
+            (F::NAN, F::ZERO, F::ZERO),
+            (F::ZERO, F::NAN, F::ZERO),
+            (F::NAN, F::NAN, F::NAN),
+            (F::ZERO, F::NEG_ZERO, F::ZERO),
+            (F::NEG_ZERO, F::ZERO, F::ZERO),
+        ];
+
+        for (x, y, res) in cases {
+            let val = f(x, y);
+            assert_biteq!(val, res, "fmaximum_num({}, {})", Hexf(x), Hexf(y));
+        }
+    }
+
+    #[test]
+    #[cfg(f16_enabled)]
+    fn fmaximum_num_spec_tests_f16() {
+        fmaximum_num_spec_test::<f16>(fmaximum_numf16);
+    }
+
+    #[test]
+    fn fmaximum_num_spec_tests_f32() {
+        fmaximum_num_spec_test::<f32>(fmaximum_numf);
+    }
+
+    #[test]
+    fn fmaximum_num_spec_tests_f64() {
+        fmaximum_num_spec_test::<f64>(fmaximum_num);
+    }
+
+    #[test]
+    #[cfg(f128_enabled)]
+    fn fmaximum_num_spec_tests_f128() {
+        fmaximum_num_spec_test::<f128>(fmaximum_numf128);
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/fmod.rs b/library/compiler-builtins/libm/src/math/fmod.rs
new file mode 100644
index 00000000000..c4752b92578
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/fmod.rs
@@ -0,0 +1,25 @@
+/// Calculate the remainder of `x / y`, the precise result of `x - trunc(x / y) * y`.
+#[cfg(f16_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fmodf16(x: f16, y: f16) -> f16 {
+    super::generic::fmod(x, y)
+}
+
+/// Calculate the remainder of `x / y`, the precise result of `x - trunc(x / y) * y`.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fmodf(x: f32, y: f32) -> f32 {
+    super::generic::fmod(x, y)
+}
+
+/// Calculate the remainder of `x / y`, the precise result of `x - trunc(x / y) * y`.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fmod(x: f64, y: f64) -> f64 {
+    super::generic::fmod(x, y)
+}
+
+/// Calculate the remainder of `x / y`, the precise result of `x - trunc(x / y) * y`.
+#[cfg(f128_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fmodf128(x: f128, y: f128) -> f128 {
+    super::generic::fmod(x, y)
+}
diff --git a/library/compiler-builtins/libm/src/math/fmodf.rs b/library/compiler-builtins/libm/src/math/fmodf.rs
new file mode 100644
index 00000000000..4e95696e20d
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/fmodf.rs
@@ -0,0 +1,5 @@
+/// Calculate the remainder of `x / y`, the precise result of `x - trunc(x / y) * y`.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fmodf(x: f32, y: f32) -> f32 {
+    super::generic::fmod(x, y)
+}
diff --git a/library/compiler-builtins/libm/src/math/fmodf128.rs b/library/compiler-builtins/libm/src/math/fmodf128.rs
new file mode 100644
index 00000000000..ff0e0493e26
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/fmodf128.rs
@@ -0,0 +1,5 @@
+/// Calculate the remainder of `x / y`, the precise result of `x - trunc(x / y) * y`.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fmodf128(x: f128, y: f128) -> f128 {
+    super::generic::fmod(x, y)
+}
diff --git a/library/compiler-builtins/libm/src/math/fmodf16.rs b/library/compiler-builtins/libm/src/math/fmodf16.rs
new file mode 100644
index 00000000000..11972a7de4f
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/fmodf16.rs
@@ -0,0 +1,5 @@
+/// Calculate the remainder of `x / y`, the precise result of `x - trunc(x / y) * y`.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fmodf16(x: f16, y: f16) -> f16 {
+    super::generic::fmod(x, y)
+}
diff --git a/library/compiler-builtins/libm/src/math/frexp.rs b/library/compiler-builtins/libm/src/math/frexp.rs
new file mode 100644
index 00000000000..de7a64fdae1
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/frexp.rs
@@ -0,0 +1,21 @@
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn frexp(x: f64) -> (f64, i32) {
+    let mut y = x.to_bits();
+    let ee = ((y >> 52) & 0x7ff) as i32;
+
+    if ee == 0 {
+        if x != 0.0 {
+            let x1p64 = f64::from_bits(0x43f0000000000000);
+            let (x, e) = frexp(x * x1p64);
+            return (x, e - 64);
+        }
+        return (x, 0);
+    } else if ee == 0x7ff {
+        return (x, 0);
+    }
+
+    let e = ee - 0x3fe;
+    y &= 0x800fffffffffffff;
+    y |= 0x3fe0000000000000;
+    return (f64::from_bits(y), e);
+}
diff --git a/library/compiler-builtins/libm/src/math/frexpf.rs b/library/compiler-builtins/libm/src/math/frexpf.rs
new file mode 100644
index 00000000000..0ec91c2d350
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/frexpf.rs
@@ -0,0 +1,22 @@
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn frexpf(x: f32) -> (f32, i32) {
+    let mut y = x.to_bits();
+    let ee: i32 = ((y >> 23) & 0xff) as i32;
+
+    if ee == 0 {
+        if x != 0.0 {
+            let x1p64 = f32::from_bits(0x5f800000);
+            let (x, e) = frexpf(x * x1p64);
+            return (x, e - 64);
+        } else {
+            return (x, 0);
+        }
+    } else if ee == 0xff {
+        return (x, 0);
+    }
+
+    let e = ee - 0x7e;
+    y &= 0x807fffff;
+    y |= 0x3f000000;
+    (f32::from_bits(y), e)
+}
diff --git a/library/compiler-builtins/libm/src/math/generic/ceil.rs b/library/compiler-builtins/libm/src/math/generic/ceil.rs
new file mode 100644
index 00000000000..1072ba7c29b
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/generic/ceil.rs
@@ -0,0 +1,174 @@
+/* SPDX-License-Identifier: MIT */
+/* origin: musl src/math/ceilf.c */
+
+//! Generic `ceil` algorithm.
+//!
+//! Note that this uses the algorithm from musl's `ceilf` rather than `ceil` or `ceill` because
+//! performance seems to be better (based on icount) and it does not seem to experience rounding
+//! errors on i386.
+
+use crate::support::{Float, FpResult, Int, IntTy, MinInt, Status};
+
+#[inline]
+pub fn ceil<F: Float>(x: F) -> F {
+    ceil_status(x).val
+}
+
+#[inline]
+pub fn ceil_status<F: Float>(x: F) -> FpResult<F> {
+    let zero = IntTy::<F>::ZERO;
+
+    let mut ix = x.to_bits();
+    let e = x.exp_unbiased();
+
+    // If the represented value has no fractional part, no truncation is needed.
+    if e >= F::SIG_BITS as i32 {
+        return FpResult::ok(x);
+    }
+
+    let status;
+    let res = if e >= 0 {
+        // |x| >= 1.0
+        let m = F::SIG_MASK >> e.unsigned();
+        if (ix & m) == zero {
+            // Portion to be masked is already zero; no adjustment needed.
+            return FpResult::ok(x);
+        }
+
+        // Otherwise, raise an inexact exception.
+        status = Status::INEXACT;
+
+        if x.is_sign_positive() {
+            ix += m;
+        }
+
+        ix &= !m;
+        F::from_bits(ix)
+    } else {
+        // |x| < 1.0, raise an inexact exception since truncation will happen (unless x == 0).
+        if ix & F::SIG_MASK == F::Int::ZERO {
+            status = Status::OK;
+        } else {
+            status = Status::INEXACT;
+        }
+
+        if x.is_sign_negative() {
+            // -1.0 < x <= -0.0; rounding up goes toward -0.0.
+            F::NEG_ZERO
+        } else if ix << 1 != zero {
+            // 0.0 < x < 1.0; rounding up goes toward +1.0.
+            F::ONE
+        } else {
+            // +0.0 remains unchanged
+            x
+        }
+    };
+
+    FpResult::new(res, status)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::support::Hexf;
+
+    /// Test against https://en.cppreference.com/w/cpp/numeric/math/ceil
+    fn spec_test<F: Float>(cases: &[(F, F, Status)]) {
+        let roundtrip = [
+            F::ZERO,
+            F::ONE,
+            F::NEG_ONE,
+            F::NEG_ZERO,
+            F::INFINITY,
+            F::NEG_INFINITY,
+        ];
+
+        for x in roundtrip {
+            let FpResult { val, status } = ceil_status(x);
+            assert_biteq!(val, x, "{}", Hexf(x));
+            assert_eq!(status, Status::OK, "{}", Hexf(x));
+        }
+
+        for &(x, res, res_stat) in cases {
+            let FpResult { val, status } = ceil_status(x);
+            assert_biteq!(val, res, "{}", Hexf(x));
+            assert_eq!(status, res_stat, "{}", Hexf(x));
+        }
+    }
+
+    /* Skipping f16 / f128 "sanity_check"s due to rejected literal lexing at MSRV */
+
+    #[test]
+    #[cfg(f16_enabled)]
+    fn spec_tests_f16() {
+        let cases = [
+            (0.1, 1.0, Status::INEXACT),
+            (-0.1, -0.0, Status::INEXACT),
+            (0.9, 1.0, Status::INEXACT),
+            (-0.9, -0.0, Status::INEXACT),
+            (1.1, 2.0, Status::INEXACT),
+            (-1.1, -1.0, Status::INEXACT),
+            (1.9, 2.0, Status::INEXACT),
+            (-1.9, -1.0, Status::INEXACT),
+        ];
+        spec_test::<f16>(&cases);
+    }
+
+    #[test]
+    fn sanity_check_f32() {
+        assert_eq!(ceil(1.1f32), 2.0);
+        assert_eq!(ceil(2.9f32), 3.0);
+    }
+
+    #[test]
+    fn spec_tests_f32() {
+        let cases = [
+            (0.1, 1.0, Status::INEXACT),
+            (-0.1, -0.0, Status::INEXACT),
+            (0.9, 1.0, Status::INEXACT),
+            (-0.9, -0.0, Status::INEXACT),
+            (1.1, 2.0, Status::INEXACT),
+            (-1.1, -1.0, Status::INEXACT),
+            (1.9, 2.0, Status::INEXACT),
+            (-1.9, -1.0, Status::INEXACT),
+        ];
+        spec_test::<f32>(&cases);
+    }
+
+    #[test]
+    fn sanity_check_f64() {
+        assert_eq!(ceil(1.1f64), 2.0);
+        assert_eq!(ceil(2.9f64), 3.0);
+    }
+
+    #[test]
+    fn spec_tests_f64() {
+        let cases = [
+            (0.1, 1.0, Status::INEXACT),
+            (-0.1, -0.0, Status::INEXACT),
+            (0.9, 1.0, Status::INEXACT),
+            (-0.9, -0.0, Status::INEXACT),
+            (1.1, 2.0, Status::INEXACT),
+            (-1.1, -1.0, Status::INEXACT),
+            (1.9, 2.0, Status::INEXACT),
+            (-1.9, -1.0, Status::INEXACT),
+        ];
+        spec_test::<f64>(&cases);
+    }
+
+    #[test]
+    #[cfg(f128_enabled)]
+    fn spec_tests_f128() {
+        let cases = [
+            (0.1, 1.0, Status::INEXACT),
+            (-0.1, -0.0, Status::INEXACT),
+            (0.9, 1.0, Status::INEXACT),
+            (-0.9, -0.0, Status::INEXACT),
+            (1.1, 2.0, Status::INEXACT),
+            (-1.1, -1.0, Status::INEXACT),
+            (1.9, 2.0, Status::INEXACT),
+            (-1.9, -1.0, Status::INEXACT),
+        ];
+        spec_test::<f128>(&cases);
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/generic/copysign.rs b/library/compiler-builtins/libm/src/math/generic/copysign.rs
new file mode 100644
index 00000000000..da9ce387885
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/generic/copysign.rs
@@ -0,0 +1,11 @@
+use crate::support::Float;
+
+/// Copy the sign of `y` to `x`.
+#[inline]
+pub fn copysign<F: Float>(x: F, y: F) -> F {
+    let mut ux = x.to_bits();
+    let uy = y.to_bits();
+    ux &= !F::SIGN_MASK;
+    ux |= uy & F::SIGN_MASK;
+    F::from_bits(ux)
+}
diff --git a/library/compiler-builtins/libm/src/math/generic/fabs.rs b/library/compiler-builtins/libm/src/math/generic/fabs.rs
new file mode 100644
index 00000000000..0adfa57d91b
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/generic/fabs.rs
@@ -0,0 +1,8 @@
+use crate::support::Float;
+
+/// Absolute value.
+#[inline]
+pub fn fabs<F: Float>(x: F) -> F {
+    let abs_mask = !F::SIGN_MASK;
+    F::from_bits(x.to_bits() & abs_mask)
+}
diff --git a/library/compiler-builtins/libm/src/math/generic/fdim.rs b/library/compiler-builtins/libm/src/math/generic/fdim.rs
new file mode 100644
index 00000000000..289e5fd96f8
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/generic/fdim.rs
@@ -0,0 +1,6 @@
+use crate::support::Float;
+
+#[inline]
+pub fn fdim<F: Float>(x: F, y: F) -> F {
+    if x <= y { F::ZERO } else { x - y }
+}
diff --git a/library/compiler-builtins/libm/src/math/generic/floor.rs b/library/compiler-builtins/libm/src/math/generic/floor.rs
new file mode 100644
index 00000000000..e6dfd8866a4
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/generic/floor.rs
@@ -0,0 +1,157 @@
+/* SPDX-License-Identifier: MIT
+ * origin: musl src/math/floor.c */
+
+//! Generic `floor` algorithm.
+//!
+//! Note that this uses the algorithm from musl's `floorf` rather than `floor` or `floorl` because
+//! performance seems to be better (based on icount) and it does not seem to experience rounding
+//! errors on i386.
+
+use crate::support::{Float, FpResult, Int, IntTy, MinInt, Status};
+
+#[inline]
+pub fn floor<F: Float>(x: F) -> F {
+    floor_status(x).val
+}
+
+#[inline]
+pub fn floor_status<F: Float>(x: F) -> FpResult<F> {
+    let zero = IntTy::<F>::ZERO;
+
+    let mut ix = x.to_bits();
+    let e = x.exp_unbiased();
+
+    // If the represented value has no fractional part, no truncation is needed.
+    if e >= F::SIG_BITS as i32 {
+        return FpResult::ok(x);
+    }
+
+    let status;
+    let res = if e >= 0 {
+        // |x| >= 1.0
+        let m = F::SIG_MASK >> e.unsigned();
+        if ix & m == zero {
+            // Portion to be masked is already zero; no adjustment needed.
+            return FpResult::ok(x);
+        }
+
+        // Otherwise, raise an inexact exception.
+        status = Status::INEXACT;
+
+        if x.is_sign_negative() {
+            ix += m;
+        }
+
+        ix &= !m;
+        F::from_bits(ix)
+    } else {
+        // |x| < 1.0, raise an inexact exception since truncation will happen.
+        if ix & F::SIG_MASK == F::Int::ZERO {
+            status = Status::OK;
+        } else {
+            status = Status::INEXACT;
+        }
+
+        if x.is_sign_positive() {
+            // 0.0 <= x < 1.0; rounding down goes toward +0.0.
+            F::ZERO
+        } else if ix << 1 != zero {
+            // -1.0 < x < 0.0; rounding down goes toward -1.0.
+            F::NEG_ONE
+        } else {
+            // -0.0 remains unchanged
+            x
+        }
+    };
+
+    FpResult::new(res, status)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::support::Hexf;
+
+    /// Test against https://en.cppreference.com/w/cpp/numeric/math/floor
+    fn spec_test<F: Float>(cases: &[(F, F, Status)]) {
+        let roundtrip = [
+            F::ZERO,
+            F::ONE,
+            F::NEG_ONE,
+            F::NEG_ZERO,
+            F::INFINITY,
+            F::NEG_INFINITY,
+        ];
+
+        for x in roundtrip {
+            let FpResult { val, status } = floor_status(x);
+            assert_biteq!(val, x, "{}", Hexf(x));
+            assert_eq!(status, Status::OK, "{}", Hexf(x));
+        }
+
+        for &(x, res, res_stat) in cases {
+            let FpResult { val, status } = floor_status(x);
+            assert_biteq!(val, res, "{}", Hexf(x));
+            assert_eq!(status, res_stat, "{}", Hexf(x));
+        }
+    }
+
+    /* Skipping f16 / f128 "sanity_check"s and spec cases due to rejected literal lexing at MSRV */
+
+    #[test]
+    #[cfg(f16_enabled)]
+    fn spec_tests_f16() {
+        let cases = [];
+        spec_test::<f16>(&cases);
+    }
+
+    #[test]
+    fn sanity_check_f32() {
+        assert_eq!(floor(0.5f32), 0.0);
+        assert_eq!(floor(1.1f32), 1.0);
+        assert_eq!(floor(2.9f32), 2.0);
+    }
+
+    #[test]
+    fn spec_tests_f32() {
+        let cases = [
+            (0.1, 0.0, Status::INEXACT),
+            (-0.1, -1.0, Status::INEXACT),
+            (0.9, 0.0, Status::INEXACT),
+            (-0.9, -1.0, Status::INEXACT),
+            (1.1, 1.0, Status::INEXACT),
+            (-1.1, -2.0, Status::INEXACT),
+            (1.9, 1.0, Status::INEXACT),
+            (-1.9, -2.0, Status::INEXACT),
+        ];
+        spec_test::<f32>(&cases);
+    }
+
+    #[test]
+    fn sanity_check_f64() {
+        assert_eq!(floor(1.1f64), 1.0);
+        assert_eq!(floor(2.9f64), 2.0);
+    }
+
+    #[test]
+    fn spec_tests_f64() {
+        let cases = [
+            (0.1, 0.0, Status::INEXACT),
+            (-0.1, -1.0, Status::INEXACT),
+            (0.9, 0.0, Status::INEXACT),
+            (-0.9, -1.0, Status::INEXACT),
+            (1.1, 1.0, Status::INEXACT),
+            (-1.1, -2.0, Status::INEXACT),
+            (1.9, 1.0, Status::INEXACT),
+            (-1.9, -2.0, Status::INEXACT),
+        ];
+        spec_test::<f64>(&cases);
+    }
+
+    #[test]
+    #[cfg(f128_enabled)]
+    fn spec_tests_f128() {
+        let cases = [];
+        spec_test::<f128>(&cases);
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/generic/fma.rs b/library/compiler-builtins/libm/src/math/generic/fma.rs
new file mode 100644
index 00000000000..aaf459d1b61
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/generic/fma.rs
@@ -0,0 +1,278 @@
+/* SPDX-License-Identifier: MIT */
+/* origin: musl src/math/fma.c. Ported to generic Rust algorithm in 2025, TG. */
+
+use crate::support::{
+    CastFrom, CastInto, DInt, Float, FpResult, HInt, Int, IntTy, MinInt, Round, Status,
+};
+
+/// Fused multiply-add that works when there is not a larger float size available. Computes
+/// `(x * y) + z`.
+#[inline]
+pub fn fma_round<F>(x: F, y: F, z: F, _round: Round) -> FpResult<F>
+where
+    F: Float,
+    F: CastFrom<F::SignedInt>,
+    F: CastFrom<i8>,
+    F::Int: HInt,
+    u32: CastInto<F::Int>,
+{
+    let one = IntTy::<F>::ONE;
+    let zero = IntTy::<F>::ZERO;
+
+    // Normalize such that the top of the mantissa is zero and we have a guard bit.
+    let nx = Norm::from_float(x);
+    let ny = Norm::from_float(y);
+    let nz = Norm::from_float(z);
+
+    if nx.is_zero_nan_inf() || ny.is_zero_nan_inf() {
+        // Value will overflow, defer to non-fused operations.
+        return FpResult::ok(x * y + z);
+    }
+
+    if nz.is_zero_nan_inf() {
+        if nz.is_zero() {
+            // Empty add component means we only need to multiply.
+            return FpResult::ok(x * y);
+        }
+        // `z` is NaN or infinity, which sets the result.
+        return FpResult::ok(z);
+    }
+
+    // multiply: r = x * y
+    let zhi: F::Int;
+    let zlo: F::Int;
+    let (mut rlo, mut rhi) = nx.m.widen_mul(ny.m).lo_hi();
+
+    // Exponent result of multiplication
+    let mut e: i32 = nx.e + ny.e;
+    // Needed shift to align `z` to the multiplication result
+    let mut d: i32 = nz.e - e;
+    let sbits = F::BITS as i32;
+
+    // Scale `z`. Shift `z <<= kz`, `r >>= kr`, so `kz+kr == d`, set `e = e+kr` (== ez-kz)
+    if d > 0 {
+        // The magnitude of `z` is larger than `x * y`
+        if d < sbits {
+            // Maximum shift of one `F::BITS` means shifted `z` will fit into `2 * F::BITS`. Shift
+            // it into `(zhi, zlo)`. No exponent adjustment necessary.
+            zlo = nz.m << d;
+            zhi = nz.m >> (sbits - d);
+        } else {
+            // Shift larger than `sbits`, `z` only needs the top half `zhi`. Place it there (acts
+            // as a shift by `sbits`).
+            zlo = zero;
+            zhi = nz.m;
+            d -= sbits;
+
+            // `z`'s exponent is large enough that it now needs to be taken into account.
+            e = nz.e - sbits;
+
+            if d == 0 {
+                // Exactly `sbits`, nothing to do
+            } else if d < sbits {
+                // Remaining shift fits within `sbits`. Leave `z` in place, shift `x * y`
+                rlo = (rhi << (sbits - d)) | (rlo >> d);
+                // Set the sticky bit
+                rlo |= IntTy::<F>::from((rlo << (sbits - d)) != zero);
+                rhi = rhi >> d;
+            } else {
+                // `z`'s magnitude is enough that `x * y` is irrelevant. It was nonzero, so set
+                // the sticky bit.
+                rlo = one;
+                rhi = zero;
+            }
+        }
+    } else {
+        // `z`'s magnitude once shifted fits entirely within `zlo`
+        zhi = zero;
+        d = -d;
+        if d == 0 {
+            // No shift needed
+            zlo = nz.m;
+        } else if d < sbits {
+            // Shift s.t. `nz.m` fits into `zlo`
+            let sticky = IntTy::<F>::from((nz.m << (sbits - d)) != zero);
+            zlo = (nz.m >> d) | sticky;
+        } else {
+            // Would be entirely shifted out, only set the sticky bit
+            zlo = one;
+        }
+    }
+
+    /* addition */
+
+    let mut neg = nx.neg ^ ny.neg;
+    let samesign: bool = !neg ^ nz.neg;
+    let mut rhi_nonzero = true;
+
+    if samesign {
+        // r += z
+        rlo = rlo.wrapping_add(zlo);
+        rhi += zhi + IntTy::<F>::from(rlo < zlo);
+    } else {
+        // r -= z
+        let (res, borrow) = rlo.overflowing_sub(zlo);
+        rlo = res;
+        rhi = rhi.wrapping_sub(zhi.wrapping_add(IntTy::<F>::from(borrow)));
+        if (rhi >> (F::BITS - 1)) != zero {
+            rlo = rlo.signed().wrapping_neg().unsigned();
+            rhi = rhi.signed().wrapping_neg().unsigned() - IntTy::<F>::from(rlo != zero);
+            neg = !neg;
+        }
+        rhi_nonzero = rhi != zero;
+    }
+
+    /* Construct result */
+
+    // Shift result into `rhi`, left-aligned. Last bit is sticky
+    if rhi_nonzero {
+        // `d` > 0, need to shift both `rhi` and `rlo` into result
+        e += sbits;
+        d = rhi.leading_zeros() as i32 - 1;
+        rhi = (rhi << d) | (rlo >> (sbits - d));
+        // Update sticky
+        rhi |= IntTy::<F>::from((rlo << d) != zero);
+    } else if rlo != zero {
+        // `rhi` is zero, `rlo` is the entire result and needs to be shifted
+        d = rlo.leading_zeros() as i32 - 1;
+        if d < 0 {
+            // Shift and set sticky
+            rhi = (rlo >> 1) | (rlo & one);
+        } else {
+            rhi = rlo << d;
+        }
+    } else {
+        // exact +/- 0.0
+        return FpResult::ok(x * y + z);
+    }
+
+    e -= d;
+
+    // Use int->float conversion to populate the significand.
+    // i is in [1 << (BITS - 2), (1 << (BITS - 1)) - 1]
+    let mut i: F::SignedInt = rhi.signed();
+
+    if neg {
+        i = -i;
+    }
+
+    // `|r|` is in `[0x1p62,0x1p63]` for `f64`
+    let mut r: F = F::cast_from_lossy(i);
+
+    /* Account for subnormal and rounding */
+
+    // Unbiased exponent for the maximum value of `r`
+    let max_pow = F::BITS - 1 + F::EXP_BIAS;
+
+    let mut status = Status::OK;
+
+    if e < -(max_pow as i32 - 2) {
+        // Result is subnormal before rounding
+        if e == -(max_pow as i32 - 1) {
+            let mut c = F::from_parts(false, max_pow, zero);
+            if neg {
+                c = -c;
+            }
+
+            if r == c {
+                // Min normal after rounding,
+                status.set_underflow(true);
+                r = F::MIN_POSITIVE_NORMAL.copysign(r);
+                return FpResult::new(r, status);
+            }
+
+            if (rhi << (F::SIG_BITS + 1)) != zero {
+                // Account for truncated bits. One bit will be lost in the `scalbn` call, add
+                // another top bit to avoid double rounding if inexact.
+                let iu: F::Int = (rhi >> 1) | (rhi & one) | (one << (F::BITS - 2));
+                i = iu.signed();
+
+                if neg {
+                    i = -i;
+                }
+
+                r = F::cast_from_lossy(i);
+
+                // Remove the top bit
+                r = F::cast_from(2i8) * r - c;
+                status.set_underflow(true);
+            }
+        } else {
+            // Only round once when scaled
+            d = F::EXP_BITS as i32 - 1;
+            let sticky = IntTy::<F>::from(rhi << (F::BITS as i32 - d) != zero);
+            i = (((rhi >> d) | sticky) << d).signed();
+
+            if neg {
+                i = -i;
+            }
+
+            r = F::cast_from_lossy(i);
+        }
+    }
+
+    // Use our exponent to scale the final value.
+    FpResult::new(super::scalbn(r, e), status)
+}
+
+/// Representation of `F` that has handled subnormals.
+#[derive(Clone, Copy, Debug)]
+struct Norm<F: Float> {
+    /// Normalized significand with one guard bit, unsigned.
+    m: F::Int,
+    /// Exponent of the mantissa such that `m * 2^e = x`. Accounts for the shift in the mantissa
+    /// and the guard bit; that is, 1.0 will normalize as `m = 1 << 53` and `e = -53`.
+    e: i32,
+    neg: bool,
+}
+
+impl<F: Float> Norm<F> {
+    /// Unbias the exponent and account for the mantissa's precision, including the guard bit.
+    const EXP_UNBIAS: u32 = F::EXP_BIAS + F::SIG_BITS + 1;
+
+    /// Values greater than this had a saturated exponent (infinity or NaN), OR were zero and we
+    /// adjusted the exponent such that it exceeds this threashold.
+    const ZERO_INF_NAN: u32 = F::EXP_SAT - Self::EXP_UNBIAS;
+
+    fn from_float(x: F) -> Self {
+        let mut ix = x.to_bits();
+        let mut e = x.ex() as i32;
+        let neg = x.is_sign_negative();
+        if e == 0 {
+            // Normalize subnormals by multiplication
+            let scale_i = F::BITS - 1;
+            let scale_f = F::from_parts(false, scale_i + F::EXP_BIAS, F::Int::ZERO);
+            let scaled = x * scale_f;
+            ix = scaled.to_bits();
+            e = scaled.ex() as i32;
+            e = if e == 0 {
+                // If the exponent is still zero, the input was zero. Artifically set this value
+                // such that the final `e` will exceed `ZERO_INF_NAN`.
+                1 << F::EXP_BITS
+            } else {
+                // Otherwise, account for the scaling we just did.
+                e - scale_i as i32
+            };
+        }
+
+        e -= Self::EXP_UNBIAS as i32;
+
+        // Absolute  value, set the implicit bit, and shift to create a guard bit
+        ix &= F::SIG_MASK;
+        ix |= F::IMPLICIT_BIT;
+        ix <<= 1;
+
+        Self { m: ix, e, neg }
+    }
+
+    /// True if the value was zero, infinity, or NaN.
+    fn is_zero_nan_inf(self) -> bool {
+        self.e >= Self::ZERO_INF_NAN as i32
+    }
+
+    /// The only value we have
+    fn is_zero(self) -> bool {
+        // The only exponent that strictly exceeds this value is our sentinel value for zero.
+        self.e > Self::ZERO_INF_NAN as i32
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/generic/fma_wide.rs b/library/compiler-builtins/libm/src/math/generic/fma_wide.rs
new file mode 100644
index 00000000000..a2ef59d3e3d
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/generic/fma_wide.rs
@@ -0,0 +1,73 @@
+use crate::support::{
+    CastFrom, CastInto, DFloat, Float, FpResult, HFloat, IntTy, MinInt, Round, Status,
+};
+
+/// Fma implementation when a hardware-backed larger float type is available. For `f32` and `f64`,
+/// `f64` has enough precision to represent the `f32` in its entirety, except for double rounding.
+#[inline]
+pub fn fma_wide_round<F, B>(x: F, y: F, z: F, round: Round) -> FpResult<F>
+where
+    F: Float + HFloat<D = B>,
+    B: Float + DFloat<H = F>,
+    B::Int: CastInto<i32>,
+    i32: CastFrom<i32>,
+{
+    let one = IntTy::<B>::ONE;
+
+    let xy: B = x.widen() * y.widen();
+    let mut result: B = xy + z.widen();
+    let mut ui: B::Int = result.to_bits();
+    let re = result.ex();
+    let zb: B = z.widen();
+
+    let prec_diff = B::SIG_BITS - F::SIG_BITS;
+    let excess_prec = ui & ((one << prec_diff) - one);
+    let halfway = one << (prec_diff - 1);
+
+    // Common case: the larger precision is fine if...
+    // This is not a halfway case
+    if excess_prec != halfway
+        // Or the result is NaN
+        || re == B::EXP_SAT
+        // Or the result is exact
+        || (result - xy == zb && result - zb == xy)
+        // Or the mode is something other than round to nearest
+        || round != Round::Nearest
+    {
+        let min_inexact_exp = (B::EXP_BIAS as i32 + F::EXP_MIN_SUBNORM) as u32;
+        let max_inexact_exp = (B::EXP_BIAS as i32 + F::EXP_MIN) as u32;
+
+        let mut status = Status::OK;
+
+        if (min_inexact_exp..max_inexact_exp).contains(&re) && status.inexact() {
+            // This branch is never hit; requires previous operations to set a status
+            status.set_inexact(false);
+
+            result = xy + z.widen();
+            if status.inexact() {
+                status.set_underflow(true);
+            } else {
+                status.set_inexact(true);
+            }
+        }
+
+        return FpResult {
+            val: result.narrow(),
+            status,
+        };
+    }
+
+    let neg = ui >> (B::BITS - 1) != IntTy::<B>::ZERO;
+    let err = if neg == (zb > xy) {
+        xy - result + zb
+    } else {
+        zb - result + xy
+    };
+    if neg == (err < B::ZERO) {
+        ui += one;
+    } else {
+        ui -= one;
+    }
+
+    FpResult::ok(B::from_bits(ui).narrow())
+}
diff --git a/library/compiler-builtins/libm/src/math/generic/fmax.rs b/library/compiler-builtins/libm/src/math/generic/fmax.rs
new file mode 100644
index 00000000000..54207e4b328
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/generic/fmax.rs
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: MIT OR Apache-2.0 */
+//! IEEE 754-2011 `maxNum`. This has been superseded by IEEE 754-2019 `maximumNumber`.
+//!
+//! Per the spec, returns the canonicalized result of:
+//! - `x` if `x > y`
+//! - `y` if `y > x`
+//! - The other number if one is NaN
+//! - Otherwise, either `x` or `y`, canonicalized
+//! - -0.0 and +0.0 may be disregarded (unlike newer operations)
+//!
+//! Excluded from our implementation is sNaN handling.
+//!
+//! More on the differences: [link].
+//!
+//! [link]: https://grouper.ieee.org/groups/msc/ANSI_IEEE-Std-754-2019/background/minNum_maxNum_Removal_Demotion_v3.pdf
+
+use crate::support::Float;
+
+#[inline]
+pub fn fmax<F: Float>(x: F, y: F) -> F {
+    let res = if x.is_nan() || x < y { y } else { x };
+    // Canonicalize
+    res * F::ONE
+}
diff --git a/library/compiler-builtins/libm/src/math/generic/fmaximum.rs b/library/compiler-builtins/libm/src/math/generic/fmaximum.rs
new file mode 100644
index 00000000000..4b6295bc0c6
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/generic/fmaximum.rs
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: MIT OR Apache-2.0 */
+//! IEEE 754-2019 `maximum`.
+//!
+//! Per the spec, returns the canonicalized result of:
+//! - `x` if `x > y`
+//! - `y` if `y > x`
+//! - qNaN if either operation is NaN
+//! - Logic following +0.0 > -0.0
+//!
+//! Excluded from our implementation is sNaN handling.
+
+use crate::support::Float;
+
+#[inline]
+pub fn fmaximum<F: Float>(x: F, y: F) -> F {
+    let res = if x.is_nan() {
+        x
+    } else if y.is_nan() {
+        y
+    } else if x > y || (y.to_bits() == F::NEG_ZERO.to_bits() && x.is_sign_positive()) {
+        x
+    } else {
+        y
+    };
+
+    // Canonicalize
+    res * F::ONE
+}
diff --git a/library/compiler-builtins/libm/src/math/generic/fmaximum_num.rs b/library/compiler-builtins/libm/src/math/generic/fmaximum_num.rs
new file mode 100644
index 00000000000..2e97ff6d369
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/generic/fmaximum_num.rs
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: MIT OR Apache-2.0 */
+//! IEEE 754-2019 `maximumNumber`.
+//!
+//! Per the spec, returns:
+//! - `x` if `x > y`
+//! - `y` if `y > x`
+//! - Non-NaN if one operand is NaN
+//! - Logic following +0.0 > -0.0
+//! - Either `x` or `y` if `x == y` and the signs are the same
+//! - qNaN if either operand is a NaN
+//!
+//! Excluded from our implementation is sNaN handling.
+
+use crate::support::Float;
+
+#[inline]
+pub fn fmaximum_num<F: Float>(x: F, y: F) -> F {
+    let res =
+        if x.is_nan() || x < y || (x.to_bits() == F::NEG_ZERO.to_bits() && y.is_sign_positive()) {
+            y
+        } else {
+            x
+        };
+
+    // Canonicalize
+    res * F::ONE
+}
diff --git a/library/compiler-builtins/libm/src/math/generic/fmin.rs b/library/compiler-builtins/libm/src/math/generic/fmin.rs
new file mode 100644
index 00000000000..0f86364d230
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/generic/fmin.rs
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: MIT OR Apache-2.0 */
+//! IEEE 754-2008 `minNum`. This has been superseded by IEEE 754-2019 `minimumNumber`.
+//!
+//! Per the spec, returns the canonicalized result of:
+//! - `x` if `x < y`
+//! - `y` if `y < x`
+//! - The other number if one is NaN
+//! - Otherwise, either `x` or `y`, canonicalized
+//! - -0.0 and +0.0 may be disregarded (unlike newer operations)
+//!
+//! Excluded from our implementation is sNaN handling.
+//!
+//! More on the differences: [link].
+//!
+//! [link]: https://grouper.ieee.org/groups/msc/ANSI_IEEE-Std-754-2019/background/minNum_maxNum_Removal_Demotion_v3.pdf
+
+use crate::support::Float;
+
+#[inline]
+pub fn fmin<F: Float>(x: F, y: F) -> F {
+    let res = if y.is_nan() || x < y { x } else { y };
+    // Canonicalize
+    res * F::ONE
+}
diff --git a/library/compiler-builtins/libm/src/math/generic/fminimum.rs b/library/compiler-builtins/libm/src/math/generic/fminimum.rs
new file mode 100644
index 00000000000..9dc0b64be3f
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/generic/fminimum.rs
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: MIT OR Apache-2.0 */
+//! IEEE 754-2019 `minimum`.
+//!
+//! Per the spec, returns the canonicalized result of:
+//! - `x` if `x < y`
+//! - `y` if `y < x`
+//! - qNaN if either operation is NaN
+//! - Logic following +0.0 > -0.0
+//!
+//! Excluded from our implementation is sNaN handling.
+
+use crate::support::Float;
+
+#[inline]
+pub fn fminimum<F: Float>(x: F, y: F) -> F {
+    let res = if x.is_nan() {
+        x
+    } else if y.is_nan() {
+        y
+    } else if x < y || (x.to_bits() == F::NEG_ZERO.to_bits() && y.is_sign_positive()) {
+        x
+    } else {
+        y
+    };
+
+    // Canonicalize
+    res * F::ONE
+}
diff --git a/library/compiler-builtins/libm/src/math/generic/fminimum_num.rs b/library/compiler-builtins/libm/src/math/generic/fminimum_num.rs
new file mode 100644
index 00000000000..40db8b18957
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/generic/fminimum_num.rs
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: MIT OR Apache-2.0 */
+//! IEEE 754-2019 `minimum`.
+//!
+//! Per the spec, returns:
+//! - `x` if `x < y`
+//! - `y` if `y < x`
+//! - Non-NaN if one operand is NaN
+//! - Logic following +0.0 > -0.0
+//! - Either `x` or `y` if `x == y` and the signs are the same
+//! - qNaN if either operand is a NaN
+//!
+//! Excluded from our implementation is sNaN handling.
+
+use crate::support::Float;
+
+#[inline]
+pub fn fminimum_num<F: Float>(x: F, y: F) -> F {
+    let res =
+        if y.is_nan() || x < y || (x.to_bits() == F::NEG_ZERO.to_bits() && y.is_sign_positive()) {
+            x
+        } else {
+            y
+        };
+
+    // Canonicalize
+    res * F::ONE
+}
diff --git a/library/compiler-builtins/libm/src/math/generic/fmod.rs b/library/compiler-builtins/libm/src/math/generic/fmod.rs
new file mode 100644
index 00000000000..29acc8a4d5d
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/generic/fmod.rs
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: MIT OR Apache-2.0 */
+use crate::support::{CastFrom, Float, Int, MinInt};
+
+#[inline]
+pub fn fmod<F: Float>(x: F, y: F) -> F {
+    let _1 = F::Int::ONE;
+    let sx = x.to_bits() & F::SIGN_MASK;
+    let ux = x.to_bits() & !F::SIGN_MASK;
+    let uy = y.to_bits() & !F::SIGN_MASK;
+
+    // Cases that return NaN:
+    //   NaN % _
+    //   Inf % _
+    //     _ % NaN
+    //     _ % 0
+    let x_nan_or_inf = ux & F::EXP_MASK == F::EXP_MASK;
+    let y_nan_or_zero = uy.wrapping_sub(_1) & F::EXP_MASK == F::EXP_MASK;
+    if x_nan_or_inf | y_nan_or_zero {
+        return (x * y) / (x * y);
+    }
+
+    if ux < uy {
+        // |x| < |y|
+        return x;
+    }
+
+    let (num, ex) = into_sig_exp::<F>(ux);
+    let (div, ey) = into_sig_exp::<F>(uy);
+
+    // To compute `(num << ex) % (div << ey)`, first
+    // evaluate `rem = (num << (ex - ey)) % div` ...
+    let rem = reduction(num, ex - ey, div);
+    // ... so the result will be `rem << ey`
+
+    if rem.is_zero() {
+        // Return zero with the sign of `x`
+        return F::from_bits(sx);
+    };
+
+    // We would shift `rem` up by `ey`, but have to stop at `F::SIG_BITS`
+    let shift = ey.min(F::SIG_BITS - rem.ilog2());
+    // Anything past that is added to the exponent field
+    let bits = (rem << shift) + (F::Int::cast_from(ey - shift) << F::SIG_BITS);
+    F::from_bits(sx + bits)
+}
+
+/// Given the bits of a finite float, return a tuple of
+///  - the mantissa with the implicit bit (0 if subnormal, 1 otherwise)
+///  - the additional exponent past 1, (0 for subnormal, 0 or more otherwise)
+fn into_sig_exp<F: Float>(mut bits: F::Int) -> (F::Int, u32) {
+    bits &= !F::SIGN_MASK;
+    // Subtract 1 from the exponent, clamping at 0
+    let sat = bits.checked_sub(F::IMPLICIT_BIT).unwrap_or(F::Int::ZERO);
+    (
+        bits - (sat & F::EXP_MASK),
+        u32::cast_from(sat >> F::SIG_BITS),
+    )
+}
+
+/// Compute the remainder `(x * 2.pow(e)) % y` without overflow.
+fn reduction<I: Int>(mut x: I, e: u32, y: I) -> I {
+    x %= y;
+    for _ in 0..e {
+        x <<= 1;
+        x = x.checked_sub(y).unwrap_or(x);
+    }
+    x
+}
diff --git a/library/compiler-builtins/libm/src/math/generic/mod.rs b/library/compiler-builtins/libm/src/math/generic/mod.rs
new file mode 100644
index 00000000000..9d497a03f54
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/generic/mod.rs
@@ -0,0 +1,42 @@
+// Note: generic functions are marked `#[inline]` because, even though generic functions are
+// typically inlined, this does not seem to always be the case.
+
+mod ceil;
+mod copysign;
+mod fabs;
+mod fdim;
+mod floor;
+mod fma;
+mod fma_wide;
+mod fmax;
+mod fmaximum;
+mod fmaximum_num;
+mod fmin;
+mod fminimum;
+mod fminimum_num;
+mod fmod;
+mod rint;
+mod round;
+mod scalbn;
+mod sqrt;
+mod trunc;
+
+pub use ceil::ceil;
+pub use copysign::copysign;
+pub use fabs::fabs;
+pub use fdim::fdim;
+pub use floor::floor;
+pub use fma::fma_round;
+pub use fma_wide::fma_wide_round;
+pub use fmax::fmax;
+pub use fmaximum::fmaximum;
+pub use fmaximum_num::fmaximum_num;
+pub use fmin::fmin;
+pub use fminimum::fminimum;
+pub use fminimum_num::fminimum_num;
+pub use fmod::fmod;
+pub use rint::rint_round;
+pub use round::round;
+pub use scalbn::scalbn;
+pub use sqrt::sqrt;
+pub use trunc::trunc;
diff --git a/library/compiler-builtins/libm/src/math/generic/rint.rs b/library/compiler-builtins/libm/src/math/generic/rint.rs
new file mode 100644
index 00000000000..c5bc27d3de6
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/generic/rint.rs
@@ -0,0 +1,130 @@
+/* SPDX-License-Identifier: MIT */
+/* origin: musl src/math/rint.c */
+
+use crate::support::{Float, FpResult, Round};
+
+/// IEEE 754-2019 `roundToIntegralExact`, which respects rounding mode and raises inexact if
+/// applicable.
+#[inline]
+pub fn rint_round<F: Float>(x: F, _round: Round) -> FpResult<F> {
+    let toint = F::ONE / F::EPSILON;
+    let e = x.ex();
+    let positive = x.is_sign_positive();
+
+    // On i386 `force_eval!` must be used to force rounding via storage to memory. Otherwise,
+    // the excess precission from x87 would cause an incorrect final result.
+    let force = |x| {
+        if cfg!(x86_no_sse) && (F::BITS == 32 || F::BITS == 64) {
+            force_eval!(x)
+        } else {
+            x
+        }
+    };
+
+    let res = if e >= F::EXP_BIAS + F::SIG_BITS {
+        // No fractional part; exact result can be returned.
+        x
+    } else {
+        // Apply a net-zero adjustment that nudges `y` in the direction of the rounding mode. For
+        // Rust this is always nearest, but ideally it would take `round` into account.
+        let y = if positive {
+            force(force(x) + toint) - toint
+        } else {
+            force(force(x) - toint) + toint
+        };
+
+        if y == F::ZERO {
+            // A zero result takes the sign of the input.
+            if positive { F::ZERO } else { F::NEG_ZERO }
+        } else {
+            y
+        }
+    };
+
+    FpResult::ok(res)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::support::{Hexf, Status};
+
+    fn spec_test<F: Float>(cases: &[(F, F, Status)]) {
+        let roundtrip = [
+            F::ZERO,
+            F::ONE,
+            F::NEG_ONE,
+            F::NEG_ZERO,
+            F::INFINITY,
+            F::NEG_INFINITY,
+        ];
+
+        for x in roundtrip {
+            let FpResult { val, status } = rint_round(x, Round::Nearest);
+            assert_biteq!(val, x, "rint_round({})", Hexf(x));
+            assert_eq!(status, Status::OK, "{}", Hexf(x));
+        }
+
+        for &(x, res, res_stat) in cases {
+            let FpResult { val, status } = rint_round(x, Round::Nearest);
+            assert_biteq!(val, res, "rint_round({})", Hexf(x));
+            assert_eq!(status, res_stat, "{}", Hexf(x));
+        }
+    }
+
+    #[test]
+    #[cfg(f16_enabled)]
+    fn spec_tests_f16() {
+        let cases = [];
+        spec_test::<f16>(&cases);
+    }
+
+    #[test]
+    fn spec_tests_f32() {
+        let cases = [
+            (0.1, 0.0, Status::OK),
+            (-0.1, -0.0, Status::OK),
+            (0.5, 0.0, Status::OK),
+            (-0.5, -0.0, Status::OK),
+            (0.9, 1.0, Status::OK),
+            (-0.9, -1.0, Status::OK),
+            (1.1, 1.0, Status::OK),
+            (-1.1, -1.0, Status::OK),
+            (1.5, 2.0, Status::OK),
+            (-1.5, -2.0, Status::OK),
+            (1.9, 2.0, Status::OK),
+            (-1.9, -2.0, Status::OK),
+            (2.8, 3.0, Status::OK),
+            (-2.8, -3.0, Status::OK),
+        ];
+        spec_test::<f32>(&cases);
+    }
+
+    #[test]
+    fn spec_tests_f64() {
+        let cases = [
+            (0.1, 0.0, Status::OK),
+            (-0.1, -0.0, Status::OK),
+            (0.5, 0.0, Status::OK),
+            (-0.5, -0.0, Status::OK),
+            (0.9, 1.0, Status::OK),
+            (-0.9, -1.0, Status::OK),
+            (1.1, 1.0, Status::OK),
+            (-1.1, -1.0, Status::OK),
+            (1.5, 2.0, Status::OK),
+            (-1.5, -2.0, Status::OK),
+            (1.9, 2.0, Status::OK),
+            (-1.9, -2.0, Status::OK),
+            (2.8, 3.0, Status::OK),
+            (-2.8, -3.0, Status::OK),
+        ];
+        spec_test::<f64>(&cases);
+    }
+
+    #[test]
+    #[cfg(f128_enabled)]
+    fn spec_tests_f128() {
+        let cases = [];
+        spec_test::<f128>(&cases);
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/generic/round.rs b/library/compiler-builtins/libm/src/math/generic/round.rs
new file mode 100644
index 00000000000..16739f01d87
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/generic/round.rs
@@ -0,0 +1,83 @@
+use super::{copysign, trunc};
+use crate::support::{Float, MinInt};
+
+#[inline]
+pub fn round<F: Float>(x: F) -> F {
+    let f0p5 = F::from_parts(false, F::EXP_BIAS - 1, F::Int::ZERO); // 0.5
+    let f0p25 = F::from_parts(false, F::EXP_BIAS - 2, F::Int::ZERO); // 0.25
+
+    trunc(x + copysign(f0p5 - f0p25 * F::EPSILON, x))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    #[cfg(f16_enabled)]
+    fn zeroes_f16() {
+        assert_biteq!(round(0.0_f16), 0.0_f16);
+        assert_biteq!(round(-0.0_f16), -0.0_f16);
+    }
+
+    #[test]
+    #[cfg(f16_enabled)]
+    fn sanity_check_f16() {
+        assert_eq!(round(-1.0_f16), -1.0);
+        assert_eq!(round(2.8_f16), 3.0);
+        assert_eq!(round(-0.5_f16), -1.0);
+        assert_eq!(round(0.5_f16), 1.0);
+        assert_eq!(round(-1.5_f16), -2.0);
+        assert_eq!(round(1.5_f16), 2.0);
+    }
+
+    #[test]
+    fn zeroes_f32() {
+        assert_biteq!(round(0.0_f32), 0.0_f32);
+        assert_biteq!(round(-0.0_f32), -0.0_f32);
+    }
+
+    #[test]
+    fn sanity_check_f32() {
+        assert_eq!(round(-1.0_f32), -1.0);
+        assert_eq!(round(2.8_f32), 3.0);
+        assert_eq!(round(-0.5_f32), -1.0);
+        assert_eq!(round(0.5_f32), 1.0);
+        assert_eq!(round(-1.5_f32), -2.0);
+        assert_eq!(round(1.5_f32), 2.0);
+    }
+
+    #[test]
+    fn zeroes_f64() {
+        assert_biteq!(round(0.0_f64), 0.0_f64);
+        assert_biteq!(round(-0.0_f64), -0.0_f64);
+    }
+
+    #[test]
+    fn sanity_check_f64() {
+        assert_eq!(round(-1.0_f64), -1.0);
+        assert_eq!(round(2.8_f64), 3.0);
+        assert_eq!(round(-0.5_f64), -1.0);
+        assert_eq!(round(0.5_f64), 1.0);
+        assert_eq!(round(-1.5_f64), -2.0);
+        assert_eq!(round(1.5_f64), 2.0);
+    }
+
+    #[test]
+    #[cfg(f128_enabled)]
+    fn zeroes_f128() {
+        assert_biteq!(round(0.0_f128), 0.0_f128);
+        assert_biteq!(round(-0.0_f128), -0.0_f128);
+    }
+
+    #[test]
+    #[cfg(f128_enabled)]
+    fn sanity_check_f128() {
+        assert_eq!(round(-1.0_f128), -1.0);
+        assert_eq!(round(2.8_f128), 3.0);
+        assert_eq!(round(-0.5_f128), -1.0);
+        assert_eq!(round(0.5_f128), 1.0);
+        assert_eq!(round(-1.5_f128), -2.0);
+        assert_eq!(round(1.5_f128), 2.0);
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/generic/scalbn.rs b/library/compiler-builtins/libm/src/math/generic/scalbn.rs
new file mode 100644
index 00000000000..6dd9b1a9b84
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/generic/scalbn.rs
@@ -0,0 +1,121 @@
+use crate::support::{CastFrom, CastInto, Float, IntTy, MinInt};
+
+/// Scale the exponent.
+///
+/// From N3220:
+///
+/// > The scalbn and scalbln functions compute `x * b^n`, where `b = FLT_RADIX` if the return type
+/// > of the function is a standard floating type, or `b = 10` if the return type of the function
+/// > is a decimal floating type. A range error occurs for some finite x, depending on n.
+/// >
+/// > [...]
+/// >
+/// > * `scalbn(±0, n)` returns `±0`.
+/// > * `scalbn(x, 0)` returns `x`.
+/// > * `scalbn(±∞, n)` returns `±∞`.
+/// >
+/// > If the calculation does not overflow or underflow, the returned value is exact and
+/// > independent of the current rounding direction mode.
+#[inline]
+pub fn scalbn<F: Float>(mut x: F, mut n: i32) -> F
+where
+    u32: CastInto<F::Int>,
+    F::Int: CastFrom<i32>,
+    F::Int: CastFrom<u32>,
+{
+    let zero = IntTy::<F>::ZERO;
+
+    // Bits including the implicit bit
+    let sig_total_bits = F::SIG_BITS + 1;
+
+    // Maximum and minimum values when biased
+    let exp_max = F::EXP_MAX;
+    let exp_min = F::EXP_MIN;
+
+    // 2 ^ Emax, maximum positive with null significand (0x1p1023 for f64)
+    let f_exp_max = F::from_parts(false, F::EXP_BIAS << 1, zero);
+
+    // 2 ^ Emin, minimum positive normal with null significand (0x1p-1022 for f64)
+    let f_exp_min = F::from_parts(false, 1, zero);
+
+    // 2 ^ sig_total_bits, moltiplier to normalize subnormals (0x1p53 for f64)
+    let f_pow_subnorm = F::from_parts(false, sig_total_bits + F::EXP_BIAS, zero);
+
+    /*
+     * The goal is to multiply `x` by a scale factor that applies `n`. However, there are cases
+     * where `2^n` is not representable by `F` but the result should be, e.g. `x = 2^Emin` with
+     * `n = -EMin + 2` (one out of range of 2^Emax). To get around this, reduce the magnitude of
+     * the final scale operation by prescaling by the max/min power representable by `F`.
+     */
+
+    if n > exp_max {
+        // Worse case positive `n`: `x`  is the minimum subnormal value, the result is `F::MAX`.
+        // This can be reached by three scaling multiplications (two here and one final).
+        debug_assert!(-exp_min + F::SIG_BITS as i32 + exp_max <= exp_max * 3);
+
+        x *= f_exp_max;
+        n -= exp_max;
+        if n > exp_max {
+            x *= f_exp_max;
+            n -= exp_max;
+            if n > exp_max {
+                n = exp_max;
+            }
+        }
+    } else if n < exp_min {
+        // When scaling toward 0, the prescaling is limited to a value that does not allow `x` to
+        // go subnormal. This avoids double rounding.
+        if F::BITS > 16 {
+            // `mul` s.t. `!(x * mul).is_subnormal() ∀ x`
+            let mul = f_exp_min * f_pow_subnorm;
+            let add = -exp_min - sig_total_bits as i32;
+
+            // Worse case negative `n`: `x`  is the maximum positive value, the result is `F::MIN`.
+            // This must be reachable by three scaling multiplications (two here and one final).
+            debug_assert!(-exp_min + F::SIG_BITS as i32 + exp_max <= add * 2 + -exp_min);
+
+            x *= mul;
+            n += add;
+
+            if n < exp_min {
+                x *= mul;
+                n += add;
+
+                if n < exp_min {
+                    n = exp_min;
+                }
+            }
+        } else {
+            // `f16` is unique compared to other float types in that the difference between the
+            // minimum exponent and the significand bits (`add = -exp_min - sig_total_bits`) is
+            // small, only three. The above method depend on decrementing `n` by `add` two times;
+            // for other float types this works out because `add` is a substantial fraction of
+            // the exponent range. For `f16`, however, 3 is relatively small compared to the
+            // exponent range (which is 39), so that requires ~10 prescale rounds rather than two.
+            //
+            // Work aroudn this by using a different algorithm that calculates the prescale
+            // dynamically based on the maximum possible value. This adds more operations per round
+            // since it needs to construct the scale, but works better in the general case.
+            let add = -(n + sig_total_bits as i32).clamp(exp_min, sig_total_bits as i32);
+            let mul = F::from_parts(false, (F::EXP_BIAS as i32 - add) as u32, zero);
+
+            x *= mul;
+            n += add;
+
+            if n < exp_min {
+                let add = -(n + sig_total_bits as i32).clamp(exp_min, sig_total_bits as i32);
+                let mul = F::from_parts(false, (F::EXP_BIAS as i32 - add) as u32, zero);
+
+                x *= mul;
+                n += add;
+
+                if n < exp_min {
+                    n = exp_min;
+                }
+            }
+        }
+    }
+
+    let scale = F::from_parts(false, (F::EXP_BIAS as i32 + n) as u32, zero);
+    x * scale
+}
diff --git a/library/compiler-builtins/libm/src/math/generic/sqrt.rs b/library/compiler-builtins/libm/src/math/generic/sqrt.rs
new file mode 100644
index 00000000000..9481c4cdb7b
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/generic/sqrt.rs
@@ -0,0 +1,541 @@
+/* SPDX-License-Identifier: MIT */
+/* origin: musl src/math/sqrt.c. Ported to generic Rust algorithm in 2025, TG. */
+
+//! Generic square root algorithm.
+//!
+//! This routine operates around `m_u2`, a U.2 (fixed point with two integral bits) mantissa
+//! within the range [1, 4). A table lookup provides an initial estimate, then goldschmidt
+//! iterations at various widths are used to approach the real values.
+//!
+//! For the iterations, `r` is a U0 number that approaches `1/sqrt(m_u2)`, and `s` is a U2 number
+//! that approaches `sqrt(m_u2)`. Recall that m_u2 ∈ [1, 4).
+//!
+//! With Newton-Raphson iterations, this would be:
+//!
+//! - `w = r * r           w ~ 1 / m`
+//! - `u = 3 - m * w       u ~ 3 - m * w = 3 - m / m = 2`
+//! - `r = r * u / 2       r ~ r`
+//!
+//! (Note that the righthand column does not show anything analytically meaningful (i.e. r ~ r),
+//! since the value of performing one iteration is in reducing the error representable by `~`).
+//!
+//! Instead of Newton-Raphson iterations, Goldschmidt iterations are used to calculate
+//! `s = m * r`:
+//!
+//! - `s = m * r           s ~ m / sqrt(m)`
+//! - `u = 3 - s * r       u ~ 3 - (m / sqrt(m)) * (1 / sqrt(m)) = 3 - m / m = 2`
+//! - `r = r * u / 2       r ~ r`
+//! - `s = s * u / 2       s ~ s`
+//!
+//! The above is precise because it uses the original value `m`. There is also a faster version
+//! that performs fewer steps but does not use `m`:
+//!
+//! - `u = 3 - s * r       u ~ 3 - 1`
+//! - `r = r * u / 2       r ~ r`
+//! - `s = s * u / 2       s ~ s`
+//!
+//! Rounding errors accumulate faster with the second version, so it is only used for subsequent
+//! iterations within the same width integer. The first version is always used for the first
+//! iteration at a new width in order to avoid this accumulation.
+//!
+//! Goldschmidt has the advantage over Newton-Raphson that `sqrt(x)` and `1/sqrt(x)` are
+//! computed at the same time, i.e. there is no need to calculate `1/sqrt(x)` and invert it.
+
+use crate::support::{
+    CastFrom, CastInto, DInt, Float, FpResult, HInt, Int, IntTy, MinInt, Round, Status, cold_path,
+};
+
+#[inline]
+pub fn sqrt<F>(x: F) -> F
+where
+    F: Float + SqrtHelper,
+    F::Int: HInt,
+    F::Int: From<u8>,
+    F::Int: From<F::ISet2>,
+    F::Int: CastInto<F::ISet1>,
+    F::Int: CastInto<F::ISet2>,
+    u32: CastInto<F::Int>,
+{
+    sqrt_round(x, Round::Nearest).val
+}
+
+#[inline]
+pub fn sqrt_round<F>(x: F, _round: Round) -> FpResult<F>
+where
+    F: Float + SqrtHelper,
+    F::Int: HInt,
+    F::Int: From<u8>,
+    F::Int: From<F::ISet2>,
+    F::Int: CastInto<F::ISet1>,
+    F::Int: CastInto<F::ISet2>,
+    u32: CastInto<F::Int>,
+{
+    let zero = IntTy::<F>::ZERO;
+    let one = IntTy::<F>::ONE;
+
+    let mut ix = x.to_bits();
+
+    // Top is the exponent and sign, which may or may not be shifted. If the float fits into a
+    // `u32`, we can get by without paying shifting costs.
+    let noshift = F::BITS <= u32::BITS;
+    let (mut top, special_case) = if noshift {
+        let exp_lsb = one << F::SIG_BITS;
+        let special_case = ix.wrapping_sub(exp_lsb) >= F::EXP_MASK - exp_lsb;
+        (Exp::NoShift(()), special_case)
+    } else {
+        let top = u32::cast_from(ix >> F::SIG_BITS);
+        let special_case = top.wrapping_sub(1) >= F::EXP_SAT - 1;
+        (Exp::Shifted(top), special_case)
+    };
+
+    // Handle NaN, zero, and out of domain (<= 0)
+    if special_case {
+        cold_path();
+
+        // +/-0
+        if ix << 1 == zero {
+            return FpResult::ok(x);
+        }
+
+        // Positive infinity
+        if ix == F::EXP_MASK {
+            return FpResult::ok(x);
+        }
+
+        // NaN or negative
+        if ix > F::EXP_MASK {
+            return FpResult::new(F::NAN, Status::INVALID);
+        }
+
+        // Normalize subnormals by multiplying by 1.0 << SIG_BITS (e.g. 0x1p52 for doubles).
+        let scaled = x * F::from_parts(false, F::SIG_BITS + F::EXP_BIAS, zero);
+        ix = scaled.to_bits();
+        match top {
+            Exp::Shifted(ref mut v) => {
+                *v = scaled.ex();
+                *v = (*v).wrapping_sub(F::SIG_BITS);
+            }
+            Exp::NoShift(()) => {
+                ix = ix.wrapping_sub((F::SIG_BITS << F::SIG_BITS).cast());
+            }
+        }
+    }
+
+    // Reduce arguments such that `x = 4^e * m`:
+    //
+    // - m_u2 ∈ [1, 4), a fixed point U2.BITS number
+    // - 2^e is the exponent part of the result
+    let (m_u2, exp) = match top {
+        Exp::Shifted(top) => {
+            // We now know `x` is positive, so `top` is just its (biased) exponent
+            let mut e = top;
+            // Construct a fixed point representation of the mantissa.
+            let mut m_u2 = (ix | F::IMPLICIT_BIT) << F::EXP_BITS;
+            let even = (e & 1) != 0;
+            if even {
+                m_u2 >>= 1;
+            }
+            e = (e.wrapping_add(F::EXP_SAT >> 1)) >> 1;
+            (m_u2, Exp::Shifted(e))
+        }
+        Exp::NoShift(()) => {
+            let even = ix & (one << F::SIG_BITS) != zero;
+
+            // Exponent part of the return value
+            let mut e_noshift = ix >> 1;
+            // ey &= (F::EXP_MASK << 2) >> 2; // clear the top exponent bit (result = 1.0)
+            e_noshift += (F::EXP_MASK ^ (F::SIGN_MASK >> 1)) >> 1;
+            e_noshift &= F::EXP_MASK;
+
+            let m1 = (ix << F::EXP_BITS) | F::SIGN_MASK;
+            let m0 = (ix << (F::EXP_BITS - 1)) & !F::SIGN_MASK;
+            let m_u2 = if even { m0 } else { m1 };
+
+            (m_u2, Exp::NoShift(e_noshift))
+        }
+    };
+
+    // Extract the top 6 bits of the significand with the lowest bit of the exponent.
+    let i = usize::cast_from(ix >> (F::SIG_BITS - 6)) & 0b1111111;
+
+    // Start with an initial guess for `r = 1 / sqrt(m)` from the table, and shift `m` as an
+    // initial value for `s = sqrt(m)`. See the module documentation for details.
+    let r1_u0: F::ISet1 = F::ISet1::cast_from(RSQRT_TAB[i]) << (F::ISet1::BITS - 16);
+    let s1_u2: F::ISet1 = ((m_u2) >> (F::BITS - F::ISet1::BITS)).cast();
+
+    // Perform iterations, if any, at quarter width (used for `f128`).
+    let (r1_u0, _s1_u2) = goldschmidt::<F, F::ISet1>(r1_u0, s1_u2, F::SET1_ROUNDS, false);
+
+    // Widen values and perform iterations at half width (used for `f64` and `f128`).
+    let r2_u0: F::ISet2 = F::ISet2::from(r1_u0) << (F::ISet2::BITS - F::ISet1::BITS);
+    let s2_u2: F::ISet2 = ((m_u2) >> (F::BITS - F::ISet2::BITS)).cast();
+    let (r2_u0, _s2_u2) = goldschmidt::<F, F::ISet2>(r2_u0, s2_u2, F::SET2_ROUNDS, false);
+
+    // Perform final iterations at full width (used for all float types).
+    let r_u0: F::Int = F::Int::from(r2_u0) << (F::BITS - F::ISet2::BITS);
+    let s_u2: F::Int = m_u2;
+    let (_r_u0, s_u2) = goldschmidt::<F, F::Int>(r_u0, s_u2, F::FINAL_ROUNDS, true);
+
+    // Shift back to mantissa position.
+    let mut m = s_u2 >> (F::EXP_BITS - 2);
+
+    // The musl source includes the following comment (with literals replaced):
+    //
+    // > s < sqrt(m) < s + 0x1.09p-SIG_BITS
+    // > compute nearest rounded result: the nearest result to SIG_BITS bits is either s or
+    // > s+0x1p-SIG_BITS, we can decide by comparing (2^SIG_BITS s + 0.5)^2 to 2^(2*SIG_BITS) m.
+    //
+    // Expanding this with , with `SIG_BITS = p` and adjusting based on the operations done to
+    // `d0` and `d1`:
+    //
+    // - `2^(2p)m ≟ ((2^p)m + 0.5)^2`
+    // - `2^(2p)m ≟ 2^(2p)m^2 + (2^p)m + 0.25`
+    // - `2^(2p)m - m^2 ≟ (2^(2p) - 1)m^2 + (2^p)m + 0.25`
+    // - `(1 - 2^(2p))m + m^2 ≟ (1 - 2^(2p))m^2 + (1 - 2^p)m + 0.25` (?)
+    //
+    // I do not follow how the rounding bit is extracted from this comparison with the below
+    // operations. In any case, the algorithm is well tested.
+
+    // The value needed to shift `m_u2` by to create `m*2^(2p)`. `2p = 2 * F::SIG_BITS`,
+    // `F::BITS - 2` accounts for the offset that `m_u2` already has.
+    let shift = 2 * F::SIG_BITS - (F::BITS - 2);
+
+    // `2^(2p)m - m^2`
+    let d0 = (m_u2 << shift).wrapping_sub(m.wrapping_mul(m));
+    // `m - 2^(2p)m + m^2`
+    let d1 = m.wrapping_sub(d0);
+    m += d1 >> (F::BITS - 1);
+    m &= F::SIG_MASK;
+
+    match exp {
+        Exp::Shifted(e) => m |= IntTy::<F>::cast_from(e) << F::SIG_BITS,
+        Exp::NoShift(e) => m |= e,
+    };
+
+    let mut y = F::from_bits(m);
+
+    // FIXME(f16): the fenv math does not work for `f16`
+    if F::BITS > 16 {
+        // Handle rounding and inexact. `(m + 1)^2 == 2^shift m` is exact; for all other cases, add
+        // a tiny value to cause fenv effects.
+        let d2 = d1.wrapping_add(m).wrapping_add(one);
+        let mut tiny = if d2 == zero {
+            cold_path();
+            zero
+        } else {
+            F::IMPLICIT_BIT
+        };
+
+        tiny |= (d1 ^ d2) & F::SIGN_MASK;
+        let t = F::from_bits(tiny);
+        y = y + t;
+    }
+
+    FpResult::ok(y)
+}
+
+/// Multiply at the wider integer size, returning the high half.
+fn wmulh<I: HInt>(a: I, b: I) -> I {
+    a.widen_mul(b).hi()
+}
+
+/// Perform `count` goldschmidt iterations, returning `(r_u0, s_u?)`.
+///
+/// - `r_u0` is the reciprocal `r ~ 1 / sqrt(m)`, as U0.
+/// - `s_u2` is the square root, `s ~ sqrt(m)`, as U2.
+/// - `count` is the number of iterations to perform.
+/// - `final_set` should be true if this is the last round (same-sized integer). If so, the
+///   returned `s` will be U3, for later shifting. Otherwise, the returned `s` is U2.
+///
+/// Note that performance relies on the optimizer being able to unroll these loops (reasonably
+/// trivial, `count` is a constant when called).
+#[inline]
+fn goldschmidt<F, I>(mut r_u0: I, mut s_u2: I, count: u32, final_set: bool) -> (I, I)
+where
+    F: SqrtHelper,
+    I: HInt + From<u8>,
+{
+    let three_u2 = I::from(0b11u8) << (I::BITS - 2);
+    let mut u_u0 = r_u0;
+
+    for i in 0..count {
+        // First iteration: `s = m*r` (`u_u0 = r_u0` set above)
+        // Subsequent iterations: `s=s*u/2`
+        s_u2 = wmulh(s_u2, u_u0);
+
+        // Perform `s /= 2` if:
+        //
+        // 1. This is not the first iteration (the first iteration is `s = m*r`)...
+        // 2. ... and this is not the last set of iterations
+        // 3. ... or, if this is the last set, it is not the last iteration
+        //
+        // This step is not performed for the final iteration because the shift is combined with
+        // a later shift (moving `s` into the mantissa).
+        if i > 0 && (!final_set || i + 1 < count) {
+            s_u2 <<= 1;
+        }
+
+        // u = 3 - s*r
+        let d_u2 = wmulh(s_u2, r_u0);
+        u_u0 = three_u2.wrapping_sub(d_u2);
+
+        // r = r*u/2
+        r_u0 = wmulh(r_u0, u_u0) << 1;
+    }
+
+    (r_u0, s_u2)
+}
+
+/// Representation of whether we shift the exponent into a `u32`, or modify it in place to save
+/// the shift operations.
+enum Exp<T> {
+    /// The exponent has been shifted to a `u32` and is LSB-aligned.
+    Shifted(u32),
+    /// The exponent is in its natural position in integer repr.
+    NoShift(T),
+}
+
+/// Size-specific constants related to the square root routine.
+pub trait SqrtHelper: Float {
+    /// Integer for the first set of rounds. If unused, set to the same type as the next set.
+    type ISet1: HInt + Into<Self::ISet2> + CastFrom<Self::Int> + From<u8>;
+    /// Integer for the second set of rounds. If unused, set to the same type as the next set.
+    type ISet2: HInt + From<Self::ISet1> + From<u8>;
+
+    /// Number of rounds at `ISet1`.
+    const SET1_ROUNDS: u32 = 0;
+    /// Number of rounds at `ISet2`.
+    const SET2_ROUNDS: u32 = 0;
+    /// Number of rounds at `Self::Int`.
+    const FINAL_ROUNDS: u32;
+}
+
+#[cfg(f16_enabled)]
+impl SqrtHelper for f16 {
+    type ISet1 = u16; // unused
+    type ISet2 = u16; // unused
+
+    const FINAL_ROUNDS: u32 = 2;
+}
+
+impl SqrtHelper for f32 {
+    type ISet1 = u32; // unused
+    type ISet2 = u32; // unused
+
+    const FINAL_ROUNDS: u32 = 3;
+}
+
+impl SqrtHelper for f64 {
+    type ISet1 = u32; // unused
+    type ISet2 = u32;
+
+    const SET2_ROUNDS: u32 = 2;
+    const FINAL_ROUNDS: u32 = 2;
+}
+
+#[cfg(f128_enabled)]
+impl SqrtHelper for f128 {
+    type ISet1 = u32;
+    type ISet2 = u64;
+
+    const SET1_ROUNDS: u32 = 1;
+    const SET2_ROUNDS: u32 = 2;
+    const FINAL_ROUNDS: u32 = 2;
+}
+
+/// A U0.16 representation of `1/sqrt(x)`.
+///
+/// The index is a 7-bit number consisting of a single exponent bit and 6 bits of significand.
+#[rustfmt::skip]
+static RSQRT_TAB: [u16; 128] = [
+    0xb451, 0xb2f0, 0xb196, 0xb044, 0xaef9, 0xadb6, 0xac79, 0xab43,
+    0xaa14, 0xa8eb, 0xa7c8, 0xa6aa, 0xa592, 0xa480, 0xa373, 0xa26b,
+    0xa168, 0xa06a, 0x9f70, 0x9e7b, 0x9d8a, 0x9c9d, 0x9bb5, 0x9ad1,
+    0x99f0, 0x9913, 0x983a, 0x9765, 0x9693, 0x95c4, 0x94f8, 0x9430,
+    0x936b, 0x92a9, 0x91ea, 0x912e, 0x9075, 0x8fbe, 0x8f0a, 0x8e59,
+    0x8daa, 0x8cfe, 0x8c54, 0x8bac, 0x8b07, 0x8a64, 0x89c4, 0x8925,
+    0x8889, 0x87ee, 0x8756, 0x86c0, 0x862b, 0x8599, 0x8508, 0x8479,
+    0x83ec, 0x8361, 0x82d8, 0x8250, 0x81c9, 0x8145, 0x80c2, 0x8040,
+    0xff02, 0xfd0e, 0xfb25, 0xf947, 0xf773, 0xf5aa, 0xf3ea, 0xf234,
+    0xf087, 0xeee3, 0xed47, 0xebb3, 0xea27, 0xe8a3, 0xe727, 0xe5b2,
+    0xe443, 0xe2dc, 0xe17a, 0xe020, 0xdecb, 0xdd7d, 0xdc34, 0xdaf1,
+    0xd9b3, 0xd87b, 0xd748, 0xd61a, 0xd4f1, 0xd3cd, 0xd2ad, 0xd192,
+    0xd07b, 0xcf69, 0xce5b, 0xcd51, 0xcc4a, 0xcb48, 0xca4a, 0xc94f,
+    0xc858, 0xc764, 0xc674, 0xc587, 0xc49d, 0xc3b7, 0xc2d4, 0xc1f4,
+    0xc116, 0xc03c, 0xbf65, 0xbe90, 0xbdbe, 0xbcef, 0xbc23, 0xbb59,
+    0xba91, 0xb9cc, 0xb90a, 0xb84a, 0xb78c, 0xb6d0, 0xb617, 0xb560,
+];
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Test behavior specified in IEEE 754 `squareRoot`.
+    fn spec_test<F>()
+    where
+        F: Float + SqrtHelper,
+        F::Int: HInt,
+        F::Int: From<u8>,
+        F::Int: From<F::ISet2>,
+        F::Int: CastInto<F::ISet1>,
+        F::Int: CastInto<F::ISet2>,
+        u32: CastInto<F::Int>,
+    {
+        // Values that should return a NaN and raise invalid
+        let nan = [F::NEG_INFINITY, F::NEG_ONE, F::NAN, F::MIN];
+
+        // Values that return unaltered
+        let roundtrip = [F::ZERO, F::NEG_ZERO, F::INFINITY];
+
+        for x in nan {
+            let FpResult { val, status } = sqrt_round(x, Round::Nearest);
+            assert!(val.is_nan());
+            assert!(status == Status::INVALID);
+        }
+
+        for x in roundtrip {
+            let FpResult { val, status } = sqrt_round(x, Round::Nearest);
+            assert_biteq!(val, x);
+            assert!(status == Status::OK);
+        }
+    }
+
+    #[test]
+    #[cfg(f16_enabled)]
+    fn sanity_check_f16() {
+        assert_biteq!(sqrt(100.0f16), 10.0);
+        assert_biteq!(sqrt(4.0f16), 2.0);
+    }
+
+    #[test]
+    #[cfg(f16_enabled)]
+    fn spec_tests_f16() {
+        spec_test::<f16>();
+    }
+
+    #[test]
+    #[cfg(f16_enabled)]
+    #[allow(clippy::approx_constant)]
+    fn conformance_tests_f16() {
+        let cases = [
+            (f16::PI, 0x3f17_u16),
+            // 10_000.0, using a hex literal for MSRV hack (Rust < 1.67 checks literal widths as
+            // part of the AST, so the `cfg` is irrelevant here).
+            (f16::from_bits(0x70e2), 0x5640_u16),
+            (f16::from_bits(0x0000000f), 0x13bf_u16),
+            (f16::INFINITY, f16::INFINITY.to_bits()),
+        ];
+
+        for (input, output) in cases {
+            assert_biteq!(
+                sqrt(input),
+                f16::from_bits(output),
+                "input: {input:?} ({:#018x})",
+                input.to_bits()
+            );
+        }
+    }
+
+    #[test]
+    fn sanity_check_f32() {
+        assert_biteq!(sqrt(100.0f32), 10.0);
+        assert_biteq!(sqrt(4.0f32), 2.0);
+    }
+
+    #[test]
+    fn spec_tests_f32() {
+        spec_test::<f32>();
+    }
+
+    #[test]
+    #[allow(clippy::approx_constant)]
+    fn conformance_tests_f32() {
+        let cases = [
+            (f32::PI, 0x3fe2dfc5_u32),
+            (10000.0f32, 0x42c80000_u32),
+            (f32::from_bits(0x0000000f), 0x1b2f456f_u32),
+            (f32::INFINITY, f32::INFINITY.to_bits()),
+        ];
+
+        for (input, output) in cases {
+            assert_biteq!(
+                sqrt(input),
+                f32::from_bits(output),
+                "input: {input:?} ({:#018x})",
+                input.to_bits()
+            );
+        }
+    }
+
+    #[test]
+    fn sanity_check_f64() {
+        assert_biteq!(sqrt(100.0f64), 10.0);
+        assert_biteq!(sqrt(4.0f64), 2.0);
+    }
+
+    #[test]
+    fn spec_tests_f64() {
+        spec_test::<f64>();
+    }
+
+    #[test]
+    #[allow(clippy::approx_constant)]
+    fn conformance_tests_f64() {
+        let cases = [
+            (f64::PI, 0x3ffc5bf891b4ef6a_u64),
+            (10000.0, 0x4059000000000000_u64),
+            (f64::from_bits(0x0000000f), 0x1e7efbdeb14f4eda_u64),
+            (f64::INFINITY, f64::INFINITY.to_bits()),
+        ];
+
+        for (input, output) in cases {
+            assert_biteq!(
+                sqrt(input),
+                f64::from_bits(output),
+                "input: {input:?} ({:#018x})",
+                input.to_bits()
+            );
+        }
+    }
+
+    #[test]
+    #[cfg(f128_enabled)]
+    fn sanity_check_f128() {
+        assert_biteq!(sqrt(100.0f128), 10.0);
+        assert_biteq!(sqrt(4.0f128), 2.0);
+    }
+
+    #[test]
+    #[cfg(f128_enabled)]
+    fn spec_tests_f128() {
+        spec_test::<f128>();
+    }
+
+    #[test]
+    #[cfg(f128_enabled)]
+    #[allow(clippy::approx_constant)]
+    fn conformance_tests_f128() {
+        let cases = [
+            (f128::PI, 0x3fffc5bf891b4ef6aa79c3b0520d5db9_u128),
+            // 10_000.0, see `f16` for reasoning.
+            (
+                f128::from_bits(0x400c3880000000000000000000000000),
+                0x40059000000000000000000000000000_u128,
+            ),
+            (
+                f128::from_bits(0x0000000f),
+                0x1fc9efbdeb14f4ed9b17ae807907e1e9_u128,
+            ),
+            (f128::INFINITY, f128::INFINITY.to_bits()),
+        ];
+
+        for (input, output) in cases {
+            assert_biteq!(
+                sqrt(input),
+                f128::from_bits(output),
+                "input: {input:?} ({:#018x})",
+                input.to_bits()
+            );
+        }
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/generic/trunc.rs b/library/compiler-builtins/libm/src/math/generic/trunc.rs
new file mode 100644
index 00000000000..d5b444d15df
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/generic/trunc.rs
@@ -0,0 +1,148 @@
+/* SPDX-License-Identifier: MIT
+ * origin: musl src/math/trunc.c */
+
+use crate::support::{Float, FpResult, Int, IntTy, MinInt, Status};
+
+#[inline]
+pub fn trunc<F: Float>(x: F) -> F {
+    trunc_status(x).val
+}
+
+#[inline]
+pub fn trunc_status<F: Float>(x: F) -> FpResult<F> {
+    let mut xi: F::Int = x.to_bits();
+    let e: i32 = x.exp_unbiased();
+
+    // C1: The represented value has no fractional part, so no truncation is needed
+    if e >= F::SIG_BITS as i32 {
+        return FpResult::ok(x);
+    }
+
+    let mask = if e < 0 {
+        // C2: If the exponent is negative, the result will be zero so we mask out everything
+        // except the sign.
+        F::SIGN_MASK
+    } else {
+        // C3: Otherwise, we mask out the last `e` bits of the significand.
+        !(F::SIG_MASK >> e.unsigned())
+    };
+
+    // C4: If the to-be-masked-out portion is already zero, we have an exact result
+    if (xi & !mask) == IntTy::<F>::ZERO {
+        return FpResult::ok(x);
+    }
+
+    // C5: Otherwise the result is inexact and we will truncate. Raise `FE_INEXACT`, mask the
+    // result, and return.
+
+    let status = if xi & F::SIG_MASK == F::Int::ZERO {
+        Status::OK
+    } else {
+        Status::INEXACT
+    };
+    xi &= mask;
+    FpResult::new(F::from_bits(xi), status)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::support::Hexf;
+
+    fn spec_test<F: Float>(cases: &[(F, F, Status)]) {
+        let roundtrip = [
+            F::ZERO,
+            F::ONE,
+            F::NEG_ONE,
+            F::NEG_ZERO,
+            F::INFINITY,
+            F::NEG_INFINITY,
+        ];
+
+        for x in roundtrip {
+            let FpResult { val, status } = trunc_status(x);
+            assert_biteq!(val, x, "{}", Hexf(x));
+            assert_eq!(status, Status::OK, "{}", Hexf(x));
+        }
+
+        for &(x, res, res_stat) in cases {
+            let FpResult { val, status } = trunc_status(x);
+            assert_biteq!(val, res, "{}", Hexf(x));
+            assert_eq!(status, res_stat, "{}", Hexf(x));
+        }
+    }
+
+    /* Skipping f16 / f128 "sanity_check"s and spec cases due to rejected literal lexing at MSRV */
+
+    #[test]
+    #[cfg(f16_enabled)]
+    fn spec_tests_f16() {
+        let cases = [];
+        spec_test::<f16>(&cases);
+    }
+
+    #[test]
+    fn sanity_check_f32() {
+        assert_eq!(trunc(0.5f32), 0.0);
+        assert_eq!(trunc(1.1f32), 1.0);
+        assert_eq!(trunc(2.9f32), 2.0);
+    }
+
+    #[test]
+    fn spec_tests_f32() {
+        let cases = [
+            (0.1, 0.0, Status::INEXACT),
+            (-0.1, -0.0, Status::INEXACT),
+            (0.9, 0.0, Status::INEXACT),
+            (-0.9, -0.0, Status::INEXACT),
+            (1.1, 1.0, Status::INEXACT),
+            (-1.1, -1.0, Status::INEXACT),
+            (1.9, 1.0, Status::INEXACT),
+            (-1.9, -1.0, Status::INEXACT),
+        ];
+        spec_test::<f32>(&cases);
+
+        assert_biteq!(trunc(1.1f32), 1.0);
+        assert_biteq!(trunc(1.1f64), 1.0);
+
+        // C1
+        assert_biteq!(trunc(hf32!("0x1p23")), hf32!("0x1p23"));
+        assert_biteq!(trunc(hf64!("0x1p52")), hf64!("0x1p52"));
+        assert_biteq!(trunc(hf32!("-0x1p23")), hf32!("-0x1p23"));
+        assert_biteq!(trunc(hf64!("-0x1p52")), hf64!("-0x1p52"));
+
+        // C2
+        assert_biteq!(trunc(hf32!("0x1p-1")), 0.0);
+        assert_biteq!(trunc(hf64!("0x1p-1")), 0.0);
+        assert_biteq!(trunc(hf32!("-0x1p-1")), -0.0);
+        assert_biteq!(trunc(hf64!("-0x1p-1")), -0.0);
+    }
+
+    #[test]
+    fn sanity_check_f64() {
+        assert_eq!(trunc(1.1f64), 1.0);
+        assert_eq!(trunc(2.9f64), 2.0);
+    }
+
+    #[test]
+    fn spec_tests_f64() {
+        let cases = [
+            (0.1, 0.0, Status::INEXACT),
+            (-0.1, -0.0, Status::INEXACT),
+            (0.9, 0.0, Status::INEXACT),
+            (-0.9, -0.0, Status::INEXACT),
+            (1.1, 1.0, Status::INEXACT),
+            (-1.1, -1.0, Status::INEXACT),
+            (1.9, 1.0, Status::INEXACT),
+            (-1.9, -1.0, Status::INEXACT),
+        ];
+        spec_test::<f64>(&cases);
+    }
+
+    #[test]
+    #[cfg(f128_enabled)]
+    fn spec_tests_f128() {
+        let cases = [];
+        spec_test::<f128>(&cases);
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/hypot.rs b/library/compiler-builtins/libm/src/math/hypot.rs
new file mode 100644
index 00000000000..da458ea1d05
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/hypot.rs
@@ -0,0 +1,74 @@
+use core::f64;
+
+use super::sqrt;
+
+const SPLIT: f64 = 134217728. + 1.; // 0x1p27 + 1 === (2 ^ 27) + 1
+
+fn sq(x: f64) -> (f64, f64) {
+    let xh: f64;
+    let xl: f64;
+    let xc: f64;
+
+    xc = x * SPLIT;
+    xh = x - xc + xc;
+    xl = x - xh;
+    let hi = x * x;
+    let lo = xh * xh - hi + 2. * xh * xl + xl * xl;
+    (hi, lo)
+}
+
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn hypot(mut x: f64, mut y: f64) -> f64 {
+    let x1p700 = f64::from_bits(0x6bb0000000000000); // 0x1p700 === 2 ^ 700
+    let x1p_700 = f64::from_bits(0x1430000000000000); // 0x1p-700 === 2 ^ -700
+
+    let mut uxi = x.to_bits();
+    let mut uyi = y.to_bits();
+    let uti;
+    let ex: i64;
+    let ey: i64;
+    let mut z: f64;
+
+    /* arrange |x| >= |y| */
+    uxi &= -1i64 as u64 >> 1;
+    uyi &= -1i64 as u64 >> 1;
+    if uxi < uyi {
+        uti = uxi;
+        uxi = uyi;
+        uyi = uti;
+    }
+
+    /* special cases */
+    ex = (uxi >> 52) as i64;
+    ey = (uyi >> 52) as i64;
+    x = f64::from_bits(uxi);
+    y = f64::from_bits(uyi);
+    /* note: hypot(inf,nan) == inf */
+    if ey == 0x7ff {
+        return y;
+    }
+    if ex == 0x7ff || uyi == 0 {
+        return x;
+    }
+    /* note: hypot(x,y) ~= x + y*y/x/2 with inexact for small y/x */
+    /* 64 difference is enough for ld80 double_t */
+    if ex - ey > 64 {
+        return x + y;
+    }
+
+    /* precise sqrt argument in nearest rounding mode without overflow */
+    /* xh*xh must not overflow and xl*xl must not underflow in sq */
+    z = 1.;
+    if ex > 0x3ff + 510 {
+        z = x1p700;
+        x *= x1p_700;
+        y *= x1p_700;
+    } else if ey < 0x3ff - 450 {
+        z = x1p_700;
+        x *= x1p700;
+        y *= x1p700;
+    }
+    let (hx, lx) = sq(x);
+    let (hy, ly) = sq(y);
+    z * sqrt(ly + lx + hy + hx)
+}
diff --git a/library/compiler-builtins/libm/src/math/hypotf.rs b/library/compiler-builtins/libm/src/math/hypotf.rs
new file mode 100644
index 00000000000..576eebb3343
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/hypotf.rs
@@ -0,0 +1,43 @@
+use core::f32;
+
+use super::sqrtf;
+
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn hypotf(mut x: f32, mut y: f32) -> f32 {
+    let x1p90 = f32::from_bits(0x6c800000); // 0x1p90f === 2 ^ 90
+    let x1p_90 = f32::from_bits(0x12800000); // 0x1p-90f === 2 ^ -90
+
+    let mut uxi = x.to_bits();
+    let mut uyi = y.to_bits();
+    let uti;
+    let mut z: f32;
+
+    uxi &= -1i32 as u32 >> 1;
+    uyi &= -1i32 as u32 >> 1;
+    if uxi < uyi {
+        uti = uxi;
+        uxi = uyi;
+        uyi = uti;
+    }
+
+    x = f32::from_bits(uxi);
+    y = f32::from_bits(uyi);
+    if uyi == 0xff << 23 {
+        return y;
+    }
+    if uxi >= 0xff << 23 || uyi == 0 || uxi - uyi >= 25 << 23 {
+        return x + y;
+    }
+
+    z = 1.;
+    if uxi >= (0x7f + 60) << 23 {
+        z = x1p90;
+        x *= x1p_90;
+        y *= x1p_90;
+    } else if uyi < (0x7f - 60) << 23 {
+        z = x1p_90;
+        x *= x1p90;
+        y *= x1p90;
+    }
+    z * sqrtf((x as f64 * x as f64 + y as f64 * y as f64) as f32)
+}
diff --git a/library/compiler-builtins/libm/src/math/ilogb.rs b/library/compiler-builtins/libm/src/math/ilogb.rs
new file mode 100644
index 00000000000..5b41f7b1dc0
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/ilogb.rs
@@ -0,0 +1,32 @@
+const FP_ILOGBNAN: i32 = -1 - 0x7fffffff;
+const FP_ILOGB0: i32 = FP_ILOGBNAN;
+
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn ilogb(x: f64) -> i32 {
+    let mut i: u64 = x.to_bits();
+    let e = ((i >> 52) & 0x7ff) as i32;
+
+    if e == 0 {
+        i <<= 12;
+        if i == 0 {
+            force_eval!(0.0 / 0.0);
+            return FP_ILOGB0;
+        }
+        /* subnormal x */
+        let mut e = -0x3ff;
+        while (i >> 63) == 0 {
+            e -= 1;
+            i <<= 1;
+        }
+        e
+    } else if e == 0x7ff {
+        force_eval!(0.0 / 0.0);
+        if (i << 12) != 0 {
+            FP_ILOGBNAN
+        } else {
+            i32::MAX
+        }
+    } else {
+        e - 0x3ff
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/ilogbf.rs b/library/compiler-builtins/libm/src/math/ilogbf.rs
new file mode 100644
index 00000000000..3585d6d36f1
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/ilogbf.rs
@@ -0,0 +1,28 @@
+const FP_ILOGBNAN: i32 = -1 - 0x7fffffff;
+const FP_ILOGB0: i32 = FP_ILOGBNAN;
+
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn ilogbf(x: f32) -> i32 {
+    let mut i = x.to_bits();
+    let e = ((i >> 23) & 0xff) as i32;
+
+    if e == 0 {
+        i <<= 9;
+        if i == 0 {
+            force_eval!(0.0 / 0.0);
+            return FP_ILOGB0;
+        }
+        /* subnormal x */
+        let mut e = -0x7f;
+        while (i >> 31) == 0 {
+            e -= 1;
+            i <<= 1;
+        }
+        e
+    } else if e == 0xff {
+        force_eval!(0.0 / 0.0);
+        if (i << 9) != 0 { FP_ILOGBNAN } else { i32::MAX }
+    } else {
+        e - 0x7f
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/j0.rs b/library/compiler-builtins/libm/src/math/j0.rs
new file mode 100644
index 00000000000..99d656f0d08
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/j0.rs
@@ -0,0 +1,426 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/e_j0.c */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunSoft, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+/* j0(x), y0(x)
+ * Bessel function of the first and second kinds of order zero.
+ * Method -- j0(x):
+ *      1. For tiny x, we use j0(x) = 1 - x^2/4 + x^4/64 - ...
+ *      2. Reduce x to |x| since j0(x)=j0(-x),  and
+ *         for x in (0,2)
+ *              j0(x) = 1-z/4+ z^2*R0/S0,  where z = x*x;
+ *         (precision:  |j0-1+z/4-z^2R0/S0 |<2**-63.67 )
+ *         for x in (2,inf)
+ *              j0(x) = sqrt(2/(pi*x))*(p0(x)*cos(x0)-q0(x)*sin(x0))
+ *         where x0 = x-pi/4. It is better to compute sin(x0),cos(x0)
+ *         as follow:
+ *              cos(x0) = cos(x)cos(pi/4)+sin(x)sin(pi/4)
+ *                      = 1/sqrt(2) * (cos(x) + sin(x))
+ *              sin(x0) = sin(x)cos(pi/4)-cos(x)sin(pi/4)
+ *                      = 1/sqrt(2) * (sin(x) - cos(x))
+ *         (To avoid cancellation, use
+ *              sin(x) +- cos(x) = -cos(2x)/(sin(x) -+ cos(x))
+ *          to compute the worse one.)
+ *
+ *      3 Special cases
+ *              j0(nan)= nan
+ *              j0(0) = 1
+ *              j0(inf) = 0
+ *
+ * Method -- y0(x):
+ *      1. For x<2.
+ *         Since
+ *              y0(x) = 2/pi*(j0(x)*(ln(x/2)+Euler) + x^2/4 - ...)
+ *         therefore y0(x)-2/pi*j0(x)*ln(x) is an even function.
+ *         We use the following function to approximate y0,
+ *              y0(x) = U(z)/V(z) + (2/pi)*(j0(x)*ln(x)), z= x^2
+ *         where
+ *              U(z) = u00 + u01*z + ... + u06*z^6
+ *              V(z) = 1  + v01*z + ... + v04*z^4
+ *         with absolute approximation error bounded by 2**-72.
+ *         Note: For tiny x, U/V = u0 and j0(x)~1, hence
+ *              y0(tiny) = u0 + (2/pi)*ln(tiny), (choose tiny<2**-27)
+ *      2. For x>=2.
+ *              y0(x) = sqrt(2/(pi*x))*(p0(x)*cos(x0)+q0(x)*sin(x0))
+ *         where x0 = x-pi/4. It is better to compute sin(x0),cos(x0)
+ *         by the method mentioned above.
+ *      3. Special cases: y0(0)=-inf, y0(x<0)=NaN, y0(inf)=0.
+ */
+
+use super::{cos, fabs, get_high_word, get_low_word, log, sin, sqrt};
+const INVSQRTPI: f64 = 5.64189583547756279280e-01; /* 0x3FE20DD7, 0x50429B6D */
+const TPI: f64 = 6.36619772367581382433e-01; /* 0x3FE45F30, 0x6DC9C883 */
+
+/* common method when |x|>=2 */
+fn common(ix: u32, x: f64, y0: bool) -> f64 {
+    let s: f64;
+    let mut c: f64;
+    let mut ss: f64;
+    let mut cc: f64;
+    let z: f64;
+
+    /*
+     * j0(x) = sqrt(2/(pi*x))*(p0(x)*cos(x-pi/4)-q0(x)*sin(x-pi/4))
+     * y0(x) = sqrt(2/(pi*x))*(p0(x)*sin(x-pi/4)+q0(x)*cos(x-pi/4))
+     *
+     * sin(x-pi/4) = (sin(x) - cos(x))/sqrt(2)
+     * cos(x-pi/4) = (sin(x) + cos(x))/sqrt(2)
+     * sin(x) +- cos(x) = -cos(2x)/(sin(x) -+ cos(x))
+     */
+    s = sin(x);
+    c = cos(x);
+    if y0 {
+        c = -c;
+    }
+    cc = s + c;
+    /* avoid overflow in 2*x, big ulp error when x>=0x1p1023 */
+    if ix < 0x7fe00000 {
+        ss = s - c;
+        z = -cos(2.0 * x);
+        if s * c < 0.0 {
+            cc = z / ss;
+        } else {
+            ss = z / cc;
+        }
+        if ix < 0x48000000 {
+            if y0 {
+                ss = -ss;
+            }
+            cc = pzero(x) * cc - qzero(x) * ss;
+        }
+    }
+    return INVSQRTPI * cc / sqrt(x);
+}
+
+/* R0/S0 on [0, 2.00] */
+const R02: f64 = 1.56249999999999947958e-02; /* 0x3F8FFFFF, 0xFFFFFFFD */
+const R03: f64 = -1.89979294238854721751e-04; /* 0xBF28E6A5, 0xB61AC6E9 */
+const R04: f64 = 1.82954049532700665670e-06; /* 0x3EBEB1D1, 0x0C503919 */
+const R05: f64 = -4.61832688532103189199e-09; /* 0xBE33D5E7, 0x73D63FCE */
+const S01: f64 = 1.56191029464890010492e-02; /* 0x3F8FFCE8, 0x82C8C2A4 */
+const S02: f64 = 1.16926784663337450260e-04; /* 0x3F1EA6D2, 0xDD57DBF4 */
+const S03: f64 = 5.13546550207318111446e-07; /* 0x3EA13B54, 0xCE84D5A9 */
+const S04: f64 = 1.16614003333790000205e-09; /* 0x3E1408BC, 0xF4745D8F */
+
+/// Zeroth order of the [Bessel function](https://en.wikipedia.org/wiki/Bessel_function) of the first kind (f64).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn j0(mut x: f64) -> f64 {
+    let z: f64;
+    let r: f64;
+    let s: f64;
+    let mut ix: u32;
+
+    ix = get_high_word(x);
+    ix &= 0x7fffffff;
+
+    /* j0(+-inf)=0, j0(nan)=nan */
+    if ix >= 0x7ff00000 {
+        return 1.0 / (x * x);
+    }
+    x = fabs(x);
+
+    if ix >= 0x40000000 {
+        /* |x| >= 2 */
+        /* large ulp error near zeros: 2.4, 5.52, 8.6537,.. */
+        return common(ix, x, false);
+    }
+
+    /* 1 - x*x/4 + x*x*R(x^2)/S(x^2) */
+    if ix >= 0x3f200000 {
+        /* |x| >= 2**-13 */
+        /* up to 4ulp error close to 2 */
+        z = x * x;
+        r = z * (R02 + z * (R03 + z * (R04 + z * R05)));
+        s = 1.0 + z * (S01 + z * (S02 + z * (S03 + z * S04)));
+        return (1.0 + x / 2.0) * (1.0 - x / 2.0) + z * (r / s);
+    }
+
+    /* 1 - x*x/4 */
+    /* prevent underflow */
+    /* inexact should be raised when x!=0, this is not done correctly */
+    if ix >= 0x38000000 {
+        /* |x| >= 2**-127 */
+        x = 0.25 * x * x;
+    }
+    return 1.0 - x;
+}
+
+const U00: f64 = -7.38042951086872317523e-02; /* 0xBFB2E4D6, 0x99CBD01F */
+const U01: f64 = 1.76666452509181115538e-01; /* 0x3FC69D01, 0x9DE9E3FC */
+const U02: f64 = -1.38185671945596898896e-02; /* 0xBF8C4CE8, 0xB16CFA97 */
+const U03: f64 = 3.47453432093683650238e-04; /* 0x3F36C54D, 0x20B29B6B */
+const U04: f64 = -3.81407053724364161125e-06; /* 0xBECFFEA7, 0x73D25CAD */
+const U05: f64 = 1.95590137035022920206e-08; /* 0x3E550057, 0x3B4EABD4 */
+const U06: f64 = -3.98205194132103398453e-11; /* 0xBDC5E43D, 0x693FB3C8 */
+const V01: f64 = 1.27304834834123699328e-02; /* 0x3F8A1270, 0x91C9C71A */
+const V02: f64 = 7.60068627350353253702e-05; /* 0x3F13ECBB, 0xF578C6C1 */
+const V03: f64 = 2.59150851840457805467e-07; /* 0x3E91642D, 0x7FF202FD */
+const V04: f64 = 4.41110311332675467403e-10; /* 0x3DFE5018, 0x3BD6D9EF */
+
+/// Zeroth order of the [Bessel function](https://en.wikipedia.org/wiki/Bessel_function) of the second kind (f64).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn y0(x: f64) -> f64 {
+    let z: f64;
+    let u: f64;
+    let v: f64;
+    let ix: u32;
+    let lx: u32;
+
+    ix = get_high_word(x);
+    lx = get_low_word(x);
+
+    /* y0(nan)=nan, y0(<0)=nan, y0(0)=-inf, y0(inf)=0 */
+    if ((ix << 1) | lx) == 0 {
+        return -1.0 / 0.0;
+    }
+    if (ix >> 31) != 0 {
+        return 0.0 / 0.0;
+    }
+    if ix >= 0x7ff00000 {
+        return 1.0 / x;
+    }
+
+    if ix >= 0x40000000 {
+        /* x >= 2 */
+        /* large ulp errors near zeros: 3.958, 7.086,.. */
+        return common(ix, x, true);
+    }
+
+    /* U(x^2)/V(x^2) + (2/pi)*j0(x)*log(x) */
+    if ix >= 0x3e400000 {
+        /* x >= 2**-27 */
+        /* large ulp error near the first zero, x ~= 0.89 */
+        z = x * x;
+        u = U00 + z * (U01 + z * (U02 + z * (U03 + z * (U04 + z * (U05 + z * U06)))));
+        v = 1.0 + z * (V01 + z * (V02 + z * (V03 + z * V04)));
+        return u / v + TPI * (j0(x) * log(x));
+    }
+    return U00 + TPI * log(x);
+}
+
+/* The asymptotic expansions of pzero is
+ *      1 - 9/128 s^2 + 11025/98304 s^4 - ...,  where s = 1/x.
+ * For x >= 2, We approximate pzero by
+ *      pzero(x) = 1 + (R/S)
+ * where  R = pR0 + pR1*s^2 + pR2*s^4 + ... + pR5*s^10
+ *        S = 1 + pS0*s^2 + ... + pS4*s^10
+ * and
+ *      | pzero(x)-1-R/S | <= 2  ** ( -60.26)
+ */
+const PR8: [f64; 6] = [
+    /* for x in [inf, 8]=1/[0,0.125] */
+    0.00000000000000000000e+00,  /* 0x00000000, 0x00000000 */
+    -7.03124999999900357484e-02, /* 0xBFB1FFFF, 0xFFFFFD32 */
+    -8.08167041275349795626e+00, /* 0xC02029D0, 0xB44FA779 */
+    -2.57063105679704847262e+02, /* 0xC0701102, 0x7B19E863 */
+    -2.48521641009428822144e+03, /* 0xC0A36A6E, 0xCD4DCAFC */
+    -5.25304380490729545272e+03, /* 0xC0B4850B, 0x36CC643D */
+];
+const PS8: [f64; 5] = [
+    1.16534364619668181717e+02, /* 0x405D2233, 0x07A96751 */
+    3.83374475364121826715e+03, /* 0x40ADF37D, 0x50596938 */
+    4.05978572648472545552e+04, /* 0x40E3D2BB, 0x6EB6B05F */
+    1.16752972564375915681e+05, /* 0x40FC810F, 0x8F9FA9BD */
+    4.76277284146730962675e+04, /* 0x40E74177, 0x4F2C49DC */
+];
+
+const PR5: [f64; 6] = [
+    /* for x in [8,4.5454]=1/[0.125,0.22001] */
+    -1.14125464691894502584e-11, /* 0xBDA918B1, 0x47E495CC */
+    -7.03124940873599280078e-02, /* 0xBFB1FFFF, 0xE69AFBC6 */
+    -4.15961064470587782438e+00, /* 0xC010A370, 0xF90C6BBF */
+    -6.76747652265167261021e+01, /* 0xC050EB2F, 0x5A7D1783 */
+    -3.31231299649172967747e+02, /* 0xC074B3B3, 0x6742CC63 */
+    -3.46433388365604912451e+02, /* 0xC075A6EF, 0x28A38BD7 */
+];
+const PS5: [f64; 5] = [
+    6.07539382692300335975e+01, /* 0x404E6081, 0x0C98C5DE */
+    1.05125230595704579173e+03, /* 0x40906D02, 0x5C7E2864 */
+    5.97897094333855784498e+03, /* 0x40B75AF8, 0x8FBE1D60 */
+    9.62544514357774460223e+03, /* 0x40C2CCB8, 0xFA76FA38 */
+    2.40605815922939109441e+03, /* 0x40A2CC1D, 0xC70BE864 */
+];
+
+const PR3: [f64; 6] = [
+    /* for x in [4.547,2.8571]=1/[0.2199,0.35001] */
+    -2.54704601771951915620e-09, /* 0xBE25E103, 0x6FE1AA86 */
+    -7.03119616381481654654e-02, /* 0xBFB1FFF6, 0xF7C0E24B */
+    -2.40903221549529611423e+00, /* 0xC00345B2, 0xAEA48074 */
+    -2.19659774734883086467e+01, /* 0xC035F74A, 0x4CB94E14 */
+    -5.80791704701737572236e+01, /* 0xC04D0A22, 0x420A1A45 */
+    -3.14479470594888503854e+01, /* 0xC03F72AC, 0xA892D80F */
+];
+const PS3: [f64; 5] = [
+    3.58560338055209726349e+01, /* 0x4041ED92, 0x84077DD3 */
+    3.61513983050303863820e+02, /* 0x40769839, 0x464A7C0E */
+    1.19360783792111533330e+03, /* 0x4092A66E, 0x6D1061D6 */
+    1.12799679856907414432e+03, /* 0x40919FFC, 0xB8C39B7E */
+    1.73580930813335754692e+02, /* 0x4065B296, 0xFC379081 */
+];
+
+const PR2: [f64; 6] = [
+    /* for x in [2.8570,2]=1/[0.3499,0.5] */
+    -8.87534333032526411254e-08, /* 0xBE77D316, 0xE927026D */
+    -7.03030995483624743247e-02, /* 0xBFB1FF62, 0x495E1E42 */
+    -1.45073846780952986357e+00, /* 0xBFF73639, 0x8A24A843 */
+    -7.63569613823527770791e+00, /* 0xC01E8AF3, 0xEDAFA7F3 */
+    -1.11931668860356747786e+01, /* 0xC02662E6, 0xC5246303 */
+    -3.23364579351335335033e+00, /* 0xC009DE81, 0xAF8FE70F */
+];
+const PS2: [f64; 5] = [
+    2.22202997532088808441e+01, /* 0x40363865, 0x908B5959 */
+    1.36206794218215208048e+02, /* 0x4061069E, 0x0EE8878F */
+    2.70470278658083486789e+02, /* 0x4070E786, 0x42EA079B */
+    1.53875394208320329881e+02, /* 0x40633C03, 0x3AB6FAFF */
+    1.46576176948256193810e+01, /* 0x402D50B3, 0x44391809 */
+];
+
+fn pzero(x: f64) -> f64 {
+    let p: &[f64; 6];
+    let q: &[f64; 5];
+    let z: f64;
+    let r: f64;
+    let s: f64;
+    let mut ix: u32;
+
+    ix = get_high_word(x);
+    ix &= 0x7fffffff;
+    if ix >= 0x40200000 {
+        p = &PR8;
+        q = &PS8;
+    } else if ix >= 0x40122E8B {
+        p = &PR5;
+        q = &PS5;
+    } else if ix >= 0x4006DB6D {
+        p = &PR3;
+        q = &PS3;
+    } else
+    /*ix >= 0x40000000*/
+    {
+        p = &PR2;
+        q = &PS2;
+    }
+    z = 1.0 / (x * x);
+    r = p[0] + z * (p[1] + z * (p[2] + z * (p[3] + z * (p[4] + z * p[5]))));
+    s = 1.0 + z * (q[0] + z * (q[1] + z * (q[2] + z * (q[3] + z * q[4]))));
+    return 1.0 + r / s;
+}
+
+/* For x >= 8, the asymptotic expansions of qzero is
+ *      -1/8 s + 75/1024 s^3 - ..., where s = 1/x.
+ * We approximate pzero by
+ *      qzero(x) = s*(-1.25 + (R/S))
+ * where  R = qR0 + qR1*s^2 + qR2*s^4 + ... + qR5*s^10
+ *        S = 1 + qS0*s^2 + ... + qS5*s^12
+ * and
+ *      | qzero(x)/s +1.25-R/S | <= 2  ** ( -61.22)
+ */
+const QR8: [f64; 6] = [
+    /* for x in [inf, 8]=1/[0,0.125] */
+    0.00000000000000000000e+00, /* 0x00000000, 0x00000000 */
+    7.32421874999935051953e-02, /* 0x3FB2BFFF, 0xFFFFFE2C */
+    1.17682064682252693899e+01, /* 0x40278952, 0x5BB334D6 */
+    5.57673380256401856059e+02, /* 0x40816D63, 0x15301825 */
+    8.85919720756468632317e+03, /* 0x40C14D99, 0x3E18F46D */
+    3.70146267776887834771e+04, /* 0x40E212D4, 0x0E901566 */
+];
+const QS8: [f64; 6] = [
+    1.63776026895689824414e+02,  /* 0x406478D5, 0x365B39BC */
+    8.09834494656449805916e+03,  /* 0x40BFA258, 0x4E6B0563 */
+    1.42538291419120476348e+05,  /* 0x41016652, 0x54D38C3F */
+    8.03309257119514397345e+05,  /* 0x412883DA, 0x83A52B43 */
+    8.40501579819060512818e+05,  /* 0x4129A66B, 0x28DE0B3D */
+    -3.43899293537866615225e+05, /* 0xC114FD6D, 0x2C9530C5 */
+];
+
+const QR5: [f64; 6] = [
+    /* for x in [8,4.5454]=1/[0.125,0.22001] */
+    1.84085963594515531381e-11, /* 0x3DB43D8F, 0x29CC8CD9 */
+    7.32421766612684765896e-02, /* 0x3FB2BFFF, 0xD172B04C */
+    5.83563508962056953777e+00, /* 0x401757B0, 0xB9953DD3 */
+    1.35111577286449829671e+02, /* 0x4060E392, 0x0A8788E9 */
+    1.02724376596164097464e+03, /* 0x40900CF9, 0x9DC8C481 */
+    1.98997785864605384631e+03, /* 0x409F17E9, 0x53C6E3A6 */
+];
+const QS5: [f64; 6] = [
+    8.27766102236537761883e+01,  /* 0x4054B1B3, 0xFB5E1543 */
+    2.07781416421392987104e+03,  /* 0x40A03BA0, 0xDA21C0CE */
+    1.88472887785718085070e+04,  /* 0x40D267D2, 0x7B591E6D */
+    5.67511122894947329769e+04,  /* 0x40EBB5E3, 0x97E02372 */
+    3.59767538425114471465e+04,  /* 0x40E19118, 0x1F7A54A0 */
+    -5.35434275601944773371e+03, /* 0xC0B4EA57, 0xBEDBC609 */
+];
+
+const QR3: [f64; 6] = [
+    /* for x in [4.547,2.8571]=1/[0.2199,0.35001] */
+    4.37741014089738620906e-09, /* 0x3E32CD03, 0x6ADECB82 */
+    7.32411180042911447163e-02, /* 0x3FB2BFEE, 0x0E8D0842 */
+    3.34423137516170720929e+00, /* 0x400AC0FC, 0x61149CF5 */
+    4.26218440745412650017e+01, /* 0x40454F98, 0x962DAEDD */
+    1.70808091340565596283e+02, /* 0x406559DB, 0xE25EFD1F */
+    1.66733948696651168575e+02, /* 0x4064D77C, 0x81FA21E0 */
+];
+const QS3: [f64; 6] = [
+    4.87588729724587182091e+01,  /* 0x40486122, 0xBFE343A6 */
+    7.09689221056606015736e+02,  /* 0x40862D83, 0x86544EB3 */
+    3.70414822620111362994e+03,  /* 0x40ACF04B, 0xE44DFC63 */
+    6.46042516752568917582e+03,  /* 0x40B93C6C, 0xD7C76A28 */
+    2.51633368920368957333e+03,  /* 0x40A3A8AA, 0xD94FB1C0 */
+    -1.49247451836156386662e+02, /* 0xC062A7EB, 0x201CF40F */
+];
+
+const QR2: [f64; 6] = [
+    /* for x in [2.8570,2]=1/[0.3499,0.5] */
+    1.50444444886983272379e-07, /* 0x3E84313B, 0x54F76BDB */
+    7.32234265963079278272e-02, /* 0x3FB2BEC5, 0x3E883E34 */
+    1.99819174093815998816e+00, /* 0x3FFFF897, 0xE727779C */
+    1.44956029347885735348e+01, /* 0x402CFDBF, 0xAAF96FE5 */
+    3.16662317504781540833e+01, /* 0x403FAA8E, 0x29FBDC4A */
+    1.62527075710929267416e+01, /* 0x403040B1, 0x71814BB4 */
+];
+const QS2: [f64; 6] = [
+    3.03655848355219184498e+01,  /* 0x403E5D96, 0xF7C07AED */
+    2.69348118608049844624e+02,  /* 0x4070D591, 0xE4D14B40 */
+    8.44783757595320139444e+02,  /* 0x408A6645, 0x22B3BF22 */
+    8.82935845112488550512e+02,  /* 0x408B977C, 0x9C5CC214 */
+    2.12666388511798828631e+02,  /* 0x406A9553, 0x0E001365 */
+    -5.31095493882666946917e+00, /* 0xC0153E6A, 0xF8B32931 */
+];
+
+fn qzero(x: f64) -> f64 {
+    let p: &[f64; 6];
+    let q: &[f64; 6];
+    let s: f64;
+    let r: f64;
+    let z: f64;
+    let mut ix: u32;
+
+    ix = get_high_word(x);
+    ix &= 0x7fffffff;
+    if ix >= 0x40200000 {
+        p = &QR8;
+        q = &QS8;
+    } else if ix >= 0x40122E8B {
+        p = &QR5;
+        q = &QS5;
+    } else if ix >= 0x4006DB6D {
+        p = &QR3;
+        q = &QS3;
+    } else
+    /*ix >= 0x40000000*/
+    {
+        p = &QR2;
+        q = &QS2;
+    }
+    z = 1.0 / (x * x);
+    r = p[0] + z * (p[1] + z * (p[2] + z * (p[3] + z * (p[4] + z * p[5]))));
+    s = 1.0 + z * (q[0] + z * (q[1] + z * (q[2] + z * (q[3] + z * (q[4] + z * q[5])))));
+    return (-0.125 + r / s) / x;
+}
diff --git a/library/compiler-builtins/libm/src/math/j0f.rs b/library/compiler-builtins/libm/src/math/j0f.rs
new file mode 100644
index 00000000000..25e5b325c8c
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/j0f.rs
@@ -0,0 +1,363 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/e_j0f.c */
+/*
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
+ */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+use super::{cosf, fabsf, logf, sinf, sqrtf};
+
+const INVSQRTPI: f32 = 5.6418961287e-01; /* 0x3f106ebb */
+const TPI: f32 = 6.3661974669e-01; /* 0x3f22f983 */
+
+fn common(ix: u32, x: f32, y0: bool) -> f32 {
+    let z: f32;
+    let s: f32;
+    let mut c: f32;
+    let mut ss: f32;
+    let mut cc: f32;
+    /*
+     * j0(x) = 1/sqrt(pi) * (P(0,x)*cc - Q(0,x)*ss) / sqrt(x)
+     * y0(x) = 1/sqrt(pi) * (P(0,x)*ss + Q(0,x)*cc) / sqrt(x)
+     */
+    s = sinf(x);
+    c = cosf(x);
+    if y0 {
+        c = -c;
+    }
+    cc = s + c;
+    if ix < 0x7f000000 {
+        ss = s - c;
+        z = -cosf(2.0 * x);
+        if s * c < 0.0 {
+            cc = z / ss;
+        } else {
+            ss = z / cc;
+        }
+        if ix < 0x58800000 {
+            if y0 {
+                ss = -ss;
+            }
+            cc = pzerof(x) * cc - qzerof(x) * ss;
+        }
+    }
+    return INVSQRTPI * cc / sqrtf(x);
+}
+
+/* R0/S0 on [0, 2.00] */
+const R02: f32 = 1.5625000000e-02; /* 0x3c800000 */
+const R03: f32 = -1.8997929874e-04; /* 0xb947352e */
+const R04: f32 = 1.8295404516e-06; /* 0x35f58e88 */
+const R05: f32 = -4.6183270541e-09; /* 0xb19eaf3c */
+const S01: f32 = 1.5619102865e-02; /* 0x3c7fe744 */
+const S02: f32 = 1.1692678527e-04; /* 0x38f53697 */
+const S03: f32 = 5.1354652442e-07; /* 0x3509daa6 */
+const S04: f32 = 1.1661400734e-09; /* 0x30a045e8 */
+
+/// Zeroth order of the [Bessel function](https://en.wikipedia.org/wiki/Bessel_function) of the first kind (f32).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn j0f(mut x: f32) -> f32 {
+    let z: f32;
+    let r: f32;
+    let s: f32;
+    let mut ix: u32;
+
+    ix = x.to_bits();
+    ix &= 0x7fffffff;
+    if ix >= 0x7f800000 {
+        return 1.0 / (x * x);
+    }
+    x = fabsf(x);
+
+    if ix >= 0x40000000 {
+        /* |x| >= 2 */
+        /* large ulp error near zeros */
+        return common(ix, x, false);
+    }
+    if ix >= 0x3a000000 {
+        /* |x| >= 2**-11 */
+        /* up to 4ulp error near 2 */
+        z = x * x;
+        r = z * (R02 + z * (R03 + z * (R04 + z * R05)));
+        s = 1.0 + z * (S01 + z * (S02 + z * (S03 + z * S04)));
+        return (1.0 + x / 2.0) * (1.0 - x / 2.0) + z * (r / s);
+    }
+    if ix >= 0x21800000 {
+        /* |x| >= 2**-60 */
+        x = 0.25 * x * x;
+    }
+    return 1.0 - x;
+}
+
+const U00: f32 = -7.3804296553e-02; /* 0xbd9726b5 */
+const U01: f32 = 1.7666645348e-01; /* 0x3e34e80d */
+const U02: f32 = -1.3818567619e-02; /* 0xbc626746 */
+const U03: f32 = 3.4745343146e-04; /* 0x39b62a69 */
+const U04: f32 = -3.8140706238e-06; /* 0xb67ff53c */
+const U05: f32 = 1.9559013964e-08; /* 0x32a802ba */
+const U06: f32 = -3.9820518410e-11; /* 0xae2f21eb */
+const V01: f32 = 1.2730483897e-02; /* 0x3c509385 */
+const V02: f32 = 7.6006865129e-05; /* 0x389f65e0 */
+const V03: f32 = 2.5915085189e-07; /* 0x348b216c */
+const V04: f32 = 4.4111031494e-10; /* 0x2ff280c2 */
+
+/// Zeroth order of the [Bessel function](https://en.wikipedia.org/wiki/Bessel_function) of the second kind (f32).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn y0f(x: f32) -> f32 {
+    let z: f32;
+    let u: f32;
+    let v: f32;
+    let ix: u32;
+
+    ix = x.to_bits();
+    if (ix & 0x7fffffff) == 0 {
+        return -1.0 / 0.0;
+    }
+    if (ix >> 31) != 0 {
+        return 0.0 / 0.0;
+    }
+    if ix >= 0x7f800000 {
+        return 1.0 / x;
+    }
+    if ix >= 0x40000000 {
+        /* |x| >= 2.0 */
+        /* large ulp error near zeros */
+        return common(ix, x, true);
+    }
+    if ix >= 0x39000000 {
+        /* x >= 2**-13 */
+        /* large ulp error at x ~= 0.89 */
+        z = x * x;
+        u = U00 + z * (U01 + z * (U02 + z * (U03 + z * (U04 + z * (U05 + z * U06)))));
+        v = 1.0 + z * (V01 + z * (V02 + z * (V03 + z * V04)));
+        return u / v + TPI * (j0f(x) * logf(x));
+    }
+    return U00 + TPI * logf(x);
+}
+
+/* The asymptotic expansions of pzero is
+ *      1 - 9/128 s^2 + 11025/98304 s^4 - ...,  where s = 1/x.
+ * For x >= 2, We approximate pzero by
+ *      pzero(x) = 1 + (R/S)
+ * where  R = pR0 + pR1*s^2 + pR2*s^4 + ... + pR5*s^10
+ *        S = 1 + pS0*s^2 + ... + pS4*s^10
+ * and
+ *      | pzero(x)-1-R/S | <= 2  ** ( -60.26)
+ */
+const PR8: [f32; 6] = [
+    /* for x in [inf, 8]=1/[0,0.125] */
+    0.0000000000e+00,  /* 0x00000000 */
+    -7.0312500000e-02, /* 0xbd900000 */
+    -8.0816707611e+00, /* 0xc1014e86 */
+    -2.5706311035e+02, /* 0xc3808814 */
+    -2.4852163086e+03, /* 0xc51b5376 */
+    -5.2530439453e+03, /* 0xc5a4285a */
+];
+const PS8: [f32; 5] = [
+    1.1653436279e+02, /* 0x42e91198 */
+    3.8337448730e+03, /* 0x456f9beb */
+    4.0597855469e+04, /* 0x471e95db */
+    1.1675296875e+05, /* 0x47e4087c */
+    4.7627726562e+04, /* 0x473a0bba */
+];
+const PR5: [f32; 6] = [
+    /* for x in [8,4.5454]=1/[0.125,0.22001] */
+    -1.1412546255e-11, /* 0xad48c58a */
+    -7.0312492549e-02, /* 0xbd8fffff */
+    -4.1596107483e+00, /* 0xc0851b88 */
+    -6.7674766541e+01, /* 0xc287597b */
+    -3.3123129272e+02, /* 0xc3a59d9b */
+    -3.4643338013e+02, /* 0xc3ad3779 */
+];
+const PS5: [f32; 5] = [
+    6.0753936768e+01, /* 0x42730408 */
+    1.0512523193e+03, /* 0x44836813 */
+    5.9789707031e+03, /* 0x45bad7c4 */
+    9.6254453125e+03, /* 0x461665c8 */
+    2.4060581055e+03, /* 0x451660ee */
+];
+
+const PR3: [f32; 6] = [
+    /* for x in [4.547,2.8571]=1/[0.2199,0.35001] */
+    -2.5470459075e-09, /* 0xb12f081b */
+    -7.0311963558e-02, /* 0xbd8fffb8 */
+    -2.4090321064e+00, /* 0xc01a2d95 */
+    -2.1965976715e+01, /* 0xc1afba52 */
+    -5.8079170227e+01, /* 0xc2685112 */
+    -3.1447946548e+01, /* 0xc1fb9565 */
+];
+const PS3: [f32; 5] = [
+    3.5856033325e+01, /* 0x420f6c94 */
+    3.6151397705e+02, /* 0x43b4c1ca */
+    1.1936077881e+03, /* 0x44953373 */
+    1.1279968262e+03, /* 0x448cffe6 */
+    1.7358093262e+02, /* 0x432d94b8 */
+];
+
+const PR2: [f32; 6] = [
+    /* for x in [2.8570,2]=1/[0.3499,0.5] */
+    -8.8753431271e-08, /* 0xb3be98b7 */
+    -7.0303097367e-02, /* 0xbd8ffb12 */
+    -1.4507384300e+00, /* 0xbfb9b1cc */
+    -7.6356959343e+00, /* 0xc0f4579f */
+    -1.1193166733e+01, /* 0xc1331736 */
+    -3.2336456776e+00, /* 0xc04ef40d */
+];
+const PS2: [f32; 5] = [
+    2.2220300674e+01, /* 0x41b1c32d */
+    1.3620678711e+02, /* 0x430834f0 */
+    2.7047027588e+02, /* 0x43873c32 */
+    1.5387539673e+02, /* 0x4319e01a */
+    1.4657617569e+01, /* 0x416a859a */
+];
+
+fn pzerof(x: f32) -> f32 {
+    let p: &[f32; 6];
+    let q: &[f32; 5];
+    let z: f32;
+    let r: f32;
+    let s: f32;
+    let mut ix: u32;
+
+    ix = x.to_bits();
+    ix &= 0x7fffffff;
+    if ix >= 0x41000000 {
+        p = &PR8;
+        q = &PS8;
+    } else if ix >= 0x409173eb {
+        p = &PR5;
+        q = &PS5;
+    } else if ix >= 0x4036d917 {
+        p = &PR3;
+        q = &PS3;
+    } else
+    /*ix >= 0x40000000*/
+    {
+        p = &PR2;
+        q = &PS2;
+    }
+    z = 1.0 / (x * x);
+    r = p[0] + z * (p[1] + z * (p[2] + z * (p[3] + z * (p[4] + z * p[5]))));
+    s = 1.0 + z * (q[0] + z * (q[1] + z * (q[2] + z * (q[3] + z * q[4]))));
+    return 1.0 + r / s;
+}
+
+/* For x >= 8, the asymptotic expansions of qzero is
+ *      -1/8 s + 75/1024 s^3 - ..., where s = 1/x.
+ * We approximate pzero by
+ *      qzero(x) = s*(-1.25 + (R/S))
+ * where  R = qR0 + qR1*s^2 + qR2*s^4 + ... + qR5*s^10
+ *        S = 1 + qS0*s^2 + ... + qS5*s^12
+ * and
+ *      | qzero(x)/s +1.25-R/S | <= 2  ** ( -61.22)
+ */
+const QR8: [f32; 6] = [
+    /* for x in [inf, 8]=1/[0,0.125] */
+    0.0000000000e+00, /* 0x00000000 */
+    7.3242187500e-02, /* 0x3d960000 */
+    1.1768206596e+01, /* 0x413c4a93 */
+    5.5767340088e+02, /* 0x440b6b19 */
+    8.8591972656e+03, /* 0x460a6cca */
+    3.7014625000e+04, /* 0x471096a0 */
+];
+const QS8: [f32; 6] = [
+    1.6377603149e+02,  /* 0x4323c6aa */
+    8.0983447266e+03,  /* 0x45fd12c2 */
+    1.4253829688e+05,  /* 0x480b3293 */
+    8.0330925000e+05,  /* 0x49441ed4 */
+    8.4050156250e+05,  /* 0x494d3359 */
+    -3.4389928125e+05, /* 0xc8a7eb69 */
+];
+
+const QR5: [f32; 6] = [
+    /* for x in [8,4.5454]=1/[0.125,0.22001] */
+    1.8408595828e-11, /* 0x2da1ec79 */
+    7.3242180049e-02, /* 0x3d95ffff */
+    5.8356351852e+00, /* 0x40babd86 */
+    1.3511157227e+02, /* 0x43071c90 */
+    1.0272437744e+03, /* 0x448067cd */
+    1.9899779053e+03, /* 0x44f8bf4b */
+];
+const QS5: [f32; 6] = [
+    8.2776611328e+01,  /* 0x42a58da0 */
+    2.0778142090e+03,  /* 0x4501dd07 */
+    1.8847289062e+04,  /* 0x46933e94 */
+    5.6751113281e+04,  /* 0x475daf1d */
+    3.5976753906e+04,  /* 0x470c88c1 */
+    -5.3543427734e+03, /* 0xc5a752be */
+];
+
+const QR3: [f32; 6] = [
+    /* for x in [4.547,2.8571]=1/[0.2199,0.35001] */
+    4.3774099900e-09, /* 0x3196681b */
+    7.3241114616e-02, /* 0x3d95ff70 */
+    3.3442313671e+00, /* 0x405607e3 */
+    4.2621845245e+01, /* 0x422a7cc5 */
+    1.7080809021e+02, /* 0x432acedf */
+    1.6673394775e+02, /* 0x4326bbe4 */
+];
+const QS3: [f32; 6] = [
+    4.8758872986e+01,  /* 0x42430916 */
+    7.0968920898e+02,  /* 0x44316c1c */
+    3.7041481934e+03,  /* 0x4567825f */
+    6.4604252930e+03,  /* 0x45c9e367 */
+    2.5163337402e+03,  /* 0x451d4557 */
+    -1.4924745178e+02, /* 0xc3153f59 */
+];
+
+const QR2: [f32; 6] = [
+    /* for x in [2.8570,2]=1/[0.3499,0.5] */
+    1.5044444979e-07, /* 0x342189db */
+    7.3223426938e-02, /* 0x3d95f62a */
+    1.9981917143e+00, /* 0x3fffc4bf */
+    1.4495602608e+01, /* 0x4167edfd */
+    3.1666231155e+01, /* 0x41fd5471 */
+    1.6252708435e+01, /* 0x4182058c */
+];
+const QS2: [f32; 6] = [
+    3.0365585327e+01,  /* 0x41f2ecb8 */
+    2.6934811401e+02,  /* 0x4386ac8f */
+    8.4478375244e+02,  /* 0x44533229 */
+    8.8293585205e+02,  /* 0x445cbbe5 */
+    2.1266638184e+02,  /* 0x4354aa98 */
+    -5.3109550476e+00, /* 0xc0a9f358 */
+];
+
+fn qzerof(x: f32) -> f32 {
+    let p: &[f32; 6];
+    let q: &[f32; 6];
+    let s: f32;
+    let r: f32;
+    let z: f32;
+    let mut ix: u32;
+
+    ix = x.to_bits();
+    ix &= 0x7fffffff;
+    if ix >= 0x41000000 {
+        p = &QR8;
+        q = &QS8;
+    } else if ix >= 0x409173eb {
+        p = &QR5;
+        q = &QS5;
+    } else if ix >= 0x4036d917 {
+        p = &QR3;
+        q = &QS3;
+    } else
+    /*ix >= 0x40000000*/
+    {
+        p = &QR2;
+        q = &QS2;
+    }
+    z = 1.0 / (x * x);
+    r = p[0] + z * (p[1] + z * (p[2] + z * (p[3] + z * (p[4] + z * p[5]))));
+    s = 1.0 + z * (q[0] + z * (q[1] + z * (q[2] + z * (q[3] + z * (q[4] + z * q[5])))));
+    return (-0.125 + r / s) / x;
+}
diff --git a/library/compiler-builtins/libm/src/math/j1.rs b/library/compiler-builtins/libm/src/math/j1.rs
new file mode 100644
index 00000000000..9b604d9e46e
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/j1.rs
@@ -0,0 +1,418 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/e_j1.c */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunSoft, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+/* j1(x), y1(x)
+ * Bessel function of the first and second kinds of order zero.
+ * Method -- j1(x):
+ *      1. For tiny x, we use j1(x) = x/2 - x^3/16 + x^5/384 - ...
+ *      2. Reduce x to |x| since j1(x)=-j1(-x),  and
+ *         for x in (0,2)
+ *              j1(x) = x/2 + x*z*R0/S0,  where z = x*x;
+ *         (precision:  |j1/x - 1/2 - R0/S0 |<2**-61.51 )
+ *         for x in (2,inf)
+ *              j1(x) = sqrt(2/(pi*x))*(p1(x)*cos(x1)-q1(x)*sin(x1))
+ *              y1(x) = sqrt(2/(pi*x))*(p1(x)*sin(x1)+q1(x)*cos(x1))
+ *         where x1 = x-3*pi/4. It is better to compute sin(x1),cos(x1)
+ *         as follow:
+ *              cos(x1) =  cos(x)cos(3pi/4)+sin(x)sin(3pi/4)
+ *                      =  1/sqrt(2) * (sin(x) - cos(x))
+ *              sin(x1) =  sin(x)cos(3pi/4)-cos(x)sin(3pi/4)
+ *                      = -1/sqrt(2) * (sin(x) + cos(x))
+ *         (To avoid cancellation, use
+ *              sin(x) +- cos(x) = -cos(2x)/(sin(x) -+ cos(x))
+ *          to compute the worse one.)
+ *
+ *      3 Special cases
+ *              j1(nan)= nan
+ *              j1(0) = 0
+ *              j1(inf) = 0
+ *
+ * Method -- y1(x):
+ *      1. screen out x<=0 cases: y1(0)=-inf, y1(x<0)=NaN
+ *      2. For x<2.
+ *         Since
+ *              y1(x) = 2/pi*(j1(x)*(ln(x/2)+Euler)-1/x-x/2+5/64*x^3-...)
+ *         therefore y1(x)-2/pi*j1(x)*ln(x)-1/x is an odd function.
+ *         We use the following function to approximate y1,
+ *              y1(x) = x*U(z)/V(z) + (2/pi)*(j1(x)*ln(x)-1/x), z= x^2
+ *         where for x in [0,2] (abs err less than 2**-65.89)
+ *              U(z) = U0[0] + U0[1]*z + ... + U0[4]*z^4
+ *              V(z) = 1  + v0[0]*z + ... + v0[4]*z^5
+ *         Note: For tiny x, 1/x dominate y1 and hence
+ *              y1(tiny) = -2/pi/tiny, (choose tiny<2**-54)
+ *      3. For x>=2.
+ *              y1(x) = sqrt(2/(pi*x))*(p1(x)*sin(x1)+q1(x)*cos(x1))
+ *         where x1 = x-3*pi/4. It is better to compute sin(x1),cos(x1)
+ *         by method mentioned above.
+ */
+
+use super::{cos, fabs, get_high_word, get_low_word, log, sin, sqrt};
+
+const INVSQRTPI: f64 = 5.64189583547756279280e-01; /* 0x3FE20DD7, 0x50429B6D */
+const TPI: f64 = 6.36619772367581382433e-01; /* 0x3FE45F30, 0x6DC9C883 */
+
+fn common(ix: u32, x: f64, y1: bool, sign: bool) -> f64 {
+    let z: f64;
+    let mut s: f64;
+    let c: f64;
+    let mut ss: f64;
+    let mut cc: f64;
+
+    /*
+     * j1(x) = sqrt(2/(pi*x))*(p1(x)*cos(x-3pi/4)-q1(x)*sin(x-3pi/4))
+     * y1(x) = sqrt(2/(pi*x))*(p1(x)*sin(x-3pi/4)+q1(x)*cos(x-3pi/4))
+     *
+     * sin(x-3pi/4) = -(sin(x) + cos(x))/sqrt(2)
+     * cos(x-3pi/4) = (sin(x) - cos(x))/sqrt(2)
+     * sin(x) +- cos(x) = -cos(2x)/(sin(x) -+ cos(x))
+     */
+    s = sin(x);
+    if y1 {
+        s = -s;
+    }
+    c = cos(x);
+    cc = s - c;
+    if ix < 0x7fe00000 {
+        /* avoid overflow in 2*x */
+        ss = -s - c;
+        z = cos(2.0 * x);
+        if s * c > 0.0 {
+            cc = z / ss;
+        } else {
+            ss = z / cc;
+        }
+        if ix < 0x48000000 {
+            if y1 {
+                ss = -ss;
+            }
+            cc = pone(x) * cc - qone(x) * ss;
+        }
+    }
+    if sign {
+        cc = -cc;
+    }
+    return INVSQRTPI * cc / sqrt(x);
+}
+
+/* R0/S0 on [0,2] */
+const R00: f64 = -6.25000000000000000000e-02; /* 0xBFB00000, 0x00000000 */
+const R01: f64 = 1.40705666955189706048e-03; /* 0x3F570D9F, 0x98472C61 */
+const R02: f64 = -1.59955631084035597520e-05; /* 0xBEF0C5C6, 0xBA169668 */
+const R03: f64 = 4.96727999609584448412e-08; /* 0x3E6AAAFA, 0x46CA0BD9 */
+const S01: f64 = 1.91537599538363460805e-02; /* 0x3F939D0B, 0x12637E53 */
+const S02: f64 = 1.85946785588630915560e-04; /* 0x3F285F56, 0xB9CDF664 */
+const S03: f64 = 1.17718464042623683263e-06; /* 0x3EB3BFF8, 0x333F8498 */
+const S04: f64 = 5.04636257076217042715e-09; /* 0x3E35AC88, 0xC97DFF2C */
+const S05: f64 = 1.23542274426137913908e-11; /* 0x3DAB2ACF, 0xCFB97ED8 */
+
+/// First order of the [Bessel function](https://en.wikipedia.org/wiki/Bessel_function) of the first kind (f64).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn j1(x: f64) -> f64 {
+    let mut z: f64;
+    let r: f64;
+    let s: f64;
+    let mut ix: u32;
+    let sign: bool;
+
+    ix = get_high_word(x);
+    sign = (ix >> 31) != 0;
+    ix &= 0x7fffffff;
+    if ix >= 0x7ff00000 {
+        return 1.0 / (x * x);
+    }
+    if ix >= 0x40000000 {
+        /* |x| >= 2 */
+        return common(ix, fabs(x), false, sign);
+    }
+    if ix >= 0x38000000 {
+        /* |x| >= 2**-127 */
+        z = x * x;
+        r = z * (R00 + z * (R01 + z * (R02 + z * R03)));
+        s = 1.0 + z * (S01 + z * (S02 + z * (S03 + z * (S04 + z * S05))));
+        z = r / s;
+    } else {
+        /* avoid underflow, raise inexact if x!=0 */
+        z = x;
+    }
+    return (0.5 + z) * x;
+}
+
+const U0: [f64; 5] = [
+    -1.96057090646238940668e-01, /* 0xBFC91866, 0x143CBC8A */
+    5.04438716639811282616e-02,  /* 0x3FA9D3C7, 0x76292CD1 */
+    -1.91256895875763547298e-03, /* 0xBF5F55E5, 0x4844F50F */
+    2.35252600561610495928e-05,  /* 0x3EF8AB03, 0x8FA6B88E */
+    -9.19099158039878874504e-08, /* 0xBE78AC00, 0x569105B8 */
+];
+const V0: [f64; 5] = [
+    1.99167318236649903973e-02, /* 0x3F94650D, 0x3F4DA9F0 */
+    2.02552581025135171496e-04, /* 0x3F2A8C89, 0x6C257764 */
+    1.35608801097516229404e-06, /* 0x3EB6C05A, 0x894E8CA6 */
+    6.22741452364621501295e-09, /* 0x3E3ABF1D, 0x5BA69A86 */
+    1.66559246207992079114e-11, /* 0x3DB25039, 0xDACA772A */
+];
+
+/// First order of the [Bessel function](https://en.wikipedia.org/wiki/Bessel_function) of the second kind (f64).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn y1(x: f64) -> f64 {
+    let z: f64;
+    let u: f64;
+    let v: f64;
+    let ix: u32;
+    let lx: u32;
+
+    ix = get_high_word(x);
+    lx = get_low_word(x);
+
+    /* y1(nan)=nan, y1(<0)=nan, y1(0)=-inf, y1(inf)=0 */
+    if (ix << 1) | lx == 0 {
+        return -1.0 / 0.0;
+    }
+    if ix >> 31 != 0 {
+        return 0.0 / 0.0;
+    }
+    if ix >= 0x7ff00000 {
+        return 1.0 / x;
+    }
+
+    if ix >= 0x40000000 {
+        /* x >= 2 */
+        return common(ix, x, true, false);
+    }
+    if ix < 0x3c900000 {
+        /* x < 2**-54 */
+        return -TPI / x;
+    }
+    z = x * x;
+    u = U0[0] + z * (U0[1] + z * (U0[2] + z * (U0[3] + z * U0[4])));
+    v = 1.0 + z * (V0[0] + z * (V0[1] + z * (V0[2] + z * (V0[3] + z * V0[4]))));
+    return x * (u / v) + TPI * (j1(x) * log(x) - 1.0 / x);
+}
+
+/* For x >= 8, the asymptotic expansions of pone is
+ *      1 + 15/128 s^2 - 4725/2^15 s^4 - ...,   where s = 1/x.
+ * We approximate pone by
+ *      pone(x) = 1 + (R/S)
+ * where  R = pr0 + pr1*s^2 + pr2*s^4 + ... + pr5*s^10
+ *        S = 1 + ps0*s^2 + ... + ps4*s^10
+ * and
+ *      | pone(x)-1-R/S | <= 2  ** ( -60.06)
+ */
+
+const PR8: [f64; 6] = [
+    /* for x in [inf, 8]=1/[0,0.125] */
+    0.00000000000000000000e+00, /* 0x00000000, 0x00000000 */
+    1.17187499999988647970e-01, /* 0x3FBDFFFF, 0xFFFFFCCE */
+    1.32394806593073575129e+01, /* 0x402A7A9D, 0x357F7FCE */
+    4.12051854307378562225e+02, /* 0x4079C0D4, 0x652EA590 */
+    3.87474538913960532227e+03, /* 0x40AE457D, 0xA3A532CC */
+    7.91447954031891731574e+03, /* 0x40BEEA7A, 0xC32782DD */
+];
+const PS8: [f64; 5] = [
+    1.14207370375678408436e+02, /* 0x405C8D45, 0x8E656CAC */
+    3.65093083420853463394e+03, /* 0x40AC85DC, 0x964D274F */
+    3.69562060269033463555e+04, /* 0x40E20B86, 0x97C5BB7F */
+    9.76027935934950801311e+04, /* 0x40F7D42C, 0xB28F17BB */
+    3.08042720627888811578e+04, /* 0x40DE1511, 0x697A0B2D */
+];
+
+const PR5: [f64; 6] = [
+    /* for x in [8,4.5454]=1/[0.125,0.22001] */
+    1.31990519556243522749e-11, /* 0x3DAD0667, 0xDAE1CA7D */
+    1.17187493190614097638e-01, /* 0x3FBDFFFF, 0xE2C10043 */
+    6.80275127868432871736e+00, /* 0x401B3604, 0x6E6315E3 */
+    1.08308182990189109773e+02, /* 0x405B13B9, 0x452602ED */
+    5.17636139533199752805e+02, /* 0x40802D16, 0xD052D649 */
+    5.28715201363337541807e+02, /* 0x408085B8, 0xBB7E0CB7 */
+];
+const PS5: [f64; 5] = [
+    5.92805987221131331921e+01, /* 0x404DA3EA, 0xA8AF633D */
+    9.91401418733614377743e+02, /* 0x408EFB36, 0x1B066701 */
+    5.35326695291487976647e+03, /* 0x40B4E944, 0x5706B6FB */
+    7.84469031749551231769e+03, /* 0x40BEA4B0, 0xB8A5BB15 */
+    1.50404688810361062679e+03, /* 0x40978030, 0x036F5E51 */
+];
+
+const PR3: [f64; 6] = [
+    3.02503916137373618024e-09, /* 0x3E29FC21, 0xA7AD9EDD */
+    1.17186865567253592491e-01, /* 0x3FBDFFF5, 0x5B21D17B */
+    3.93297750033315640650e+00, /* 0x400F76BC, 0xE85EAD8A */
+    3.51194035591636932736e+01, /* 0x40418F48, 0x9DA6D129 */
+    9.10550110750781271918e+01, /* 0x4056C385, 0x4D2C1837 */
+    4.85590685197364919645e+01, /* 0x4048478F, 0x8EA83EE5 */
+];
+const PS3: [f64; 5] = [
+    3.47913095001251519989e+01, /* 0x40416549, 0xA134069C */
+    3.36762458747825746741e+02, /* 0x40750C33, 0x07F1A75F */
+    1.04687139975775130551e+03, /* 0x40905B7C, 0x5037D523 */
+    8.90811346398256432622e+02, /* 0x408BD67D, 0xA32E31E9 */
+    1.03787932439639277504e+02, /* 0x4059F26D, 0x7C2EED53 */
+];
+
+const PR2: [f64; 6] = [
+    /* for x in [2.8570,2]=1/[0.3499,0.5] */
+    1.07710830106873743082e-07, /* 0x3E7CE9D4, 0xF65544F4 */
+    1.17176219462683348094e-01, /* 0x3FBDFF42, 0xBE760D83 */
+    2.36851496667608785174e+00, /* 0x4002F2B7, 0xF98FAEC0 */
+    1.22426109148261232917e+01, /* 0x40287C37, 0x7F71A964 */
+    1.76939711271687727390e+01, /* 0x4031B1A8, 0x177F8EE2 */
+    5.07352312588818499250e+00, /* 0x40144B49, 0xA574C1FE */
+];
+const PS2: [f64; 5] = [
+    2.14364859363821409488e+01, /* 0x40356FBD, 0x8AD5ECDC */
+    1.25290227168402751090e+02, /* 0x405F5293, 0x14F92CD5 */
+    2.32276469057162813669e+02, /* 0x406D08D8, 0xD5A2DBD9 */
+    1.17679373287147100768e+02, /* 0x405D6B7A, 0xDA1884A9 */
+    8.36463893371618283368e+00, /* 0x4020BAB1, 0xF44E5192 */
+];
+
+fn pone(x: f64) -> f64 {
+    let p: &[f64; 6];
+    let q: &[f64; 5];
+    let z: f64;
+    let r: f64;
+    let s: f64;
+    let mut ix: u32;
+
+    ix = get_high_word(x);
+    ix &= 0x7fffffff;
+    if ix >= 0x40200000 {
+        p = &PR8;
+        q = &PS8;
+    } else if ix >= 0x40122E8B {
+        p = &PR5;
+        q = &PS5;
+    } else if ix >= 0x4006DB6D {
+        p = &PR3;
+        q = &PS3;
+    } else
+    /*ix >= 0x40000000*/
+    {
+        p = &PR2;
+        q = &PS2;
+    }
+    z = 1.0 / (x * x);
+    r = p[0] + z * (p[1] + z * (p[2] + z * (p[3] + z * (p[4] + z * p[5]))));
+    s = 1.0 + z * (q[0] + z * (q[1] + z * (q[2] + z * (q[3] + z * q[4]))));
+    return 1.0 + r / s;
+}
+
+/* For x >= 8, the asymptotic expansions of qone is
+ *      3/8 s - 105/1024 s^3 - ..., where s = 1/x.
+ * We approximate pone by
+ *      qone(x) = s*(0.375 + (R/S))
+ * where  R = qr1*s^2 + qr2*s^4 + ... + qr5*s^10
+ *        S = 1 + qs1*s^2 + ... + qs6*s^12
+ * and
+ *      | qone(x)/s -0.375-R/S | <= 2  ** ( -61.13)
+ */
+
+const QR8: [f64; 6] = [
+    /* for x in [inf, 8]=1/[0,0.125] */
+    0.00000000000000000000e+00,  /* 0x00000000, 0x00000000 */
+    -1.02539062499992714161e-01, /* 0xBFBA3FFF, 0xFFFFFDF3 */
+    -1.62717534544589987888e+01, /* 0xC0304591, 0xA26779F7 */
+    -7.59601722513950107896e+02, /* 0xC087BCD0, 0x53E4B576 */
+    -1.18498066702429587167e+04, /* 0xC0C724E7, 0x40F87415 */
+    -4.84385124285750353010e+04, /* 0xC0E7A6D0, 0x65D09C6A */
+];
+const QS8: [f64; 6] = [
+    1.61395369700722909556e+02,  /* 0x40642CA6, 0xDE5BCDE5 */
+    7.82538599923348465381e+03,  /* 0x40BE9162, 0xD0D88419 */
+    1.33875336287249578163e+05,  /* 0x4100579A, 0xB0B75E98 */
+    7.19657723683240939863e+05,  /* 0x4125F653, 0x72869C19 */
+    6.66601232617776375264e+05,  /* 0x412457D2, 0x7719AD5C */
+    -2.94490264303834643215e+05, /* 0xC111F969, 0x0EA5AA18 */
+];
+
+const QR5: [f64; 6] = [
+    /* for x in [8,4.5454]=1/[0.125,0.22001] */
+    -2.08979931141764104297e-11, /* 0xBDB6FA43, 0x1AA1A098 */
+    -1.02539050241375426231e-01, /* 0xBFBA3FFF, 0xCB597FEF */
+    -8.05644828123936029840e+00, /* 0xC0201CE6, 0xCA03AD4B */
+    -1.83669607474888380239e+02, /* 0xC066F56D, 0x6CA7B9B0 */
+    -1.37319376065508163265e+03, /* 0xC09574C6, 0x6931734F */
+    -2.61244440453215656817e+03, /* 0xC0A468E3, 0x88FDA79D */
+];
+const QS5: [f64; 6] = [
+    8.12765501384335777857e+01,  /* 0x405451B2, 0xFF5A11B2 */
+    1.99179873460485964642e+03,  /* 0x409F1F31, 0xE77BF839 */
+    1.74684851924908907677e+04,  /* 0x40D10F1F, 0x0D64CE29 */
+    4.98514270910352279316e+04,  /* 0x40E8576D, 0xAABAD197 */
+    2.79480751638918118260e+04,  /* 0x40DB4B04, 0xCF7C364B */
+    -4.71918354795128470869e+03, /* 0xC0B26F2E, 0xFCFFA004 */
+];
+
+const QR3: [f64; 6] = [
+    -5.07831226461766561369e-09, /* 0xBE35CFA9, 0xD38FC84F */
+    -1.02537829820837089745e-01, /* 0xBFBA3FEB, 0x51AEED54 */
+    -4.61011581139473403113e+00, /* 0xC01270C2, 0x3302D9FF */
+    -5.78472216562783643212e+01, /* 0xC04CEC71, 0xC25D16DA */
+    -2.28244540737631695038e+02, /* 0xC06C87D3, 0x4718D55F */
+    -2.19210128478909325622e+02, /* 0xC06B66B9, 0x5F5C1BF6 */
+];
+const QS3: [f64; 6] = [
+    4.76651550323729509273e+01,  /* 0x4047D523, 0xCCD367E4 */
+    6.73865112676699709482e+02,  /* 0x40850EEB, 0xC031EE3E */
+    3.38015286679526343505e+03,  /* 0x40AA684E, 0x448E7C9A */
+    5.54772909720722782367e+03,  /* 0x40B5ABBA, 0xA61D54A6 */
+    1.90311919338810798763e+03,  /* 0x409DBC7A, 0x0DD4DF4B */
+    -1.35201191444307340817e+02, /* 0xC060E670, 0x290A311F */
+];
+
+const QR2: [f64; 6] = [
+    /* for x in [2.8570,2]=1/[0.3499,0.5] */
+    -1.78381727510958865572e-07, /* 0xBE87F126, 0x44C626D2 */
+    -1.02517042607985553460e-01, /* 0xBFBA3E8E, 0x9148B010 */
+    -2.75220568278187460720e+00, /* 0xC0060484, 0x69BB4EDA */
+    -1.96636162643703720221e+01, /* 0xC033A9E2, 0xC168907F */
+    -4.23253133372830490089e+01, /* 0xC04529A3, 0xDE104AAA */
+    -2.13719211703704061733e+01, /* 0xC0355F36, 0x39CF6E52 */
+];
+const QS2: [f64; 6] = [
+    2.95333629060523854548e+01,  /* 0x403D888A, 0x78AE64FF */
+    2.52981549982190529136e+02,  /* 0x406F9F68, 0xDB821CBA */
+    7.57502834868645436472e+02,  /* 0x4087AC05, 0xCE49A0F7 */
+    7.39393205320467245656e+02,  /* 0x40871B25, 0x48D4C029 */
+    1.55949003336666123687e+02,  /* 0x40637E5E, 0x3C3ED8D4 */
+    -4.95949898822628210127e+00, /* 0xC013D686, 0xE71BE86B */
+];
+
+fn qone(x: f64) -> f64 {
+    let p: &[f64; 6];
+    let q: &[f64; 6];
+    let s: f64;
+    let r: f64;
+    let z: f64;
+    let mut ix: u32;
+
+    ix = get_high_word(x);
+    ix &= 0x7fffffff;
+    if ix >= 0x40200000 {
+        p = &QR8;
+        q = &QS8;
+    } else if ix >= 0x40122E8B {
+        p = &QR5;
+        q = &QS5;
+    } else if ix >= 0x4006DB6D {
+        p = &QR3;
+        q = &QS3;
+    } else
+    /*ix >= 0x40000000*/
+    {
+        p = &QR2;
+        q = &QS2;
+    }
+    z = 1.0 / (x * x);
+    r = p[0] + z * (p[1] + z * (p[2] + z * (p[3] + z * (p[4] + z * p[5]))));
+    s = 1.0 + z * (q[0] + z * (q[1] + z * (q[2] + z * (q[3] + z * (q[4] + z * q[5])))));
+    return (0.375 + r / s) / x;
+}
diff --git a/library/compiler-builtins/libm/src/math/j1f.rs b/library/compiler-builtins/libm/src/math/j1f.rs
new file mode 100644
index 00000000000..a47472401ee
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/j1f.rs
@@ -0,0 +1,384 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/e_j1f.c */
+/*
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
+ */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+use super::{cosf, fabsf, logf, sinf, sqrtf};
+
+const INVSQRTPI: f32 = 5.6418961287e-01; /* 0x3f106ebb */
+const TPI: f32 = 6.3661974669e-01; /* 0x3f22f983 */
+
+fn common(ix: u32, x: f32, y1: bool, sign: bool) -> f32 {
+    let z: f64;
+    let mut s: f64;
+    let c: f64;
+    let mut ss: f64;
+    let mut cc: f64;
+
+    s = sinf(x) as f64;
+    if y1 {
+        s = -s;
+    }
+    c = cosf(x) as f64;
+    cc = s - c;
+    if ix < 0x7f000000 {
+        ss = -s - c;
+        z = cosf(2.0 * x) as f64;
+        if s * c > 0.0 {
+            cc = z / ss;
+        } else {
+            ss = z / cc;
+        }
+        if ix < 0x58800000 {
+            if y1 {
+                ss = -ss;
+            }
+            cc = (ponef(x) as f64) * cc - (qonef(x) as f64) * ss;
+        }
+    }
+    if sign {
+        cc = -cc;
+    }
+    return (((INVSQRTPI as f64) * cc) / (sqrtf(x) as f64)) as f32;
+}
+
+/* R0/S0 on [0,2] */
+const R00: f32 = -6.2500000000e-02; /* 0xbd800000 */
+const R01: f32 = 1.4070566976e-03; /* 0x3ab86cfd */
+const R02: f32 = -1.5995563444e-05; /* 0xb7862e36 */
+const R03: f32 = 4.9672799207e-08; /* 0x335557d2 */
+const S01: f32 = 1.9153760746e-02; /* 0x3c9ce859 */
+const S02: f32 = 1.8594678841e-04; /* 0x3942fab6 */
+const S03: f32 = 1.1771846857e-06; /* 0x359dffc2 */
+const S04: f32 = 5.0463624390e-09; /* 0x31ad6446 */
+const S05: f32 = 1.2354227016e-11; /* 0x2d59567e */
+
+/// First order of the [Bessel function](https://en.wikipedia.org/wiki/Bessel_function) of the first kind (f32).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn j1f(x: f32) -> f32 {
+    let mut z: f32;
+    let r: f32;
+    let s: f32;
+    let mut ix: u32;
+    let sign: bool;
+
+    ix = x.to_bits();
+    sign = (ix >> 31) != 0;
+    ix &= 0x7fffffff;
+    if ix >= 0x7f800000 {
+        return 1.0 / (x * x);
+    }
+    if ix >= 0x40000000 {
+        /* |x| >= 2 */
+        return common(ix, fabsf(x), false, sign);
+    }
+    if ix >= 0x39000000 {
+        /* |x| >= 2**-13 */
+        z = x * x;
+        r = z * (R00 + z * (R01 + z * (R02 + z * R03)));
+        s = 1.0 + z * (S01 + z * (S02 + z * (S03 + z * (S04 + z * S05))));
+        z = 0.5 + r / s;
+    } else {
+        z = 0.5;
+    }
+    return z * x;
+}
+
+const U0: [f32; 5] = [
+    -1.9605709612e-01, /* 0xbe48c331 */
+    5.0443872809e-02,  /* 0x3d4e9e3c */
+    -1.9125689287e-03, /* 0xbafaaf2a */
+    2.3525259166e-05,  /* 0x37c5581c */
+    -9.1909917899e-08, /* 0xb3c56003 */
+];
+const V0: [f32; 5] = [
+    1.9916731864e-02, /* 0x3ca3286a */
+    2.0255257550e-04, /* 0x3954644b */
+    1.3560879779e-06, /* 0x35b602d4 */
+    6.2274145840e-09, /* 0x31d5f8eb */
+    1.6655924903e-11, /* 0x2d9281cf */
+];
+
+/// First order of the [Bessel function](https://en.wikipedia.org/wiki/Bessel_function) of the second kind (f32).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn y1f(x: f32) -> f32 {
+    let z: f32;
+    let u: f32;
+    let v: f32;
+    let ix: u32;
+
+    ix = x.to_bits();
+    if (ix & 0x7fffffff) == 0 {
+        return -1.0 / 0.0;
+    }
+    if (ix >> 31) != 0 {
+        return 0.0 / 0.0;
+    }
+    if ix >= 0x7f800000 {
+        return 1.0 / x;
+    }
+    if ix >= 0x40000000 {
+        /* |x| >= 2.0 */
+        return common(ix, x, true, false);
+    }
+    if ix < 0x33000000 {
+        /* x < 2**-25 */
+        return -TPI / x;
+    }
+    z = x * x;
+    u = U0[0] + z * (U0[1] + z * (U0[2] + z * (U0[3] + z * U0[4])));
+    v = 1.0 + z * (V0[0] + z * (V0[1] + z * (V0[2] + z * (V0[3] + z * V0[4]))));
+    return x * (u / v) + TPI * (j1f(x) * logf(x) - 1.0 / x);
+}
+
+/* For x >= 8, the asymptotic expansions of pone is
+ *      1 + 15/128 s^2 - 4725/2^15 s^4 - ...,   where s = 1/x.
+ * We approximate pone by
+ *      pone(x) = 1 + (R/S)
+ * where  R = pr0 + pr1*s^2 + pr2*s^4 + ... + pr5*s^10
+ *        S = 1 + ps0*s^2 + ... + ps4*s^10
+ * and
+ *      | pone(x)-1-R/S | <= 2  ** ( -60.06)
+ */
+
+const PR8: [f32; 6] = [
+    /* for x in [inf, 8]=1/[0,0.125] */
+    0.0000000000e+00, /* 0x00000000 */
+    1.1718750000e-01, /* 0x3df00000 */
+    1.3239480972e+01, /* 0x4153d4ea */
+    4.1205184937e+02, /* 0x43ce06a3 */
+    3.8747453613e+03, /* 0x45722bed */
+    7.9144794922e+03, /* 0x45f753d6 */
+];
+const PS8: [f32; 5] = [
+    1.1420736694e+02, /* 0x42e46a2c */
+    3.6509309082e+03, /* 0x45642ee5 */
+    3.6956207031e+04, /* 0x47105c35 */
+    9.7602796875e+04, /* 0x47bea166 */
+    3.0804271484e+04, /* 0x46f0a88b */
+];
+
+const PR5: [f32; 6] = [
+    /* for x in [8,4.5454]=1/[0.125,0.22001] */
+    1.3199052094e-11, /* 0x2d68333f */
+    1.1718749255e-01, /* 0x3defffff */
+    6.8027510643e+00, /* 0x40d9b023 */
+    1.0830818176e+02, /* 0x42d89dca */
+    5.1763616943e+02, /* 0x440168b7 */
+    5.2871520996e+02, /* 0x44042dc6 */
+];
+const PS5: [f32; 5] = [
+    5.9280597687e+01, /* 0x426d1f55 */
+    9.9140142822e+02, /* 0x4477d9b1 */
+    5.3532670898e+03, /* 0x45a74a23 */
+    7.8446904297e+03, /* 0x45f52586 */
+    1.5040468750e+03, /* 0x44bc0180 */
+];
+
+const PR3: [f32; 6] = [
+    3.0250391081e-09, /* 0x314fe10d */
+    1.1718686670e-01, /* 0x3defffab */
+    3.9329774380e+00, /* 0x407bb5e7 */
+    3.5119403839e+01, /* 0x420c7a45 */
+    9.1055007935e+01, /* 0x42b61c2a */
+    4.8559066772e+01, /* 0x42423c7c */
+];
+const PS3: [f32; 5] = [
+    3.4791309357e+01, /* 0x420b2a4d */
+    3.3676245117e+02, /* 0x43a86198 */
+    1.0468714600e+03, /* 0x4482dbe3 */
+    8.9081134033e+02, /* 0x445eb3ed */
+    1.0378793335e+02, /* 0x42cf936c */
+];
+
+const PR2: [f32; 6] = [
+    /* for x in [2.8570,2]=1/[0.3499,0.5] */
+    1.0771083225e-07, /* 0x33e74ea8 */
+    1.1717621982e-01, /* 0x3deffa16 */
+    2.3685150146e+00, /* 0x401795c0 */
+    1.2242610931e+01, /* 0x4143e1bc */
+    1.7693971634e+01, /* 0x418d8d41 */
+    5.0735230446e+00, /* 0x40a25a4d */
+];
+const PS2: [f32; 5] = [
+    2.1436485291e+01, /* 0x41ab7dec */
+    1.2529022980e+02, /* 0x42fa9499 */
+    2.3227647400e+02, /* 0x436846c7 */
+    1.1767937469e+02, /* 0x42eb5bd7 */
+    8.3646392822e+00, /* 0x4105d590 */
+];
+
+fn ponef(x: f32) -> f32 {
+    let p: &[f32; 6];
+    let q: &[f32; 5];
+    let z: f32;
+    let r: f32;
+    let s: f32;
+    let mut ix: u32;
+
+    ix = x.to_bits();
+    ix &= 0x7fffffff;
+    if ix >= 0x41000000 {
+        p = &PR8;
+        q = &PS8;
+    } else if ix >= 0x409173eb {
+        p = &PR5;
+        q = &PS5;
+    } else if ix >= 0x4036d917 {
+        p = &PR3;
+        q = &PS3;
+    } else
+    /*ix >= 0x40000000*/
+    {
+        p = &PR2;
+        q = &PS2;
+    }
+    z = 1.0 / (x * x);
+    r = p[0] + z * (p[1] + z * (p[2] + z * (p[3] + z * (p[4] + z * p[5]))));
+    s = 1.0 + z * (q[0] + z * (q[1] + z * (q[2] + z * (q[3] + z * q[4]))));
+    return 1.0 + r / s;
+}
+
+/* For x >= 8, the asymptotic expansions of qone is
+ *      3/8 s - 105/1024 s^3 - ..., where s = 1/x.
+ * We approximate pone by
+ *      qone(x) = s*(0.375 + (R/S))
+ * where  R = qr1*s^2 + qr2*s^4 + ... + qr5*s^10
+ *        S = 1 + qs1*s^2 + ... + qs6*s^12
+ * and
+ *      | qone(x)/s -0.375-R/S | <= 2  ** ( -61.13)
+ */
+
+const QR8: [f32; 6] = [
+    /* for x in [inf, 8]=1/[0,0.125] */
+    0.0000000000e+00,  /* 0x00000000 */
+    -1.0253906250e-01, /* 0xbdd20000 */
+    -1.6271753311e+01, /* 0xc1822c8d */
+    -7.5960174561e+02, /* 0xc43de683 */
+    -1.1849806641e+04, /* 0xc639273a */
+    -4.8438511719e+04, /* 0xc73d3683 */
+];
+const QS8: [f32; 6] = [
+    1.6139537048e+02,  /* 0x43216537 */
+    7.8253862305e+03,  /* 0x45f48b17 */
+    1.3387534375e+05,  /* 0x4802bcd6 */
+    7.1965775000e+05,  /* 0x492fb29c */
+    6.6660125000e+05,  /* 0x4922be94 */
+    -2.9449025000e+05, /* 0xc88fcb48 */
+];
+
+const QR5: [f32; 6] = [
+    /* for x in [8,4.5454]=1/[0.125,0.22001] */
+    -2.0897993405e-11, /* 0xadb7d219 */
+    -1.0253904760e-01, /* 0xbdd1fffe */
+    -8.0564479828e+00, /* 0xc100e736 */
+    -1.8366960144e+02, /* 0xc337ab6b */
+    -1.3731937256e+03, /* 0xc4aba633 */
+    -2.6124443359e+03, /* 0xc523471c */
+];
+const QS5: [f32; 6] = [
+    8.1276550293e+01,  /* 0x42a28d98 */
+    1.9917987061e+03,  /* 0x44f8f98f */
+    1.7468484375e+04,  /* 0x468878f8 */
+    4.9851425781e+04,  /* 0x4742bb6d */
+    2.7948074219e+04,  /* 0x46da5826 */
+    -4.7191835938e+03, /* 0xc5937978 */
+];
+
+const QR3: [f32; 6] = [
+    -5.0783124372e-09, /* 0xb1ae7d4f */
+    -1.0253783315e-01, /* 0xbdd1ff5b */
+    -4.6101160049e+00, /* 0xc0938612 */
+    -5.7847221375e+01, /* 0xc267638e */
+    -2.2824453735e+02, /* 0xc3643e9a */
+    -2.1921012878e+02, /* 0xc35b35cb */
+];
+const QS3: [f32; 6] = [
+    4.7665153503e+01,  /* 0x423ea91e */
+    6.7386511230e+02,  /* 0x4428775e */
+    3.3801528320e+03,  /* 0x45534272 */
+    5.5477290039e+03,  /* 0x45ad5dd5 */
+    1.9031191406e+03,  /* 0x44ede3d0 */
+    -1.3520118713e+02, /* 0xc3073381 */
+];
+
+const QR2: [f32; 6] = [
+    /* for x in [2.8570,2]=1/[0.3499,0.5] */
+    -1.7838172539e-07, /* 0xb43f8932 */
+    -1.0251704603e-01, /* 0xbdd1f475 */
+    -2.7522056103e+00, /* 0xc0302423 */
+    -1.9663616180e+01, /* 0xc19d4f16 */
+    -4.2325313568e+01, /* 0xc2294d1f */
+    -2.1371921539e+01, /* 0xc1aaf9b2 */
+];
+const QS2: [f32; 6] = [
+    2.9533363342e+01,  /* 0x41ec4454 */
+    2.5298155212e+02,  /* 0x437cfb47 */
+    7.5750280762e+02,  /* 0x443d602e */
+    7.3939318848e+02,  /* 0x4438d92a */
+    1.5594900513e+02,  /* 0x431bf2f2 */
+    -4.9594988823e+00, /* 0xc09eb437 */
+];
+
+fn qonef(x: f32) -> f32 {
+    let p: &[f32; 6];
+    let q: &[f32; 6];
+    let s: f32;
+    let r: f32;
+    let z: f32;
+    let mut ix: u32;
+
+    ix = x.to_bits();
+    ix &= 0x7fffffff;
+    if ix >= 0x41000000 {
+        p = &QR8;
+        q = &QS8;
+    } else if ix >= 0x409173eb {
+        p = &QR5;
+        q = &QS5;
+    } else if ix >= 0x4036d917 {
+        p = &QR3;
+        q = &QS3;
+    } else
+    /*ix >= 0x40000000*/
+    {
+        p = &QR2;
+        q = &QS2;
+    }
+    z = 1.0 / (x * x);
+    r = p[0] + z * (p[1] + z * (p[2] + z * (p[3] + z * (p[4] + z * p[5]))));
+    s = 1.0 + z * (q[0] + z * (q[1] + z * (q[2] + z * (q[3] + z * (q[4] + z * q[5])))));
+    return (0.375 + r / s) / x;
+}
+
+// PowerPC tests are failing on LLVM 13: https://github.com/rust-lang/rust/issues/88520
+#[cfg(not(target_arch = "powerpc64"))]
+#[cfg(test)]
+mod tests {
+    use super::{j1f, y1f};
+    #[test]
+    fn test_j1f_2488() {
+        // 0x401F3E49
+        assert_eq!(j1f(2.4881766_f32), 0.49999475_f32);
+    }
+    #[test]
+    fn test_y1f_2002() {
+        //allow slightly different result on x87
+        let res = y1f(2.0000002_f32);
+        if cfg!(all(target_arch = "x86", not(target_feature = "sse2"))) && (res == -0.10703231_f32)
+        {
+            return;
+        }
+        assert_eq!(res, -0.10703229_f32);
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/jn.rs b/library/compiler-builtins/libm/src/math/jn.rs
new file mode 100644
index 00000000000..31f8d9c5382
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/jn.rs
@@ -0,0 +1,339 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/e_jn.c */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunSoft, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+/*
+ * jn(n, x), yn(n, x)
+ * floating point Bessel's function of the 1st and 2nd kind
+ * of order n
+ *
+ * Special cases:
+ *      y0(0)=y1(0)=yn(n,0) = -inf with division by zero signal;
+ *      y0(-ve)=y1(-ve)=yn(n,-ve) are NaN with invalid signal.
+ * Note 2. About jn(n,x), yn(n,x)
+ *      For n=0, j0(x) is called,
+ *      for n=1, j1(x) is called,
+ *      for n<=x, forward recursion is used starting
+ *      from values of j0(x) and j1(x).
+ *      for n>x, a continued fraction approximation to
+ *      j(n,x)/j(n-1,x) is evaluated and then backward
+ *      recursion is used starting from a supposed value
+ *      for j(n,x). The resulting value of j(0,x) is
+ *      compared with the actual value to correct the
+ *      supposed value of j(n,x).
+ *
+ *      yn(n,x) is similar in all respects, except
+ *      that forward recursion is used for all
+ *      values of n>1.
+ */
+
+use super::{cos, fabs, get_high_word, get_low_word, j0, j1, log, sin, sqrt, y0, y1};
+
+const INVSQRTPI: f64 = 5.64189583547756279280e-01; /* 0x3FE20DD7, 0x50429B6D */
+
+/// Integer order of the [Bessel function](https://en.wikipedia.org/wiki/Bessel_function) of the first kind (f64).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn jn(n: i32, mut x: f64) -> f64 {
+    let mut ix: u32;
+    let lx: u32;
+    let nm1: i32;
+    let mut i: i32;
+    let mut sign: bool;
+    let mut a: f64;
+    let mut b: f64;
+    let mut temp: f64;
+
+    ix = get_high_word(x);
+    lx = get_low_word(x);
+    sign = (ix >> 31) != 0;
+    ix &= 0x7fffffff;
+
+    // -lx == !lx + 1
+    if ix | ((lx | (!lx).wrapping_add(1)) >> 31) > 0x7ff00000 {
+        /* nan */
+        return x;
+    }
+
+    /* J(-n,x) = (-1)^n * J(n, x), J(n, -x) = (-1)^n * J(n, x)
+     * Thus, J(-n,x) = J(n,-x)
+     */
+    /* nm1 = |n|-1 is used instead of |n| to handle n==INT_MIN */
+    if n == 0 {
+        return j0(x);
+    }
+    if n < 0 {
+        nm1 = -(n + 1);
+        x = -x;
+        sign = !sign;
+    } else {
+        nm1 = n - 1;
+    }
+    if nm1 == 0 {
+        return j1(x);
+    }
+
+    sign &= (n & 1) != 0; /* even n: 0, odd n: signbit(x) */
+    x = fabs(x);
+    if (ix | lx) == 0 || ix == 0x7ff00000 {
+        /* if x is 0 or inf */
+        b = 0.0;
+    } else if (nm1 as f64) < x {
+        /* Safe to use J(n+1,x)=2n/x *J(n,x)-J(n-1,x) */
+        if ix >= 0x52d00000 {
+            /* x > 2**302 */
+            /* (x >> n**2)
+             *      Jn(x) = cos(x-(2n+1)*pi/4)*sqrt(2/x*pi)
+             *      Yn(x) = sin(x-(2n+1)*pi/4)*sqrt(2/x*pi)
+             *      Let s=sin(x), c=cos(x),
+             *          xn=x-(2n+1)*pi/4, sqt2 = sqrt(2),then
+             *
+             *             n    sin(xn)*sqt2    cos(xn)*sqt2
+             *          ----------------------------------
+             *             0     s-c             c+s
+             *             1    -s-c            -c+s
+             *             2    -s+c            -c-s
+             *             3     s+c             c-s
+             */
+            temp = match nm1 & 3 {
+                0 => -cos(x) + sin(x),
+                1 => -cos(x) - sin(x),
+                2 => cos(x) - sin(x),
+                // 3
+                _ => cos(x) + sin(x),
+            };
+            b = INVSQRTPI * temp / sqrt(x);
+        } else {
+            a = j0(x);
+            b = j1(x);
+            i = 0;
+            while i < nm1 {
+                i += 1;
+                temp = b;
+                b = b * (2.0 * (i as f64) / x) - a; /* avoid underflow */
+                a = temp;
+            }
+        }
+    } else if ix < 0x3e100000 {
+        /* x < 2**-29 */
+        /* x is tiny, return the first Taylor expansion of J(n,x)
+         * J(n,x) = 1/n!*(x/2)^n  - ...
+         */
+        if nm1 > 32 {
+            /* underflow */
+            b = 0.0;
+        } else {
+            temp = x * 0.5;
+            b = temp;
+            a = 1.0;
+            i = 2;
+            while i <= nm1 + 1 {
+                a *= i as f64; /* a = n! */
+                b *= temp; /* b = (x/2)^n */
+                i += 1;
+            }
+            b = b / a;
+        }
+    } else {
+        /* use backward recurrence */
+        /*                      x      x^2      x^2
+         *  J(n,x)/J(n-1,x) =  ----   ------   ------   .....
+         *                      2n  - 2(n+1) - 2(n+2)
+         *
+         *                      1      1        1
+         *  (for large x)   =  ----  ------   ------   .....
+         *                      2n   2(n+1)   2(n+2)
+         *                      -- - ------ - ------ -
+         *                       x     x         x
+         *
+         * Let w = 2n/x and h=2/x, then the above quotient
+         * is equal to the continued fraction:
+         *                  1
+         *      = -----------------------
+         *                     1
+         *         w - -----------------
+         *                        1
+         *              w+h - ---------
+         *                     w+2h - ...
+         *
+         * To determine how many terms needed, let
+         * Q(0) = w, Q(1) = w(w+h) - 1,
+         * Q(k) = (w+k*h)*Q(k-1) - Q(k-2),
+         * When Q(k) > 1e4      good for single
+         * When Q(k) > 1e9      good for double
+         * When Q(k) > 1e17     good for quadruple
+         */
+        /* determine k */
+        let mut t: f64;
+        let mut q0: f64;
+        let mut q1: f64;
+        let mut w: f64;
+        let h: f64;
+        let mut z: f64;
+        let mut tmp: f64;
+        let nf: f64;
+
+        let mut k: i32;
+
+        nf = (nm1 as f64) + 1.0;
+        w = 2.0 * nf / x;
+        h = 2.0 / x;
+        z = w + h;
+        q0 = w;
+        q1 = w * z - 1.0;
+        k = 1;
+        while q1 < 1.0e9 {
+            k += 1;
+            z += h;
+            tmp = z * q1 - q0;
+            q0 = q1;
+            q1 = tmp;
+        }
+        t = 0.0;
+        i = k;
+        while i >= 0 {
+            t = 1.0 / (2.0 * ((i as f64) + nf) / x - t);
+            i -= 1;
+        }
+        a = t;
+        b = 1.0;
+        /*  estimate log((2/x)^n*n!) = n*log(2/x)+n*ln(n)
+         *  Hence, if n*(log(2n/x)) > ...
+         *  single 8.8722839355e+01
+         *  double 7.09782712893383973096e+02
+         *  long double 1.1356523406294143949491931077970765006170e+04
+         *  then recurrent value may overflow and the result is
+         *  likely underflow to zero
+         */
+        tmp = nf * log(fabs(w));
+        if tmp < 7.09782712893383973096e+02 {
+            i = nm1;
+            while i > 0 {
+                temp = b;
+                b = b * (2.0 * (i as f64)) / x - a;
+                a = temp;
+                i -= 1;
+            }
+        } else {
+            i = nm1;
+            while i > 0 {
+                temp = b;
+                b = b * (2.0 * (i as f64)) / x - a;
+                a = temp;
+                /* scale b to avoid spurious overflow */
+                let x1p500 = f64::from_bits(0x5f30000000000000); // 0x1p500 == 2^500
+                if b > x1p500 {
+                    a /= b;
+                    t /= b;
+                    b = 1.0;
+                }
+                i -= 1;
+            }
+        }
+        z = j0(x);
+        w = j1(x);
+        if fabs(z) >= fabs(w) {
+            b = t * z / b;
+        } else {
+            b = t * w / a;
+        }
+    }
+
+    if sign { -b } else { b }
+}
+
+/// Integer order of the [Bessel function](https://en.wikipedia.org/wiki/Bessel_function) of the second kind (f64).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn yn(n: i32, x: f64) -> f64 {
+    let mut ix: u32;
+    let lx: u32;
+    let mut ib: u32;
+    let nm1: i32;
+    let mut sign: bool;
+    let mut i: i32;
+    let mut a: f64;
+    let mut b: f64;
+    let mut temp: f64;
+
+    ix = get_high_word(x);
+    lx = get_low_word(x);
+    sign = (ix >> 31) != 0;
+    ix &= 0x7fffffff;
+
+    // -lx == !lx + 1
+    if ix | ((lx | (!lx).wrapping_add(1)) >> 31) > 0x7ff00000 {
+        /* nan */
+        return x;
+    }
+    if sign && (ix | lx) != 0 {
+        /* x < 0 */
+        return 0.0 / 0.0;
+    }
+    if ix == 0x7ff00000 {
+        return 0.0;
+    }
+
+    if n == 0 {
+        return y0(x);
+    }
+    if n < 0 {
+        nm1 = -(n + 1);
+        sign = (n & 1) != 0;
+    } else {
+        nm1 = n - 1;
+        sign = false;
+    }
+    if nm1 == 0 {
+        if sign {
+            return -y1(x);
+        } else {
+            return y1(x);
+        }
+    }
+
+    if ix >= 0x52d00000 {
+        /* x > 2**302 */
+        /* (x >> n**2)
+         *      Jn(x) = cos(x-(2n+1)*pi/4)*sqrt(2/x*pi)
+         *      Yn(x) = sin(x-(2n+1)*pi/4)*sqrt(2/x*pi)
+         *      Let s=sin(x), c=cos(x),
+         *          xn=x-(2n+1)*pi/4, sqt2 = sqrt(2),then
+         *
+         *             n    sin(xn)*sqt2    cos(xn)*sqt2
+         *          ----------------------------------
+         *             0     s-c             c+s
+         *             1    -s-c            -c+s
+         *             2    -s+c            -c-s
+         *             3     s+c             c-s
+         */
+        temp = match nm1 & 3 {
+            0 => -sin(x) - cos(x),
+            1 => -sin(x) + cos(x),
+            2 => sin(x) + cos(x),
+            // 3
+            _ => sin(x) - cos(x),
+        };
+        b = INVSQRTPI * temp / sqrt(x);
+    } else {
+        a = y0(x);
+        b = y1(x);
+        /* quit if b is -inf */
+        ib = get_high_word(b);
+        i = 0;
+        while i < nm1 && ib != 0xfff00000 {
+            i += 1;
+            temp = b;
+            b = (2.0 * (i as f64) / x) * b - a;
+            ib = get_high_word(b);
+            a = temp;
+        }
+    }
+
+    if sign { -b } else { b }
+}
diff --git a/library/compiler-builtins/libm/src/math/jnf.rs b/library/compiler-builtins/libm/src/math/jnf.rs
new file mode 100644
index 00000000000..52cf7d8a8bd
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/jnf.rs
@@ -0,0 +1,253 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/e_jnf.c */
+/*
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
+ */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+use super::{fabsf, j0f, j1f, logf, y0f, y1f};
+
+/// Integer order of the [Bessel function](https://en.wikipedia.org/wiki/Bessel_function) of the first kind (f32).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn jnf(n: i32, mut x: f32) -> f32 {
+    let mut ix: u32;
+    let mut nm1: i32;
+    let mut sign: bool;
+    let mut i: i32;
+    let mut a: f32;
+    let mut b: f32;
+    let mut temp: f32;
+
+    ix = x.to_bits();
+    sign = (ix >> 31) != 0;
+    ix &= 0x7fffffff;
+    if ix > 0x7f800000 {
+        /* nan */
+        return x;
+    }
+
+    /* J(-n,x) = J(n,-x), use |n|-1 to avoid overflow in -n */
+    if n == 0 {
+        return j0f(x);
+    }
+    if n < 0 {
+        nm1 = -(n + 1);
+        x = -x;
+        sign = !sign;
+    } else {
+        nm1 = n - 1;
+    }
+    if nm1 == 0 {
+        return j1f(x);
+    }
+
+    sign &= (n & 1) != 0; /* even n: 0, odd n: signbit(x) */
+    x = fabsf(x);
+    if ix == 0 || ix == 0x7f800000 {
+        /* if x is 0 or inf */
+        b = 0.0;
+    } else if (nm1 as f32) < x {
+        /* Safe to use J(n+1,x)=2n/x *J(n,x)-J(n-1,x) */
+        a = j0f(x);
+        b = j1f(x);
+        i = 0;
+        while i < nm1 {
+            i += 1;
+            temp = b;
+            b = b * (2.0 * (i as f32) / x) - a;
+            a = temp;
+        }
+    } else if ix < 0x35800000 {
+        /* x < 2**-20 */
+        /* x is tiny, return the first Taylor expansion of J(n,x)
+         * J(n,x) = 1/n!*(x/2)^n  - ...
+         */
+        if nm1 > 8 {
+            /* underflow */
+            nm1 = 8;
+        }
+        temp = 0.5 * x;
+        b = temp;
+        a = 1.0;
+        i = 2;
+        while i <= nm1 + 1 {
+            a *= i as f32; /* a = n! */
+            b *= temp; /* b = (x/2)^n */
+            i += 1;
+        }
+        b = b / a;
+    } else {
+        /* use backward recurrence */
+        /*                      x      x^2      x^2
+         *  J(n,x)/J(n-1,x) =  ----   ------   ------   .....
+         *                      2n  - 2(n+1) - 2(n+2)
+         *
+         *                      1      1        1
+         *  (for large x)   =  ----  ------   ------   .....
+         *                      2n   2(n+1)   2(n+2)
+         *                      -- - ------ - ------ -
+         *                       x     x         x
+         *
+         * Let w = 2n/x and h=2/x, then the above quotient
+         * is equal to the continued fraction:
+         *                  1
+         *      = -----------------------
+         *                     1
+         *         w - -----------------
+         *                        1
+         *              w+h - ---------
+         *                     w+2h - ...
+         *
+         * To determine how many terms needed, let
+         * Q(0) = w, Q(1) = w(w+h) - 1,
+         * Q(k) = (w+k*h)*Q(k-1) - Q(k-2),
+         * When Q(k) > 1e4      good for single
+         * When Q(k) > 1e9      good for double
+         * When Q(k) > 1e17     good for quadruple
+         */
+        /* determine k */
+        let mut t: f32;
+        let mut q0: f32;
+        let mut q1: f32;
+        let mut w: f32;
+        let h: f32;
+        let mut z: f32;
+        let mut tmp: f32;
+        let nf: f32;
+        let mut k: i32;
+
+        nf = (nm1 as f32) + 1.0;
+        w = 2.0 * nf / x;
+        h = 2.0 / x;
+        z = w + h;
+        q0 = w;
+        q1 = w * z - 1.0;
+        k = 1;
+        while q1 < 1.0e4 {
+            k += 1;
+            z += h;
+            tmp = z * q1 - q0;
+            q0 = q1;
+            q1 = tmp;
+        }
+        t = 0.0;
+        i = k;
+        while i >= 0 {
+            t = 1.0 / (2.0 * ((i as f32) + nf) / x - t);
+            i -= 1;
+        }
+        a = t;
+        b = 1.0;
+        /*  estimate log((2/x)^n*n!) = n*log(2/x)+n*ln(n)
+         *  Hence, if n*(log(2n/x)) > ...
+         *  single 8.8722839355e+01
+         *  double 7.09782712893383973096e+02
+         *  long double 1.1356523406294143949491931077970765006170e+04
+         *  then recurrent value may overflow and the result is
+         *  likely underflow to zero
+         */
+        tmp = nf * logf(fabsf(w));
+        if tmp < 88.721679688 {
+            i = nm1;
+            while i > 0 {
+                temp = b;
+                b = 2.0 * (i as f32) * b / x - a;
+                a = temp;
+                i -= 1;
+            }
+        } else {
+            i = nm1;
+            while i > 0 {
+                temp = b;
+                b = 2.0 * (i as f32) * b / x - a;
+                a = temp;
+                /* scale b to avoid spurious overflow */
+                let x1p60 = f32::from_bits(0x5d800000); // 0x1p60 == 2^60
+                if b > x1p60 {
+                    a /= b;
+                    t /= b;
+                    b = 1.0;
+                }
+                i -= 1;
+            }
+        }
+        z = j0f(x);
+        w = j1f(x);
+        if fabsf(z) >= fabsf(w) {
+            b = t * z / b;
+        } else {
+            b = t * w / a;
+        }
+    }
+
+    if sign { -b } else { b }
+}
+
+/// Integer order of the [Bessel function](https://en.wikipedia.org/wiki/Bessel_function) of the second kind (f32).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn ynf(n: i32, x: f32) -> f32 {
+    let mut ix: u32;
+    let mut ib: u32;
+    let nm1: i32;
+    let mut sign: bool;
+    let mut i: i32;
+    let mut a: f32;
+    let mut b: f32;
+    let mut temp: f32;
+
+    ix = x.to_bits();
+    sign = (ix >> 31) != 0;
+    ix &= 0x7fffffff;
+    if ix > 0x7f800000 {
+        /* nan */
+        return x;
+    }
+    if sign && ix != 0 {
+        /* x < 0 */
+        return 0.0 / 0.0;
+    }
+    if ix == 0x7f800000 {
+        return 0.0;
+    }
+
+    if n == 0 {
+        return y0f(x);
+    }
+    if n < 0 {
+        nm1 = -(n + 1);
+        sign = (n & 1) != 0;
+    } else {
+        nm1 = n - 1;
+        sign = false;
+    }
+    if nm1 == 0 {
+        if sign {
+            return -y1f(x);
+        } else {
+            return y1f(x);
+        }
+    }
+
+    a = y0f(x);
+    b = y1f(x);
+    /* quit if b is -inf */
+    ib = b.to_bits();
+    i = 0;
+    while i < nm1 && ib != 0xff800000 {
+        i += 1;
+        temp = b;
+        b = (2.0 * (i as f32) / x) * b - a;
+        ib = b.to_bits();
+        a = temp;
+    }
+
+    if sign { -b } else { b }
+}
diff --git a/library/compiler-builtins/libm/src/math/k_cos.rs b/library/compiler-builtins/libm/src/math/k_cos.rs
new file mode 100644
index 00000000000..49b2fc64d86
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/k_cos.rs
@@ -0,0 +1,62 @@
+// origin: FreeBSD /usr/src/lib/msun/src/k_cos.c
+//
+// ====================================================
+// Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+//
+// Developed at SunSoft, a Sun Microsystems, Inc. business.
+// Permission to use, copy, modify, and distribute this
+// software is freely granted, provided that this notice
+// is preserved.
+// ====================================================
+
+const C1: f64 = 4.16666666666666019037e-02; /* 0x3FA55555, 0x5555554C */
+const C2: f64 = -1.38888888888741095749e-03; /* 0xBF56C16C, 0x16C15177 */
+const C3: f64 = 2.48015872894767294178e-05; /* 0x3EFA01A0, 0x19CB1590 */
+const C4: f64 = -2.75573143513906633035e-07; /* 0xBE927E4F, 0x809C52AD */
+const C5: f64 = 2.08757232129817482790e-09; /* 0x3E21EE9E, 0xBDB4B1C4 */
+const C6: f64 = -1.13596475577881948265e-11; /* 0xBDA8FAE9, 0xBE8838D4 */
+
+// kernel cos function on [-pi/4, pi/4], pi/4 ~ 0.785398164
+// Input x is assumed to be bounded by ~pi/4 in magnitude.
+// Input y is the tail of x.
+//
+// Algorithm
+//      1. Since cos(-x) = cos(x), we need only to consider positive x.
+//      2. if x < 2^-27 (hx<0x3e400000 0), return 1 with inexact if x!=0.
+//      3. cos(x) is approximated by a polynomial of degree 14 on
+//         [0,pi/4]
+//                                       4            14
+//              cos(x) ~ 1 - x*x/2 + C1*x + ... + C6*x
+//         where the remez error is
+//
+//      |              2     4     6     8     10    12     14 |     -58
+//      |cos(x)-(1-.5*x +C1*x +C2*x +C3*x +C4*x +C5*x  +C6*x  )| <= 2
+//      |                                                      |
+//
+//                     4     6     8     10    12     14
+//      4. let r = C1*x +C2*x +C3*x +C4*x +C5*x  +C6*x  , then
+//             cos(x) ~ 1 - x*x/2 + r
+//         since cos(x+y) ~ cos(x) - sin(x)*y
+//                        ~ cos(x) - x*y,
+//         a correction term is necessary in cos(x) and hence
+//              cos(x+y) = 1 - (x*x/2 - (r - x*y))
+//         For better accuracy, rearrange to
+//              cos(x+y) ~ w + (tmp + (r-x*y))
+//         where w = 1 - x*x/2 and tmp is a tiny correction term
+//         (1 - x*x/2 == w + tmp exactly in infinite precision).
+//         The exactness of w + tmp in infinite precision depends on w
+//         and tmp having the same precision as x.  If they have extra
+//         precision due to compiler bugs, then the extra precision is
+//         only good provided it is retained in all terms of the final
+//         expression for cos().  Retention happens in all cases tested
+//         under FreeBSD, so don't pessimize things by forcibly clipping
+//         any extra precision in w.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub(crate) fn k_cos(x: f64, y: f64) -> f64 {
+    let z = x * x;
+    let w = z * z;
+    let r = z * (C1 + z * (C2 + z * C3)) + w * w * (C4 + z * (C5 + z * C6));
+    let hz = 0.5 * z;
+    let w = 1.0 - hz;
+    w + (((1.0 - w) - hz) + (z * r - x * y))
+}
diff --git a/library/compiler-builtins/libm/src/math/k_cosf.rs b/library/compiler-builtins/libm/src/math/k_cosf.rs
new file mode 100644
index 00000000000..e99f2348c00
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/k_cosf.rs
@@ -0,0 +1,29 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/k_cosf.c */
+/*
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
+ * Debugged and optimized by Bruce D. Evans.
+ */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+/* |cos(x) - c(x)| < 2**-34.1 (~[-5.37e-11, 5.295e-11]). */
+const C0: f64 = -0.499999997251031003120; /* -0x1ffffffd0c5e81.0p-54 */
+const C1: f64 = 0.0416666233237390631894; /*  0x155553e1053a42.0p-57 */
+const C2: f64 = -0.00138867637746099294692; /* -0x16c087e80f1e27.0p-62 */
+const C3: f64 = 0.0000243904487962774090654; /*  0x199342e0ee5069.0p-68 */
+
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub(crate) fn k_cosf(x: f64) -> f32 {
+    let z = x * x;
+    let w = z * z;
+    let r = C2 + z * C3;
+    (((1.0 + z * C0) + w * C1) + (w * z) * r) as f32
+}
diff --git a/library/compiler-builtins/libm/src/math/k_expo2.rs b/library/compiler-builtins/libm/src/math/k_expo2.rs
new file mode 100644
index 00000000000..7345075f376
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/k_expo2.rs
@@ -0,0 +1,14 @@
+use super::exp;
+
+/* k is such that k*ln2 has minimal relative error and x - kln2 > log(FLT_MIN) */
+const K: i32 = 2043;
+
+/* expf(x)/2 for x >= log(FLT_MAX), slightly better than 0.5f*expf(x/2)*expf(x/2) */
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub(crate) fn k_expo2(x: f64) -> f64 {
+    let k_ln2 = f64::from_bits(0x40962066151add8b);
+    /* note that k is odd and scale*scale overflows */
+    let scale = f64::from_bits(((((0x3ff + K / 2) as u32) << 20) as u64) << 32);
+    /* exp(x - k ln2) * 2**(k-1) */
+    exp(x - k_ln2) * scale * scale
+}
diff --git a/library/compiler-builtins/libm/src/math/k_expo2f.rs b/library/compiler-builtins/libm/src/math/k_expo2f.rs
new file mode 100644
index 00000000000..fbd7b27d583
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/k_expo2f.rs
@@ -0,0 +1,14 @@
+use super::expf;
+
+/* k is such that k*ln2 has minimal relative error and x - kln2 > log(FLT_MIN) */
+const K: i32 = 235;
+
+/* expf(x)/2 for x >= log(FLT_MAX), slightly better than 0.5f*expf(x/2)*expf(x/2) */
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub(crate) fn k_expo2f(x: f32) -> f32 {
+    let k_ln2 = f32::from_bits(0x4322e3bc);
+    /* note that k is odd and scale*scale overflows */
+    let scale = f32::from_bits(((0x7f + K / 2) as u32) << 23);
+    /* exp(x - k ln2) * 2**(k-1) */
+    expf(x - k_ln2) * scale * scale
+}
diff --git a/library/compiler-builtins/libm/src/math/k_sin.rs b/library/compiler-builtins/libm/src/math/k_sin.rs
new file mode 100644
index 00000000000..9dd96c94474
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/k_sin.rs
@@ -0,0 +1,57 @@
+// origin: FreeBSD /usr/src/lib/msun/src/k_sin.c
+//
+// ====================================================
+// Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+//
+// Developed at SunSoft, a Sun Microsystems, Inc. business.
+// Permission to use, copy, modify, and distribute this
+// software is freely granted, provided that this notice
+// is preserved.
+// ====================================================
+
+const S1: f64 = -1.66666666666666324348e-01; /* 0xBFC55555, 0x55555549 */
+const S2: f64 = 8.33333333332248946124e-03; /* 0x3F811111, 0x1110F8A6 */
+const S3: f64 = -1.98412698298579493134e-04; /* 0xBF2A01A0, 0x19C161D5 */
+const S4: f64 = 2.75573137070700676789e-06; /* 0x3EC71DE3, 0x57B1FE7D */
+const S5: f64 = -2.50507602534068634195e-08; /* 0xBE5AE5E6, 0x8A2B9CEB */
+const S6: f64 = 1.58969099521155010221e-10; /* 0x3DE5D93A, 0x5ACFD57C */
+
+// kernel sin function on ~[-pi/4, pi/4] (except on -0), pi/4 ~ 0.7854
+// Input x is assumed to be bounded by ~pi/4 in magnitude.
+// Input y is the tail of x.
+// Input iy indicates whether y is 0. (if iy=0, y assume to be 0).
+//
+// Algorithm
+//      1. Since sin(-x) = -sin(x), we need only to consider positive x.
+//      2. Callers must return sin(-0) = -0 without calling here since our
+//         odd polynomial is not evaluated in a way that preserves -0.
+//         Callers may do the optimization sin(x) ~ x for tiny x.
+//      3. sin(x) is approximated by a polynomial of degree 13 on
+//         [0,pi/4]
+//                               3            13
+//              sin(x) ~ x + S1*x + ... + S6*x
+//         where
+//
+//      |sin(x)         2     4     6     8     10     12  |     -58
+//      |----- - (1+S1*x +S2*x +S3*x +S4*x +S5*x  +S6*x   )| <= 2
+//      |  x                                               |
+//
+//      4. sin(x+y) = sin(x) + sin'(x')*y
+//                  ~ sin(x) + (1-x*x/2)*y
+//         For better accuracy, let
+//                   3      2      2      2      2
+//              r = x *(S2+x *(S3+x *(S4+x *(S5+x *S6))))
+//         then                   3    2
+//              sin(x) = x + (S1*x + (x *(r-y/2)+y))
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub(crate) fn k_sin(x: f64, y: f64, iy: i32) -> f64 {
+    let z = x * x;
+    let w = z * z;
+    let r = S2 + z * (S3 + z * S4) + z * w * (S5 + z * S6);
+    let v = z * x;
+    if iy == 0 {
+        x + v * (S1 + z * r)
+    } else {
+        x - ((z * (0.5 * y - v * r) - y) - v * S1)
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/k_sinf.rs b/library/compiler-builtins/libm/src/math/k_sinf.rs
new file mode 100644
index 00000000000..88d10cababc
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/k_sinf.rs
@@ -0,0 +1,30 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/k_sinf.c */
+/*
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
+ * Optimized by Bruce D. Evans.
+ */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+/* |sin(x)/x - s(x)| < 2**-37.5 (~[-4.89e-12, 4.824e-12]). */
+const S1: f64 = -0.166666666416265235595; /* -0x15555554cbac77.0p-55 */
+const S2: f64 = 0.0083333293858894631756; /*  0x111110896efbb2.0p-59 */
+const S3: f64 = -0.000198393348360966317347; /* -0x1a00f9e2cae774.0p-65 */
+const S4: f64 = 0.0000027183114939898219064; /*  0x16cd878c3b46a7.0p-71 */
+
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub(crate) fn k_sinf(x: f64) -> f32 {
+    let z = x * x;
+    let w = z * z;
+    let r = S3 + z * S4;
+    let s = z * x;
+    ((x + s * (S1 + z * S2)) + s * w * r) as f32
+}
diff --git a/library/compiler-builtins/libm/src/math/k_tan.rs b/library/compiler-builtins/libm/src/math/k_tan.rs
new file mode 100644
index 00000000000..d177010bb0a
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/k_tan.rs
@@ -0,0 +1,105 @@
+// origin: FreeBSD /usr/src/lib/msun/src/k_tan.c */
+//
+// ====================================================
+// Copyright 2004 Sun Microsystems, Inc.  All Rights Reserved.
+//
+// Permission to use, copy, modify, and distribute this
+// software is freely granted, provided that this notice
+// is preserved.
+// ====================================================
+
+// kernel tan function on ~[-pi/4, pi/4] (except on -0), pi/4 ~ 0.7854
+// Input x is assumed to be bounded by ~pi/4 in magnitude.
+// Input y is the tail of x.
+// Input odd indicates whether tan (if odd = 0) or -1/tan (if odd = 1) is returned.
+//
+// Algorithm
+//      1. Since tan(-x) = -tan(x), we need only to consider positive x.
+//      2. Callers must return tan(-0) = -0 without calling here since our
+//         odd polynomial is not evaluated in a way that preserves -0.
+//         Callers may do the optimization tan(x) ~ x for tiny x.
+//      3. tan(x) is approximated by a odd polynomial of degree 27 on
+//         [0,0.67434]
+//                               3             27
+//              tan(x) ~ x + T1*x + ... + T13*x
+//         where
+//
+//              |tan(x)         2     4            26   |     -59.2
+//              |----- - (1+T1*x +T2*x +.... +T13*x    )| <= 2
+//              |  x                                    |
+//
+//         Note: tan(x+y) = tan(x) + tan'(x)*y
+//                        ~ tan(x) + (1+x*x)*y
+//         Therefore, for better accuracy in computing tan(x+y), let
+//                   3      2      2       2       2
+//              r = x *(T2+x *(T3+x *(...+x *(T12+x *T13))))
+//         then
+//                                  3    2
+//              tan(x+y) = x + (T1*x + (x *(r+y)+y))
+//
+//      4. For x in [0.67434,pi/4],  let y = pi/4 - x, then
+//              tan(x) = tan(pi/4-y) = (1-tan(y))/(1+tan(y))
+//                     = 1 - 2*(tan(y) - (tan(y)^2)/(1+tan(y)))
+static T: [f64; 13] = [
+    3.33333333333334091986e-01,  /* 3FD55555, 55555563 */
+    1.33333333333201242699e-01,  /* 3FC11111, 1110FE7A */
+    5.39682539762260521377e-02,  /* 3FABA1BA, 1BB341FE */
+    2.18694882948595424599e-02,  /* 3F9664F4, 8406D637 */
+    8.86323982359930005737e-03,  /* 3F8226E3, E96E8493 */
+    3.59207910759131235356e-03,  /* 3F6D6D22, C9560328 */
+    1.45620945432529025516e-03,  /* 3F57DBC8, FEE08315 */
+    5.88041240820264096874e-04,  /* 3F4344D8, F2F26501 */
+    2.46463134818469906812e-04,  /* 3F3026F7, 1A8D1068 */
+    7.81794442939557092300e-05,  /* 3F147E88, A03792A6 */
+    7.14072491382608190305e-05,  /* 3F12B80F, 32F0A7E9 */
+    -1.85586374855275456654e-05, /* BEF375CB, DB605373 */
+    2.59073051863633712884e-05,  /* 3EFB2A70, 74BF7AD4 */
+];
+const PIO4: f64 = 7.85398163397448278999e-01; /* 3FE921FB, 54442D18 */
+const PIO4_LO: f64 = 3.06161699786838301793e-17; /* 3C81A626, 33145C07 */
+
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub(crate) fn k_tan(mut x: f64, mut y: f64, odd: i32) -> f64 {
+    let hx = (f64::to_bits(x) >> 32) as u32;
+    let big = (hx & 0x7fffffff) >= 0x3FE59428; /* |x| >= 0.6744 */
+    if big {
+        let sign = hx >> 31;
+        if sign != 0 {
+            x = -x;
+            y = -y;
+        }
+        x = (PIO4 - x) + (PIO4_LO - y);
+        y = 0.0;
+    }
+    let z = x * x;
+    let w = z * z;
+    /*
+     * Break x^5*(T[1]+x^2*T[2]+...) into
+     * x^5(T[1]+x^4*T[3]+...+x^20*T[11]) +
+     * x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12]))
+     */
+    let r = T[1] + w * (T[3] + w * (T[5] + w * (T[7] + w * (T[9] + w * T[11]))));
+    let v = z * (T[2] + w * (T[4] + w * (T[6] + w * (T[8] + w * (T[10] + w * T[12])))));
+    let s = z * x;
+    let r = y + z * (s * (r + v) + y) + s * T[0];
+    let w = x + r;
+    if big {
+        let sign = hx >> 31;
+        let s = 1.0 - 2.0 * odd as f64;
+        let v = s - 2.0 * (x + (r - w * w / (w + s)));
+        return if sign != 0 { -v } else { v };
+    }
+    if odd == 0 {
+        return w;
+    }
+    /* -1.0/(x+r) has up to 2ulp error, so compute it accurately */
+    let w0 = zero_low_word(w);
+    let v = r - (w0 - x); /* w0+v = r+x */
+    let a = -1.0 / w;
+    let a0 = zero_low_word(a);
+    a0 + a * (1.0 + a0 * w0 + a0 * v)
+}
+
+fn zero_low_word(x: f64) -> f64 {
+    f64::from_bits(f64::to_bits(x) & 0xFFFF_FFFF_0000_0000)
+}
diff --git a/library/compiler-builtins/libm/src/math/k_tanf.rs b/library/compiler-builtins/libm/src/math/k_tanf.rs
new file mode 100644
index 00000000000..af8db539dad
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/k_tanf.rs
@@ -0,0 +1,46 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/k_tan.c */
+/*
+ * ====================================================
+ * Copyright 2004 Sun Microsystems, Inc.  All Rights Reserved.
+ *
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+/* |tan(x)/x - t(x)| < 2**-25.5 (~[-2e-08, 2e-08]). */
+const T: [f64; 6] = [
+    0.333331395030791399758,   /* 0x15554d3418c99f.0p-54 */
+    0.133392002712976742718,   /* 0x1112fd38999f72.0p-55 */
+    0.0533812378445670393523,  /* 0x1b54c91d865afe.0p-57 */
+    0.0245283181166547278873,  /* 0x191df3908c33ce.0p-58 */
+    0.00297435743359967304927, /* 0x185dadfcecf44e.0p-61 */
+    0.00946564784943673166728, /* 0x1362b9bf971bcd.0p-59 */
+];
+
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub(crate) fn k_tanf(x: f64, odd: bool) -> f32 {
+    let z = x * x;
+    /*
+     * Split up the polynomial into small independent terms to give
+     * opportunities for parallel evaluation.  The chosen splitting is
+     * micro-optimized for Athlons (XP, X64).  It costs 2 multiplications
+     * relative to Horner's method on sequential machines.
+     *
+     * We add the small terms from lowest degree up for efficiency on
+     * non-sequential machines (the lowest degree terms tend to be ready
+     * earlier).  Apart from this, we don't care about order of
+     * operations, and don't need to to care since we have precision to
+     * spare.  However, the chosen splitting is good for accuracy too,
+     * and would give results as accurate as Horner's method if the
+     * small terms were added from highest degree down.
+     */
+    let mut r = T[4] + z * T[5];
+    let t = T[2] + z * T[3];
+    let w = z * z;
+    let s = z * x;
+    let u = T[0] + z * T[1];
+    r = (x + s * u) + (s * w) * (t + w * r);
+    (if odd { -1. / r } else { r }) as f32
+}
diff --git a/library/compiler-builtins/libm/src/math/ldexp.rs b/library/compiler-builtins/libm/src/math/ldexp.rs
new file mode 100644
index 00000000000..24899ba306a
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/ldexp.rs
@@ -0,0 +1,21 @@
+#[cfg(f16_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn ldexpf16(x: f16, n: i32) -> f16 {
+    super::scalbnf16(x, n)
+}
+
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn ldexpf(x: f32, n: i32) -> f32 {
+    super::scalbnf(x, n)
+}
+
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn ldexp(x: f64, n: i32) -> f64 {
+    super::scalbn(x, n)
+}
+
+#[cfg(f128_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn ldexpf128(x: f128, n: i32) -> f128 {
+    super::scalbnf128(x, n)
+}
diff --git a/library/compiler-builtins/libm/src/math/ldexpf.rs b/library/compiler-builtins/libm/src/math/ldexpf.rs
new file mode 100644
index 00000000000..95b27fc49d2
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/ldexpf.rs
@@ -0,0 +1,4 @@
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn ldexpf(x: f32, n: i32) -> f32 {
+    super::scalbnf(x, n)
+}
diff --git a/library/compiler-builtins/libm/src/math/ldexpf128.rs b/library/compiler-builtins/libm/src/math/ldexpf128.rs
new file mode 100644
index 00000000000..b35277d15fb
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/ldexpf128.rs
@@ -0,0 +1,4 @@
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn ldexpf128(x: f128, n: i32) -> f128 {
+    super::scalbnf128(x, n)
+}
diff --git a/library/compiler-builtins/libm/src/math/ldexpf16.rs b/library/compiler-builtins/libm/src/math/ldexpf16.rs
new file mode 100644
index 00000000000..8de6cffd699
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/ldexpf16.rs
@@ -0,0 +1,4 @@
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn ldexpf16(x: f16, n: i32) -> f16 {
+    super::scalbnf16(x, n)
+}
diff --git a/library/compiler-builtins/libm/src/math/lgamma.rs b/library/compiler-builtins/libm/src/math/lgamma.rs
new file mode 100644
index 00000000000..8312dc18648
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/lgamma.rs
@@ -0,0 +1,8 @@
+use super::lgamma_r;
+
+/// The natural logarithm of the
+/// [Gamma function](https://en.wikipedia.org/wiki/Gamma_function) (f64).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn lgamma(x: f64) -> f64 {
+    lgamma_r(x).0
+}
diff --git a/library/compiler-builtins/libm/src/math/lgamma_r.rs b/library/compiler-builtins/libm/src/math/lgamma_r.rs
new file mode 100644
index 00000000000..6becaad2ce9
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/lgamma_r.rs
@@ -0,0 +1,321 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/e_lgamma_r.c */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunSoft, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ *
+ */
+/* lgamma_r(x, signgamp)
+ * Reentrant version of the logarithm of the Gamma function
+ * with user provide pointer for the sign of Gamma(x).
+ *
+ * Method:
+ *   1. Argument Reduction for 0 < x <= 8
+ *      Since gamma(1+s)=s*gamma(s), for x in [0,8], we may
+ *      reduce x to a number in [1.5,2.5] by
+ *              lgamma(1+s) = log(s) + lgamma(s)
+ *      for example,
+ *              lgamma(7.3) = log(6.3) + lgamma(6.3)
+ *                          = log(6.3*5.3) + lgamma(5.3)
+ *                          = log(6.3*5.3*4.3*3.3*2.3) + lgamma(2.3)
+ *   2. Polynomial approximation of lgamma around its
+ *      minimun ymin=1.461632144968362245 to maintain monotonicity.
+ *      On [ymin-0.23, ymin+0.27] (i.e., [1.23164,1.73163]), use
+ *              Let z = x-ymin;
+ *              lgamma(x) = -1.214862905358496078218 + z^2*poly(z)
+ *      where
+ *              poly(z) is a 14 degree polynomial.
+ *   2. Rational approximation in the primary interval [2,3]
+ *      We use the following approximation:
+ *              s = x-2.0;
+ *              lgamma(x) = 0.5*s + s*P(s)/Q(s)
+ *      with accuracy
+ *              |P/Q - (lgamma(x)-0.5s)| < 2**-61.71
+ *      Our algorithms are based on the following observation
+ *
+ *                             zeta(2)-1    2    zeta(3)-1    3
+ * lgamma(2+s) = s*(1-Euler) + --------- * s  -  --------- * s  + ...
+ *                                 2                 3
+ *
+ *      where Euler = 0.5771... is the Euler constant, which is very
+ *      close to 0.5.
+ *
+ *   3. For x>=8, we have
+ *      lgamma(x)~(x-0.5)log(x)-x+0.5*log(2pi)+1/(12x)-1/(360x**3)+....
+ *      (better formula:
+ *         lgamma(x)~(x-0.5)*(log(x)-1)-.5*(log(2pi)-1) + ...)
+ *      Let z = 1/x, then we approximation
+ *              f(z) = lgamma(x) - (x-0.5)(log(x)-1)
+ *      by
+ *                                  3       5             11
+ *              w = w0 + w1*z + w2*z  + w3*z  + ... + w6*z
+ *      where
+ *              |w - f(z)| < 2**-58.74
+ *
+ *   4. For negative x, since (G is gamma function)
+ *              -x*G(-x)*G(x) = PI/sin(PI*x),
+ *      we have
+ *              G(x) = PI/(sin(PI*x)*(-x)*G(-x))
+ *      since G(-x) is positive, sign(G(x)) = sign(sin(PI*x)) for x<0
+ *      Hence, for x<0, signgam = sign(sin(PI*x)) and
+ *              lgamma(x) = log(|Gamma(x)|)
+ *                        = log(PI/(|x*sin(PI*x)|)) - lgamma(-x);
+ *      Note: one should avoid compute PI*(-x) directly in the
+ *            computation of sin(PI*(-x)).
+ *
+ *   5. Special Cases
+ *              lgamma(2+s) ~ s*(1-Euler) for tiny s
+ *              lgamma(1) = lgamma(2) = 0
+ *              lgamma(x) ~ -log(|x|) for tiny x
+ *              lgamma(0) = lgamma(neg.integer) = inf and raise divide-by-zero
+ *              lgamma(inf) = inf
+ *              lgamma(-inf) = inf (bug for bug compatible with C99!?)
+ *
+ */
+
+use super::{floor, k_cos, k_sin, log};
+
+const PI: f64 = 3.14159265358979311600e+00; /* 0x400921FB, 0x54442D18 */
+const A0: f64 = 7.72156649015328655494e-02; /* 0x3FB3C467, 0xE37DB0C8 */
+const A1: f64 = 3.22467033424113591611e-01; /* 0x3FD4A34C, 0xC4A60FAD */
+const A2: f64 = 6.73523010531292681824e-02; /* 0x3FB13E00, 0x1A5562A7 */
+const A3: f64 = 2.05808084325167332806e-02; /* 0x3F951322, 0xAC92547B */
+const A4: f64 = 7.38555086081402883957e-03; /* 0x3F7E404F, 0xB68FEFE8 */
+const A5: f64 = 2.89051383673415629091e-03; /* 0x3F67ADD8, 0xCCB7926B */
+const A6: f64 = 1.19270763183362067845e-03; /* 0x3F538A94, 0x116F3F5D */
+const A7: f64 = 5.10069792153511336608e-04; /* 0x3F40B6C6, 0x89B99C00 */
+const A8: f64 = 2.20862790713908385557e-04; /* 0x3F2CF2EC, 0xED10E54D */
+const A9: f64 = 1.08011567247583939954e-04; /* 0x3F1C5088, 0x987DFB07 */
+const A10: f64 = 2.52144565451257326939e-05; /* 0x3EFA7074, 0x428CFA52 */
+const A11: f64 = 4.48640949618915160150e-05; /* 0x3F07858E, 0x90A45837 */
+const TC: f64 = 1.46163214496836224576e+00; /* 0x3FF762D8, 0x6356BE3F */
+const TF: f64 = -1.21486290535849611461e-01; /* 0xBFBF19B9, 0xBCC38A42 */
+/* tt = -(tail of TF) */
+const TT: f64 = -3.63867699703950536541e-18; /* 0xBC50C7CA, 0xA48A971F */
+const T0: f64 = 4.83836122723810047042e-01; /* 0x3FDEF72B, 0xC8EE38A2 */
+const T1: f64 = -1.47587722994593911752e-01; /* 0xBFC2E427, 0x8DC6C509 */
+const T2: f64 = 6.46249402391333854778e-02; /* 0x3FB08B42, 0x94D5419B */
+const T3: f64 = -3.27885410759859649565e-02; /* 0xBFA0C9A8, 0xDF35B713 */
+const T4: f64 = 1.79706750811820387126e-02; /* 0x3F9266E7, 0x970AF9EC */
+const T5: f64 = -1.03142241298341437450e-02; /* 0xBF851F9F, 0xBA91EC6A */
+const T6: f64 = 6.10053870246291332635e-03; /* 0x3F78FCE0, 0xE370E344 */
+const T7: f64 = -3.68452016781138256760e-03; /* 0xBF6E2EFF, 0xB3E914D7 */
+const T8: f64 = 2.25964780900612472250e-03; /* 0x3F6282D3, 0x2E15C915 */
+const T9: f64 = -1.40346469989232843813e-03; /* 0xBF56FE8E, 0xBF2D1AF1 */
+const T10: f64 = 8.81081882437654011382e-04; /* 0x3F4CDF0C, 0xEF61A8E9 */
+const T11: f64 = -5.38595305356740546715e-04; /* 0xBF41A610, 0x9C73E0EC */
+const T12: f64 = 3.15632070903625950361e-04; /* 0x3F34AF6D, 0x6C0EBBF7 */
+const T13: f64 = -3.12754168375120860518e-04; /* 0xBF347F24, 0xECC38C38 */
+const T14: f64 = 3.35529192635519073543e-04; /* 0x3F35FD3E, 0xE8C2D3F4 */
+const U0: f64 = -7.72156649015328655494e-02; /* 0xBFB3C467, 0xE37DB0C8 */
+const U1: f64 = 6.32827064025093366517e-01; /* 0x3FE4401E, 0x8B005DFF */
+const U2: f64 = 1.45492250137234768737e+00; /* 0x3FF7475C, 0xD119BD6F */
+const U3: f64 = 9.77717527963372745603e-01; /* 0x3FEF4976, 0x44EA8450 */
+const U4: f64 = 2.28963728064692451092e-01; /* 0x3FCD4EAE, 0xF6010924 */
+const U5: f64 = 1.33810918536787660377e-02; /* 0x3F8B678B, 0xBF2BAB09 */
+const V1: f64 = 2.45597793713041134822e+00; /* 0x4003A5D7, 0xC2BD619C */
+const V2: f64 = 2.12848976379893395361e+00; /* 0x40010725, 0xA42B18F5 */
+const V3: f64 = 7.69285150456672783825e-01; /* 0x3FE89DFB, 0xE45050AF */
+const V4: f64 = 1.04222645593369134254e-01; /* 0x3FBAAE55, 0xD6537C88 */
+const V5: f64 = 3.21709242282423911810e-03; /* 0x3F6A5ABB, 0x57D0CF61 */
+const S0: f64 = -7.72156649015328655494e-02; /* 0xBFB3C467, 0xE37DB0C8 */
+const S1: f64 = 2.14982415960608852501e-01; /* 0x3FCB848B, 0x36E20878 */
+const S2: f64 = 3.25778796408930981787e-01; /* 0x3FD4D98F, 0x4F139F59 */
+const S3: f64 = 1.46350472652464452805e-01; /* 0x3FC2BB9C, 0xBEE5F2F7 */
+const S4: f64 = 2.66422703033638609560e-02; /* 0x3F9B481C, 0x7E939961 */
+const S5: f64 = 1.84028451407337715652e-03; /* 0x3F5E26B6, 0x7368F239 */
+const S6: f64 = 3.19475326584100867617e-05; /* 0x3F00BFEC, 0xDD17E945 */
+const R1: f64 = 1.39200533467621045958e+00; /* 0x3FF645A7, 0x62C4AB74 */
+const R2: f64 = 7.21935547567138069525e-01; /* 0x3FE71A18, 0x93D3DCDC */
+const R3: f64 = 1.71933865632803078993e-01; /* 0x3FC601ED, 0xCCFBDF27 */
+const R4: f64 = 1.86459191715652901344e-02; /* 0x3F9317EA, 0x742ED475 */
+const R5: f64 = 7.77942496381893596434e-04; /* 0x3F497DDA, 0xCA41A95B */
+const R6: f64 = 7.32668430744625636189e-06; /* 0x3EDEBAF7, 0xA5B38140 */
+const W0: f64 = 4.18938533204672725052e-01; /* 0x3FDACFE3, 0x90C97D69 */
+const W1: f64 = 8.33333333333329678849e-02; /* 0x3FB55555, 0x5555553B */
+const W2: f64 = -2.77777777728775536470e-03; /* 0xBF66C16C, 0x16B02E5C */
+const W3: f64 = 7.93650558643019558500e-04; /* 0x3F4A019F, 0x98CF38B6 */
+const W4: f64 = -5.95187557450339963135e-04; /* 0xBF4380CB, 0x8C0FE741 */
+const W5: f64 = 8.36339918996282139126e-04; /* 0x3F4B67BA, 0x4CDAD5D1 */
+const W6: f64 = -1.63092934096575273989e-03; /* 0xBF5AB89D, 0x0B9E43E4 */
+
+/* sin(PI*x) assuming x > 2^-100, if sin(PI*x)==0 the sign is arbitrary */
+fn sin_pi(mut x: f64) -> f64 {
+    let mut n: i32;
+
+    /* spurious inexact if odd int */
+    x = 2.0 * (x * 0.5 - floor(x * 0.5)); /* x mod 2.0 */
+
+    n = (x * 4.0) as i32;
+    n = div!(n + 1, 2);
+    x -= (n as f64) * 0.5;
+    x *= PI;
+
+    match n {
+        1 => k_cos(x, 0.0),
+        2 => k_sin(-x, 0.0, 0),
+        3 => -k_cos(x, 0.0),
+        // 0
+        _ => k_sin(x, 0.0, 0),
+    }
+}
+
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn lgamma_r(mut x: f64) -> (f64, i32) {
+    let u: u64 = x.to_bits();
+    let mut t: f64;
+    let y: f64;
+    let mut z: f64;
+    let nadj: f64;
+    let p: f64;
+    let p1: f64;
+    let p2: f64;
+    let p3: f64;
+    let q: f64;
+    let mut r: f64;
+    let w: f64;
+    let ix: u32;
+    let sign: bool;
+    let i: i32;
+    let mut signgam: i32;
+
+    /* purge off +-inf, NaN, +-0, tiny and negative arguments */
+    signgam = 1;
+    sign = (u >> 63) != 0;
+    ix = ((u >> 32) as u32) & 0x7fffffff;
+    if ix >= 0x7ff00000 {
+        return (x * x, signgam);
+    }
+    if ix < (0x3ff - 70) << 20 {
+        /* |x|<2**-70, return -log(|x|) */
+        if sign {
+            x = -x;
+            signgam = -1;
+        }
+        return (-log(x), signgam);
+    }
+    if sign {
+        x = -x;
+        t = sin_pi(x);
+        if t == 0.0 {
+            /* -integer */
+            return (1.0 / (x - x), signgam);
+        }
+        if t > 0.0 {
+            signgam = -1;
+        } else {
+            t = -t;
+        }
+        nadj = log(PI / (t * x));
+    } else {
+        nadj = 0.0;
+    }
+
+    /* purge off 1 and 2 */
+    if (ix == 0x3ff00000 || ix == 0x40000000) && (u & 0xffffffff) == 0 {
+        r = 0.0;
+    }
+    /* for x < 2.0 */
+    else if ix < 0x40000000 {
+        if ix <= 0x3feccccc {
+            /* lgamma(x) = lgamma(x+1)-log(x) */
+            r = -log(x);
+            if ix >= 0x3FE76944 {
+                y = 1.0 - x;
+                i = 0;
+            } else if ix >= 0x3FCDA661 {
+                y = x - (TC - 1.0);
+                i = 1;
+            } else {
+                y = x;
+                i = 2;
+            }
+        } else {
+            r = 0.0;
+            if ix >= 0x3FFBB4C3 {
+                /* [1.7316,2] */
+                y = 2.0 - x;
+                i = 0;
+            } else if ix >= 0x3FF3B4C4 {
+                /* [1.23,1.73] */
+                y = x - TC;
+                i = 1;
+            } else {
+                y = x - 1.0;
+                i = 2;
+            }
+        }
+        match i {
+            0 => {
+                z = y * y;
+                p1 = A0 + z * (A2 + z * (A4 + z * (A6 + z * (A8 + z * A10))));
+                p2 = z * (A1 + z * (A3 + z * (A5 + z * (A7 + z * (A9 + z * A11)))));
+                p = y * p1 + p2;
+                r += p - 0.5 * y;
+            }
+            1 => {
+                z = y * y;
+                w = z * y;
+                p1 = T0 + w * (T3 + w * (T6 + w * (T9 + w * T12))); /* parallel comp */
+                p2 = T1 + w * (T4 + w * (T7 + w * (T10 + w * T13)));
+                p3 = T2 + w * (T5 + w * (T8 + w * (T11 + w * T14)));
+                p = z * p1 - (TT - w * (p2 + y * p3));
+                r += TF + p;
+            }
+            2 => {
+                p1 = y * (U0 + y * (U1 + y * (U2 + y * (U3 + y * (U4 + y * U5)))));
+                p2 = 1.0 + y * (V1 + y * (V2 + y * (V3 + y * (V4 + y * V5))));
+                r += -0.5 * y + p1 / p2;
+            }
+            #[cfg(debug_assertions)]
+            _ => unreachable!(),
+            #[cfg(not(debug_assertions))]
+            _ => {}
+        }
+    } else if ix < 0x40200000 {
+        /* x < 8.0 */
+        i = x as i32;
+        y = x - (i as f64);
+        p = y * (S0 + y * (S1 + y * (S2 + y * (S3 + y * (S4 + y * (S5 + y * S6))))));
+        q = 1.0 + y * (R1 + y * (R2 + y * (R3 + y * (R4 + y * (R5 + y * R6)))));
+        r = 0.5 * y + p / q;
+        z = 1.0; /* lgamma(1+s) = log(s) + lgamma(s) */
+        // TODO: In C, this was implemented using switch jumps with fallthrough.
+        // Does this implementation have performance problems?
+        if i >= 7 {
+            z *= y + 6.0;
+        }
+        if i >= 6 {
+            z *= y + 5.0;
+        }
+        if i >= 5 {
+            z *= y + 4.0;
+        }
+        if i >= 4 {
+            z *= y + 3.0;
+        }
+        if i >= 3 {
+            z *= y + 2.0;
+            r += log(z);
+        }
+    } else if ix < 0x43900000 {
+        /* 8.0 <= x < 2**58 */
+        t = log(x);
+        z = 1.0 / x;
+        y = z * z;
+        w = W0 + z * (W1 + y * (W2 + y * (W3 + y * (W4 + y * (W5 + y * W6)))));
+        r = (x - 0.5) * (t - 1.0) + w;
+    } else {
+        /* 2**58 <= x <= inf */
+        r = x * (log(x) - 1.0);
+    }
+    if sign {
+        r = nadj - r;
+    }
+    return (r, signgam);
+}
diff --git a/library/compiler-builtins/libm/src/math/lgammaf.rs b/library/compiler-builtins/libm/src/math/lgammaf.rs
new file mode 100644
index 00000000000..d37512397cb
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/lgammaf.rs
@@ -0,0 +1,8 @@
+use super::lgammaf_r;
+
+/// The natural logarithm of the
+/// [Gamma function](https://en.wikipedia.org/wiki/Gamma_function) (f32).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn lgammaf(x: f32) -> f32 {
+    lgammaf_r(x).0
+}
diff --git a/library/compiler-builtins/libm/src/math/lgammaf_r.rs b/library/compiler-builtins/libm/src/math/lgammaf_r.rs
new file mode 100644
index 00000000000..10cecee541c
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/lgammaf_r.rs
@@ -0,0 +1,256 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/e_lgammaf_r.c */
+/*
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
+ */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+use super::{floorf, k_cosf, k_sinf, logf};
+
+const PI: f32 = 3.1415927410e+00; /* 0x40490fdb */
+const A0: f32 = 7.7215664089e-02; /* 0x3d9e233f */
+const A1: f32 = 3.2246702909e-01; /* 0x3ea51a66 */
+const A2: f32 = 6.7352302372e-02; /* 0x3d89f001 */
+const A3: f32 = 2.0580807701e-02; /* 0x3ca89915 */
+const A4: f32 = 7.3855509982e-03; /* 0x3bf2027e */
+const A5: f32 = 2.8905137442e-03; /* 0x3b3d6ec6 */
+const A6: f32 = 1.1927076848e-03; /* 0x3a9c54a1 */
+const A7: f32 = 5.1006977446e-04; /* 0x3a05b634 */
+const A8: f32 = 2.2086278477e-04; /* 0x39679767 */
+const A9: f32 = 1.0801156895e-04; /* 0x38e28445 */
+const A10: f32 = 2.5214456400e-05; /* 0x37d383a2 */
+const A11: f32 = 4.4864096708e-05; /* 0x383c2c75 */
+const TC: f32 = 1.4616321325e+00; /* 0x3fbb16c3 */
+const TF: f32 = -1.2148628384e-01; /* 0xbdf8cdcd */
+/* TT = -(tail of TF) */
+const TT: f32 = 6.6971006518e-09; /* 0x31e61c52 */
+const T0: f32 = 4.8383611441e-01; /* 0x3ef7b95e */
+const T1: f32 = -1.4758771658e-01; /* 0xbe17213c */
+const T2: f32 = 6.4624942839e-02; /* 0x3d845a15 */
+const T3: f32 = -3.2788541168e-02; /* 0xbd064d47 */
+const T4: f32 = 1.7970675603e-02; /* 0x3c93373d */
+const T5: f32 = -1.0314224288e-02; /* 0xbc28fcfe */
+const T6: f32 = 6.1005386524e-03; /* 0x3bc7e707 */
+const T7: f32 = -3.6845202558e-03; /* 0xbb7177fe */
+const T8: f32 = 2.2596477065e-03; /* 0x3b141699 */
+const T9: f32 = -1.4034647029e-03; /* 0xbab7f476 */
+const T10: f32 = 8.8108185446e-04; /* 0x3a66f867 */
+const T11: f32 = -5.3859531181e-04; /* 0xba0d3085 */
+const T12: f32 = 3.1563205994e-04; /* 0x39a57b6b */
+const T13: f32 = -3.1275415677e-04; /* 0xb9a3f927 */
+const T14: f32 = 3.3552918467e-04; /* 0x39afe9f7 */
+const U0: f32 = -7.7215664089e-02; /* 0xbd9e233f */
+const U1: f32 = 6.3282704353e-01; /* 0x3f2200f4 */
+const U2: f32 = 1.4549225569e+00; /* 0x3fba3ae7 */
+const U3: f32 = 9.7771751881e-01; /* 0x3f7a4bb2 */
+const U4: f32 = 2.2896373272e-01; /* 0x3e6a7578 */
+const U5: f32 = 1.3381091878e-02; /* 0x3c5b3c5e */
+const V1: f32 = 2.4559779167e+00; /* 0x401d2ebe */
+const V2: f32 = 2.1284897327e+00; /* 0x4008392d */
+const V3: f32 = 7.6928514242e-01; /* 0x3f44efdf */
+const V4: f32 = 1.0422264785e-01; /* 0x3dd572af */
+const V5: f32 = 3.2170924824e-03; /* 0x3b52d5db */
+const S0: f32 = -7.7215664089e-02; /* 0xbd9e233f */
+const S1: f32 = 2.1498242021e-01; /* 0x3e5c245a */
+const S2: f32 = 3.2577878237e-01; /* 0x3ea6cc7a */
+const S3: f32 = 1.4635047317e-01; /* 0x3e15dce6 */
+const S4: f32 = 2.6642270386e-02; /* 0x3cda40e4 */
+const S5: f32 = 1.8402845599e-03; /* 0x3af135b4 */
+const S6: f32 = 3.1947532989e-05; /* 0x3805ff67 */
+const R1: f32 = 1.3920053244e+00; /* 0x3fb22d3b */
+const R2: f32 = 7.2193557024e-01; /* 0x3f38d0c5 */
+const R3: f32 = 1.7193385959e-01; /* 0x3e300f6e */
+const R4: f32 = 1.8645919859e-02; /* 0x3c98bf54 */
+const R5: f32 = 7.7794247773e-04; /* 0x3a4beed6 */
+const R6: f32 = 7.3266842264e-06; /* 0x36f5d7bd */
+const W0: f32 = 4.1893854737e-01; /* 0x3ed67f1d */
+const W1: f32 = 8.3333335817e-02; /* 0x3daaaaab */
+const W2: f32 = -2.7777778450e-03; /* 0xbb360b61 */
+const W3: f32 = 7.9365057172e-04; /* 0x3a500cfd */
+const W4: f32 = -5.9518753551e-04; /* 0xba1c065c */
+const W5: f32 = 8.3633989561e-04; /* 0x3a5b3dd2 */
+const W6: f32 = -1.6309292987e-03; /* 0xbad5c4e8 */
+
+/* sin(PI*x) assuming x > 2^-100, if sin(PI*x)==0 the sign is arbitrary */
+fn sin_pi(mut x: f32) -> f32 {
+    let mut y: f64;
+    let mut n: isize;
+
+    /* spurious inexact if odd int */
+    x = 2.0 * (x * 0.5 - floorf(x * 0.5)); /* x mod 2.0 */
+
+    n = (x * 4.0) as isize;
+    n = div!(n + 1, 2);
+    y = (x as f64) - (n as f64) * 0.5;
+    y *= 3.14159265358979323846;
+    match n {
+        1 => k_cosf(y),
+        2 => k_sinf(-y),
+        3 => -k_cosf(y),
+        // 0
+        _ => k_sinf(y),
+    }
+}
+
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn lgammaf_r(mut x: f32) -> (f32, i32) {
+    let u = x.to_bits();
+    let mut t: f32;
+    let y: f32;
+    let mut z: f32;
+    let nadj: f32;
+    let p: f32;
+    let p1: f32;
+    let p2: f32;
+    let p3: f32;
+    let q: f32;
+    let mut r: f32;
+    let w: f32;
+    let ix: u32;
+    let i: i32;
+    let sign: bool;
+    let mut signgam: i32;
+
+    /* purge off +-inf, NaN, +-0, tiny and negative arguments */
+    signgam = 1;
+    sign = (u >> 31) != 0;
+    ix = u & 0x7fffffff;
+    if ix >= 0x7f800000 {
+        return (x * x, signgam);
+    }
+    if ix < 0x35000000 {
+        /* |x| < 2**-21, return -log(|x|) */
+        if sign {
+            signgam = -1;
+            x = -x;
+        }
+        return (-logf(x), signgam);
+    }
+    if sign {
+        x = -x;
+        t = sin_pi(x);
+        if t == 0.0 {
+            /* -integer */
+            return (1.0 / (x - x), signgam);
+        }
+        if t > 0.0 {
+            signgam = -1;
+        } else {
+            t = -t;
+        }
+        nadj = logf(PI / (t * x));
+    } else {
+        nadj = 0.0;
+    }
+
+    /* purge off 1 and 2 */
+    if ix == 0x3f800000 || ix == 0x40000000 {
+        r = 0.0;
+    }
+    /* for x < 2.0 */
+    else if ix < 0x40000000 {
+        if ix <= 0x3f666666 {
+            /* lgamma(x) = lgamma(x+1)-log(x) */
+            r = -logf(x);
+            if ix >= 0x3f3b4a20 {
+                y = 1.0 - x;
+                i = 0;
+            } else if ix >= 0x3e6d3308 {
+                y = x - (TC - 1.0);
+                i = 1;
+            } else {
+                y = x;
+                i = 2;
+            }
+        } else {
+            r = 0.0;
+            if ix >= 0x3fdda618 {
+                /* [1.7316,2] */
+                y = 2.0 - x;
+                i = 0;
+            } else if ix >= 0x3F9da620 {
+                /* [1.23,1.73] */
+                y = x - TC;
+                i = 1;
+            } else {
+                y = x - 1.0;
+                i = 2;
+            }
+        }
+        match i {
+            0 => {
+                z = y * y;
+                p1 = A0 + z * (A2 + z * (A4 + z * (A6 + z * (A8 + z * A10))));
+                p2 = z * (A1 + z * (A3 + z * (A5 + z * (A7 + z * (A9 + z * A11)))));
+                p = y * p1 + p2;
+                r += p - 0.5 * y;
+            }
+            1 => {
+                z = y * y;
+                w = z * y;
+                p1 = T0 + w * (T3 + w * (T6 + w * (T9 + w * T12))); /* parallel comp */
+                p2 = T1 + w * (T4 + w * (T7 + w * (T10 + w * T13)));
+                p3 = T2 + w * (T5 + w * (T8 + w * (T11 + w * T14)));
+                p = z * p1 - (TT - w * (p2 + y * p3));
+                r += TF + p;
+            }
+            2 => {
+                p1 = y * (U0 + y * (U1 + y * (U2 + y * (U3 + y * (U4 + y * U5)))));
+                p2 = 1.0 + y * (V1 + y * (V2 + y * (V3 + y * (V4 + y * V5))));
+                r += -0.5 * y + p1 / p2;
+            }
+            #[cfg(debug_assertions)]
+            _ => unreachable!(),
+            #[cfg(not(debug_assertions))]
+            _ => {}
+        }
+    } else if ix < 0x41000000 {
+        /* x < 8.0 */
+        i = x as i32;
+        y = x - (i as f32);
+        p = y * (S0 + y * (S1 + y * (S2 + y * (S3 + y * (S4 + y * (S5 + y * S6))))));
+        q = 1.0 + y * (R1 + y * (R2 + y * (R3 + y * (R4 + y * (R5 + y * R6)))));
+        r = 0.5 * y + p / q;
+        z = 1.0; /* lgamma(1+s) = log(s) + lgamma(s) */
+        // TODO: In C, this was implemented using switch jumps with fallthrough.
+        // Does this implementation have performance problems?
+        if i >= 7 {
+            z *= y + 6.0;
+        }
+        if i >= 6 {
+            z *= y + 5.0;
+        }
+        if i >= 5 {
+            z *= y + 4.0;
+        }
+        if i >= 4 {
+            z *= y + 3.0;
+        }
+        if i >= 3 {
+            z *= y + 2.0;
+            r += logf(z);
+        }
+    } else if ix < 0x5c800000 {
+        /* 8.0 <= x < 2**58 */
+        t = logf(x);
+        z = 1.0 / x;
+        y = z * z;
+        w = W0 + z * (W1 + y * (W2 + y * (W3 + y * (W4 + y * (W5 + y * W6)))));
+        r = (x - 0.5) * (t - 1.0) + w;
+    } else {
+        /* 2**58 <= x <= inf */
+        r = x * (logf(x) - 1.0);
+    }
+    if sign {
+        r = nadj - r;
+    }
+    return (r, signgam);
+}
diff --git a/library/compiler-builtins/libm/src/math/log.rs b/library/compiler-builtins/libm/src/math/log.rs
new file mode 100644
index 00000000000..f2dc47ec5cc
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/log.rs
@@ -0,0 +1,118 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/e_log.c */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunSoft, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+/* log(x)
+ * Return the logarithm of x
+ *
+ * Method :
+ *   1. Argument Reduction: find k and f such that
+ *                      x = 2^k * (1+f),
+ *         where  sqrt(2)/2 < 1+f < sqrt(2) .
+ *
+ *   2. Approximation of log(1+f).
+ *      Let s = f/(2+f) ; based on log(1+f) = log(1+s) - log(1-s)
+ *               = 2s + 2/3 s**3 + 2/5 s**5 + .....,
+ *               = 2s + s*R
+ *      We use a special Remez algorithm on [0,0.1716] to generate
+ *      a polynomial of degree 14 to approximate R The maximum error
+ *      of this polynomial approximation is bounded by 2**-58.45. In
+ *      other words,
+ *                      2      4      6      8      10      12      14
+ *          R(z) ~ Lg1*s +Lg2*s +Lg3*s +Lg4*s +Lg5*s  +Lg6*s  +Lg7*s
+ *      (the values of Lg1 to Lg7 are listed in the program)
+ *      and
+ *          |      2          14          |     -58.45
+ *          | Lg1*s +...+Lg7*s    -  R(z) | <= 2
+ *          |                             |
+ *      Note that 2s = f - s*f = f - hfsq + s*hfsq, where hfsq = f*f/2.
+ *      In order to guarantee error in log below 1ulp, we compute log
+ *      by
+ *              log(1+f) = f - s*(f - R)        (if f is not too large)
+ *              log(1+f) = f - (hfsq - s*(hfsq+R)).     (better accuracy)
+ *
+ *      3. Finally,  log(x) = k*ln2 + log(1+f).
+ *                          = k*ln2_hi+(f-(hfsq-(s*(hfsq+R)+k*ln2_lo)))
+ *         Here ln2 is split into two floating point number:
+ *                      ln2_hi + ln2_lo,
+ *         where n*ln2_hi is always exact for |n| < 2000.
+ *
+ * Special cases:
+ *      log(x) is NaN with signal if x < 0 (including -INF) ;
+ *      log(+INF) is +INF; log(0) is -INF with signal;
+ *      log(NaN) is that NaN with no signal.
+ *
+ * Accuracy:
+ *      according to an error analysis, the error is always less than
+ *      1 ulp (unit in the last place).
+ *
+ * Constants:
+ * The hexadecimal values are the intended ones for the following
+ * constants. The decimal values may be used, provided that the
+ * compiler will convert from decimal to binary accurately enough
+ * to produce the hexadecimal values shown.
+ */
+
+const LN2_HI: f64 = 6.93147180369123816490e-01; /* 3fe62e42 fee00000 */
+const LN2_LO: f64 = 1.90821492927058770002e-10; /* 3dea39ef 35793c76 */
+const LG1: f64 = 6.666666666666735130e-01; /* 3FE55555 55555593 */
+const LG2: f64 = 3.999999999940941908e-01; /* 3FD99999 9997FA04 */
+const LG3: f64 = 2.857142874366239149e-01; /* 3FD24924 94229359 */
+const LG4: f64 = 2.222219843214978396e-01; /* 3FCC71C5 1D8E78AF */
+const LG5: f64 = 1.818357216161805012e-01; /* 3FC74664 96CB03DE */
+const LG6: f64 = 1.531383769920937332e-01; /* 3FC39A09 D078C69F */
+const LG7: f64 = 1.479819860511658591e-01; /* 3FC2F112 DF3E5244 */
+
+/// The natural logarithm of `x` (f64).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn log(mut x: f64) -> f64 {
+    let x1p54 = f64::from_bits(0x4350000000000000); // 0x1p54 === 2 ^ 54
+
+    let mut ui = x.to_bits();
+    let mut hx: u32 = (ui >> 32) as u32;
+    let mut k: i32 = 0;
+
+    if (hx < 0x00100000) || ((hx >> 31) != 0) {
+        /* x < 2**-126  */
+        if ui << 1 == 0 {
+            return -1. / (x * x); /* log(+-0)=-inf */
+        }
+        if hx >> 31 != 0 {
+            return (x - x) / 0.0; /* log(-#) = NaN */
+        }
+        /* subnormal number, scale x up */
+        k -= 54;
+        x *= x1p54;
+        ui = x.to_bits();
+        hx = (ui >> 32) as u32;
+    } else if hx >= 0x7ff00000 {
+        return x;
+    } else if hx == 0x3ff00000 && ui << 32 == 0 {
+        return 0.;
+    }
+
+    /* reduce x into [sqrt(2)/2, sqrt(2)] */
+    hx += 0x3ff00000 - 0x3fe6a09e;
+    k += ((hx >> 20) as i32) - 0x3ff;
+    hx = (hx & 0x000fffff) + 0x3fe6a09e;
+    ui = ((hx as u64) << 32) | (ui & 0xffffffff);
+    x = f64::from_bits(ui);
+
+    let f: f64 = x - 1.0;
+    let hfsq: f64 = 0.5 * f * f;
+    let s: f64 = f / (2.0 + f);
+    let z: f64 = s * s;
+    let w: f64 = z * z;
+    let t1: f64 = w * (LG2 + w * (LG4 + w * LG6));
+    let t2: f64 = z * (LG1 + w * (LG3 + w * (LG5 + w * LG7)));
+    let r: f64 = t2 + t1;
+    let dk: f64 = k as f64;
+    s * (hfsq + r) + dk * LN2_LO - hfsq + f + dk * LN2_HI
+}
diff --git a/library/compiler-builtins/libm/src/math/log10.rs b/library/compiler-builtins/libm/src/math/log10.rs
new file mode 100644
index 00000000000..8c9d68c492d
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/log10.rs
@@ -0,0 +1,118 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/e_log10.c */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunSoft, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+/*
+ * Return the base 10 logarithm of x.  See log.c for most comments.
+ *
+ * Reduce x to 2^k (1+f) and calculate r = log(1+f) - f + f*f/2
+ * as in log.c, then combine and scale in extra precision:
+ *    log10(x) = (f - f*f/2 + r)/log(10) + k*log10(2)
+ */
+
+use core::f64;
+
+const IVLN10HI: f64 = 4.34294481878168880939e-01; /* 0x3fdbcb7b, 0x15200000 */
+const IVLN10LO: f64 = 2.50829467116452752298e-11; /* 0x3dbb9438, 0xca9aadd5 */
+const LOG10_2HI: f64 = 3.01029995663611771306e-01; /* 0x3FD34413, 0x509F6000 */
+const LOG10_2LO: f64 = 3.69423907715893078616e-13; /* 0x3D59FEF3, 0x11F12B36 */
+const LG1: f64 = 6.666666666666735130e-01; /* 3FE55555 55555593 */
+const LG2: f64 = 3.999999999940941908e-01; /* 3FD99999 9997FA04 */
+const LG3: f64 = 2.857142874366239149e-01; /* 3FD24924 94229359 */
+const LG4: f64 = 2.222219843214978396e-01; /* 3FCC71C5 1D8E78AF */
+const LG5: f64 = 1.818357216161805012e-01; /* 3FC74664 96CB03DE */
+const LG6: f64 = 1.531383769920937332e-01; /* 3FC39A09 D078C69F */
+const LG7: f64 = 1.479819860511658591e-01; /* 3FC2F112 DF3E5244 */
+
+/// The base 10 logarithm of `x` (f64).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn log10(mut x: f64) -> f64 {
+    let x1p54 = f64::from_bits(0x4350000000000000); // 0x1p54 === 2 ^ 54
+
+    let mut ui: u64 = x.to_bits();
+    let hfsq: f64;
+    let f: f64;
+    let s: f64;
+    let z: f64;
+    let r: f64;
+    let mut w: f64;
+    let t1: f64;
+    let t2: f64;
+    let dk: f64;
+    let y: f64;
+    let mut hi: f64;
+    let lo: f64;
+    let mut val_hi: f64;
+    let mut val_lo: f64;
+    let mut hx: u32;
+    let mut k: i32;
+
+    hx = (ui >> 32) as u32;
+    k = 0;
+    if hx < 0x00100000 || (hx >> 31) > 0 {
+        if ui << 1 == 0 {
+            return -1. / (x * x); /* log(+-0)=-inf */
+        }
+        if (hx >> 31) > 0 {
+            return (x - x) / 0.0; /* log(-#) = NaN */
+        }
+        /* subnormal number, scale x up */
+        k -= 54;
+        x *= x1p54;
+        ui = x.to_bits();
+        hx = (ui >> 32) as u32;
+    } else if hx >= 0x7ff00000 {
+        return x;
+    } else if hx == 0x3ff00000 && ui << 32 == 0 {
+        return 0.;
+    }
+
+    /* reduce x into [sqrt(2)/2, sqrt(2)] */
+    hx += 0x3ff00000 - 0x3fe6a09e;
+    k += (hx >> 20) as i32 - 0x3ff;
+    hx = (hx & 0x000fffff) + 0x3fe6a09e;
+    ui = ((hx as u64) << 32) | (ui & 0xffffffff);
+    x = f64::from_bits(ui);
+
+    f = x - 1.0;
+    hfsq = 0.5 * f * f;
+    s = f / (2.0 + f);
+    z = s * s;
+    w = z * z;
+    t1 = w * (LG2 + w * (LG4 + w * LG6));
+    t2 = z * (LG1 + w * (LG3 + w * (LG5 + w * LG7)));
+    r = t2 + t1;
+
+    /* See log2.c for details. */
+    /* hi+lo = f - hfsq + s*(hfsq+R) ~ log(1+f) */
+    hi = f - hfsq;
+    ui = hi.to_bits();
+    ui &= (-1i64 as u64) << 32;
+    hi = f64::from_bits(ui);
+    lo = f - hi - hfsq + s * (hfsq + r);
+
+    /* val_hi+val_lo ~ log10(1+f) + k*log10(2) */
+    val_hi = hi * IVLN10HI;
+    dk = k as f64;
+    y = dk * LOG10_2HI;
+    val_lo = dk * LOG10_2LO + (lo + hi) * IVLN10LO + lo * IVLN10HI;
+
+    /*
+     * Extra precision in for adding y is not strictly needed
+     * since there is no very large cancellation near x = sqrt(2) or
+     * x = 1/sqrt(2), but we do it anyway since it costs little on CPUs
+     * with some parallelism and it reduces the error for many args.
+     */
+    w = y + val_hi;
+    val_lo += (y - w) + val_hi;
+    val_hi = w;
+
+    val_lo + val_hi
+}
diff --git a/library/compiler-builtins/libm/src/math/log10f.rs b/library/compiler-builtins/libm/src/math/log10f.rs
new file mode 100644
index 00000000000..18bf8fcc832
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/log10f.rs
@@ -0,0 +1,92 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/e_log10f.c */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+/*
+ * See comments in log10.c.
+ */
+
+use core::f32;
+
+const IVLN10HI: f32 = 4.3432617188e-01; /* 0x3ede6000 */
+const IVLN10LO: f32 = -3.1689971365e-05; /* 0xb804ead9 */
+const LOG10_2HI: f32 = 3.0102920532e-01; /* 0x3e9a2080 */
+const LOG10_2LO: f32 = 7.9034151668e-07; /* 0x355427db */
+/* |(log(1+s)-log(1-s))/s - Lg(s)| < 2**-34.24 (~[-4.95e-11, 4.97e-11]). */
+const LG1: f32 = 0.66666662693; /* 0xaaaaaa.0p-24 */
+const LG2: f32 = 0.40000972152; /* 0xccce13.0p-25 */
+const LG3: f32 = 0.28498786688; /* 0x91e9ee.0p-25 */
+const LG4: f32 = 0.24279078841; /* 0xf89e26.0p-26 */
+
+/// The base 10 logarithm of `x` (f32).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn log10f(mut x: f32) -> f32 {
+    let x1p25f = f32::from_bits(0x4c000000); // 0x1p25f === 2 ^ 25
+
+    let mut ui: u32 = x.to_bits();
+    let hfsq: f32;
+    let f: f32;
+    let s: f32;
+    let z: f32;
+    let r: f32;
+    let w: f32;
+    let t1: f32;
+    let t2: f32;
+    let dk: f32;
+    let mut hi: f32;
+    let lo: f32;
+    let mut ix: u32;
+    let mut k: i32;
+
+    ix = ui;
+    k = 0;
+    if ix < 0x00800000 || (ix >> 31) > 0 {
+        /* x < 2**-126  */
+        if ix << 1 == 0 {
+            return -1. / (x * x); /* log(+-0)=-inf */
+        }
+        if (ix >> 31) > 0 {
+            return (x - x) / 0.0; /* log(-#) = NaN */
+        }
+        /* subnormal number, scale up x */
+        k -= 25;
+        x *= x1p25f;
+        ui = x.to_bits();
+        ix = ui;
+    } else if ix >= 0x7f800000 {
+        return x;
+    } else if ix == 0x3f800000 {
+        return 0.;
+    }
+
+    /* reduce x into [sqrt(2)/2, sqrt(2)] */
+    ix += 0x3f800000 - 0x3f3504f3;
+    k += (ix >> 23) as i32 - 0x7f;
+    ix = (ix & 0x007fffff) + 0x3f3504f3;
+    ui = ix;
+    x = f32::from_bits(ui);
+
+    f = x - 1.0;
+    s = f / (2.0 + f);
+    z = s * s;
+    w = z * z;
+    t1 = w * (LG2 + w * LG4);
+    t2 = z * (LG1 + w * LG3);
+    r = t2 + t1;
+    hfsq = 0.5 * f * f;
+
+    hi = f - hfsq;
+    ui = hi.to_bits();
+    ui &= 0xfffff000;
+    hi = f32::from_bits(ui);
+    lo = f - hi - hfsq + s * (hfsq + r);
+    dk = k as f32;
+    dk * LOG10_2LO + (lo + hi) * IVLN10LO + lo * IVLN10HI + hi * IVLN10HI + dk * LOG10_2HI
+}
diff --git a/library/compiler-builtins/libm/src/math/log1p.rs b/library/compiler-builtins/libm/src/math/log1p.rs
new file mode 100644
index 00000000000..65142c0d622
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/log1p.rs
@@ -0,0 +1,144 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/s_log1p.c */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+/* double log1p(double x)
+ * Return the natural logarithm of 1+x.
+ *
+ * Method :
+ *   1. Argument Reduction: find k and f such that
+ *                      1+x = 2^k * (1+f),
+ *         where  sqrt(2)/2 < 1+f < sqrt(2) .
+ *
+ *      Note. If k=0, then f=x is exact. However, if k!=0, then f
+ *      may not be representable exactly. In that case, a correction
+ *      term is need. Let u=1+x rounded. Let c = (1+x)-u, then
+ *      log(1+x) - log(u) ~ c/u. Thus, we proceed to compute log(u),
+ *      and add back the correction term c/u.
+ *      (Note: when x > 2**53, one can simply return log(x))
+ *
+ *   2. Approximation of log(1+f): See log.c
+ *
+ *   3. Finally, log1p(x) = k*ln2 + log(1+f) + c/u. See log.c
+ *
+ * Special cases:
+ *      log1p(x) is NaN with signal if x < -1 (including -INF) ;
+ *      log1p(+INF) is +INF; log1p(-1) is -INF with signal;
+ *      log1p(NaN) is that NaN with no signal.
+ *
+ * Accuracy:
+ *      according to an error analysis, the error is always less than
+ *      1 ulp (unit in the last place).
+ *
+ * Constants:
+ * The hexadecimal values are the intended ones for the following
+ * constants. The decimal values may be used, provided that the
+ * compiler will convert from decimal to binary accurately enough
+ * to produce the hexadecimal values shown.
+ *
+ * Note: Assuming log() return accurate answer, the following
+ *       algorithm can be used to compute log1p(x) to within a few ULP:
+ *
+ *              u = 1+x;
+ *              if(u==1.0) return x ; else
+ *                         return log(u)*(x/(u-1.0));
+ *
+ *       See HP-15C Advanced Functions Handbook, p.193.
+ */
+
+use core::f64;
+
+const LN2_HI: f64 = 6.93147180369123816490e-01; /* 3fe62e42 fee00000 */
+const LN2_LO: f64 = 1.90821492927058770002e-10; /* 3dea39ef 35793c76 */
+const LG1: f64 = 6.666666666666735130e-01; /* 3FE55555 55555593 */
+const LG2: f64 = 3.999999999940941908e-01; /* 3FD99999 9997FA04 */
+const LG3: f64 = 2.857142874366239149e-01; /* 3FD24924 94229359 */
+const LG4: f64 = 2.222219843214978396e-01; /* 3FCC71C5 1D8E78AF */
+const LG5: f64 = 1.818357216161805012e-01; /* 3FC74664 96CB03DE */
+const LG6: f64 = 1.531383769920937332e-01; /* 3FC39A09 D078C69F */
+const LG7: f64 = 1.479819860511658591e-01; /* 3FC2F112 DF3E5244 */
+
+/// The natural logarithm of 1+`x` (f64).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn log1p(x: f64) -> f64 {
+    let mut ui: u64 = x.to_bits();
+    let hfsq: f64;
+    let mut f: f64 = 0.;
+    let mut c: f64 = 0.;
+    let s: f64;
+    let z: f64;
+    let r: f64;
+    let w: f64;
+    let t1: f64;
+    let t2: f64;
+    let dk: f64;
+    let hx: u32;
+    let mut hu: u32;
+    let mut k: i32;
+
+    hx = (ui >> 32) as u32;
+    k = 1;
+    if hx < 0x3fda827a || (hx >> 31) > 0 {
+        /* 1+x < sqrt(2)+ */
+        if hx >= 0xbff00000 {
+            /* x <= -1.0 */
+            if x == -1. {
+                return x / 0.0; /* log1p(-1) = -inf */
+            }
+            return (x - x) / 0.0; /* log1p(x<-1) = NaN */
+        }
+        if hx << 1 < 0x3ca00000 << 1 {
+            /* |x| < 2**-53 */
+            /* underflow if subnormal */
+            if (hx & 0x7ff00000) == 0 {
+                force_eval!(x as f32);
+            }
+            return x;
+        }
+        if hx <= 0xbfd2bec4 {
+            /* sqrt(2)/2- <= 1+x < sqrt(2)+ */
+            k = 0;
+            c = 0.;
+            f = x;
+        }
+    } else if hx >= 0x7ff00000 {
+        return x;
+    }
+    if k > 0 {
+        ui = (1. + x).to_bits();
+        hu = (ui >> 32) as u32;
+        hu += 0x3ff00000 - 0x3fe6a09e;
+        k = (hu >> 20) as i32 - 0x3ff;
+        /* correction term ~ log(1+x)-log(u), avoid underflow in c/u */
+        if k < 54 {
+            c = if k >= 2 {
+                1. - (f64::from_bits(ui) - x)
+            } else {
+                x - (f64::from_bits(ui) - 1.)
+            };
+            c /= f64::from_bits(ui);
+        } else {
+            c = 0.;
+        }
+        /* reduce u into [sqrt(2)/2, sqrt(2)] */
+        hu = (hu & 0x000fffff) + 0x3fe6a09e;
+        ui = ((hu as u64) << 32) | (ui & 0xffffffff);
+        f = f64::from_bits(ui) - 1.;
+    }
+    hfsq = 0.5 * f * f;
+    s = f / (2.0 + f);
+    z = s * s;
+    w = z * z;
+    t1 = w * (LG2 + w * (LG4 + w * LG6));
+    t2 = z * (LG1 + w * (LG3 + w * (LG5 + w * LG7)));
+    r = t2 + t1;
+    dk = k as f64;
+    s * (hfsq + r) + (dk * LN2_LO + c) - hfsq + f + dk * LN2_HI
+}
diff --git a/library/compiler-builtins/libm/src/math/log1pf.rs b/library/compiler-builtins/libm/src/math/log1pf.rs
new file mode 100644
index 00000000000..23978e61c3c
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/log1pf.rs
@@ -0,0 +1,99 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/s_log1pf.c */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+use core::f32;
+
+const LN2_HI: f32 = 6.9313812256e-01; /* 0x3f317180 */
+const LN2_LO: f32 = 9.0580006145e-06; /* 0x3717f7d1 */
+/* |(log(1+s)-log(1-s))/s - Lg(s)| < 2**-34.24 (~[-4.95e-11, 4.97e-11]). */
+const LG1: f32 = 0.66666662693; /* 0xaaaaaa.0p-24 */
+const LG2: f32 = 0.40000972152; /* 0xccce13.0p-25 */
+const LG3: f32 = 0.28498786688; /* 0x91e9ee.0p-25 */
+const LG4: f32 = 0.24279078841; /* 0xf89e26.0p-26 */
+
+/// The natural logarithm of 1+`x` (f32).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn log1pf(x: f32) -> f32 {
+    let mut ui: u32 = x.to_bits();
+    let hfsq: f32;
+    let mut f: f32 = 0.;
+    let mut c: f32 = 0.;
+    let s: f32;
+    let z: f32;
+    let r: f32;
+    let w: f32;
+    let t1: f32;
+    let t2: f32;
+    let dk: f32;
+    let ix: u32;
+    let mut iu: u32;
+    let mut k: i32;
+
+    ix = ui;
+    k = 1;
+    if ix < 0x3ed413d0 || (ix >> 31) > 0 {
+        /* 1+x < sqrt(2)+  */
+        if ix >= 0xbf800000 {
+            /* x <= -1.0 */
+            if x == -1. {
+                return x / 0.0; /* log1p(-1)=+inf */
+            }
+            return (x - x) / 0.0; /* log1p(x<-1)=NaN */
+        }
+        if ix << 1 < 0x33800000 << 1 {
+            /* |x| < 2**-24 */
+            /* underflow if subnormal */
+            if (ix & 0x7f800000) == 0 {
+                force_eval!(x * x);
+            }
+            return x;
+        }
+        if ix <= 0xbe95f619 {
+            /* sqrt(2)/2- <= 1+x < sqrt(2)+ */
+            k = 0;
+            c = 0.;
+            f = x;
+        }
+    } else if ix >= 0x7f800000 {
+        return x;
+    }
+    if k > 0 {
+        ui = (1. + x).to_bits();
+        iu = ui;
+        iu += 0x3f800000 - 0x3f3504f3;
+        k = (iu >> 23) as i32 - 0x7f;
+        /* correction term ~ log(1+x)-log(u), avoid underflow in c/u */
+        if k < 25 {
+            c = if k >= 2 {
+                1. - (f32::from_bits(ui) - x)
+            } else {
+                x - (f32::from_bits(ui) - 1.)
+            };
+            c /= f32::from_bits(ui);
+        } else {
+            c = 0.;
+        }
+        /* reduce u into [sqrt(2)/2, sqrt(2)] */
+        iu = (iu & 0x007fffff) + 0x3f3504f3;
+        ui = iu;
+        f = f32::from_bits(ui) - 1.;
+    }
+    s = f / (2.0 + f);
+    z = s * s;
+    w = z * z;
+    t1 = w * (LG2 + w * LG4);
+    t2 = z * (LG1 + w * LG3);
+    r = t2 + t1;
+    hfsq = 0.5 * f * f;
+    dk = k as f32;
+    s * (hfsq + r) + (dk * LN2_LO + c) - hfsq + f + dk * LN2_HI
+}
diff --git a/library/compiler-builtins/libm/src/math/log2.rs b/library/compiler-builtins/libm/src/math/log2.rs
new file mode 100644
index 00000000000..701f63c25e7
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/log2.rs
@@ -0,0 +1,107 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/e_log2.c */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunSoft, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+/*
+ * Return the base 2 logarithm of x.  See log.c for most comments.
+ *
+ * Reduce x to 2^k (1+f) and calculate r = log(1+f) - f + f*f/2
+ * as in log.c, then combine and scale in extra precision:
+ *    log2(x) = (f - f*f/2 + r)/log(2) + k
+ */
+
+use core::f64;
+
+const IVLN2HI: f64 = 1.44269504072144627571e+00; /* 0x3ff71547, 0x65200000 */
+const IVLN2LO: f64 = 1.67517131648865118353e-10; /* 0x3de705fc, 0x2eefa200 */
+const LG1: f64 = 6.666666666666735130e-01; /* 3FE55555 55555593 */
+const LG2: f64 = 3.999999999940941908e-01; /* 3FD99999 9997FA04 */
+const LG3: f64 = 2.857142874366239149e-01; /* 3FD24924 94229359 */
+const LG4: f64 = 2.222219843214978396e-01; /* 3FCC71C5 1D8E78AF */
+const LG5: f64 = 1.818357216161805012e-01; /* 3FC74664 96CB03DE */
+const LG6: f64 = 1.531383769920937332e-01; /* 3FC39A09 D078C69F */
+const LG7: f64 = 1.479819860511658591e-01; /* 3FC2F112 DF3E5244 */
+
+/// The base 2 logarithm of `x` (f64).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn log2(mut x: f64) -> f64 {
+    let x1p54 = f64::from_bits(0x4350000000000000); // 0x1p54 === 2 ^ 54
+
+    let mut ui: u64 = x.to_bits();
+    let hfsq: f64;
+    let f: f64;
+    let s: f64;
+    let z: f64;
+    let r: f64;
+    let mut w: f64;
+    let t1: f64;
+    let t2: f64;
+    let y: f64;
+    let mut hi: f64;
+    let lo: f64;
+    let mut val_hi: f64;
+    let mut val_lo: f64;
+    let mut hx: u32;
+    let mut k: i32;
+
+    hx = (ui >> 32) as u32;
+    k = 0;
+    if hx < 0x00100000 || (hx >> 31) > 0 {
+        if ui << 1 == 0 {
+            return -1. / (x * x); /* log(+-0)=-inf */
+        }
+        if (hx >> 31) > 0 {
+            return (x - x) / 0.0; /* log(-#) = NaN */
+        }
+        /* subnormal number, scale x up */
+        k -= 54;
+        x *= x1p54;
+        ui = x.to_bits();
+        hx = (ui >> 32) as u32;
+    } else if hx >= 0x7ff00000 {
+        return x;
+    } else if hx == 0x3ff00000 && ui << 32 == 0 {
+        return 0.;
+    }
+
+    /* reduce x into [sqrt(2)/2, sqrt(2)] */
+    hx += 0x3ff00000 - 0x3fe6a09e;
+    k += (hx >> 20) as i32 - 0x3ff;
+    hx = (hx & 0x000fffff) + 0x3fe6a09e;
+    ui = ((hx as u64) << 32) | (ui & 0xffffffff);
+    x = f64::from_bits(ui);
+
+    f = x - 1.0;
+    hfsq = 0.5 * f * f;
+    s = f / (2.0 + f);
+    z = s * s;
+    w = z * z;
+    t1 = w * (LG2 + w * (LG4 + w * LG6));
+    t2 = z * (LG1 + w * (LG3 + w * (LG5 + w * LG7)));
+    r = t2 + t1;
+
+    /* hi+lo = f - hfsq + s*(hfsq+R) ~ log(1+f) */
+    hi = f - hfsq;
+    ui = hi.to_bits();
+    ui &= (-1i64 as u64) << 32;
+    hi = f64::from_bits(ui);
+    lo = f - hi - hfsq + s * (hfsq + r);
+
+    val_hi = hi * IVLN2HI;
+    val_lo = (lo + hi) * IVLN2LO + lo * IVLN2HI;
+
+    /* spadd(val_hi, val_lo, y), except for not using double_t: */
+    y = k.into();
+    w = y + val_hi;
+    val_lo += (y - w) + val_hi;
+    val_hi = w;
+
+    val_lo + val_hi
+}
diff --git a/library/compiler-builtins/libm/src/math/log2f.rs b/library/compiler-builtins/libm/src/math/log2f.rs
new file mode 100644
index 00000000000..5ba2427d1d4
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/log2f.rs
@@ -0,0 +1,88 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/e_log2f.c */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+/*
+ * See comments in log2.c.
+ */
+
+use core::f32;
+
+const IVLN2HI: f32 = 1.4428710938e+00; /* 0x3fb8b000 */
+const IVLN2LO: f32 = -1.7605285393e-04; /* 0xb9389ad4 */
+/* |(log(1+s)-log(1-s))/s - Lg(s)| < 2**-34.24 (~[-4.95e-11, 4.97e-11]). */
+const LG1: f32 = 0.66666662693; /* 0xaaaaaa.0p-24 */
+const LG2: f32 = 0.40000972152; /* 0xccce13.0p-25 */
+const LG3: f32 = 0.28498786688; /* 0x91e9ee.0p-25 */
+const LG4: f32 = 0.24279078841; /* 0xf89e26.0p-26 */
+
+/// The base 2 logarithm of `x` (f32).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn log2f(mut x: f32) -> f32 {
+    let x1p25f = f32::from_bits(0x4c000000); // 0x1p25f === 2 ^ 25
+
+    let mut ui: u32 = x.to_bits();
+    let hfsq: f32;
+    let f: f32;
+    let s: f32;
+    let z: f32;
+    let r: f32;
+    let w: f32;
+    let t1: f32;
+    let t2: f32;
+    let mut hi: f32;
+    let lo: f32;
+    let mut ix: u32;
+    let mut k: i32;
+
+    ix = ui;
+    k = 0;
+    if ix < 0x00800000 || (ix >> 31) > 0 {
+        /* x < 2**-126  */
+        if ix << 1 == 0 {
+            return -1. / (x * x); /* log(+-0)=-inf */
+        }
+        if (ix >> 31) > 0 {
+            return (x - x) / 0.0; /* log(-#) = NaN */
+        }
+        /* subnormal number, scale up x */
+        k -= 25;
+        x *= x1p25f;
+        ui = x.to_bits();
+        ix = ui;
+    } else if ix >= 0x7f800000 {
+        return x;
+    } else if ix == 0x3f800000 {
+        return 0.;
+    }
+
+    /* reduce x into [sqrt(2)/2, sqrt(2)] */
+    ix += 0x3f800000 - 0x3f3504f3;
+    k += (ix >> 23) as i32 - 0x7f;
+    ix = (ix & 0x007fffff) + 0x3f3504f3;
+    ui = ix;
+    x = f32::from_bits(ui);
+
+    f = x - 1.0;
+    s = f / (2.0 + f);
+    z = s * s;
+    w = z * z;
+    t1 = w * (LG2 + w * LG4);
+    t2 = z * (LG1 + w * LG3);
+    r = t2 + t1;
+    hfsq = 0.5 * f * f;
+
+    hi = f - hfsq;
+    ui = hi.to_bits();
+    ui &= 0xfffff000;
+    hi = f32::from_bits(ui);
+    lo = f - hi - hfsq + s * (hfsq + r);
+    (lo + hi) * IVLN2LO + lo * IVLN2HI + hi * IVLN2HI + k as f32
+}
diff --git a/library/compiler-builtins/libm/src/math/logf.rs b/library/compiler-builtins/libm/src/math/logf.rs
new file mode 100644
index 00000000000..68d1943025e
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/logf.rs
@@ -0,0 +1,66 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/e_logf.c */
+/*
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
+ */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+const LN2_HI: f32 = 6.9313812256e-01; /* 0x3f317180 */
+const LN2_LO: f32 = 9.0580006145e-06; /* 0x3717f7d1 */
+/* |(log(1+s)-log(1-s))/s - Lg(s)| < 2**-34.24 (~[-4.95e-11, 4.97e-11]). */
+const LG1: f32 = 0.66666662693; /*  0xaaaaaa.0p-24*/
+const LG2: f32 = 0.40000972152; /*  0xccce13.0p-25 */
+const LG3: f32 = 0.28498786688; /*  0x91e9ee.0p-25 */
+const LG4: f32 = 0.24279078841; /*  0xf89e26.0p-26 */
+
+/// The natural logarithm of `x` (f32).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn logf(mut x: f32) -> f32 {
+    let x1p25 = f32::from_bits(0x4c000000); // 0x1p25f === 2 ^ 25
+
+    let mut ix = x.to_bits();
+    let mut k = 0i32;
+
+    if (ix < 0x00800000) || ((ix >> 31) != 0) {
+        /* x < 2**-126  */
+        if ix << 1 == 0 {
+            return -1. / (x * x); /* log(+-0)=-inf */
+        }
+        if (ix >> 31) != 0 {
+            return (x - x) / 0.; /* log(-#) = NaN */
+        }
+        /* subnormal number, scale up x */
+        k -= 25;
+        x *= x1p25;
+        ix = x.to_bits();
+    } else if ix >= 0x7f800000 {
+        return x;
+    } else if ix == 0x3f800000 {
+        return 0.;
+    }
+
+    /* reduce x into [sqrt(2)/2, sqrt(2)] */
+    ix += 0x3f800000 - 0x3f3504f3;
+    k += ((ix >> 23) as i32) - 0x7f;
+    ix = (ix & 0x007fffff) + 0x3f3504f3;
+    x = f32::from_bits(ix);
+
+    let f = x - 1.;
+    let s = f / (2. + f);
+    let z = s * s;
+    let w = z * z;
+    let t1 = w * (LG2 + w * LG4);
+    let t2 = z * (LG1 + w * LG3);
+    let r = t2 + t1;
+    let hfsq = 0.5 * f * f;
+    let dk = k as f32;
+    s * (hfsq + r) + dk * LN2_LO - hfsq + f + dk * LN2_HI
+}
diff --git a/library/compiler-builtins/libm/src/math/mod.rs b/library/compiler-builtins/libm/src/math/mod.rs
new file mode 100644
index 00000000000..ce9b8fc58bb
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/mod.rs
@@ -0,0 +1,394 @@
+macro_rules! force_eval {
+    ($e:expr) => {
+        unsafe { ::core::ptr::read_volatile(&$e) }
+    };
+}
+
+#[cfg(not(debug_assertions))]
+macro_rules! i {
+    ($array:expr, $index:expr) => {
+        unsafe { *$array.get_unchecked($index) }
+    };
+    ($array:expr, $index:expr, = , $rhs:expr) => {
+        unsafe {
+            *$array.get_unchecked_mut($index) = $rhs;
+        }
+    };
+    ($array:expr, $index:expr, += , $rhs:expr) => {
+        unsafe {
+            *$array.get_unchecked_mut($index) += $rhs;
+        }
+    };
+    ($array:expr, $index:expr, -= , $rhs:expr) => {
+        unsafe {
+            *$array.get_unchecked_mut($index) -= $rhs;
+        }
+    };
+    ($array:expr, $index:expr, &= , $rhs:expr) => {
+        unsafe {
+            *$array.get_unchecked_mut($index) &= $rhs;
+        }
+    };
+    ($array:expr, $index:expr, == , $rhs:expr) => {
+        unsafe { *$array.get_unchecked_mut($index) == $rhs }
+    };
+}
+
+#[cfg(debug_assertions)]
+macro_rules! i {
+    ($array:expr, $index:expr) => {
+        *$array.get($index).unwrap()
+    };
+    ($array:expr, $index:expr, = , $rhs:expr) => {
+        *$array.get_mut($index).unwrap() = $rhs;
+    };
+    ($array:expr, $index:expr, -= , $rhs:expr) => {
+        *$array.get_mut($index).unwrap() -= $rhs;
+    };
+    ($array:expr, $index:expr, += , $rhs:expr) => {
+        *$array.get_mut($index).unwrap() += $rhs;
+    };
+    ($array:expr, $index:expr, &= , $rhs:expr) => {
+        *$array.get_mut($index).unwrap() &= $rhs;
+    };
+    ($array:expr, $index:expr, == , $rhs:expr) => {
+        *$array.get_mut($index).unwrap() == $rhs
+    };
+}
+
+// Temporary macro to avoid panic codegen for division (in debug mode too). At
+// the time of this writing this is only used in a few places, and once
+// rust-lang/rust#72751 is fixed then this macro will no longer be necessary and
+// the native `/` operator can be used and panics won't be codegen'd.
+#[cfg(any(debug_assertions, not(intrinsics_enabled)))]
+macro_rules! div {
+    ($a:expr, $b:expr) => {
+        $a / $b
+    };
+}
+
+#[cfg(all(not(debug_assertions), intrinsics_enabled))]
+macro_rules! div {
+    ($a:expr, $b:expr) => {
+        unsafe { core::intrinsics::unchecked_div($a, $b) }
+    };
+}
+
+// `support` may be public for testing
+#[macro_use]
+#[cfg(feature = "unstable-public-internals")]
+pub mod support;
+
+#[macro_use]
+#[cfg(not(feature = "unstable-public-internals"))]
+pub(crate) mod support;
+
+cfg_if! {
+    if #[cfg(feature = "unstable-public-internals")] {
+        pub mod generic;
+    } else {
+        mod generic;
+    }
+}
+
+// Private modules
+mod arch;
+mod expo2;
+mod k_cos;
+mod k_cosf;
+mod k_expo2;
+mod k_expo2f;
+mod k_sin;
+mod k_sinf;
+mod k_tan;
+mod k_tanf;
+mod rem_pio2;
+mod rem_pio2_large;
+mod rem_pio2f;
+
+// Private re-imports
+use self::expo2::expo2;
+use self::k_cos::k_cos;
+use self::k_cosf::k_cosf;
+use self::k_expo2::k_expo2;
+use self::k_expo2f::k_expo2f;
+use self::k_sin::k_sin;
+use self::k_sinf::k_sinf;
+use self::k_tan::k_tan;
+use self::k_tanf::k_tanf;
+use self::rem_pio2::rem_pio2;
+use self::rem_pio2_large::rem_pio2_large;
+use self::rem_pio2f::rem_pio2f;
+#[allow(unused_imports)]
+use self::support::{CastFrom, CastInto, DFloat, DInt, Float, HFloat, HInt, Int, IntTy, MinInt};
+
+// Public modules
+mod acos;
+mod acosf;
+mod acosh;
+mod acoshf;
+mod asin;
+mod asinf;
+mod asinh;
+mod asinhf;
+mod atan;
+mod atan2;
+mod atan2f;
+mod atanf;
+mod atanh;
+mod atanhf;
+mod cbrt;
+mod cbrtf;
+mod ceil;
+mod copysign;
+mod cos;
+mod cosf;
+mod cosh;
+mod coshf;
+mod erf;
+mod erff;
+mod exp;
+mod exp10;
+mod exp10f;
+mod exp2;
+mod exp2f;
+mod expf;
+mod expm1;
+mod expm1f;
+mod fabs;
+mod fdim;
+mod floor;
+mod fma;
+mod fmin_fmax;
+mod fminimum_fmaximum;
+mod fminimum_fmaximum_num;
+mod fmod;
+mod frexp;
+mod frexpf;
+mod hypot;
+mod hypotf;
+mod ilogb;
+mod ilogbf;
+mod j0;
+mod j0f;
+mod j1;
+mod j1f;
+mod jn;
+mod jnf;
+mod ldexp;
+mod lgamma;
+mod lgamma_r;
+mod lgammaf;
+mod lgammaf_r;
+mod log;
+mod log10;
+mod log10f;
+mod log1p;
+mod log1pf;
+mod log2;
+mod log2f;
+mod logf;
+mod modf;
+mod modff;
+mod nextafter;
+mod nextafterf;
+mod pow;
+mod powf;
+mod remainder;
+mod remainderf;
+mod remquo;
+mod remquof;
+mod rint;
+mod round;
+mod roundeven;
+mod scalbn;
+mod sin;
+mod sincos;
+mod sincosf;
+mod sinf;
+mod sinh;
+mod sinhf;
+mod sqrt;
+mod tan;
+mod tanf;
+mod tanh;
+mod tanhf;
+mod tgamma;
+mod tgammaf;
+mod trunc;
+
+// Use separated imports instead of {}-grouped imports for easier merging.
+pub use self::acos::acos;
+pub use self::acosf::acosf;
+pub use self::acosh::acosh;
+pub use self::acoshf::acoshf;
+pub use self::asin::asin;
+pub use self::asinf::asinf;
+pub use self::asinh::asinh;
+pub use self::asinhf::asinhf;
+pub use self::atan::atan;
+pub use self::atan2::atan2;
+pub use self::atan2f::atan2f;
+pub use self::atanf::atanf;
+pub use self::atanh::atanh;
+pub use self::atanhf::atanhf;
+pub use self::cbrt::cbrt;
+pub use self::cbrtf::cbrtf;
+pub use self::ceil::{ceil, ceilf};
+pub use self::copysign::{copysign, copysignf};
+pub use self::cos::cos;
+pub use self::cosf::cosf;
+pub use self::cosh::cosh;
+pub use self::coshf::coshf;
+pub use self::erf::{erf, erfc};
+pub use self::erff::{erfcf, erff};
+pub use self::exp::exp;
+pub use self::exp2::exp2;
+pub use self::exp2f::exp2f;
+pub use self::exp10::exp10;
+pub use self::exp10f::exp10f;
+pub use self::expf::expf;
+pub use self::expm1::expm1;
+pub use self::expm1f::expm1f;
+pub use self::fabs::{fabs, fabsf};
+pub use self::fdim::{fdim, fdimf};
+pub use self::floor::{floor, floorf};
+pub use self::fma::{fma, fmaf};
+pub use self::fmin_fmax::{fmax, fmaxf, fmin, fminf};
+pub use self::fminimum_fmaximum::{fmaximum, fmaximumf, fminimum, fminimumf};
+pub use self::fminimum_fmaximum_num::{fmaximum_num, fmaximum_numf, fminimum_num, fminimum_numf};
+pub use self::fmod::{fmod, fmodf};
+pub use self::frexp::frexp;
+pub use self::frexpf::frexpf;
+pub use self::hypot::hypot;
+pub use self::hypotf::hypotf;
+pub use self::ilogb::ilogb;
+pub use self::ilogbf::ilogbf;
+pub use self::j0::{j0, y0};
+pub use self::j0f::{j0f, y0f};
+pub use self::j1::{j1, y1};
+pub use self::j1f::{j1f, y1f};
+pub use self::jn::{jn, yn};
+pub use self::jnf::{jnf, ynf};
+pub use self::ldexp::{ldexp, ldexpf};
+pub use self::lgamma::lgamma;
+pub use self::lgamma_r::lgamma_r;
+pub use self::lgammaf::lgammaf;
+pub use self::lgammaf_r::lgammaf_r;
+pub use self::log::log;
+pub use self::log1p::log1p;
+pub use self::log1pf::log1pf;
+pub use self::log2::log2;
+pub use self::log2f::log2f;
+pub use self::log10::log10;
+pub use self::log10f::log10f;
+pub use self::logf::logf;
+pub use self::modf::modf;
+pub use self::modff::modff;
+pub use self::nextafter::nextafter;
+pub use self::nextafterf::nextafterf;
+pub use self::pow::pow;
+pub use self::powf::powf;
+pub use self::remainder::remainder;
+pub use self::remainderf::remainderf;
+pub use self::remquo::remquo;
+pub use self::remquof::remquof;
+pub use self::rint::{rint, rintf};
+pub use self::round::{round, roundf};
+pub use self::roundeven::{roundeven, roundevenf};
+pub use self::scalbn::{scalbn, scalbnf};
+pub use self::sin::sin;
+pub use self::sincos::sincos;
+pub use self::sincosf::sincosf;
+pub use self::sinf::sinf;
+pub use self::sinh::sinh;
+pub use self::sinhf::sinhf;
+pub use self::sqrt::{sqrt, sqrtf};
+pub use self::tan::tan;
+pub use self::tanf::tanf;
+pub use self::tanh::tanh;
+pub use self::tanhf::tanhf;
+pub use self::tgamma::tgamma;
+pub use self::tgammaf::tgammaf;
+pub use self::trunc::{trunc, truncf};
+
+cfg_if! {
+    if #[cfg(f16_enabled)] {
+        // verify-sorted-start
+        pub use self::ceil::ceilf16;
+        pub use self::copysign::copysignf16;
+        pub use self::fabs::fabsf16;
+        pub use self::fdim::fdimf16;
+        pub use self::floor::floorf16;
+        pub use self::fmin_fmax::{fmaxf16, fminf16};
+        pub use self::fminimum_fmaximum::{fmaximumf16, fminimumf16};
+        pub use self::fminimum_fmaximum_num::{fmaximum_numf16, fminimum_numf16};
+        pub use self::fmod::fmodf16;
+        pub use self::ldexp::ldexpf16;
+        pub use self::rint::rintf16;
+        pub use self::round::roundf16;
+        pub use self::roundeven::roundevenf16;
+        pub use self::scalbn::scalbnf16;
+        pub use self::sqrt::sqrtf16;
+        pub use self::trunc::truncf16;
+        // verify-sorted-end
+
+        #[allow(unused_imports)]
+        pub(crate) use self::fma::fmaf16;
+    }
+}
+
+cfg_if! {
+    if #[cfg(f128_enabled)] {
+        // verify-sorted-start
+        pub use self::ceil::ceilf128;
+        pub use self::copysign::copysignf128;
+        pub use self::fabs::fabsf128;
+        pub use self::fdim::fdimf128;
+        pub use self::floor::floorf128;
+        pub use self::fma::fmaf128;
+        pub use self::fmin_fmax::{fmaxf128, fminf128};
+        pub use self::fminimum_fmaximum::{fmaximumf128, fminimumf128};
+        pub use self::fminimum_fmaximum_num::{fmaximum_numf128, fminimum_numf128};
+        pub use self::fmod::fmodf128;
+        pub use self::ldexp::ldexpf128;
+        pub use self::rint::rintf128;
+        pub use self::round::roundf128;
+        pub use self::roundeven::roundevenf128;
+        pub use self::scalbn::scalbnf128;
+        pub use self::sqrt::sqrtf128;
+        pub use self::trunc::truncf128;
+        // verify-sorted-end
+    }
+}
+
+#[inline]
+fn get_high_word(x: f64) -> u32 {
+    (x.to_bits() >> 32) as u32
+}
+
+#[inline]
+fn get_low_word(x: f64) -> u32 {
+    x.to_bits() as u32
+}
+
+#[inline]
+fn with_set_high_word(f: f64, hi: u32) -> f64 {
+    let mut tmp = f.to_bits();
+    tmp &= 0x00000000_ffffffff;
+    tmp |= (hi as u64) << 32;
+    f64::from_bits(tmp)
+}
+
+#[inline]
+fn with_set_low_word(f: f64, lo: u32) -> f64 {
+    let mut tmp = f.to_bits();
+    tmp &= 0xffffffff_00000000;
+    tmp |= lo as u64;
+    f64::from_bits(tmp)
+}
+
+#[inline]
+fn combine_words(hi: u32, lo: u32) -> f64 {
+    f64::from_bits(((hi as u64) << 32) | lo as u64)
+}
diff --git a/library/compiler-builtins/libm/src/math/modf.rs b/library/compiler-builtins/libm/src/math/modf.rs
new file mode 100644
index 00000000000..6541862cdd9
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/modf.rs
@@ -0,0 +1,35 @@
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn modf(x: f64) -> (f64, f64) {
+    let rv2: f64;
+    let mut u = x.to_bits();
+    let mask: u64;
+    let e = (((u >> 52) & 0x7ff) as i32) - 0x3ff;
+
+    /* no fractional part */
+    if e >= 52 {
+        rv2 = x;
+        if e == 0x400 && (u << 12) != 0 {
+            /* nan */
+            return (x, rv2);
+        }
+        u &= 1 << 63;
+        return (f64::from_bits(u), rv2);
+    }
+
+    /* no integral part*/
+    if e < 0 {
+        u &= 1 << 63;
+        rv2 = f64::from_bits(u);
+        return (x, rv2);
+    }
+
+    mask = ((!0) >> 12) >> e;
+    if (u & mask) == 0 {
+        rv2 = x;
+        u &= 1 << 63;
+        return (f64::from_bits(u), rv2);
+    }
+    u &= !mask;
+    rv2 = f64::from_bits(u);
+    return (x - rv2, rv2);
+}
diff --git a/library/compiler-builtins/libm/src/math/modff.rs b/library/compiler-builtins/libm/src/math/modff.rs
new file mode 100644
index 00000000000..90c6bca7d8d
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/modff.rs
@@ -0,0 +1,34 @@
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn modff(x: f32) -> (f32, f32) {
+    let rv2: f32;
+    let mut u: u32 = x.to_bits();
+    let mask: u32;
+    let e = (((u >> 23) & 0xff) as i32) - 0x7f;
+
+    /* no fractional part */
+    if e >= 23 {
+        rv2 = x;
+        if e == 0x80 && (u << 9) != 0 {
+            /* nan */
+            return (x, rv2);
+        }
+        u &= 0x80000000;
+        return (f32::from_bits(u), rv2);
+    }
+    /* no integral part */
+    if e < 0 {
+        u &= 0x80000000;
+        rv2 = f32::from_bits(u);
+        return (x, rv2);
+    }
+
+    mask = 0x007fffff >> e;
+    if (u & mask) == 0 {
+        rv2 = x;
+        u &= 0x80000000;
+        return (f32::from_bits(u), rv2);
+    }
+    u &= !mask;
+    rv2 = f32::from_bits(u);
+    return (x - rv2, rv2);
+}
diff --git a/library/compiler-builtins/libm/src/math/nextafter.rs b/library/compiler-builtins/libm/src/math/nextafter.rs
new file mode 100644
index 00000000000..c991ff6f233
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/nextafter.rs
@@ -0,0 +1,37 @@
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn nextafter(x: f64, y: f64) -> f64 {
+    if x.is_nan() || y.is_nan() {
+        return x + y;
+    }
+
+    let mut ux_i = x.to_bits();
+    let uy_i = y.to_bits();
+    if ux_i == uy_i {
+        return y;
+    }
+
+    let ax = ux_i & (!1_u64 / 2);
+    let ay = uy_i & (!1_u64 / 2);
+    if ax == 0 {
+        if ay == 0 {
+            return y;
+        }
+        ux_i = (uy_i & (1_u64 << 63)) | 1;
+    } else if ax > ay || ((ux_i ^ uy_i) & (1_u64 << 63)) != 0 {
+        ux_i -= 1;
+    } else {
+        ux_i += 1;
+    }
+
+    let e = (ux_i >> 52) & 0x7ff;
+    // raise overflow if ux.f is infinite and x is finite
+    if e == 0x7ff {
+        force_eval!(x + x);
+    }
+    let ux_f = f64::from_bits(ux_i);
+    // raise underflow if ux.f is subnormal or zero
+    if e == 0 {
+        force_eval!(x * x + ux_f * ux_f);
+    }
+    ux_f
+}
diff --git a/library/compiler-builtins/libm/src/math/nextafterf.rs b/library/compiler-builtins/libm/src/math/nextafterf.rs
new file mode 100644
index 00000000000..8ba3833562f
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/nextafterf.rs
@@ -0,0 +1,37 @@
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn nextafterf(x: f32, y: f32) -> f32 {
+    if x.is_nan() || y.is_nan() {
+        return x + y;
+    }
+
+    let mut ux_i = x.to_bits();
+    let uy_i = y.to_bits();
+    if ux_i == uy_i {
+        return y;
+    }
+
+    let ax = ux_i & 0x7fff_ffff_u32;
+    let ay = uy_i & 0x7fff_ffff_u32;
+    if ax == 0 {
+        if ay == 0 {
+            return y;
+        }
+        ux_i = (uy_i & 0x8000_0000_u32) | 1;
+    } else if ax > ay || ((ux_i ^ uy_i) & 0x8000_0000_u32) != 0 {
+        ux_i -= 1;
+    } else {
+        ux_i += 1;
+    }
+
+    let e = ux_i & 0x7f80_0000_u32;
+    // raise overflow if ux_f is infinite and x is finite
+    if e == 0x7f80_0000_u32 {
+        force_eval!(x + x);
+    }
+    let ux_f = f32::from_bits(ux_i);
+    // raise underflow if ux_f is subnormal or zero
+    if e == 0 {
+        force_eval!(x * x + ux_f * ux_f);
+    }
+    ux_f
+}
diff --git a/library/compiler-builtins/libm/src/math/pow.rs b/library/compiler-builtins/libm/src/math/pow.rs
new file mode 100644
index 00000000000..94ae31cf0da
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/pow.rs
@@ -0,0 +1,624 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/e_pow.c */
+/*
+ * ====================================================
+ * Copyright (C) 2004 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+// pow(x,y) return x**y
+//
+//                    n
+// Method:  Let x =  2   * (1+f)
+//      1. Compute and return log2(x) in two pieces:
+//              log2(x) = w1 + w2,
+//         where w1 has 53-24 = 29 bit trailing zeros.
+//      2. Perform y*log2(x) = n+y' by simulating multi-precision
+//         arithmetic, where |y'|<=0.5.
+//      3. Return x**y = 2**n*exp(y'*log2)
+//
+// Special cases:
+//      1.  (anything) ** 0  is 1
+//      2.  1 ** (anything)  is 1
+//      3.  (anything except 1) ** NAN is NAN
+//      4.  NAN ** (anything except 0) is NAN
+//      5.  +-(|x| > 1) **  +INF is +INF
+//      6.  +-(|x| > 1) **  -INF is +0
+//      7.  +-(|x| < 1) **  +INF is +0
+//      8.  +-(|x| < 1) **  -INF is +INF
+//      9.  -1          ** +-INF is 1
+//      10. +0 ** (+anything except 0, NAN)               is +0
+//      11. -0 ** (+anything except 0, NAN, odd integer)  is +0
+//      12. +0 ** (-anything except 0, NAN)               is +INF, raise divbyzero
+//      13. -0 ** (-anything except 0, NAN, odd integer)  is +INF, raise divbyzero
+//      14. -0 ** (+odd integer) is -0
+//      15. -0 ** (-odd integer) is -INF, raise divbyzero
+//      16. +INF ** (+anything except 0,NAN) is +INF
+//      17. +INF ** (-anything except 0,NAN) is +0
+//      18. -INF ** (+odd integer) is -INF
+//      19. -INF ** (anything) = -0 ** (-anything), (anything except odd integer)
+//      20. (anything) ** 1 is (anything)
+//      21. (anything) ** -1 is 1/(anything)
+//      22. (-anything) ** (integer) is (-1)**(integer)*(+anything**integer)
+//      23. (-anything except 0 and inf) ** (non-integer) is NAN
+//
+// Accuracy:
+//      pow(x,y) returns x**y nearly rounded. In particular
+//                      pow(integer,integer)
+//      always returns the correct integer provided it is
+//      representable.
+//
+// Constants :
+// The hexadecimal values are the intended ones for the following
+// constants. The decimal values may be used, provided that the
+// compiler will convert from decimal to binary accurately enough
+// to produce the hexadecimal values shown.
+//
+use super::{fabs, get_high_word, scalbn, sqrt, with_set_high_word, with_set_low_word};
+
+const BP: [f64; 2] = [1.0, 1.5];
+const DP_H: [f64; 2] = [0.0, 5.84962487220764160156e-01]; /* 0x3fe2b803_40000000 */
+const DP_L: [f64; 2] = [0.0, 1.35003920212974897128e-08]; /* 0x3E4CFDEB, 0x43CFD006 */
+const TWO53: f64 = 9007199254740992.0; /* 0x43400000_00000000 */
+const HUGE: f64 = 1.0e300;
+const TINY: f64 = 1.0e-300;
+
+// poly coefs for (3/2)*(log(x)-2s-2/3*s**3:
+const L1: f64 = 5.99999999999994648725e-01; /* 0x3fe33333_33333303 */
+const L2: f64 = 4.28571428578550184252e-01; /* 0x3fdb6db6_db6fabff */
+const L3: f64 = 3.33333329818377432918e-01; /* 0x3fd55555_518f264d */
+const L4: f64 = 2.72728123808534006489e-01; /* 0x3fd17460_a91d4101 */
+const L5: f64 = 2.30660745775561754067e-01; /* 0x3fcd864a_93c9db65 */
+const L6: f64 = 2.06975017800338417784e-01; /* 0x3fca7e28_4a454eef */
+const P1: f64 = 1.66666666666666019037e-01; /* 0x3fc55555_5555553e */
+const P2: f64 = -2.77777777770155933842e-03; /* 0xbf66c16c_16bebd93 */
+const P3: f64 = 6.61375632143793436117e-05; /* 0x3f11566a_af25de2c */
+const P4: f64 = -1.65339022054652515390e-06; /* 0xbebbbd41_c5d26bf1 */
+const P5: f64 = 4.13813679705723846039e-08; /* 0x3e663769_72bea4d0 */
+const LG2: f64 = 6.93147180559945286227e-01; /* 0x3fe62e42_fefa39ef */
+const LG2_H: f64 = 6.93147182464599609375e-01; /* 0x3fe62e43_00000000 */
+const LG2_L: f64 = -1.90465429995776804525e-09; /* 0xbe205c61_0ca86c39 */
+const OVT: f64 = 8.0085662595372944372e-017; /* -(1024-log2(ovfl+.5ulp)) */
+const CP: f64 = 9.61796693925975554329e-01; /* 0x3feec709_dc3a03fd =2/(3ln2) */
+const CP_H: f64 = 9.61796700954437255859e-01; /* 0x3feec709_e0000000 =(float)cp */
+const CP_L: f64 = -7.02846165095275826516e-09; /* 0xbe3e2fe0_145b01f5 =tail of cp_h*/
+const IVLN2: f64 = 1.44269504088896338700e+00; /* 0x3ff71547_652b82fe =1/ln2 */
+const IVLN2_H: f64 = 1.44269502162933349609e+00; /* 0x3ff71547_60000000 =24b 1/ln2*/
+const IVLN2_L: f64 = 1.92596299112661746887e-08; /* 0x3e54ae0b_f85ddf44 =1/ln2 tail*/
+
+/// Returns `x` to the power of `y` (f64).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn pow(x: f64, y: f64) -> f64 {
+    let t1: f64;
+    let t2: f64;
+
+    let (hx, lx): (i32, u32) = ((x.to_bits() >> 32) as i32, x.to_bits() as u32);
+    let (hy, ly): (i32, u32) = ((y.to_bits() >> 32) as i32, y.to_bits() as u32);
+
+    let mut ix: i32 = hx & 0x7fffffff_i32;
+    let iy: i32 = hy & 0x7fffffff_i32;
+
+    /* x**0 = 1, even if x is NaN */
+    if ((iy as u32) | ly) == 0 {
+        return 1.0;
+    }
+
+    /* 1**y = 1, even if y is NaN */
+    if hx == 0x3ff00000 && lx == 0 {
+        return 1.0;
+    }
+
+    /* NaN if either arg is NaN */
+    if ix > 0x7ff00000
+        || (ix == 0x7ff00000 && lx != 0)
+        || iy > 0x7ff00000
+        || (iy == 0x7ff00000 && ly != 0)
+    {
+        return x + y;
+    }
+
+    /* determine if y is an odd int when x < 0
+     * yisint = 0       ... y is not an integer
+     * yisint = 1       ... y is an odd int
+     * yisint = 2       ... y is an even int
+     */
+    let mut yisint: i32 = 0;
+    let mut k: i32;
+    let mut j: i32;
+    if hx < 0 {
+        if iy >= 0x43400000 {
+            yisint = 2; /* even integer y */
+        } else if iy >= 0x3ff00000 {
+            k = (iy >> 20) - 0x3ff; /* exponent */
+
+            if k > 20 {
+                j = (ly >> (52 - k)) as i32;
+
+                if (j << (52 - k)) == (ly as i32) {
+                    yisint = 2 - (j & 1);
+                }
+            } else if ly == 0 {
+                j = iy >> (20 - k);
+
+                if (j << (20 - k)) == iy {
+                    yisint = 2 - (j & 1);
+                }
+            }
+        }
+    }
+
+    if ly == 0 {
+        /* special value of y */
+        if iy == 0x7ff00000 {
+            /* y is +-inf */
+
+            return if ((ix - 0x3ff00000) | (lx as i32)) == 0 {
+                /* (-1)**+-inf is 1 */
+                1.0
+            } else if ix >= 0x3ff00000 {
+                /* (|x|>1)**+-inf = inf,0 */
+                if hy >= 0 { y } else { 0.0 }
+            } else {
+                /* (|x|<1)**+-inf = 0,inf */
+                if hy >= 0 { 0.0 } else { -y }
+            };
+        }
+
+        if iy == 0x3ff00000 {
+            /* y is +-1 */
+            return if hy >= 0 { x } else { 1.0 / x };
+        }
+
+        if hy == 0x40000000 {
+            /* y is 2 */
+            return x * x;
+        }
+
+        if hy == 0x3fe00000 {
+            /* y is 0.5 */
+            if hx >= 0 {
+                /* x >= +0 */
+                return sqrt(x);
+            }
+        }
+    }
+
+    let mut ax: f64 = fabs(x);
+    if lx == 0 {
+        /* special value of x */
+        if ix == 0x7ff00000 || ix == 0 || ix == 0x3ff00000 {
+            /* x is +-0,+-inf,+-1 */
+            let mut z: f64 = ax;
+
+            if hy < 0 {
+                /* z = (1/|x|) */
+                z = 1.0 / z;
+            }
+
+            if hx < 0 {
+                if ((ix - 0x3ff00000) | yisint) == 0 {
+                    z = (z - z) / (z - z); /* (-1)**non-int is NaN */
+                } else if yisint == 1 {
+                    z = -z; /* (x<0)**odd = -(|x|**odd) */
+                }
+            }
+
+            return z;
+        }
+    }
+
+    let mut s: f64 = 1.0; /* sign of result */
+    if hx < 0 {
+        if yisint == 0 {
+            /* (x<0)**(non-int) is NaN */
+            return (x - x) / (x - x);
+        }
+
+        if yisint == 1 {
+            /* (x<0)**(odd int) */
+            s = -1.0;
+        }
+    }
+
+    /* |y| is HUGE */
+    if iy > 0x41e00000 {
+        /* if |y| > 2**31 */
+        if iy > 0x43f00000 {
+            /* if |y| > 2**64, must o/uflow */
+            if ix <= 0x3fefffff {
+                return if hy < 0 { HUGE * HUGE } else { TINY * TINY };
+            }
+
+            if ix >= 0x3ff00000 {
+                return if hy > 0 { HUGE * HUGE } else { TINY * TINY };
+            }
+        }
+
+        /* over/underflow if x is not close to one */
+        if ix < 0x3fefffff {
+            return if hy < 0 {
+                s * HUGE * HUGE
+            } else {
+                s * TINY * TINY
+            };
+        }
+        if ix > 0x3ff00000 {
+            return if hy > 0 {
+                s * HUGE * HUGE
+            } else {
+                s * TINY * TINY
+            };
+        }
+
+        /* now |1-x| is TINY <= 2**-20, suffice to compute
+        log(x) by x-x^2/2+x^3/3-x^4/4 */
+        let t: f64 = ax - 1.0; /* t has 20 trailing zeros */
+        let w: f64 = (t * t) * (0.5 - t * (0.3333333333333333333333 - t * 0.25));
+        let u: f64 = IVLN2_H * t; /* ivln2_h has 21 sig. bits */
+        let v: f64 = t * IVLN2_L - w * IVLN2;
+        t1 = with_set_low_word(u + v, 0);
+        t2 = v - (t1 - u);
+    } else {
+        // double ss,s2,s_h,s_l,t_h,t_l;
+        let mut n: i32 = 0;
+
+        if ix < 0x00100000 {
+            /* take care subnormal number */
+            ax *= TWO53;
+            n -= 53;
+            ix = get_high_word(ax) as i32;
+        }
+
+        n += (ix >> 20) - 0x3ff;
+        j = ix & 0x000fffff;
+
+        /* determine interval */
+        let k: i32;
+        ix = j | 0x3ff00000; /* normalize ix */
+        if j <= 0x3988E {
+            /* |x|<sqrt(3/2) */
+            k = 0;
+        } else if j < 0xBB67A {
+            /* |x|<sqrt(3)   */
+            k = 1;
+        } else {
+            k = 0;
+            n += 1;
+            ix -= 0x00100000;
+        }
+        ax = with_set_high_word(ax, ix as u32);
+
+        /* compute ss = s_h+s_l = (x-1)/(x+1) or (x-1.5)/(x+1.5) */
+        let u: f64 = ax - i!(BP, k as usize); /* bp[0]=1.0, bp[1]=1.5 */
+        let v: f64 = 1.0 / (ax + i!(BP, k as usize));
+        let ss: f64 = u * v;
+        let s_h = with_set_low_word(ss, 0);
+
+        /* t_h=ax+bp[k] High */
+        let t_h: f64 = with_set_high_word(
+            0.0,
+            ((ix as u32 >> 1) | 0x20000000) + 0x00080000 + ((k as u32) << 18),
+        );
+        let t_l: f64 = ax - (t_h - i!(BP, k as usize));
+        let s_l: f64 = v * ((u - s_h * t_h) - s_h * t_l);
+
+        /* compute log(ax) */
+        let s2: f64 = ss * ss;
+        let mut r: f64 = s2 * s2 * (L1 + s2 * (L2 + s2 * (L3 + s2 * (L4 + s2 * (L5 + s2 * L6)))));
+        r += s_l * (s_h + ss);
+        let s2: f64 = s_h * s_h;
+        let t_h: f64 = with_set_low_word(3.0 + s2 + r, 0);
+        let t_l: f64 = r - ((t_h - 3.0) - s2);
+
+        /* u+v = ss*(1+...) */
+        let u: f64 = s_h * t_h;
+        let v: f64 = s_l * t_h + t_l * ss;
+
+        /* 2/(3log2)*(ss+...) */
+        let p_h: f64 = with_set_low_word(u + v, 0);
+        let p_l = v - (p_h - u);
+        let z_h: f64 = CP_H * p_h; /* cp_h+cp_l = 2/(3*log2) */
+        let z_l: f64 = CP_L * p_h + p_l * CP + i!(DP_L, k as usize);
+
+        /* log2(ax) = (ss+..)*2/(3*log2) = n + dp_h + z_h + z_l */
+        let t: f64 = n as f64;
+        t1 = with_set_low_word(((z_h + z_l) + i!(DP_H, k as usize)) + t, 0);
+        t2 = z_l - (((t1 - t) - i!(DP_H, k as usize)) - z_h);
+    }
+
+    /* split up y into y1+y2 and compute (y1+y2)*(t1+t2) */
+    let y1: f64 = with_set_low_word(y, 0);
+    let p_l: f64 = (y - y1) * t1 + y * t2;
+    let mut p_h: f64 = y1 * t1;
+    let z: f64 = p_l + p_h;
+    let mut j: i32 = (z.to_bits() >> 32) as i32;
+    let i: i32 = z.to_bits() as i32;
+    // let (j, i): (i32, i32) = ((z.to_bits() >> 32) as i32, z.to_bits() as i32);
+
+    if j >= 0x40900000 {
+        /* z >= 1024 */
+        if (j - 0x40900000) | i != 0 {
+            /* if z > 1024 */
+            return s * HUGE * HUGE; /* overflow */
+        }
+
+        if p_l + OVT > z - p_h {
+            return s * HUGE * HUGE; /* overflow */
+        }
+    } else if (j & 0x7fffffff) >= 0x4090cc00 {
+        /* z <= -1075 */
+        // FIXME: instead of abs(j) use unsigned j
+
+        if (((j as u32) - 0xc090cc00) | (i as u32)) != 0 {
+            /* z < -1075 */
+            return s * TINY * TINY; /* underflow */
+        }
+
+        if p_l <= z - p_h {
+            return s * TINY * TINY; /* underflow */
+        }
+    }
+
+    /* compute 2**(p_h+p_l) */
+    let i: i32 = j & 0x7fffffff_i32;
+    k = (i >> 20) - 0x3ff;
+    let mut n: i32 = 0;
+
+    if i > 0x3fe00000 {
+        /* if |z| > 0.5, set n = [z+0.5] */
+        n = j + (0x00100000 >> (k + 1));
+        k = ((n & 0x7fffffff) >> 20) - 0x3ff; /* new k for n */
+        let t: f64 = with_set_high_word(0.0, (n & !(0x000fffff >> k)) as u32);
+        n = ((n & 0x000fffff) | 0x00100000) >> (20 - k);
+        if j < 0 {
+            n = -n;
+        }
+        p_h -= t;
+    }
+
+    let t: f64 = with_set_low_word(p_l + p_h, 0);
+    let u: f64 = t * LG2_H;
+    let v: f64 = (p_l - (t - p_h)) * LG2 + t * LG2_L;
+    let mut z: f64 = u + v;
+    let w: f64 = v - (z - u);
+    let t: f64 = z * z;
+    let t1: f64 = z - t * (P1 + t * (P2 + t * (P3 + t * (P4 + t * P5))));
+    let r: f64 = (z * t1) / (t1 - 2.0) - (w + z * w);
+    z = 1.0 - (r - z);
+    j = get_high_word(z) as i32;
+    j += n << 20;
+
+    if (j >> 20) <= 0 {
+        /* subnormal output */
+        z = scalbn(z, n);
+    } else {
+        z = with_set_high_word(z, j as u32);
+    }
+
+    s * z
+}
+
+#[cfg(test)]
+mod tests {
+    extern crate core;
+
+    use self::core::f64::consts::{E, PI};
+    use super::pow;
+
+    const POS_ZERO: &[f64] = &[0.0];
+    const NEG_ZERO: &[f64] = &[-0.0];
+    const POS_ONE: &[f64] = &[1.0];
+    const NEG_ONE: &[f64] = &[-1.0];
+    const POS_FLOATS: &[f64] = &[99.0 / 70.0, E, PI];
+    const NEG_FLOATS: &[f64] = &[-99.0 / 70.0, -E, -PI];
+    const POS_SMALL_FLOATS: &[f64] = &[(1.0 / 2.0), f64::MIN_POSITIVE, f64::EPSILON];
+    const NEG_SMALL_FLOATS: &[f64] = &[-(1.0 / 2.0), -f64::MIN_POSITIVE, -f64::EPSILON];
+    const POS_EVENS: &[f64] = &[2.0, 6.0, 8.0, 10.0, 22.0, 100.0, f64::MAX];
+    const NEG_EVENS: &[f64] = &[f64::MIN, -100.0, -22.0, -10.0, -8.0, -6.0, -2.0];
+    const POS_ODDS: &[f64] = &[3.0, 7.0];
+    const NEG_ODDS: &[f64] = &[-7.0, -3.0];
+    const NANS: &[f64] = &[f64::NAN];
+    const POS_INF: &[f64] = &[f64::INFINITY];
+    const NEG_INF: &[f64] = &[f64::NEG_INFINITY];
+
+    const ALL: &[&[f64]] = &[
+        POS_ZERO,
+        NEG_ZERO,
+        NANS,
+        NEG_SMALL_FLOATS,
+        POS_SMALL_FLOATS,
+        NEG_FLOATS,
+        POS_FLOATS,
+        NEG_EVENS,
+        POS_EVENS,
+        NEG_ODDS,
+        POS_ODDS,
+        NEG_INF,
+        POS_INF,
+        NEG_ONE,
+        POS_ONE,
+    ];
+    const POS: &[&[f64]] = &[POS_ZERO, POS_ODDS, POS_ONE, POS_FLOATS, POS_EVENS, POS_INF];
+    const NEG: &[&[f64]] = &[NEG_ZERO, NEG_ODDS, NEG_ONE, NEG_FLOATS, NEG_EVENS, NEG_INF];
+
+    fn pow_test(base: f64, exponent: f64, expected: f64) {
+        let res = pow(base, exponent);
+        assert!(
+            if expected.is_nan() {
+                res.is_nan()
+            } else {
+                pow(base, exponent) == expected
+            },
+            "{base} ** {exponent} was {res} instead of {expected}",
+        );
+    }
+
+    fn test_sets_as_base(sets: &[&[f64]], exponent: f64, expected: f64) {
+        sets.iter()
+            .for_each(|s| s.iter().for_each(|val| pow_test(*val, exponent, expected)));
+    }
+
+    fn test_sets_as_exponent(base: f64, sets: &[&[f64]], expected: f64) {
+        sets.iter()
+            .for_each(|s| s.iter().for_each(|val| pow_test(base, *val, expected)));
+    }
+
+    fn test_sets(sets: &[&[f64]], computed: &dyn Fn(f64) -> f64, expected: &dyn Fn(f64) -> f64) {
+        sets.iter().for_each(|s| {
+            s.iter().for_each(|val| {
+                let exp = expected(*val);
+                let res = computed(*val);
+
+                #[cfg(all(target_arch = "x86", not(target_feature = "sse2")))]
+                let exp = force_eval!(exp);
+                #[cfg(all(target_arch = "x86", not(target_feature = "sse2")))]
+                let res = force_eval!(res);
+                assert!(
+                    if exp.is_nan() {
+                        res.is_nan()
+                    } else {
+                        exp == res
+                    },
+                    "test for {val} was {res} instead of {exp}",
+                );
+            })
+        });
+    }
+
+    #[test]
+    fn zero_as_exponent() {
+        test_sets_as_base(ALL, 0.0, 1.0);
+        test_sets_as_base(ALL, -0.0, 1.0);
+    }
+
+    #[test]
+    fn one_as_base() {
+        test_sets_as_exponent(1.0, ALL, 1.0);
+    }
+
+    #[test]
+    fn nan_inputs() {
+        // NAN as the base:
+        // (f64::NAN ^ anything *but 0* should be f64::NAN)
+        test_sets_as_exponent(f64::NAN, &ALL[2..], f64::NAN);
+
+        // f64::NAN as the exponent:
+        // (anything *but 1* ^ f64::NAN should be f64::NAN)
+        test_sets_as_base(&ALL[..(ALL.len() - 2)], f64::NAN, f64::NAN);
+    }
+
+    #[test]
+    fn infinity_as_base() {
+        // Positive Infinity as the base:
+        // (+Infinity ^ positive anything but 0 and f64::NAN should be +Infinity)
+        test_sets_as_exponent(f64::INFINITY, &POS[1..], f64::INFINITY);
+
+        // (+Infinity ^ negative anything except 0 and f64::NAN should be 0.0)
+        test_sets_as_exponent(f64::INFINITY, &NEG[1..], 0.0);
+
+        // Negative Infinity as the base:
+        // (-Infinity ^ positive odd ints should be -Infinity)
+        test_sets_as_exponent(f64::NEG_INFINITY, &[POS_ODDS], f64::NEG_INFINITY);
+
+        // (-Infinity ^ anything but odd ints should be == -0 ^ (-anything))
+        // We can lump in pos/neg odd ints here because they don't seem to
+        // cause panics (div by zero) in release mode (I think).
+        test_sets(ALL, &|v: f64| pow(f64::NEG_INFINITY, v), &|v: f64| {
+            pow(-0.0, -v)
+        });
+    }
+
+    #[test]
+    fn infinity_as_exponent() {
+        // Positive/Negative base greater than 1:
+        // (pos/neg > 1 ^ Infinity should be Infinity - note this excludes f64::NAN as the base)
+        test_sets_as_base(&ALL[5..(ALL.len() - 2)], f64::INFINITY, f64::INFINITY);
+
+        // (pos/neg > 1 ^ -Infinity should be 0.0)
+        test_sets_as_base(&ALL[5..ALL.len() - 2], f64::NEG_INFINITY, 0.0);
+
+        // Positive/Negative base less than 1:
+        let base_below_one = &[POS_ZERO, NEG_ZERO, NEG_SMALL_FLOATS, POS_SMALL_FLOATS];
+
+        // (pos/neg < 1 ^ Infinity should be 0.0 - this also excludes f64::NAN as the base)
+        test_sets_as_base(base_below_one, f64::INFINITY, 0.0);
+
+        // (pos/neg < 1 ^ -Infinity should be Infinity)
+        test_sets_as_base(base_below_one, f64::NEG_INFINITY, f64::INFINITY);
+
+        // Positive/Negative 1 as the base:
+        // (pos/neg 1 ^ Infinity should be 1)
+        test_sets_as_base(&[NEG_ONE, POS_ONE], f64::INFINITY, 1.0);
+
+        // (pos/neg 1 ^ -Infinity should be 1)
+        test_sets_as_base(&[NEG_ONE, POS_ONE], f64::NEG_INFINITY, 1.0);
+    }
+
+    #[test]
+    fn zero_as_base() {
+        // Positive Zero as the base:
+        // (+0 ^ anything positive but 0 and f64::NAN should be +0)
+        test_sets_as_exponent(0.0, &POS[1..], 0.0);
+
+        // (+0 ^ anything negative but 0 and f64::NAN should be Infinity)
+        // (this should panic because we're dividing by zero)
+        test_sets_as_exponent(0.0, &NEG[1..], f64::INFINITY);
+
+        // Negative Zero as the base:
+        // (-0 ^ anything positive but 0, f64::NAN, and odd ints should be +0)
+        test_sets_as_exponent(-0.0, &POS[3..], 0.0);
+
+        // (-0 ^ anything negative but 0, f64::NAN, and odd ints should be Infinity)
+        // (should panic because of divide by zero)
+        test_sets_as_exponent(-0.0, &NEG[3..], f64::INFINITY);
+
+        // (-0 ^ positive odd ints should be -0)
+        test_sets_as_exponent(-0.0, &[POS_ODDS], -0.0);
+
+        // (-0 ^ negative odd ints should be -Infinity)
+        // (should panic because of divide by zero)
+        test_sets_as_exponent(-0.0, &[NEG_ODDS], f64::NEG_INFINITY);
+    }
+
+    #[test]
+    fn special_cases() {
+        // One as the exponent:
+        // (anything ^ 1 should be anything - i.e. the base)
+        test_sets(ALL, &|v: f64| pow(v, 1.0), &|v: f64| v);
+
+        // Negative One as the exponent:
+        // (anything ^ -1 should be 1/anything)
+        test_sets(ALL, &|v: f64| pow(v, -1.0), &|v: f64| 1.0 / v);
+
+        // Factoring -1 out:
+        // (negative anything ^ integer should be (-1 ^ integer) * (positive anything ^ integer))
+        [POS_ZERO, NEG_ZERO, POS_ONE, NEG_ONE, POS_EVENS, NEG_EVENS]
+            .iter()
+            .for_each(|int_set| {
+                int_set.iter().for_each(|int| {
+                    test_sets(ALL, &|v: f64| pow(-v, *int), &|v: f64| {
+                        pow(-1.0, *int) * pow(v, *int)
+                    });
+                })
+            });
+
+        // Negative base (imaginary results):
+        // (-anything except 0 and Infinity ^ non-integer should be NAN)
+        NEG[1..(NEG.len() - 1)].iter().for_each(|set| {
+            set.iter().for_each(|val| {
+                test_sets(&ALL[3..7], &|v: f64| pow(*val, v), &|_| f64::NAN);
+            })
+        });
+    }
+
+    #[test]
+    fn normal_cases() {
+        assert_eq!(pow(2.0, 20.0), (1 << 20) as f64);
+        assert_eq!(pow(-1.0, 9.0), -1.0);
+        assert!(pow(-1.0, 2.2).is_nan());
+        assert!(pow(-1.0, -1.14).is_nan());
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/powf.rs b/library/compiler-builtins/libm/src/math/powf.rs
new file mode 100644
index 00000000000..11c7a7cbd94
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/powf.rs
@@ -0,0 +1,343 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/e_powf.c */
+/*
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
+ */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+use core::cmp::Ordering;
+
+use super::{fabsf, scalbnf, sqrtf};
+
+const BP: [f32; 2] = [1.0, 1.5];
+const DP_H: [f32; 2] = [0.0, 5.84960938e-01]; /* 0x3f15c000 */
+const DP_L: [f32; 2] = [0.0, 1.56322085e-06]; /* 0x35d1cfdc */
+const TWO24: f32 = 16777216.0; /* 0x4b800000 */
+const HUGE: f32 = 1.0e30;
+const TINY: f32 = 1.0e-30;
+const L1: f32 = 6.0000002384e-01; /* 0x3f19999a */
+const L2: f32 = 4.2857143283e-01; /* 0x3edb6db7 */
+const L3: f32 = 3.3333334327e-01; /* 0x3eaaaaab */
+const L4: f32 = 2.7272811532e-01; /* 0x3e8ba305 */
+const L5: f32 = 2.3066075146e-01; /* 0x3e6c3255 */
+const L6: f32 = 2.0697501302e-01; /* 0x3e53f142 */
+const P1: f32 = 1.6666667163e-01; /* 0x3e2aaaab */
+const P2: f32 = -2.7777778450e-03; /* 0xbb360b61 */
+const P3: f32 = 6.6137559770e-05; /* 0x388ab355 */
+const P4: f32 = -1.6533901999e-06; /* 0xb5ddea0e */
+const P5: f32 = 4.1381369442e-08; /* 0x3331bb4c */
+const LG2: f32 = 6.9314718246e-01; /* 0x3f317218 */
+const LG2_H: f32 = 6.93145752e-01; /* 0x3f317200 */
+const LG2_L: f32 = 1.42860654e-06; /* 0x35bfbe8c */
+const OVT: f32 = 4.2995665694e-08; /* -(128-log2(ovfl+.5ulp)) */
+const CP: f32 = 9.6179670095e-01; /* 0x3f76384f =2/(3ln2) */
+const CP_H: f32 = 9.6191406250e-01; /* 0x3f764000 =12b cp */
+const CP_L: f32 = -1.1736857402e-04; /* 0xb8f623c6 =tail of cp_h */
+const IVLN2: f32 = 1.4426950216e+00;
+const IVLN2_H: f32 = 1.4426879883e+00;
+const IVLN2_L: f32 = 7.0526075433e-06;
+
+/// Returns `x` to the power of `y` (f32).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn powf(x: f32, y: f32) -> f32 {
+    let mut z: f32;
+    let mut ax: f32;
+    let z_h: f32;
+    let z_l: f32;
+    let mut p_h: f32;
+    let mut p_l: f32;
+    let y1: f32;
+    let mut t1: f32;
+    let t2: f32;
+    let mut r: f32;
+    let s: f32;
+    let mut sn: f32;
+    let mut t: f32;
+    let mut u: f32;
+    let mut v: f32;
+    let mut w: f32;
+    let i: i32;
+    let mut j: i32;
+    let mut k: i32;
+    let mut yisint: i32;
+    let mut n: i32;
+    let hx: i32;
+    let hy: i32;
+    let mut ix: i32;
+    let iy: i32;
+    let mut is: i32;
+
+    hx = x.to_bits() as i32;
+    hy = y.to_bits() as i32;
+
+    ix = hx & 0x7fffffff;
+    iy = hy & 0x7fffffff;
+
+    /* x**0 = 1, even if x is NaN */
+    if iy == 0 {
+        return 1.0;
+    }
+
+    /* 1**y = 1, even if y is NaN */
+    if hx == 0x3f800000 {
+        return 1.0;
+    }
+
+    /* NaN if either arg is NaN */
+    if ix > 0x7f800000 || iy > 0x7f800000 {
+        return x + y;
+    }
+
+    /* determine if y is an odd int when x < 0
+     * yisint = 0       ... y is not an integer
+     * yisint = 1       ... y is an odd int
+     * yisint = 2       ... y is an even int
+     */
+    yisint = 0;
+    if hx < 0 {
+        if iy >= 0x4b800000 {
+            yisint = 2; /* even integer y */
+        } else if iy >= 0x3f800000 {
+            k = (iy >> 23) - 0x7f; /* exponent */
+            j = iy >> (23 - k);
+            if (j << (23 - k)) == iy {
+                yisint = 2 - (j & 1);
+            }
+        }
+    }
+
+    /* special value of y */
+    if iy == 0x7f800000 {
+        /* y is +-inf */
+        match ix.cmp(&0x3f800000) {
+            /* (-1)**+-inf is 1 */
+            Ordering::Equal => return 1.0,
+            /* (|x|>1)**+-inf = inf,0 */
+            Ordering::Greater => return if hy >= 0 { y } else { 0.0 },
+            /* (|x|<1)**+-inf = 0,inf */
+            Ordering::Less => return if hy >= 0 { 0.0 } else { -y },
+        }
+    }
+    if iy == 0x3f800000 {
+        /* y is +-1 */
+        return if hy >= 0 { x } else { 1.0 / x };
+    }
+
+    if hy == 0x40000000 {
+        /* y is 2 */
+        return x * x;
+    }
+
+    if hy == 0x3f000000
+       /* y is  0.5 */
+       && hx >= 0
+    {
+        /* x >= +0 */
+        return sqrtf(x);
+    }
+
+    ax = fabsf(x);
+    /* special value of x */
+    if ix == 0x7f800000 || ix == 0 || ix == 0x3f800000 {
+        /* x is +-0,+-inf,+-1 */
+        z = ax;
+        if hy < 0 {
+            /* z = (1/|x|) */
+            z = 1.0 / z;
+        }
+
+        if hx < 0 {
+            if ((ix - 0x3f800000) | yisint) == 0 {
+                z = (z - z) / (z - z); /* (-1)**non-int is NaN */
+            } else if yisint == 1 {
+                z = -z; /* (x<0)**odd = -(|x|**odd) */
+            }
+        }
+        return z;
+    }
+
+    sn = 1.0; /* sign of result */
+    if hx < 0 {
+        if yisint == 0 {
+            /* (x<0)**(non-int) is NaN */
+            return (x - x) / (x - x);
+        }
+
+        if yisint == 1 {
+            /* (x<0)**(odd int) */
+            sn = -1.0;
+        }
+    }
+
+    /* |y| is HUGE */
+    if iy > 0x4d000000 {
+        /* if |y| > 2**27 */
+        /* over/underflow if x is not close to one */
+        if ix < 0x3f7ffff8 {
+            return if hy < 0 {
+                sn * HUGE * HUGE
+            } else {
+                sn * TINY * TINY
+            };
+        }
+
+        if ix > 0x3f800007 {
+            return if hy > 0 {
+                sn * HUGE * HUGE
+            } else {
+                sn * TINY * TINY
+            };
+        }
+
+        /* now |1-x| is TINY <= 2**-20, suffice to compute
+        log(x) by x-x^2/2+x^3/3-x^4/4 */
+        t = ax - 1.; /* t has 20 trailing zeros */
+        w = (t * t) * (0.5 - t * (0.333333333333 - t * 0.25));
+        u = IVLN2_H * t; /* IVLN2_H has 16 sig. bits */
+        v = t * IVLN2_L - w * IVLN2;
+        t1 = u + v;
+        is = t1.to_bits() as i32;
+        t1 = f32::from_bits(is as u32 & 0xfffff000);
+        t2 = v - (t1 - u);
+    } else {
+        let mut s2: f32;
+        let mut s_h: f32;
+        let s_l: f32;
+        let mut t_h: f32;
+        let mut t_l: f32;
+
+        n = 0;
+        /* take care subnormal number */
+        if ix < 0x00800000 {
+            ax *= TWO24;
+            n -= 24;
+            ix = ax.to_bits() as i32;
+        }
+        n += ((ix) >> 23) - 0x7f;
+        j = ix & 0x007fffff;
+        /* determine interval */
+        ix = j | 0x3f800000; /* normalize ix */
+        if j <= 0x1cc471 {
+            /* |x|<sqrt(3/2) */
+            k = 0;
+        } else if j < 0x5db3d7 {
+            /* |x|<sqrt(3)   */
+            k = 1;
+        } else {
+            k = 0;
+            n += 1;
+            ix -= 0x00800000;
+        }
+        ax = f32::from_bits(ix as u32);
+
+        /* compute s = s_h+s_l = (x-1)/(x+1) or (x-1.5)/(x+1.5) */
+        u = ax - i!(BP, k as usize); /* bp[0]=1.0, bp[1]=1.5 */
+        v = 1.0 / (ax + i!(BP, k as usize));
+        s = u * v;
+        s_h = s;
+        is = s_h.to_bits() as i32;
+        s_h = f32::from_bits(is as u32 & 0xfffff000);
+        /* t_h=ax+bp[k] High */
+        is = (((ix as u32 >> 1) & 0xfffff000) | 0x20000000) as i32;
+        t_h = f32::from_bits(is as u32 + 0x00400000 + ((k as u32) << 21));
+        t_l = ax - (t_h - i!(BP, k as usize));
+        s_l = v * ((u - s_h * t_h) - s_h * t_l);
+        /* compute log(ax) */
+        s2 = s * s;
+        r = s2 * s2 * (L1 + s2 * (L2 + s2 * (L3 + s2 * (L4 + s2 * (L5 + s2 * L6)))));
+        r += s_l * (s_h + s);
+        s2 = s_h * s_h;
+        t_h = 3.0 + s2 + r;
+        is = t_h.to_bits() as i32;
+        t_h = f32::from_bits(is as u32 & 0xfffff000);
+        t_l = r - ((t_h - 3.0) - s2);
+        /* u+v = s*(1+...) */
+        u = s_h * t_h;
+        v = s_l * t_h + t_l * s;
+        /* 2/(3log2)*(s+...) */
+        p_h = u + v;
+        is = p_h.to_bits() as i32;
+        p_h = f32::from_bits(is as u32 & 0xfffff000);
+        p_l = v - (p_h - u);
+        z_h = CP_H * p_h; /* cp_h+cp_l = 2/(3*log2) */
+        z_l = CP_L * p_h + p_l * CP + i!(DP_L, k as usize);
+        /* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */
+        t = n as f32;
+        t1 = ((z_h + z_l) + i!(DP_H, k as usize)) + t;
+        is = t1.to_bits() as i32;
+        t1 = f32::from_bits(is as u32 & 0xfffff000);
+        t2 = z_l - (((t1 - t) - i!(DP_H, k as usize)) - z_h);
+    };
+
+    /* split up y into y1+y2 and compute (y1+y2)*(t1+t2) */
+    is = y.to_bits() as i32;
+    y1 = f32::from_bits(is as u32 & 0xfffff000);
+    p_l = (y - y1) * t1 + y * t2;
+    p_h = y1 * t1;
+    z = p_l + p_h;
+    j = z.to_bits() as i32;
+    if j > 0x43000000 {
+        /* if z > 128 */
+        return sn * HUGE * HUGE; /* overflow */
+    } else if j == 0x43000000 {
+        /* if z == 128 */
+        if p_l + OVT > z - p_h {
+            return sn * HUGE * HUGE; /* overflow */
+        }
+    } else if (j & 0x7fffffff) > 0x43160000 {
+        /* z < -150 */
+        // FIXME: check should be  (uint32_t)j > 0xc3160000
+        return sn * TINY * TINY; /* underflow */
+    } else if j as u32 == 0xc3160000
+              /* z == -150 */
+              && p_l <= z - p_h
+    {
+        return sn * TINY * TINY; /* underflow */
+    }
+
+    /*
+     * compute 2**(p_h+p_l)
+     */
+    i = j & 0x7fffffff;
+    k = (i >> 23) - 0x7f;
+    n = 0;
+    if i > 0x3f000000 {
+        /* if |z| > 0.5, set n = [z+0.5] */
+        n = j + (0x00800000 >> (k + 1));
+        k = ((n & 0x7fffffff) >> 23) - 0x7f; /* new k for n */
+        t = f32::from_bits(n as u32 & !(0x007fffff >> k));
+        n = ((n & 0x007fffff) | 0x00800000) >> (23 - k);
+        if j < 0 {
+            n = -n;
+        }
+        p_h -= t;
+    }
+    t = p_l + p_h;
+    is = t.to_bits() as i32;
+    t = f32::from_bits(is as u32 & 0xffff8000);
+    u = t * LG2_H;
+    v = (p_l - (t - p_h)) * LG2 + t * LG2_L;
+    z = u + v;
+    w = v - (z - u);
+    t = z * z;
+    t1 = z - t * (P1 + t * (P2 + t * (P3 + t * (P4 + t * P5))));
+    r = (z * t1) / (t1 - 2.0) - (w + z * w);
+    z = 1.0 - (r - z);
+    j = z.to_bits() as i32;
+    j += n << 23;
+    if (j >> 23) <= 0 {
+        /* subnormal output */
+        z = scalbnf(z, n);
+    } else {
+        z = f32::from_bits(j as u32);
+    }
+    sn * z
+}
diff --git a/library/compiler-builtins/libm/src/math/rem_pio2.rs b/library/compiler-builtins/libm/src/math/rem_pio2.rs
new file mode 100644
index 00000000000..d677fd9dcb3
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/rem_pio2.rs
@@ -0,0 +1,235 @@
+// origin: FreeBSD /usr/src/lib/msun/src/e_rem_pio2.c
+//
+// ====================================================
+// Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+//
+// Developed at SunPro, a Sun Microsystems, Inc. business.
+// Permission to use, copy, modify, and distribute this
+// software is freely granted, provided that this notice
+// is preserved.
+// ====================================================
+//
+// Optimized by Bruce D. Evans. */
+use super::rem_pio2_large;
+
+// #if FLT_EVAL_METHOD==0 || FLT_EVAL_METHOD==1
+// #define EPS DBL_EPSILON
+const EPS: f64 = 2.2204460492503131e-16;
+// #elif FLT_EVAL_METHOD==2
+// #define EPS LDBL_EPSILON
+// #endif
+
+// TODO: Support FLT_EVAL_METHOD?
+
+const TO_INT: f64 = 1.5 / EPS;
+/// 53 bits of 2/pi
+const INV_PIO2: f64 = 6.36619772367581382433e-01; /* 0x3FE45F30, 0x6DC9C883 */
+/// first 33 bits of pi/2
+const PIO2_1: f64 = 1.57079632673412561417e+00; /* 0x3FF921FB, 0x54400000 */
+/// pi/2 - PIO2_1
+const PIO2_1T: f64 = 6.07710050650619224932e-11; /* 0x3DD0B461, 0x1A626331 */
+/// second 33 bits of pi/2
+const PIO2_2: f64 = 6.07710050630396597660e-11; /* 0x3DD0B461, 0x1A600000 */
+/// pi/2 - (PIO2_1+PIO2_2)
+const PIO2_2T: f64 = 2.02226624879595063154e-21; /* 0x3BA3198A, 0x2E037073 */
+/// third 33 bits of pi/2
+const PIO2_3: f64 = 2.02226624871116645580e-21; /* 0x3BA3198A, 0x2E000000 */
+/// pi/2 - (PIO2_1+PIO2_2+PIO2_3)
+const PIO2_3T: f64 = 8.47842766036889956997e-32; /* 0x397B839A, 0x252049C1 */
+
+// return the remainder of x rem pi/2 in y[0]+y[1]
+// use rem_pio2_large() for large x
+//
+// caller must handle the case when reduction is not needed: |x| ~<= pi/4 */
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub(crate) fn rem_pio2(x: f64) -> (i32, f64, f64) {
+    let x1p24 = f64::from_bits(0x4170000000000000);
+
+    let sign = (f64::to_bits(x) >> 63) as i32;
+    let ix = (f64::to_bits(x) >> 32) as u32 & 0x7fffffff;
+
+    fn medium(x: f64, ix: u32) -> (i32, f64, f64) {
+        /* rint(x/(pi/2)), Assume round-to-nearest. */
+        let tmp = x * INV_PIO2 + TO_INT;
+        // force rounding of tmp to it's storage format on x87 to avoid
+        // excess precision issues.
+        #[cfg(all(target_arch = "x86", not(target_feature = "sse2")))]
+        let tmp = force_eval!(tmp);
+        let f_n = tmp - TO_INT;
+        let n = f_n as i32;
+        let mut r = x - f_n * PIO2_1;
+        let mut w = f_n * PIO2_1T; /* 1st round, good to 85 bits */
+        let mut y0 = r - w;
+        let ui = f64::to_bits(y0);
+        let ey = (ui >> 52) as i32 & 0x7ff;
+        let ex = (ix >> 20) as i32;
+        if ex - ey > 16 {
+            /* 2nd round, good to 118 bits */
+            let t = r;
+            w = f_n * PIO2_2;
+            r = t - w;
+            w = f_n * PIO2_2T - ((t - r) - w);
+            y0 = r - w;
+            let ey = (f64::to_bits(y0) >> 52) as i32 & 0x7ff;
+            if ex - ey > 49 {
+                /* 3rd round, good to 151 bits, covers all cases */
+                let t = r;
+                w = f_n * PIO2_3;
+                r = t - w;
+                w = f_n * PIO2_3T - ((t - r) - w);
+                y0 = r - w;
+            }
+        }
+        let y1 = (r - y0) - w;
+        (n, y0, y1)
+    }
+
+    if ix <= 0x400f6a7a {
+        /* |x| ~<= 5pi/4 */
+        if (ix & 0xfffff) == 0x921fb {
+            /* |x| ~= pi/2 or 2pi/2 */
+            return medium(x, ix); /* cancellation -- use medium case */
+        }
+        if ix <= 0x4002d97c {
+            /* |x| ~<= 3pi/4 */
+            if sign == 0 {
+                let z = x - PIO2_1; /* one round good to 85 bits */
+                let y0 = z - PIO2_1T;
+                let y1 = (z - y0) - PIO2_1T;
+                return (1, y0, y1);
+            } else {
+                let z = x + PIO2_1;
+                let y0 = z + PIO2_1T;
+                let y1 = (z - y0) + PIO2_1T;
+                return (-1, y0, y1);
+            }
+        } else if sign == 0 {
+            let z = x - 2.0 * PIO2_1;
+            let y0 = z - 2.0 * PIO2_1T;
+            let y1 = (z - y0) - 2.0 * PIO2_1T;
+            return (2, y0, y1);
+        } else {
+            let z = x + 2.0 * PIO2_1;
+            let y0 = z + 2.0 * PIO2_1T;
+            let y1 = (z - y0) + 2.0 * PIO2_1T;
+            return (-2, y0, y1);
+        }
+    }
+    if ix <= 0x401c463b {
+        /* |x| ~<= 9pi/4 */
+        if ix <= 0x4015fdbc {
+            /* |x| ~<= 7pi/4 */
+            if ix == 0x4012d97c {
+                /* |x| ~= 3pi/2 */
+                return medium(x, ix);
+            }
+            if sign == 0 {
+                let z = x - 3.0 * PIO2_1;
+                let y0 = z - 3.0 * PIO2_1T;
+                let y1 = (z - y0) - 3.0 * PIO2_1T;
+                return (3, y0, y1);
+            } else {
+                let z = x + 3.0 * PIO2_1;
+                let y0 = z + 3.0 * PIO2_1T;
+                let y1 = (z - y0) + 3.0 * PIO2_1T;
+                return (-3, y0, y1);
+            }
+        } else {
+            if ix == 0x401921fb {
+                /* |x| ~= 4pi/2 */
+                return medium(x, ix);
+            }
+            if sign == 0 {
+                let z = x - 4.0 * PIO2_1;
+                let y0 = z - 4.0 * PIO2_1T;
+                let y1 = (z - y0) - 4.0 * PIO2_1T;
+                return (4, y0, y1);
+            } else {
+                let z = x + 4.0 * PIO2_1;
+                let y0 = z + 4.0 * PIO2_1T;
+                let y1 = (z - y0) + 4.0 * PIO2_1T;
+                return (-4, y0, y1);
+            }
+        }
+    }
+    if ix < 0x413921fb {
+        /* |x| ~< 2^20*(pi/2), medium size */
+        return medium(x, ix);
+    }
+    /*
+     * all other (large) arguments
+     */
+    if ix >= 0x7ff00000 {
+        /* x is inf or NaN */
+        let y0 = x - x;
+        let y1 = y0;
+        return (0, y0, y1);
+    }
+    /* set z = scalbn(|x|,-ilogb(x)+23) */
+    let mut ui = f64::to_bits(x);
+    ui &= (!1) >> 12;
+    ui |= (0x3ff + 23) << 52;
+    let mut z = f64::from_bits(ui);
+    let mut tx = [0.0; 3];
+    for i in 0..2 {
+        i!(tx,i, =, z as i32 as f64);
+        z = (z - i!(tx, i)) * x1p24;
+    }
+    i!(tx,2, =, z);
+    /* skip zero terms, first term is non-zero */
+    let mut i = 2;
+    while i != 0 && i!(tx, i) == 0.0 {
+        i -= 1;
+    }
+    let mut ty = [0.0; 3];
+    let n = rem_pio2_large(&tx[..=i], &mut ty, ((ix as i32) >> 20) - (0x3ff + 23), 1);
+    if sign != 0 {
+        return (-n, -i!(ty, 0), -i!(ty, 1));
+    }
+    (n, i!(ty, 0), i!(ty, 1))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::rem_pio2;
+
+    #[test]
+    // FIXME(correctness): inaccurate results on i586
+    #[cfg_attr(all(target_arch = "x86", not(target_feature = "sse")), ignore)]
+    fn test_near_pi() {
+        let arg = 3.141592025756836;
+        let arg = force_eval!(arg);
+        assert_eq!(
+            rem_pio2(arg),
+            (2, -6.278329573009626e-7, -2.1125998133974653e-23)
+        );
+        let arg = 3.141592033207416;
+        let arg = force_eval!(arg);
+        assert_eq!(
+            rem_pio2(arg),
+            (2, -6.20382377148128e-7, -2.1125998133974653e-23)
+        );
+        let arg = 3.141592144966125;
+        let arg = force_eval!(arg);
+        assert_eq!(
+            rem_pio2(arg),
+            (2, -5.086236681942706e-7, -2.1125998133974653e-23)
+        );
+        let arg = 3.141592979431152;
+        let arg = force_eval!(arg);
+        assert_eq!(
+            rem_pio2(arg),
+            (2, 3.2584135866119817e-7, -2.1125998133974653e-23)
+        );
+    }
+
+    #[test]
+    fn test_overflow_b9b847() {
+        let _ = rem_pio2(-3054214.5490637687);
+    }
+
+    #[test]
+    fn test_overflow_4747b9() {
+        let _ = rem_pio2(917340800458.2274);
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/rem_pio2_large.rs b/library/compiler-builtins/libm/src/math/rem_pio2_large.rs
new file mode 100644
index 00000000000..6d679bbe98c
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/rem_pio2_large.rs
@@ -0,0 +1,468 @@
+#![allow(unused_unsafe)]
+/* origin: FreeBSD /usr/src/lib/msun/src/k_rem_pio2.c */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunSoft, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+use super::{floor, scalbn};
+
+// initial value for jk
+const INIT_JK: [usize; 4] = [3, 4, 4, 6];
+
+// Table of constants for 2/pi, 396 Hex digits (476 decimal) of 2/pi
+//
+//              integer array, contains the (24*i)-th to (24*i+23)-th
+//              bit of 2/pi after binary point. The corresponding
+//              floating value is
+//
+//                      ipio2[i] * 2^(-24(i+1)).
+//
+// NB: This table must have at least (e0-3)/24 + jk terms.
+//     For quad precision (e0 <= 16360, jk = 6), this is 686.
+#[cfg(any(target_pointer_width = "32", target_pointer_width = "16"))]
+const IPIO2: [i32; 66] = [
+    0xA2F983, 0x6E4E44, 0x1529FC, 0x2757D1, 0xF534DD, 0xC0DB62, 0x95993C, 0x439041, 0xFE5163,
+    0xABDEBB, 0xC561B7, 0x246E3A, 0x424DD2, 0xE00649, 0x2EEA09, 0xD1921C, 0xFE1DEB, 0x1CB129,
+    0xA73EE8, 0x8235F5, 0x2EBB44, 0x84E99C, 0x7026B4, 0x5F7E41, 0x3991D6, 0x398353, 0x39F49C,
+    0x845F8B, 0xBDF928, 0x3B1FF8, 0x97FFDE, 0x05980F, 0xEF2F11, 0x8B5A0A, 0x6D1F6D, 0x367ECF,
+    0x27CB09, 0xB74F46, 0x3F669E, 0x5FEA2D, 0x7527BA, 0xC7EBE5, 0xF17B3D, 0x0739F7, 0x8A5292,
+    0xEA6BFB, 0x5FB11F, 0x8D5D08, 0x560330, 0x46FC7B, 0x6BABF0, 0xCFBC20, 0x9AF436, 0x1DA9E3,
+    0x91615E, 0xE61B08, 0x659985, 0x5F14A0, 0x68408D, 0xFFD880, 0x4D7327, 0x310606, 0x1556CA,
+    0x73A8C9, 0x60E27B, 0xC08C6B,
+];
+
+#[cfg(target_pointer_width = "64")]
+const IPIO2: [i32; 690] = [
+    0xA2F983, 0x6E4E44, 0x1529FC, 0x2757D1, 0xF534DD, 0xC0DB62, 0x95993C, 0x439041, 0xFE5163,
+    0xABDEBB, 0xC561B7, 0x246E3A, 0x424DD2, 0xE00649, 0x2EEA09, 0xD1921C, 0xFE1DEB, 0x1CB129,
+    0xA73EE8, 0x8235F5, 0x2EBB44, 0x84E99C, 0x7026B4, 0x5F7E41, 0x3991D6, 0x398353, 0x39F49C,
+    0x845F8B, 0xBDF928, 0x3B1FF8, 0x97FFDE, 0x05980F, 0xEF2F11, 0x8B5A0A, 0x6D1F6D, 0x367ECF,
+    0x27CB09, 0xB74F46, 0x3F669E, 0x5FEA2D, 0x7527BA, 0xC7EBE5, 0xF17B3D, 0x0739F7, 0x8A5292,
+    0xEA6BFB, 0x5FB11F, 0x8D5D08, 0x560330, 0x46FC7B, 0x6BABF0, 0xCFBC20, 0x9AF436, 0x1DA9E3,
+    0x91615E, 0xE61B08, 0x659985, 0x5F14A0, 0x68408D, 0xFFD880, 0x4D7327, 0x310606, 0x1556CA,
+    0x73A8C9, 0x60E27B, 0xC08C6B, 0x47C419, 0xC367CD, 0xDCE809, 0x2A8359, 0xC4768B, 0x961CA6,
+    0xDDAF44, 0xD15719, 0x053EA5, 0xFF0705, 0x3F7E33, 0xE832C2, 0xDE4F98, 0x327DBB, 0xC33D26,
+    0xEF6B1E, 0x5EF89F, 0x3A1F35, 0xCAF27F, 0x1D87F1, 0x21907C, 0x7C246A, 0xFA6ED5, 0x772D30,
+    0x433B15, 0xC614B5, 0x9D19C3, 0xC2C4AD, 0x414D2C, 0x5D000C, 0x467D86, 0x2D71E3, 0x9AC69B,
+    0x006233, 0x7CD2B4, 0x97A7B4, 0xD55537, 0xF63ED7, 0x1810A3, 0xFC764D, 0x2A9D64, 0xABD770,
+    0xF87C63, 0x57B07A, 0xE71517, 0x5649C0, 0xD9D63B, 0x3884A7, 0xCB2324, 0x778AD6, 0x23545A,
+    0xB91F00, 0x1B0AF1, 0xDFCE19, 0xFF319F, 0x6A1E66, 0x615799, 0x47FBAC, 0xD87F7E, 0xB76522,
+    0x89E832, 0x60BFE6, 0xCDC4EF, 0x09366C, 0xD43F5D, 0xD7DE16, 0xDE3B58, 0x929BDE, 0x2822D2,
+    0xE88628, 0x4D58E2, 0x32CAC6, 0x16E308, 0xCB7DE0, 0x50C017, 0xA71DF3, 0x5BE018, 0x34132E,
+    0x621283, 0x014883, 0x5B8EF5, 0x7FB0AD, 0xF2E91E, 0x434A48, 0xD36710, 0xD8DDAA, 0x425FAE,
+    0xCE616A, 0xA4280A, 0xB499D3, 0xF2A606, 0x7F775C, 0x83C2A3, 0x883C61, 0x78738A, 0x5A8CAF,
+    0xBDD76F, 0x63A62D, 0xCBBFF4, 0xEF818D, 0x67C126, 0x45CA55, 0x36D9CA, 0xD2A828, 0x8D61C2,
+    0x77C912, 0x142604, 0x9B4612, 0xC459C4, 0x44C5C8, 0x91B24D, 0xF31700, 0xAD43D4, 0xE54929,
+    0x10D5FD, 0xFCBE00, 0xCC941E, 0xEECE70, 0xF53E13, 0x80F1EC, 0xC3E7B3, 0x28F8C7, 0x940593,
+    0x3E71C1, 0xB3092E, 0xF3450B, 0x9C1288, 0x7B20AB, 0x9FB52E, 0xC29247, 0x2F327B, 0x6D550C,
+    0x90A772, 0x1FE76B, 0x96CB31, 0x4A1679, 0xE27941, 0x89DFF4, 0x9794E8, 0x84E6E2, 0x973199,
+    0x6BED88, 0x365F5F, 0x0EFDBB, 0xB49A48, 0x6CA467, 0x427271, 0x325D8D, 0xB8159F, 0x09E5BC,
+    0x25318D, 0x3974F7, 0x1C0530, 0x010C0D, 0x68084B, 0x58EE2C, 0x90AA47, 0x02E774, 0x24D6BD,
+    0xA67DF7, 0x72486E, 0xEF169F, 0xA6948E, 0xF691B4, 0x5153D1, 0xF20ACF, 0x339820, 0x7E4BF5,
+    0x6863B2, 0x5F3EDD, 0x035D40, 0x7F8985, 0x295255, 0xC06437, 0x10D86D, 0x324832, 0x754C5B,
+    0xD4714E, 0x6E5445, 0xC1090B, 0x69F52A, 0xD56614, 0x9D0727, 0x50045D, 0xDB3BB4, 0xC576EA,
+    0x17F987, 0x7D6B49, 0xBA271D, 0x296996, 0xACCCC6, 0x5414AD, 0x6AE290, 0x89D988, 0x50722C,
+    0xBEA404, 0x940777, 0x7030F3, 0x27FC00, 0xA871EA, 0x49C266, 0x3DE064, 0x83DD97, 0x973FA3,
+    0xFD9443, 0x8C860D, 0xDE4131, 0x9D3992, 0x8C70DD, 0xE7B717, 0x3BDF08, 0x2B3715, 0xA0805C,
+    0x93805A, 0x921110, 0xD8E80F, 0xAF806C, 0x4BFFDB, 0x0F9038, 0x761859, 0x15A562, 0xBBCB61,
+    0xB989C7, 0xBD4010, 0x04F2D2, 0x277549, 0xF6B6EB, 0xBB22DB, 0xAA140A, 0x2F2689, 0x768364,
+    0x333B09, 0x1A940E, 0xAA3A51, 0xC2A31D, 0xAEEDAF, 0x12265C, 0x4DC26D, 0x9C7A2D, 0x9756C0,
+    0x833F03, 0xF6F009, 0x8C402B, 0x99316D, 0x07B439, 0x15200C, 0x5BC3D8, 0xC492F5, 0x4BADC6,
+    0xA5CA4E, 0xCD37A7, 0x36A9E6, 0x9492AB, 0x6842DD, 0xDE6319, 0xEF8C76, 0x528B68, 0x37DBFC,
+    0xABA1AE, 0x3115DF, 0xA1AE00, 0xDAFB0C, 0x664D64, 0xB705ED, 0x306529, 0xBF5657, 0x3AFF47,
+    0xB9F96A, 0xF3BE75, 0xDF9328, 0x3080AB, 0xF68C66, 0x15CB04, 0x0622FA, 0x1DE4D9, 0xA4B33D,
+    0x8F1B57, 0x09CD36, 0xE9424E, 0xA4BE13, 0xB52333, 0x1AAAF0, 0xA8654F, 0xA5C1D2, 0x0F3F0B,
+    0xCD785B, 0x76F923, 0x048B7B, 0x721789, 0x53A6C6, 0xE26E6F, 0x00EBEF, 0x584A9B, 0xB7DAC4,
+    0xBA66AA, 0xCFCF76, 0x1D02D1, 0x2DF1B1, 0xC1998C, 0x77ADC3, 0xDA4886, 0xA05DF7, 0xF480C6,
+    0x2FF0AC, 0x9AECDD, 0xBC5C3F, 0x6DDED0, 0x1FC790, 0xB6DB2A, 0x3A25A3, 0x9AAF00, 0x9353AD,
+    0x0457B6, 0xB42D29, 0x7E804B, 0xA707DA, 0x0EAA76, 0xA1597B, 0x2A1216, 0x2DB7DC, 0xFDE5FA,
+    0xFEDB89, 0xFDBE89, 0x6C76E4, 0xFCA906, 0x70803E, 0x156E85, 0xFF87FD, 0x073E28, 0x336761,
+    0x86182A, 0xEABD4D, 0xAFE7B3, 0x6E6D8F, 0x396795, 0x5BBF31, 0x48D784, 0x16DF30, 0x432DC7,
+    0x356125, 0xCE70C9, 0xB8CB30, 0xFD6CBF, 0xA200A4, 0xE46C05, 0xA0DD5A, 0x476F21, 0xD21262,
+    0x845CB9, 0x496170, 0xE0566B, 0x015299, 0x375550, 0xB7D51E, 0xC4F133, 0x5F6E13, 0xE4305D,
+    0xA92E85, 0xC3B21D, 0x3632A1, 0xA4B708, 0xD4B1EA, 0x21F716, 0xE4698F, 0x77FF27, 0x80030C,
+    0x2D408D, 0xA0CD4F, 0x99A520, 0xD3A2B3, 0x0A5D2F, 0x42F9B4, 0xCBDA11, 0xD0BE7D, 0xC1DB9B,
+    0xBD17AB, 0x81A2CA, 0x5C6A08, 0x17552E, 0x550027, 0xF0147F, 0x8607E1, 0x640B14, 0x8D4196,
+    0xDEBE87, 0x2AFDDA, 0xB6256B, 0x34897B, 0xFEF305, 0x9EBFB9, 0x4F6A68, 0xA82A4A, 0x5AC44F,
+    0xBCF82D, 0x985AD7, 0x95C7F4, 0x8D4D0D, 0xA63A20, 0x5F57A4, 0xB13F14, 0x953880, 0x0120CC,
+    0x86DD71, 0xB6DEC9, 0xF560BF, 0x11654D, 0x6B0701, 0xACB08C, 0xD0C0B2, 0x485551, 0x0EFB1E,
+    0xC37295, 0x3B06A3, 0x3540C0, 0x7BDC06, 0xCC45E0, 0xFA294E, 0xC8CAD6, 0x41F3E8, 0xDE647C,
+    0xD8649B, 0x31BED9, 0xC397A4, 0xD45877, 0xC5E369, 0x13DAF0, 0x3C3ABA, 0x461846, 0x5F7555,
+    0xF5BDD2, 0xC6926E, 0x5D2EAC, 0xED440E, 0x423E1C, 0x87C461, 0xE9FD29, 0xF3D6E7, 0xCA7C22,
+    0x35916F, 0xC5E008, 0x8DD7FF, 0xE26A6E, 0xC6FDB0, 0xC10893, 0x745D7C, 0xB2AD6B, 0x9D6ECD,
+    0x7B723E, 0x6A11C6, 0xA9CFF7, 0xDF7329, 0xBAC9B5, 0x5100B7, 0x0DB2E2, 0x24BA74, 0x607DE5,
+    0x8AD874, 0x2C150D, 0x0C1881, 0x94667E, 0x162901, 0x767A9F, 0xBEFDFD, 0xEF4556, 0x367ED9,
+    0x13D9EC, 0xB9BA8B, 0xFC97C4, 0x27A831, 0xC36EF1, 0x36C594, 0x56A8D8, 0xB5A8B4, 0x0ECCCF,
+    0x2D8912, 0x34576F, 0x89562C, 0xE3CE99, 0xB920D6, 0xAA5E6B, 0x9C2A3E, 0xCC5F11, 0x4A0BFD,
+    0xFBF4E1, 0x6D3B8E, 0x2C86E2, 0x84D4E9, 0xA9B4FC, 0xD1EEEF, 0xC9352E, 0x61392F, 0x442138,
+    0xC8D91B, 0x0AFC81, 0x6A4AFB, 0xD81C2F, 0x84B453, 0x8C994E, 0xCC2254, 0xDC552A, 0xD6C6C0,
+    0x96190B, 0xB8701A, 0x649569, 0x605A26, 0xEE523F, 0x0F117F, 0x11B5F4, 0xF5CBFC, 0x2DBC34,
+    0xEEBC34, 0xCC5DE8, 0x605EDD, 0x9B8E67, 0xEF3392, 0xB817C9, 0x9B5861, 0xBC57E1, 0xC68351,
+    0x103ED8, 0x4871DD, 0xDD1C2D, 0xA118AF, 0x462C21, 0xD7F359, 0x987AD9, 0xC0549E, 0xFA864F,
+    0xFC0656, 0xAE79E5, 0x362289, 0x22AD38, 0xDC9367, 0xAAE855, 0x382682, 0x9BE7CA, 0xA40D51,
+    0xB13399, 0x0ED7A9, 0x480569, 0xF0B265, 0xA7887F, 0x974C88, 0x36D1F9, 0xB39221, 0x4A827B,
+    0x21CF98, 0xDC9F40, 0x5547DC, 0x3A74E1, 0x42EB67, 0xDF9DFE, 0x5FD45E, 0xA4677B, 0x7AACBA,
+    0xA2F655, 0x23882B, 0x55BA41, 0x086E59, 0x862A21, 0x834739, 0xE6E389, 0xD49EE5, 0x40FB49,
+    0xE956FF, 0xCA0F1C, 0x8A59C5, 0x2BFA94, 0xC5C1D3, 0xCFC50F, 0xAE5ADB, 0x86C547, 0x624385,
+    0x3B8621, 0x94792C, 0x876110, 0x7B4C2A, 0x1A2C80, 0x12BF43, 0x902688, 0x893C78, 0xE4C4A8,
+    0x7BDBE5, 0xC23AC4, 0xEAF426, 0x8A67F7, 0xBF920D, 0x2BA365, 0xB1933D, 0x0B7CBD, 0xDC51A4,
+    0x63DD27, 0xDDE169, 0x19949A, 0x9529A8, 0x28CE68, 0xB4ED09, 0x209F44, 0xCA984E, 0x638270,
+    0x237C7E, 0x32B90F, 0x8EF5A7, 0xE75614, 0x08F121, 0x2A9DB5, 0x4D7E6F, 0x5119A5, 0xABF9B5,
+    0xD6DF82, 0x61DD96, 0x023616, 0x9F3AC4, 0xA1A283, 0x6DED72, 0x7A8D39, 0xA9B882, 0x5C326B,
+    0x5B2746, 0xED3400, 0x7700D2, 0x55F4FC, 0x4D5901, 0x8071E0,
+];
+
+const PIO2: [f64; 8] = [
+    1.57079625129699707031e+00, /* 0x3FF921FB, 0x40000000 */
+    7.54978941586159635335e-08, /* 0x3E74442D, 0x00000000 */
+    5.39030252995776476554e-15, /* 0x3CF84698, 0x80000000 */
+    3.28200341580791294123e-22, /* 0x3B78CC51, 0x60000000 */
+    1.27065575308067607349e-29, /* 0x39F01B83, 0x80000000 */
+    1.22933308981111328932e-36, /* 0x387A2520, 0x40000000 */
+    2.73370053816464559624e-44, /* 0x36E38222, 0x80000000 */
+    2.16741683877804819444e-51, /* 0x3569F31D, 0x00000000 */
+];
+
+// fn rem_pio2_large(x : &[f64], y : &mut [f64], e0 : i32, prec : usize) -> i32
+//
+// Input parameters:
+//      x[]     The input value (must be positive) is broken into nx
+//              pieces of 24-bit integers in double precision format.
+//              x[i] will be the i-th 24 bit of x. The scaled exponent
+//              of x[0] is given in input parameter e0 (i.e., x[0]*2^e0
+//              match x's up to 24 bits.
+//
+//              Example of breaking a double positive z into x[0]+x[1]+x[2]:
+//                      e0 = ilogb(z)-23
+//                      z  = scalbn(z,-e0)
+//              for i = 0,1,2
+//                      x[i] = floor(z)
+//                      z    = (z-x[i])*2**24
+//
+//      y[]     ouput result in an array of double precision numbers.
+//              The dimension of y[] is:
+//                      24-bit  precision       1
+//                      53-bit  precision       2
+//                      64-bit  precision       2
+//                      113-bit precision       3
+//              The actual value is the sum of them. Thus for 113-bit
+//              precison, one may have to do something like:
+//
+//              long double t,w,r_head, r_tail;
+//              t = (long double)y[2] + (long double)y[1];
+//              w = (long double)y[0];
+//              r_head = t+w;
+//              r_tail = w - (r_head - t);
+//
+//      e0      The exponent of x[0]. Must be <= 16360 or you need to
+//              expand the ipio2 table.
+//
+//      prec    an integer indicating the precision:
+//                      0       24  bits (single)
+//                      1       53  bits (double)
+//                      2       64  bits (extended)
+//                      3       113 bits (quad)
+//
+// Here is the description of some local variables:
+//
+//      jk      jk+1 is the initial number of terms of ipio2[] needed
+//              in the computation. The minimum and recommended value
+//              for jk is 3,4,4,6 for single, double, extended, and quad.
+//              jk+1 must be 2 larger than you might expect so that our
+//              recomputation test works. (Up to 24 bits in the integer
+//              part (the 24 bits of it that we compute) and 23 bits in
+//              the fraction part may be lost to cancelation before we
+//              recompute.)
+//
+//      jz      local integer variable indicating the number of
+//              terms of ipio2[] used.
+//
+//      jx      nx - 1
+//
+//      jv      index for pointing to the suitable ipio2[] for the
+//              computation. In general, we want
+//                      ( 2^e0*x[0] * ipio2[jv-1]*2^(-24jv) )/8
+//              is an integer. Thus
+//                      e0-3-24*jv >= 0 or (e0-3)/24 >= jv
+//              Hence jv = max(0,(e0-3)/24).
+//
+//      jp      jp+1 is the number of terms in PIo2[] needed, jp = jk.
+//
+//      q[]     double array with integral value, representing the
+//              24-bits chunk of the product of x and 2/pi.
+//
+//      q0      the corresponding exponent of q[0]. Note that the
+//              exponent for q[i] would be q0-24*i.
+//
+//      PIo2[]  double precision array, obtained by cutting pi/2
+//              into 24 bits chunks.
+//
+//      f[]     ipio2[] in floating point
+//
+//      iq[]    integer array by breaking up q[] in 24-bits chunk.
+//
+//      fq[]    final product of x*(2/pi) in fq[0],..,fq[jk]
+//
+//      ih      integer. If >0 it indicates q[] is >= 0.5, hence
+//              it also indicates the *sign* of the result.
+
+/// Return the last three digits of N with y = x - N*pi/2
+/// so that |y| < pi/2.
+///
+/// The method is to compute the integer (mod 8) and fraction parts of
+/// (2/pi)*x without doing the full multiplication. In general we
+/// skip the part of the product that are known to be a huge integer (
+/// more accurately, = 0 mod 8 ). Thus the number of operations are
+/// independent of the exponent of the input.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub(crate) fn rem_pio2_large(x: &[f64], y: &mut [f64], e0: i32, prec: usize) -> i32 {
+    let x1p24 = f64::from_bits(0x4170000000000000); // 0x1p24 === 2 ^ 24
+    let x1p_24 = f64::from_bits(0x3e70000000000000); // 0x1p_24 === 2 ^ (-24)
+
+    if cfg!(target_pointer_width = "64") {
+        debug_assert!(e0 <= 16360);
+    }
+
+    let nx = x.len();
+
+    let mut fw: f64;
+    let mut n: i32;
+    let mut ih: i32;
+    let mut z: f64;
+    let mut f: [f64; 20] = [0.; 20];
+    let mut fq: [f64; 20] = [0.; 20];
+    let mut q: [f64; 20] = [0.; 20];
+    let mut iq: [i32; 20] = [0; 20];
+
+    /* initialize jk*/
+    let jk = i!(INIT_JK, prec);
+    let jp = jk;
+
+    /* determine jx,jv,q0, note that 3>q0 */
+    let jx = nx - 1;
+    let mut jv = div!(e0 - 3, 24);
+    if jv < 0 {
+        jv = 0;
+    }
+    let mut q0 = e0 - 24 * (jv + 1);
+    let jv = jv as usize;
+
+    /* set up f[0] to f[jx+jk] where f[jx+jk] = ipio2[jv+jk] */
+    let mut j = (jv as i32) - (jx as i32);
+    let m = jx + jk;
+    for i in 0..=m {
+        i!(f, i, =, if j < 0 {
+            0.
+        } else {
+            i!(IPIO2, j as usize) as f64
+        });
+        j += 1;
+    }
+
+    /* compute q[0],q[1],...q[jk] */
+    for i in 0..=jk {
+        fw = 0f64;
+        for j in 0..=jx {
+            fw += i!(x, j) * i!(f, jx + i - j);
+        }
+        i!(q, i, =, fw);
+    }
+
+    let mut jz = jk;
+
+    'recompute: loop {
+        /* distill q[] into iq[] reversingly */
+        let mut i = 0i32;
+        z = i!(q, jz);
+        for j in (1..=jz).rev() {
+            fw = (x1p_24 * z) as i32 as f64;
+            i!(iq, i as usize, =, (z - x1p24 * fw) as i32);
+            z = i!(q, j - 1) + fw;
+            i += 1;
+        }
+
+        /* compute n */
+        z = scalbn(z, q0); /* actual value of z */
+        z -= 8.0 * floor(z * 0.125); /* trim off integer >= 8 */
+        n = z as i32;
+        z -= n as f64;
+        ih = 0;
+        if q0 > 0 {
+            /* need iq[jz-1] to determine n */
+            i = i!(iq, jz - 1) >> (24 - q0);
+            n += i;
+            i!(iq, jz - 1, -=, i << (24 - q0));
+            ih = i!(iq, jz - 1) >> (23 - q0);
+        } else if q0 == 0 {
+            ih = i!(iq, jz - 1) >> 23;
+        } else if z >= 0.5 {
+            ih = 2;
+        }
+
+        if ih > 0 {
+            /* q > 0.5 */
+            n += 1;
+            let mut carry = 0i32;
+            for i in 0..jz {
+                /* compute 1-q */
+                let j = i!(iq, i);
+                if carry == 0 {
+                    if j != 0 {
+                        carry = 1;
+                        i!(iq, i, =, 0x1000000 - j);
+                    }
+                } else {
+                    i!(iq, i, =, 0xffffff - j);
+                }
+            }
+            if q0 > 0 {
+                /* rare case: chance is 1 in 12 */
+                match q0 {
+                    1 => {
+                        i!(iq, jz - 1, &=, 0x7fffff);
+                    }
+                    2 => {
+                        i!(iq, jz - 1, &=, 0x3fffff);
+                    }
+                    _ => {}
+                }
+            }
+            if ih == 2 {
+                z = 1. - z;
+                if carry != 0 {
+                    z -= scalbn(1., q0);
+                }
+            }
+        }
+
+        /* check if recomputation is needed */
+        if z == 0. {
+            let mut j = 0;
+            for i in (jk..=jz - 1).rev() {
+                j |= i!(iq, i);
+            }
+            if j == 0 {
+                /* need recomputation */
+                let mut k = 1;
+                while i!(iq, jk - k, ==, 0) {
+                    k += 1; /* k = no. of terms needed */
+                }
+
+                for i in (jz + 1)..=(jz + k) {
+                    /* add q[jz+1] to q[jz+k] */
+                    i!(f, jx + i, =, i!(IPIO2, jv + i) as f64);
+                    fw = 0f64;
+                    for j in 0..=jx {
+                        fw += i!(x, j) * i!(f, jx + i - j);
+                    }
+                    i!(q, i, =, fw);
+                }
+                jz += k;
+                continue 'recompute;
+            }
+        }
+
+        break;
+    }
+
+    /* chop off zero terms */
+    if z == 0. {
+        jz -= 1;
+        q0 -= 24;
+        while i!(iq, jz) == 0 {
+            jz -= 1;
+            q0 -= 24;
+        }
+    } else {
+        /* break z into 24-bit if necessary */
+        z = scalbn(z, -q0);
+        if z >= x1p24 {
+            fw = (x1p_24 * z) as i32 as f64;
+            i!(iq, jz, =, (z - x1p24 * fw) as i32);
+            jz += 1;
+            q0 += 24;
+            i!(iq, jz, =, fw as i32);
+        } else {
+            i!(iq, jz, =, z as i32);
+        }
+    }
+
+    /* convert integer "bit" chunk to floating-point value */
+    fw = scalbn(1., q0);
+    for i in (0..=jz).rev() {
+        i!(q, i, =, fw * (i!(iq, i) as f64));
+        fw *= x1p_24;
+    }
+
+    /* compute PIo2[0,...,jp]*q[jz,...,0] */
+    for i in (0..=jz).rev() {
+        fw = 0f64;
+        let mut k = 0;
+        while (k <= jp) && (k <= jz - i) {
+            fw += i!(PIO2, k) * i!(q, i + k);
+            k += 1;
+        }
+        i!(fq, jz - i, =, fw);
+    }
+
+    /* compress fq[] into y[] */
+    match prec {
+        0 => {
+            fw = 0f64;
+            for i in (0..=jz).rev() {
+                fw += i!(fq, i);
+            }
+            i!(y, 0, =, if ih == 0 { fw } else { -fw });
+        }
+        1 | 2 => {
+            fw = 0f64;
+            for i in (0..=jz).rev() {
+                fw += i!(fq, i);
+            }
+            i!(y, 0, =, if ih == 0 { fw } else { -fw });
+            fw = i!(fq, 0) - fw;
+            for i in 1..=jz {
+                fw += i!(fq, i);
+            }
+            i!(y, 1, =, if ih == 0 { fw } else { -fw });
+        }
+        3 => {
+            /* painful */
+            for i in (1..=jz).rev() {
+                fw = i!(fq, i - 1) + i!(fq, i);
+                i!(fq, i, +=, i!(fq, i - 1) - fw);
+                i!(fq, i - 1, =, fw);
+            }
+            for i in (2..=jz).rev() {
+                fw = i!(fq, i - 1) + i!(fq, i);
+                i!(fq, i, +=, i!(fq, i - 1) - fw);
+                i!(fq, i - 1, =, fw);
+            }
+            fw = 0f64;
+            for i in (2..=jz).rev() {
+                fw += i!(fq, i);
+            }
+            if ih == 0 {
+                i!(y, 0, =, i!(fq, 0));
+                i!(y, 1, =, i!(fq, 1));
+                i!(y, 2, =, fw);
+            } else {
+                i!(y, 0, =, -i!(fq, 0));
+                i!(y, 1, =, -i!(fq, 1));
+                i!(y, 2, =, -fw);
+            }
+        }
+        #[cfg(debug_assertions)]
+        _ => unreachable!(),
+        #[cfg(not(debug_assertions))]
+        _ => {}
+    }
+    n & 7
+}
diff --git a/library/compiler-builtins/libm/src/math/rem_pio2f.rs b/library/compiler-builtins/libm/src/math/rem_pio2f.rs
new file mode 100644
index 00000000000..3c658fe3dbc
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/rem_pio2f.rs
@@ -0,0 +1,67 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/e_rem_pio2f.c */
+/*
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
+ * Debugged and optimized by Bruce D. Evans.
+ */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+use core::f64;
+
+use super::rem_pio2_large;
+
+const TOINT: f64 = 1.5 / f64::EPSILON;
+
+/// 53 bits of 2/pi
+const INV_PIO2: f64 = 6.36619772367581382433e-01; /* 0x3FE45F30, 0x6DC9C883 */
+/// first 25 bits of pi/2
+const PIO2_1: f64 = 1.57079631090164184570e+00; /* 0x3FF921FB, 0x50000000 */
+/// pi/2 - pio2_1
+const PIO2_1T: f64 = 1.58932547735281966916e-08; /* 0x3E5110b4, 0x611A6263 */
+
+/// Return the remainder of x rem pi/2 in *y
+///
+/// use double precision for everything except passing x
+/// use __rem_pio2_large() for large x
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub(crate) fn rem_pio2f(x: f32) -> (i32, f64) {
+    let x64 = x as f64;
+
+    let mut tx: [f64; 1] = [0.];
+    let mut ty: [f64; 1] = [0.];
+
+    let ix = x.to_bits() & 0x7fffffff;
+    /* 25+53 bit pi is good enough for medium size */
+    if ix < 0x4dc90fdb {
+        /* |x| ~< 2^28*(pi/2), medium size */
+        /* Use a specialized rint() to get fn.  Assume round-to-nearest. */
+        let tmp = x64 * INV_PIO2 + TOINT;
+        // force rounding of tmp to it's storage format on x87 to avoid
+        // excess precision issues.
+        #[cfg(all(target_arch = "x86", not(target_feature = "sse2")))]
+        let tmp = force_eval!(tmp);
+        let f_n = tmp - TOINT;
+        return (f_n as i32, x64 - f_n * PIO2_1 - f_n * PIO2_1T);
+    }
+    if ix >= 0x7f800000 {
+        /* x is inf or NaN */
+        return (0, x64 - x64);
+    }
+    /* scale x into [2^23, 2^24-1] */
+    let sign = (x.to_bits() >> 31) != 0;
+    let e0 = ((ix >> 23) - (0x7f + 23)) as i32; /* e0 = ilogb(|x|)-23, positive */
+    tx[0] = f32::from_bits(ix - (e0 << 23) as u32) as f64;
+    let n = rem_pio2_large(&tx, &mut ty, e0, 0);
+    if sign {
+        return (-n, -ty[0]);
+    }
+    (n, ty[0])
+}
diff --git a/library/compiler-builtins/libm/src/math/remainder.rs b/library/compiler-builtins/libm/src/math/remainder.rs
new file mode 100644
index 00000000000..9e966c9ed7f
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/remainder.rs
@@ -0,0 +1,5 @@
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn remainder(x: f64, y: f64) -> f64 {
+    let (result, _) = super::remquo(x, y);
+    result
+}
diff --git a/library/compiler-builtins/libm/src/math/remainderf.rs b/library/compiler-builtins/libm/src/math/remainderf.rs
new file mode 100644
index 00000000000..b1407cf2ace
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/remainderf.rs
@@ -0,0 +1,5 @@
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn remainderf(x: f32, y: f32) -> f32 {
+    let (result, _) = super::remquof(x, y);
+    result
+}
diff --git a/library/compiler-builtins/libm/src/math/remquo.rs b/library/compiler-builtins/libm/src/math/remquo.rs
new file mode 100644
index 00000000000..4c11e848746
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/remquo.rs
@@ -0,0 +1,106 @@
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn remquo(mut x: f64, mut y: f64) -> (f64, i32) {
+    let ux: u64 = x.to_bits();
+    let mut uy: u64 = y.to_bits();
+    let mut ex = ((ux >> 52) & 0x7ff) as i32;
+    let mut ey = ((uy >> 52) & 0x7ff) as i32;
+    let sx = (ux >> 63) != 0;
+    let sy = (uy >> 63) != 0;
+    let mut q: u32;
+    let mut i: u64;
+    let mut uxi: u64 = ux;
+
+    if (uy << 1) == 0 || y.is_nan() || ex == 0x7ff {
+        return ((x * y) / (x * y), 0);
+    }
+    if (ux << 1) == 0 {
+        return (x, 0);
+    }
+
+    /* normalize x and y */
+    if ex == 0 {
+        i = uxi << 12;
+        while (i >> 63) == 0 {
+            ex -= 1;
+            i <<= 1;
+        }
+        uxi <<= -ex + 1;
+    } else {
+        uxi &= (!0) >> 12;
+        uxi |= 1 << 52;
+    }
+    if ey == 0 {
+        i = uy << 12;
+        while (i >> 63) == 0 {
+            ey -= 1;
+            i <<= 1;
+        }
+        uy <<= -ey + 1;
+    } else {
+        uy &= (!0) >> 12;
+        uy |= 1 << 52;
+    }
+
+    q = 0;
+
+    if ex + 1 != ey {
+        if ex < ey {
+            return (x, 0);
+        }
+        /* x mod y */
+        while ex > ey {
+            i = uxi.wrapping_sub(uy);
+            if (i >> 63) == 0 {
+                uxi = i;
+                q += 1;
+            }
+            uxi <<= 1;
+            q <<= 1;
+            ex -= 1;
+        }
+        i = uxi.wrapping_sub(uy);
+        if (i >> 63) == 0 {
+            uxi = i;
+            q += 1;
+        }
+        if uxi == 0 {
+            ex = -60;
+        } else {
+            while (uxi >> 52) == 0 {
+                uxi <<= 1;
+                ex -= 1;
+            }
+        }
+    }
+
+    /* scale result and decide between |x| and |x|-|y| */
+    if ex > 0 {
+        uxi -= 1 << 52;
+        uxi |= (ex as u64) << 52;
+    } else {
+        uxi >>= -ex + 1;
+    }
+    x = f64::from_bits(uxi);
+    if sy {
+        y = -y;
+    }
+    if ex == ey || (ex + 1 == ey && (2.0 * x > y || (2.0 * x == y && (q % 2) != 0))) {
+        x -= y;
+        // TODO: this matches musl behavior, but it is incorrect
+        q = q.wrapping_add(1);
+    }
+    q &= 0x7fffffff;
+    let quo = if sx ^ sy { -(q as i32) } else { q as i32 };
+    if sx { (-x, quo) } else { (x, quo) }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::remquo;
+
+    #[test]
+    fn test_q_overflow() {
+        // 0xc000000000000001, 0x04c0000000000004
+        let _ = remquo(-2.0000000000000004, 8.406091369059082e-286);
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/remquof.rs b/library/compiler-builtins/libm/src/math/remquof.rs
new file mode 100644
index 00000000000..b0e85ca6611
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/remquof.rs
@@ -0,0 +1,93 @@
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn remquof(mut x: f32, mut y: f32) -> (f32, i32) {
+    let ux: u32 = x.to_bits();
+    let mut uy: u32 = y.to_bits();
+    let mut ex = ((ux >> 23) & 0xff) as i32;
+    let mut ey = ((uy >> 23) & 0xff) as i32;
+    let sx = (ux >> 31) != 0;
+    let sy = (uy >> 31) != 0;
+    let mut q: u32;
+    let mut i: u32;
+    let mut uxi: u32 = ux;
+
+    if (uy << 1) == 0 || y.is_nan() || ex == 0xff {
+        return ((x * y) / (x * y), 0);
+    }
+    if (ux << 1) == 0 {
+        return (x, 0);
+    }
+
+    /* normalize x and y */
+    if ex == 0 {
+        i = uxi << 9;
+        while (i >> 31) == 0 {
+            ex -= 1;
+            i <<= 1;
+        }
+        uxi <<= -ex + 1;
+    } else {
+        uxi &= (!0) >> 9;
+        uxi |= 1 << 23;
+    }
+    if ey == 0 {
+        i = uy << 9;
+        while (i >> 31) == 0 {
+            ey -= 1;
+            i <<= 1;
+        }
+        uy <<= -ey + 1;
+    } else {
+        uy &= (!0) >> 9;
+        uy |= 1 << 23;
+    }
+
+    q = 0;
+    if ex + 1 != ey {
+        if ex < ey {
+            return (x, 0);
+        }
+        /* x mod y */
+        while ex > ey {
+            i = uxi.wrapping_sub(uy);
+            if (i >> 31) == 0 {
+                uxi = i;
+                q += 1;
+            }
+            uxi <<= 1;
+            q <<= 1;
+            ex -= 1;
+        }
+        i = uxi.wrapping_sub(uy);
+        if (i >> 31) == 0 {
+            uxi = i;
+            q += 1;
+        }
+        if uxi == 0 {
+            ex = -30;
+        } else {
+            while (uxi >> 23) == 0 {
+                uxi <<= 1;
+                ex -= 1;
+            }
+        }
+    }
+
+    /* scale result and decide between |x| and |x|-|y| */
+    if ex > 0 {
+        uxi -= 1 << 23;
+        uxi |= (ex as u32) << 23;
+    } else {
+        uxi >>= -ex + 1;
+    }
+    x = f32::from_bits(uxi);
+    if sy {
+        y = -y;
+    }
+    if ex == ey || (ex + 1 == ey && (2.0 * x > y || (2.0 * x == y && (q % 2) != 0))) {
+        x -= y;
+        q += 1;
+    }
+    q &= 0x7fffffff;
+    let quo = if sx ^ sy { -(q as i32) } else { q as i32 };
+    if sx { (-x, quo) } else { (x, quo) }
+}
diff --git a/library/compiler-builtins/libm/src/math/rint.rs b/library/compiler-builtins/libm/src/math/rint.rs
new file mode 100644
index 00000000000..e1c32c94355
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/rint.rs
@@ -0,0 +1,51 @@
+use super::support::Round;
+
+/// Round `x` to the nearest integer, breaking ties toward even.
+#[cfg(f16_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn rintf16(x: f16) -> f16 {
+    select_implementation! {
+        name: rintf16,
+        use_arch: all(target_arch = "aarch64", target_feature = "fp16"),
+        args: x,
+    }
+
+    super::generic::rint_round(x, Round::Nearest).val
+}
+
+/// Round `x` to the nearest integer, breaking ties toward even.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn rintf(x: f32) -> f32 {
+    select_implementation! {
+        name: rintf,
+        use_arch: any(
+            all(target_arch = "aarch64", target_feature = "neon"),
+            all(target_arch = "wasm32", intrinsics_enabled),
+        ),
+        args: x,
+    }
+
+    super::generic::rint_round(x, Round::Nearest).val
+}
+
+/// Round `x` to the nearest integer, breaking ties toward even.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn rint(x: f64) -> f64 {
+    select_implementation! {
+        name: rint,
+        use_arch: any(
+            all(target_arch = "aarch64", target_feature = "neon"),
+            all(target_arch = "wasm32", intrinsics_enabled),
+        ),
+        args: x,
+    }
+
+    super::generic::rint_round(x, Round::Nearest).val
+}
+
+/// Round `x` to the nearest integer, breaking ties toward even.
+#[cfg(f128_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn rintf128(x: f128) -> f128 {
+    super::generic::rint_round(x, Round::Nearest).val
+}
diff --git a/library/compiler-builtins/libm/src/math/round.rs b/library/compiler-builtins/libm/src/math/round.rs
new file mode 100644
index 00000000000..6cd091cd73c
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/round.rs
@@ -0,0 +1,25 @@
+/// Round `x` to the nearest integer, breaking ties away from zero.
+#[cfg(f16_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn roundf16(x: f16) -> f16 {
+    super::generic::round(x)
+}
+
+/// Round `x` to the nearest integer, breaking ties away from zero.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn roundf(x: f32) -> f32 {
+    super::generic::round(x)
+}
+
+/// Round `x` to the nearest integer, breaking ties away from zero.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn round(x: f64) -> f64 {
+    super::generic::round(x)
+}
+
+/// Round `x` to the nearest integer, breaking ties away from zero.
+#[cfg(f128_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn roundf128(x: f128) -> f128 {
+    super::generic::round(x)
+}
diff --git a/library/compiler-builtins/libm/src/math/roundeven.rs b/library/compiler-builtins/libm/src/math/roundeven.rs
new file mode 100644
index 00000000000..6e621d7628f
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/roundeven.rs
@@ -0,0 +1,36 @@
+use super::support::{Float, Round};
+
+/// Round `x` to the nearest integer, breaking ties toward even. This is IEEE 754
+/// `roundToIntegralTiesToEven`.
+#[cfg(f16_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn roundevenf16(x: f16) -> f16 {
+    roundeven_impl(x)
+}
+
+/// Round `x` to the nearest integer, breaking ties toward even. This is IEEE 754
+/// `roundToIntegralTiesToEven`.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn roundevenf(x: f32) -> f32 {
+    roundeven_impl(x)
+}
+
+/// Round `x` to the nearest integer, breaking ties toward even. This is IEEE 754
+/// `roundToIntegralTiesToEven`.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn roundeven(x: f64) -> f64 {
+    roundeven_impl(x)
+}
+
+/// Round `x` to the nearest integer, breaking ties toward even. This is IEEE 754
+/// `roundToIntegralTiesToEven`.
+#[cfg(f128_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn roundevenf128(x: f128) -> f128 {
+    roundeven_impl(x)
+}
+
+#[inline]
+pub fn roundeven_impl<F: Float>(x: F) -> F {
+    super::generic::rint_round(x, Round::Nearest).val
+}
diff --git a/library/compiler-builtins/libm/src/math/roundf.rs b/library/compiler-builtins/libm/src/math/roundf.rs
new file mode 100644
index 00000000000..b5d7c9d693e
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/roundf.rs
@@ -0,0 +1,5 @@
+/// Round `x` to the nearest integer, breaking ties away from zero.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn roundf(x: f32) -> f32 {
+    super::generic::round(x)
+}
diff --git a/library/compiler-builtins/libm/src/math/roundf128.rs b/library/compiler-builtins/libm/src/math/roundf128.rs
new file mode 100644
index 00000000000..fc3164929fe
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/roundf128.rs
@@ -0,0 +1,5 @@
+/// Round `x` to the nearest integer, breaking ties away from zero.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn roundf128(x: f128) -> f128 {
+    super::generic::round(x)
+}
diff --git a/library/compiler-builtins/libm/src/math/roundf16.rs b/library/compiler-builtins/libm/src/math/roundf16.rs
new file mode 100644
index 00000000000..8b356eaabee
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/roundf16.rs
@@ -0,0 +1,5 @@
+/// Round `x` to the nearest integer, breaking ties away from zero.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn roundf16(x: f16) -> f16 {
+    super::generic::round(x)
+}
diff --git a/library/compiler-builtins/libm/src/math/scalbn.rs b/library/compiler-builtins/libm/src/math/scalbn.rs
new file mode 100644
index 00000000000..ed73c3f94f0
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/scalbn.rs
@@ -0,0 +1,87 @@
+#[cfg(f16_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn scalbnf16(x: f16, n: i32) -> f16 {
+    super::generic::scalbn(x, n)
+}
+
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn scalbnf(x: f32, n: i32) -> f32 {
+    super::generic::scalbn(x, n)
+}
+
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn scalbn(x: f64, n: i32) -> f64 {
+    super::generic::scalbn(x, n)
+}
+
+#[cfg(f128_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn scalbnf128(x: f128, n: i32) -> f128 {
+    super::generic::scalbn(x, n)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::support::{CastFrom, CastInto, Float};
+
+    // Tests against N3220
+    fn spec_test<F: Float>(f: impl Fn(F, i32) -> F)
+    where
+        u32: CastInto<F::Int>,
+        F::Int: CastFrom<i32>,
+        F::Int: CastFrom<u32>,
+    {
+        // `scalbn(±0, n)` returns `±0`.
+        assert_biteq!(f(F::NEG_ZERO, 10), F::NEG_ZERO);
+        assert_biteq!(f(F::NEG_ZERO, 0), F::NEG_ZERO);
+        assert_biteq!(f(F::NEG_ZERO, -10), F::NEG_ZERO);
+        assert_biteq!(f(F::ZERO, 10), F::ZERO);
+        assert_biteq!(f(F::ZERO, 0), F::ZERO);
+        assert_biteq!(f(F::ZERO, -10), F::ZERO);
+
+        // `scalbn(x, 0)` returns `x`.
+        assert_biteq!(f(F::MIN, 0), F::MIN);
+        assert_biteq!(f(F::MAX, 0), F::MAX);
+        assert_biteq!(f(F::INFINITY, 0), F::INFINITY);
+        assert_biteq!(f(F::NEG_INFINITY, 0), F::NEG_INFINITY);
+        assert_biteq!(f(F::ZERO, 0), F::ZERO);
+        assert_biteq!(f(F::NEG_ZERO, 0), F::NEG_ZERO);
+
+        // `scalbn(±∞, n)` returns `±∞`.
+        assert_biteq!(f(F::INFINITY, 10), F::INFINITY);
+        assert_biteq!(f(F::INFINITY, -10), F::INFINITY);
+        assert_biteq!(f(F::NEG_INFINITY, 10), F::NEG_INFINITY);
+        assert_biteq!(f(F::NEG_INFINITY, -10), F::NEG_INFINITY);
+
+        // NaN should remain NaNs.
+        assert!(f(F::NAN, 10).is_nan());
+        assert!(f(F::NAN, 0).is_nan());
+        assert!(f(F::NAN, -10).is_nan());
+        assert!(f(-F::NAN, 10).is_nan());
+        assert!(f(-F::NAN, 0).is_nan());
+        assert!(f(-F::NAN, -10).is_nan());
+    }
+
+    #[test]
+    #[cfg(f16_enabled)]
+    fn spec_test_f16() {
+        spec_test::<f16>(scalbnf16);
+    }
+
+    #[test]
+    fn spec_test_f32() {
+        spec_test::<f32>(scalbnf);
+    }
+
+    #[test]
+    fn spec_test_f64() {
+        spec_test::<f64>(scalbn);
+    }
+
+    #[test]
+    #[cfg(f128_enabled)]
+    fn spec_test_f128() {
+        spec_test::<f128>(scalbnf128);
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/scalbnf.rs b/library/compiler-builtins/libm/src/math/scalbnf.rs
new file mode 100644
index 00000000000..57e7ba76f60
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/scalbnf.rs
@@ -0,0 +1,4 @@
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn scalbnf(x: f32, n: i32) -> f32 {
+    super::generic::scalbn(x, n)
+}
diff --git a/library/compiler-builtins/libm/src/math/scalbnf128.rs b/library/compiler-builtins/libm/src/math/scalbnf128.rs
new file mode 100644
index 00000000000..c1d2b485585
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/scalbnf128.rs
@@ -0,0 +1,4 @@
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn scalbnf128(x: f128, n: i32) -> f128 {
+    super::generic::scalbn(x, n)
+}
diff --git a/library/compiler-builtins/libm/src/math/scalbnf16.rs b/library/compiler-builtins/libm/src/math/scalbnf16.rs
new file mode 100644
index 00000000000..2209e1a1795
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/scalbnf16.rs
@@ -0,0 +1,4 @@
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn scalbnf16(x: f16, n: i32) -> f16 {
+    super::generic::scalbn(x, n)
+}
diff --git a/library/compiler-builtins/libm/src/math/sin.rs b/library/compiler-builtins/libm/src/math/sin.rs
new file mode 100644
index 00000000000..229fa4bef08
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/sin.rs
@@ -0,0 +1,95 @@
+// origin: FreeBSD /usr/src/lib/msun/src/s_sin.c */
+//
+// ====================================================
+// Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+//
+// Developed at SunPro, a Sun Microsystems, Inc. business.
+// Permission to use, copy, modify, and distribute this
+// software is freely granted, provided that this notice
+// is preserved.
+// ====================================================
+
+use super::{k_cos, k_sin, rem_pio2};
+
+// sin(x)
+// Return sine function of x.
+//
+// kernel function:
+//      k_sin            ... sine function on [-pi/4,pi/4]
+//      k_cos            ... cose function on [-pi/4,pi/4]
+//      rem_pio2         ... argument reduction routine
+//
+// Method.
+//      Let S,C and T denote the sin, cos and tan respectively on
+//      [-PI/4, +PI/4]. Reduce the argument x to y1+y2 = x-k*pi/2
+//      in [-pi/4 , +pi/4], and let n = k mod 4.
+//      We have
+//
+//          n        sin(x)      cos(x)        tan(x)
+//     ----------------------------------------------------------
+//          0          S           C             T
+//          1          C          -S            -1/T
+//          2         -S          -C             T
+//          3         -C           S            -1/T
+//     ----------------------------------------------------------
+//
+// Special cases:
+//      Let trig be any of sin, cos, or tan.
+//      trig(+-INF)  is NaN, with signals;
+//      trig(NaN)    is that NaN;
+//
+// Accuracy:
+//      TRIG(x) returns trig(x) nearly rounded
+
+/// The sine of `x` (f64).
+///
+/// `x` is specified in radians.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn sin(x: f64) -> f64 {
+    let x1p120 = f64::from_bits(0x4770000000000000); // 0x1p120f === 2 ^ 120
+
+    /* High word of x. */
+    let ix = (f64::to_bits(x) >> 32) as u32 & 0x7fffffff;
+
+    /* |x| ~< pi/4 */
+    if ix <= 0x3fe921fb {
+        if ix < 0x3e500000 {
+            /* |x| < 2**-26 */
+            /* raise inexact if x != 0 and underflow if subnormal*/
+            if ix < 0x00100000 {
+                force_eval!(x / x1p120);
+            } else {
+                force_eval!(x + x1p120);
+            }
+            return x;
+        }
+        return k_sin(x, 0.0, 0);
+    }
+
+    /* sin(Inf or NaN) is NaN */
+    if ix >= 0x7ff00000 {
+        return x - x;
+    }
+
+    /* argument reduction needed */
+    let (n, y0, y1) = rem_pio2(x);
+    match n & 3 {
+        0 => k_sin(y0, y1, 1),
+        1 => k_cos(y0, y1),
+        2 => -k_sin(y0, y1, 1),
+        _ => -k_cos(y0, y1),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    #[cfg_attr(x86_no_sse, ignore = "FIXME(i586): possible incorrect rounding")]
+    fn test_near_pi() {
+        let x = f64::from_bits(0x400921fb000FD5DD); // 3.141592026217707
+        let sx = f64::from_bits(0x3ea50d15ced1a4a2); // 6.273720864039205e-7
+        assert_eq!(sin(x), sx);
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/sincos.rs b/library/compiler-builtins/libm/src/math/sincos.rs
new file mode 100644
index 00000000000..ebf482f2df3
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/sincos.rs
@@ -0,0 +1,137 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/s_sin.c */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+use super::{get_high_word, k_cos, k_sin, rem_pio2};
+
+/// Both the sine and cosine of `x` (f64).
+///
+/// `x` is specified in radians and the return value is (sin(x), cos(x)).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn sincos(x: f64) -> (f64, f64) {
+    let s: f64;
+    let c: f64;
+    let mut ix: u32;
+
+    ix = get_high_word(x);
+    ix &= 0x7fffffff;
+
+    /* |x| ~< pi/4 */
+    if ix <= 0x3fe921fb {
+        /* if |x| < 2**-27 * sqrt(2) */
+        if ix < 0x3e46a09e {
+            /* raise inexact if x!=0 and underflow if subnormal */
+            let x1p120 = f64::from_bits(0x4770000000000000); // 0x1p120 == 2^120
+            if ix < 0x00100000 {
+                force_eval!(x / x1p120);
+            } else {
+                force_eval!(x + x1p120);
+            }
+            return (x, 1.0);
+        }
+        return (k_sin(x, 0.0, 0), k_cos(x, 0.0));
+    }
+
+    /* sincos(Inf or NaN) is NaN */
+    if ix >= 0x7ff00000 {
+        let rv = x - x;
+        return (rv, rv);
+    }
+
+    /* argument reduction needed */
+    let (n, y0, y1) = rem_pio2(x);
+    s = k_sin(y0, y1, 1);
+    c = k_cos(y0, y1);
+    match n & 3 {
+        0 => (s, c),
+        1 => (c, -s),
+        2 => (-s, -c),
+        3 => (-c, s),
+        #[cfg(debug_assertions)]
+        _ => unreachable!(),
+        #[cfg(not(debug_assertions))]
+        _ => (0.0, 1.0),
+    }
+}
+
+// These tests are based on those from sincosf.rs
+#[cfg(test)]
+mod tests {
+    use super::sincos;
+
+    const TOLERANCE: f64 = 1e-6;
+
+    #[test]
+    fn with_pi() {
+        let (s, c) = sincos(core::f64::consts::PI);
+        assert!(
+            (s - 0.0).abs() < TOLERANCE,
+            "|{} - {}| = {} >= {}",
+            s,
+            0.0,
+            (s - 0.0).abs(),
+            TOLERANCE
+        );
+        assert!(
+            (c + 1.0).abs() < TOLERANCE,
+            "|{} + {}| = {} >= {}",
+            c,
+            1.0,
+            (s + 1.0).abs(),
+            TOLERANCE
+        );
+    }
+
+    #[test]
+    fn rotational_symmetry() {
+        use core::f64::consts::PI;
+        const N: usize = 24;
+        for n in 0..N {
+            let theta = 2. * PI * (n as f64) / (N as f64);
+            let (s, c) = sincos(theta);
+            let (s_plus, c_plus) = sincos(theta + 2. * PI);
+            let (s_minus, c_minus) = sincos(theta - 2. * PI);
+
+            assert!(
+                (s - s_plus).abs() < TOLERANCE,
+                "|{} - {}| = {} >= {}",
+                s,
+                s_plus,
+                (s - s_plus).abs(),
+                TOLERANCE
+            );
+            assert!(
+                (s - s_minus).abs() < TOLERANCE,
+                "|{} - {}| = {} >= {}",
+                s,
+                s_minus,
+                (s - s_minus).abs(),
+                TOLERANCE
+            );
+            assert!(
+                (c - c_plus).abs() < TOLERANCE,
+                "|{} - {}| = {} >= {}",
+                c,
+                c_plus,
+                (c - c_plus).abs(),
+                TOLERANCE
+            );
+            assert!(
+                (c - c_minus).abs() < TOLERANCE,
+                "|{} - {}| = {} >= {}",
+                c,
+                c_minus,
+                (c - c_minus).abs(),
+                TOLERANCE
+            );
+        }
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/sincosf.rs b/library/compiler-builtins/libm/src/math/sincosf.rs
new file mode 100644
index 00000000000..f3360767683
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/sincosf.rs
@@ -0,0 +1,176 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/s_sinf.c */
+/*
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
+ * Optimized by Bruce D. Evans.
+ */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+use super::{k_cosf, k_sinf, rem_pio2f};
+
+/* Small multiples of pi/2 rounded to double precision. */
+const PI_2: f64 = 0.5 * 3.1415926535897931160E+00;
+const S1PIO2: f64 = 1.0 * PI_2; /* 0x3FF921FB, 0x54442D18 */
+const S2PIO2: f64 = 2.0 * PI_2; /* 0x400921FB, 0x54442D18 */
+const S3PIO2: f64 = 3.0 * PI_2; /* 0x4012D97C, 0x7F3321D2 */
+const S4PIO2: f64 = 4.0 * PI_2; /* 0x401921FB, 0x54442D18 */
+
+/// Both the sine and cosine of `x` (f32).
+///
+/// `x` is specified in radians and the return value is (sin(x), cos(x)).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn sincosf(x: f32) -> (f32, f32) {
+    let s: f32;
+    let c: f32;
+    let mut ix: u32;
+    let sign: bool;
+
+    ix = x.to_bits();
+    sign = (ix >> 31) != 0;
+    ix &= 0x7fffffff;
+
+    /* |x| ~<= pi/4 */
+    if ix <= 0x3f490fda {
+        /* |x| < 2**-12 */
+        if ix < 0x39800000 {
+            /* raise inexact if x!=0 and underflow if subnormal */
+
+            let x1p120 = f32::from_bits(0x7b800000); // 0x1p120 == 2^120
+            if ix < 0x00100000 {
+                force_eval!(x / x1p120);
+            } else {
+                force_eval!(x + x1p120);
+            }
+            return (x, 1.0);
+        }
+        return (k_sinf(x as f64), k_cosf(x as f64));
+    }
+
+    /* |x| ~<= 5*pi/4 */
+    if ix <= 0x407b53d1 {
+        if ix <= 0x4016cbe3 {
+            /* |x| ~<= 3pi/4 */
+            if sign {
+                s = -k_cosf(x as f64 + S1PIO2);
+                c = k_sinf(x as f64 + S1PIO2);
+            } else {
+                s = k_cosf(S1PIO2 - x as f64);
+                c = k_sinf(S1PIO2 - x as f64);
+            }
+        }
+        /* -sin(x+c) is not correct if x+c could be 0: -0 vs +0 */
+        else if sign {
+            s = -k_sinf(x as f64 + S2PIO2);
+            c = -k_cosf(x as f64 + S2PIO2);
+        } else {
+            s = -k_sinf(x as f64 - S2PIO2);
+            c = -k_cosf(x as f64 - S2PIO2);
+        }
+
+        return (s, c);
+    }
+
+    /* |x| ~<= 9*pi/4 */
+    if ix <= 0x40e231d5 {
+        if ix <= 0x40afeddf {
+            /* |x| ~<= 7*pi/4 */
+            if sign {
+                s = k_cosf(x as f64 + S3PIO2);
+                c = -k_sinf(x as f64 + S3PIO2);
+            } else {
+                s = -k_cosf(x as f64 - S3PIO2);
+                c = k_sinf(x as f64 - S3PIO2);
+            }
+        } else if sign {
+            s = k_sinf(x as f64 + S4PIO2);
+            c = k_cosf(x as f64 + S4PIO2);
+        } else {
+            s = k_sinf(x as f64 - S4PIO2);
+            c = k_cosf(x as f64 - S4PIO2);
+        }
+
+        return (s, c);
+    }
+
+    /* sin(Inf or NaN) is NaN */
+    if ix >= 0x7f800000 {
+        let rv = x - x;
+        return (rv, rv);
+    }
+
+    /* general argument reduction needed */
+    let (n, y) = rem_pio2f(x);
+    s = k_sinf(y);
+    c = k_cosf(y);
+    match n & 3 {
+        0 => (s, c),
+        1 => (c, -s),
+        2 => (-s, -c),
+        3 => (-c, s),
+        #[cfg(debug_assertions)]
+        _ => unreachable!(),
+        #[cfg(not(debug_assertions))]
+        _ => (0.0, 1.0),
+    }
+}
+
+// PowerPC tests are failing on LLVM 13: https://github.com/rust-lang/rust/issues/88520
+#[cfg(not(target_arch = "powerpc64"))]
+#[cfg(test)]
+mod tests {
+    use super::sincosf;
+
+    #[test]
+    fn rotational_symmetry() {
+        use core::f32::consts::PI;
+        const N: usize = 24;
+        for n in 0..N {
+            let theta = 2. * PI * (n as f32) / (N as f32);
+            let (s, c) = sincosf(theta);
+            let (s_plus, c_plus) = sincosf(theta + 2. * PI);
+            let (s_minus, c_minus) = sincosf(theta - 2. * PI);
+
+            const TOLERANCE: f32 = 1e-6;
+            assert!(
+                (s - s_plus).abs() < TOLERANCE,
+                "|{} - {}| = {} >= {}",
+                s,
+                s_plus,
+                (s - s_plus).abs(),
+                TOLERANCE
+            );
+            assert!(
+                (s - s_minus).abs() < TOLERANCE,
+                "|{} - {}| = {} >= {}",
+                s,
+                s_minus,
+                (s - s_minus).abs(),
+                TOLERANCE
+            );
+            assert!(
+                (c - c_plus).abs() < TOLERANCE,
+                "|{} - {}| = {} >= {}",
+                c,
+                c_plus,
+                (c - c_plus).abs(),
+                TOLERANCE
+            );
+            assert!(
+                (c - c_minus).abs() < TOLERANCE,
+                "|{} - {}| = {} >= {}",
+                c,
+                c_minus,
+                (c - c_minus).abs(),
+                TOLERANCE
+            );
+        }
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/sinf.rs b/library/compiler-builtins/libm/src/math/sinf.rs
new file mode 100644
index 00000000000..709b63fcf29
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/sinf.rs
@@ -0,0 +1,96 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/s_sinf.c */
+/*
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
+ * Optimized by Bruce D. Evans.
+ */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+use core::f64::consts::FRAC_PI_2;
+
+use super::{k_cosf, k_sinf, rem_pio2f};
+
+/* Small multiples of pi/2 rounded to double precision. */
+const S1_PIO2: f64 = 1. * FRAC_PI_2; /* 0x3FF921FB, 0x54442D18 */
+const S2_PIO2: f64 = 2. * FRAC_PI_2; /* 0x400921FB, 0x54442D18 */
+const S3_PIO2: f64 = 3. * FRAC_PI_2; /* 0x4012D97C, 0x7F3321D2 */
+const S4_PIO2: f64 = 4. * FRAC_PI_2; /* 0x401921FB, 0x54442D18 */
+
+/// The sine of `x` (f32).
+///
+/// `x` is specified in radians.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn sinf(x: f32) -> f32 {
+    let x64 = x as f64;
+
+    let x1p120 = f32::from_bits(0x7b800000); // 0x1p120f === 2 ^ 120
+
+    let mut ix = x.to_bits();
+    let sign = (ix >> 31) != 0;
+    ix &= 0x7fffffff;
+
+    if ix <= 0x3f490fda {
+        /* |x| ~<= pi/4 */
+        if ix < 0x39800000 {
+            /* |x| < 2**-12 */
+            /* raise inexact if x!=0 and underflow if subnormal */
+            force_eval!(if ix < 0x00800000 {
+                x / x1p120
+            } else {
+                x + x1p120
+            });
+            return x;
+        }
+        return k_sinf(x64);
+    }
+    if ix <= 0x407b53d1 {
+        /* |x| ~<= 5*pi/4 */
+        if ix <= 0x4016cbe3 {
+            /* |x| ~<= 3pi/4 */
+            if sign {
+                return -k_cosf(x64 + S1_PIO2);
+            } else {
+                return k_cosf(x64 - S1_PIO2);
+            }
+        }
+        return k_sinf(if sign {
+            -(x64 + S2_PIO2)
+        } else {
+            -(x64 - S2_PIO2)
+        });
+    }
+    if ix <= 0x40e231d5 {
+        /* |x| ~<= 9*pi/4 */
+        if ix <= 0x40afeddf {
+            /* |x| ~<= 7*pi/4 */
+            if sign {
+                return k_cosf(x64 + S3_PIO2);
+            } else {
+                return -k_cosf(x64 - S3_PIO2);
+            }
+        }
+        return k_sinf(if sign { x64 + S4_PIO2 } else { x64 - S4_PIO2 });
+    }
+
+    /* sin(Inf or NaN) is NaN */
+    if ix >= 0x7f800000 {
+        return x - x;
+    }
+
+    /* general argument reduction needed */
+    let (n, y) = rem_pio2f(x);
+    match n & 3 {
+        0 => k_sinf(y),
+        1 => k_cosf(y),
+        2 => k_sinf(-y),
+        _ => -k_cosf(y),
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/sinh.rs b/library/compiler-builtins/libm/src/math/sinh.rs
new file mode 100644
index 00000000000..79184198263
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/sinh.rs
@@ -0,0 +1,51 @@
+use super::{expm1, expo2};
+
+// sinh(x) = (exp(x) - 1/exp(x))/2
+//         = (exp(x)-1 + (exp(x)-1)/exp(x))/2
+//         = x + x^3/6 + o(x^5)
+//
+
+/// The hyperbolic sine of `x` (f64).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn sinh(x: f64) -> f64 {
+    // union {double f; uint64_t i;} u = {.f = x};
+    // uint32_t w;
+    // double t, h, absx;
+
+    let mut uf: f64 = x;
+    let mut ui: u64 = f64::to_bits(uf);
+    let w: u32;
+    let t: f64;
+    let mut h: f64;
+    let absx: f64;
+
+    h = 0.5;
+    if ui >> 63 != 0 {
+        h = -h;
+    }
+    /* |x| */
+    ui &= !1 / 2;
+    uf = f64::from_bits(ui);
+    absx = uf;
+    w = (ui >> 32) as u32;
+
+    /* |x| < log(DBL_MAX) */
+    if w < 0x40862e42 {
+        t = expm1(absx);
+        if w < 0x3ff00000 {
+            if w < 0x3ff00000 - (26 << 20) {
+                /* note: inexact and underflow are raised by expm1 */
+                /* note: this branch avoids spurious underflow */
+                return x;
+            }
+            return h * (2.0 * t - t * t / (t + 1.0));
+        }
+        /* note: |x|>log(0x1p26)+eps could be just h*exp(x) */
+        return h * (t + t / (t + 1.0));
+    }
+
+    /* |x| > log(DBL_MAX) or nan */
+    /* note: the result is stored to handle overflow */
+    t = 2.0 * h * expo2(absx);
+    t
+}
diff --git a/library/compiler-builtins/libm/src/math/sinhf.rs b/library/compiler-builtins/libm/src/math/sinhf.rs
new file mode 100644
index 00000000000..44d2e3560d5
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/sinhf.rs
@@ -0,0 +1,30 @@
+use super::{expm1f, k_expo2f};
+
+/// The hyperbolic sine of `x` (f32).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn sinhf(x: f32) -> f32 {
+    let mut h = 0.5f32;
+    let mut ix = x.to_bits();
+    if (ix >> 31) != 0 {
+        h = -h;
+    }
+    /* |x| */
+    ix &= 0x7fffffff;
+    let absx = f32::from_bits(ix);
+    let w = ix;
+
+    /* |x| < log(FLT_MAX) */
+    if w < 0x42b17217 {
+        let t = expm1f(absx);
+        if w < 0x3f800000 {
+            if w < (0x3f800000 - (12 << 23)) {
+                return x;
+            }
+            return h * (2. * t - t * t / (t + 1.));
+        }
+        return h * (t + t / (t + 1.));
+    }
+
+    /* |x| > logf(FLT_MAX) or nan */
+    2. * h * k_expo2f(absx)
+}
diff --git a/library/compiler-builtins/libm/src/math/sqrt.rs b/library/compiler-builtins/libm/src/math/sqrt.rs
new file mode 100644
index 00000000000..76bc240cf01
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/sqrt.rs
@@ -0,0 +1,51 @@
+/// The square root of `x` (f16).
+#[cfg(f16_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn sqrtf16(x: f16) -> f16 {
+    select_implementation! {
+        name: sqrtf16,
+        use_arch: all(target_arch = "aarch64", target_feature = "fp16"),
+        args: x,
+    }
+
+    return super::generic::sqrt(x);
+}
+
+/// The square root of `x` (f32).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn sqrtf(x: f32) -> f32 {
+    select_implementation! {
+        name: sqrtf,
+        use_arch: any(
+            all(target_arch = "aarch64", target_feature = "neon"),
+            all(target_arch = "wasm32", intrinsics_enabled),
+            target_feature = "sse2"
+        ),
+        args: x,
+    }
+
+    super::generic::sqrt(x)
+}
+
+/// The square root of `x` (f64).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn sqrt(x: f64) -> f64 {
+    select_implementation! {
+        name: sqrt,
+        use_arch: any(
+            all(target_arch = "aarch64", target_feature = "neon"),
+            all(target_arch = "wasm32", intrinsics_enabled),
+            target_feature = "sse2"
+        ),
+        args: x,
+    }
+
+    super::generic::sqrt(x)
+}
+
+/// The square root of `x` (f128).
+#[cfg(f128_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn sqrtf128(x: f128) -> f128 {
+    return super::generic::sqrt(x);
+}
diff --git a/library/compiler-builtins/libm/src/math/sqrtf.rs b/library/compiler-builtins/libm/src/math/sqrtf.rs
new file mode 100644
index 00000000000..c28a705e378
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/sqrtf.rs
@@ -0,0 +1,15 @@
+/// The square root of `x` (f32).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn sqrtf(x: f32) -> f32 {
+    select_implementation! {
+        name: sqrtf,
+        use_arch: any(
+            all(target_arch = "aarch64", target_feature = "neon"),
+            all(target_arch = "wasm32", intrinsics_enabled),
+            target_feature = "sse2"
+        ),
+        args: x,
+    }
+
+    super::generic::sqrt(x)
+}
diff --git a/library/compiler-builtins/libm/src/math/sqrtf128.rs b/library/compiler-builtins/libm/src/math/sqrtf128.rs
new file mode 100644
index 00000000000..eaef6ae0c1c
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/sqrtf128.rs
@@ -0,0 +1,5 @@
+/// The square root of `x` (f128).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn sqrtf128(x: f128) -> f128 {
+    return super::generic::sqrt(x);
+}
diff --git a/library/compiler-builtins/libm/src/math/sqrtf16.rs b/library/compiler-builtins/libm/src/math/sqrtf16.rs
new file mode 100644
index 00000000000..7bedb7f8bbb
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/sqrtf16.rs
@@ -0,0 +1,11 @@
+/// The square root of `x` (f16).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn sqrtf16(x: f16) -> f16 {
+    select_implementation! {
+        name: sqrtf16,
+        use_arch: all(target_arch = "aarch64", target_feature = "fp16"),
+        args: x,
+    }
+
+    return super::generic::sqrt(x);
+}
diff --git a/library/compiler-builtins/libm/src/math/support/big.rs b/library/compiler-builtins/libm/src/math/support/big.rs
new file mode 100644
index 00000000000..8a52d86cc98
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/support/big.rs
@@ -0,0 +1,257 @@
+//! Integers used for wide operations, larger than `u128`.
+
+#[cfg(test)]
+mod tests;
+
+use core::ops;
+
+use super::{DInt, HInt, Int, MinInt};
+
+const U128_LO_MASK: u128 = u64::MAX as u128;
+
+/// A 256-bit unsigned integer represented as two 128-bit native-endian limbs.
+#[allow(non_camel_case_types)]
+#[derive(Clone, Copy, Debug, PartialEq, PartialOrd)]
+pub struct u256 {
+    pub lo: u128,
+    pub hi: u128,
+}
+
+impl u256 {
+    #[cfg(any(test, feature = "unstable-public-internals"))]
+    pub const MAX: Self = Self {
+        lo: u128::MAX,
+        hi: u128::MAX,
+    };
+
+    /// Reinterpret as a signed integer
+    pub fn signed(self) -> i256 {
+        i256 {
+            lo: self.lo,
+            hi: self.hi,
+        }
+    }
+}
+
+/// A 256-bit signed integer represented as two 128-bit native-endian limbs.
+#[allow(non_camel_case_types)]
+#[derive(Clone, Copy, Debug, PartialEq, PartialOrd)]
+pub struct i256 {
+    pub lo: u128,
+    pub hi: u128,
+}
+
+impl i256 {
+    /// Reinterpret as an unsigned integer
+    #[cfg(any(test, feature = "unstable-public-internals"))]
+    pub fn unsigned(self) -> u256 {
+        u256 {
+            lo: self.lo,
+            hi: self.hi,
+        }
+    }
+}
+
+impl MinInt for u256 {
+    type OtherSign = i256;
+
+    type Unsigned = u256;
+
+    const SIGNED: bool = false;
+    const BITS: u32 = 256;
+    const ZERO: Self = Self { lo: 0, hi: 0 };
+    const ONE: Self = Self { lo: 1, hi: 0 };
+    const MIN: Self = Self { lo: 0, hi: 0 };
+    const MAX: Self = Self {
+        lo: u128::MAX,
+        hi: u128::MAX,
+    };
+}
+
+impl MinInt for i256 {
+    type OtherSign = u256;
+
+    type Unsigned = u256;
+
+    const SIGNED: bool = false;
+    const BITS: u32 = 256;
+    const ZERO: Self = Self { lo: 0, hi: 0 };
+    const ONE: Self = Self { lo: 1, hi: 0 };
+    const MIN: Self = Self {
+        lo: 0,
+        hi: 1 << 127,
+    };
+    const MAX: Self = Self {
+        lo: u128::MAX,
+        hi: u128::MAX >> 1,
+    };
+}
+
+macro_rules! impl_common {
+    ($ty:ty) => {
+        impl ops::BitOr for $ty {
+            type Output = Self;
+
+            fn bitor(mut self, rhs: Self) -> Self::Output {
+                self.lo |= rhs.lo;
+                self.hi |= rhs.hi;
+                self
+            }
+        }
+
+        impl ops::Not for $ty {
+            type Output = Self;
+
+            fn not(mut self) -> Self::Output {
+                self.lo = !self.lo;
+                self.hi = !self.hi;
+                self
+            }
+        }
+
+        impl ops::Shl<u32> for $ty {
+            type Output = Self;
+
+            fn shl(self, _rhs: u32) -> Self::Output {
+                unimplemented!("only used to meet trait bounds")
+            }
+        }
+    };
+}
+
+impl_common!(i256);
+impl_common!(u256);
+
+impl ops::Add<Self> for u256 {
+    type Output = Self;
+
+    fn add(self, rhs: Self) -> Self::Output {
+        let (lo, carry) = self.lo.overflowing_add(rhs.lo);
+        let hi = self.hi.wrapping_add(carry as u128).wrapping_add(rhs.hi);
+
+        Self { lo, hi }
+    }
+}
+
+impl ops::Shr<u32> for u256 {
+    type Output = Self;
+
+    fn shr(mut self, rhs: u32) -> Self::Output {
+        debug_assert!(rhs < Self::BITS, "attempted to shift right with overflow");
+        if rhs >= Self::BITS {
+            return Self::ZERO;
+        }
+
+        if rhs == 0 {
+            return self;
+        }
+
+        if rhs < 128 {
+            self.lo >>= rhs;
+            self.lo |= self.hi << (128 - rhs);
+        } else {
+            self.lo = self.hi >> (rhs - 128);
+        }
+
+        if rhs < 128 {
+            self.hi >>= rhs;
+        } else {
+            self.hi = 0;
+        }
+
+        self
+    }
+}
+
+impl HInt for u128 {
+    type D = u256;
+
+    fn widen(self) -> Self::D {
+        u256 { lo: self, hi: 0 }
+    }
+
+    fn zero_widen(self) -> Self::D {
+        self.widen()
+    }
+
+    fn zero_widen_mul(self, rhs: Self) -> Self::D {
+        let l0 = self & U128_LO_MASK;
+        let l1 = rhs & U128_LO_MASK;
+        let h0 = self >> 64;
+        let h1 = rhs >> 64;
+
+        let p_ll: u128 = l0.overflowing_mul(l1).0;
+        let p_lh: u128 = l0.overflowing_mul(h1).0;
+        let p_hl: u128 = h0.overflowing_mul(l1).0;
+        let p_hh: u128 = h0.overflowing_mul(h1).0;
+
+        let s0 = p_hl + (p_ll >> 64);
+        let s1 = (p_ll & U128_LO_MASK) + (s0 << 64);
+        let s2 = p_lh + (s1 >> 64);
+
+        let lo = (p_ll & U128_LO_MASK) + (s2 << 64);
+        let hi = p_hh + (s0 >> 64) + (s2 >> 64);
+
+        u256 { lo, hi }
+    }
+
+    fn widen_mul(self, rhs: Self) -> Self::D {
+        self.zero_widen_mul(rhs)
+    }
+
+    fn widen_hi(self) -> Self::D {
+        self.widen() << <Self as MinInt>::BITS
+    }
+}
+
+impl HInt for i128 {
+    type D = i256;
+
+    fn widen(self) -> Self::D {
+        let mut ret = self.unsigned().zero_widen().signed();
+        if self.is_negative() {
+            ret.hi = u128::MAX;
+        }
+        ret
+    }
+
+    fn zero_widen(self) -> Self::D {
+        self.unsigned().zero_widen().signed()
+    }
+
+    fn zero_widen_mul(self, rhs: Self) -> Self::D {
+        self.unsigned().zero_widen_mul(rhs.unsigned()).signed()
+    }
+
+    fn widen_mul(self, _rhs: Self) -> Self::D {
+        unimplemented!("signed i128 widening multiply is not used")
+    }
+
+    fn widen_hi(self) -> Self::D {
+        self.widen() << <Self as MinInt>::BITS
+    }
+}
+
+impl DInt for u256 {
+    type H = u128;
+
+    fn lo(self) -> Self::H {
+        self.lo
+    }
+
+    fn hi(self) -> Self::H {
+        self.hi
+    }
+}
+
+impl DInt for i256 {
+    type H = i128;
+
+    fn lo(self) -> Self::H {
+        self.lo as i128
+    }
+
+    fn hi(self) -> Self::H {
+        self.hi as i128
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/support/big/tests.rs b/library/compiler-builtins/libm/src/math/support/big/tests.rs
new file mode 100644
index 00000000000..d2010f0216e
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/support/big/tests.rs
@@ -0,0 +1,277 @@
+extern crate std;
+use std::string::String;
+use std::{eprintln, format};
+
+use super::{HInt, MinInt, i256, u256};
+
+const LOHI_SPLIT: u128 = 0xaaaaaaaaaaaaaaaaffffffffffffffff;
+
+/// Print a `u256` as hex since we can't add format implementations
+fn hexu(v: u256) -> String {
+    format!("0x{:032x}{:032x}", v.hi, v.lo)
+}
+
+#[test]
+fn widen_u128() {
+    assert_eq!(
+        u128::MAX.widen(),
+        u256 {
+            lo: u128::MAX,
+            hi: 0
+        }
+    );
+    assert_eq!(
+        LOHI_SPLIT.widen(),
+        u256 {
+            lo: LOHI_SPLIT,
+            hi: 0
+        }
+    );
+}
+
+#[test]
+fn widen_i128() {
+    assert_eq!((-1i128).widen(), u256::MAX.signed());
+    assert_eq!(
+        (LOHI_SPLIT as i128).widen(),
+        i256 {
+            lo: LOHI_SPLIT,
+            hi: u128::MAX
+        }
+    );
+    assert_eq!((-1i128).zero_widen().unsigned(), (u128::MAX).widen());
+}
+
+#[test]
+fn widen_mul_u128() {
+    let tests = [
+        (
+            u128::MAX / 2,
+            2_u128,
+            u256 {
+                lo: u128::MAX - 1,
+                hi: 0,
+            },
+        ),
+        (
+            u128::MAX,
+            2_u128,
+            u256 {
+                lo: u128::MAX - 1,
+                hi: 1,
+            },
+        ),
+        (
+            u128::MAX,
+            u128::MAX,
+            u256 {
+                lo: 1,
+                hi: u128::MAX - 1,
+            },
+        ),
+        (0, 0, u256::ZERO),
+        (1234u128, 0, u256::ZERO),
+        (0, 1234, u256::ZERO),
+    ];
+
+    let mut has_errors = false;
+    let mut add_error = |i, a, b, expected, actual| {
+        has_errors = true;
+        eprintln!(
+            "\
+            FAILURE ({i}): {a:#034x} * {b:#034x}\n\
+            expected: {}\n\
+            got:      {}\
+            ",
+            hexu(expected),
+            hexu(actual)
+        );
+    };
+
+    for (i, (a, b, exp)) in tests.iter().copied().enumerate() {
+        let res = a.widen_mul(b);
+        let res_z = a.zero_widen_mul(b);
+        assert_eq!(res, res_z);
+        if res != exp {
+            add_error(i, a, b, exp, res);
+        }
+    }
+
+    assert!(!has_errors);
+}
+
+#[test]
+fn not_u256() {
+    assert_eq!(!u256::ZERO, u256::MAX);
+}
+
+#[test]
+fn shr_u256() {
+    let only_low = [
+        1,
+        u16::MAX.into(),
+        u32::MAX.into(),
+        u64::MAX.into(),
+        u128::MAX,
+    ];
+    let mut has_errors = false;
+
+    let mut add_error = |a, b, expected, actual| {
+        has_errors = true;
+        eprintln!(
+            "\
+            FAILURE:  {} >> {b}\n\
+            expected: {}\n\
+            actual:   {}\
+            ",
+            hexu(a),
+            hexu(expected),
+            hexu(actual),
+        );
+    };
+
+    for a in only_low {
+        for perturb in 0..10 {
+            let a = a.saturating_add(perturb);
+            for shift in 0..128 {
+                let res = a.widen() >> shift;
+                let expected = (a >> shift).widen();
+                if res != expected {
+                    add_error(a.widen(), shift, expected, res);
+                }
+            }
+        }
+    }
+
+    let check = [
+        (
+            u256::MAX,
+            1,
+            u256 {
+                lo: u128::MAX,
+                hi: u128::MAX >> 1,
+            },
+        ),
+        (
+            u256::MAX,
+            5,
+            u256 {
+                lo: u128::MAX,
+                hi: u128::MAX >> 5,
+            },
+        ),
+        (
+            u256::MAX,
+            63,
+            u256 {
+                lo: u128::MAX,
+                hi: u64::MAX as u128 | (1 << 64),
+            },
+        ),
+        (
+            u256::MAX,
+            64,
+            u256 {
+                lo: u128::MAX,
+                hi: u64::MAX as u128,
+            },
+        ),
+        (
+            u256::MAX,
+            65,
+            u256 {
+                lo: u128::MAX,
+                hi: (u64::MAX >> 1) as u128,
+            },
+        ),
+        (
+            u256::MAX,
+            127,
+            u256 {
+                lo: u128::MAX,
+                hi: 1,
+            },
+        ),
+        (
+            u256::MAX,
+            128,
+            u256 {
+                lo: u128::MAX,
+                hi: 0,
+            },
+        ),
+        (
+            u256::MAX,
+            129,
+            u256 {
+                lo: u128::MAX >> 1,
+                hi: 0,
+            },
+        ),
+        (
+            u256::MAX,
+            191,
+            u256 {
+                lo: u64::MAX as u128 | 1 << 64,
+                hi: 0,
+            },
+        ),
+        (
+            u256::MAX,
+            192,
+            u256 {
+                lo: u64::MAX as u128,
+                hi: 0,
+            },
+        ),
+        (
+            u256::MAX,
+            193,
+            u256 {
+                lo: u64::MAX as u128 >> 1,
+                hi: 0,
+            },
+        ),
+        (u256::MAX, 254, u256 { lo: 0b11, hi: 0 }),
+        (u256::MAX, 255, u256 { lo: 1, hi: 0 }),
+        (
+            u256 {
+                hi: LOHI_SPLIT,
+                lo: 0,
+            },
+            64,
+            u256 {
+                lo: 0xffffffffffffffff0000000000000000,
+                hi: 0xaaaaaaaaaaaaaaaa,
+            },
+        ),
+    ];
+
+    for (input, shift, expected) in check {
+        let res = input >> shift;
+        if res != expected {
+            add_error(input, shift, expected, res);
+        }
+    }
+
+    assert!(!has_errors);
+}
+
+#[test]
+#[should_panic]
+#[cfg(debug_assertions)]
+// FIXME(ppc): ppc64le seems to have issues with `should_panic` tests.
+#[cfg(not(all(target_arch = "powerpc64", target_endian = "little")))]
+fn shr_u256_overflow() {
+    // Like regular shr, panic on overflow with debug assertions
+    let _ = u256::MAX >> 256;
+}
+
+#[test]
+#[cfg(not(debug_assertions))]
+fn shr_u256_overflow() {
+    // No panic without debug assertions
+    assert_eq!(u256::MAX >> 256, u256::ZERO);
+    assert_eq!(u256::MAX >> 257, u256::ZERO);
+    assert_eq!(u256::MAX >> u32::MAX, u256::ZERO);
+}
diff --git a/library/compiler-builtins/libm/src/math/support/env.rs b/library/compiler-builtins/libm/src/math/support/env.rs
new file mode 100644
index 00000000000..53ae32f658d
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/support/env.rs
@@ -0,0 +1,130 @@
+//! Support for rounding directions and status flags as specified by IEEE 754.
+//!
+//! Rust does not support the floating point environment so rounding mode is passed as an argument
+//! and status flags are returned as part of the result. There is currently not much support for
+//! this; most existing ports from musl use a form of `force_eval!` to raise exceptions, but this
+//! has no side effects in Rust. Further, correct behavior relies on elementary operations making
+//! use of the correct rounding and raising relevant exceptions, which is not the case for Rust.
+//!
+//! This module exists so no functionality is lost when porting algorithms that respect floating
+//! point environment, and so that some functionality may be tested (that which does not rely on
+//! side effects from elementary operations). Full support would require wrappers around basic
+//! operations, but there is no plan to add this at the current time.
+
+/// A value combined with a floating point status.
+pub struct FpResult<T> {
+    pub val: T,
+    #[cfg_attr(not(feature = "unstable-public-internals"), allow(dead_code))]
+    pub status: Status,
+}
+
+impl<T> FpResult<T> {
+    pub fn new(val: T, status: Status) -> Self {
+        Self { val, status }
+    }
+
+    /// Return `val` with `Status::OK`.
+    pub fn ok(val: T) -> Self {
+        Self {
+            val,
+            status: Status::OK,
+        }
+    }
+}
+
+/// IEEE 754 rounding mode, excluding the optional `roundTiesToAway` version of nearest.
+///
+/// Integer representation comes from what CORE-MATH uses for indexing.
+#[cfg_attr(not(feature = "unstable-public-internals"), allow(dead_code))]
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub enum Round {
+    /// IEEE 754 nearest, `roundTiesToEven`.
+    Nearest = 0,
+    /// IEEE 754 `roundTowardNegative`.
+    Negative = 1,
+    /// IEEE 754 `roundTowardPositive`.
+    Positive = 2,
+    /// IEEE 754 `roundTowardZero`.
+    Zero = 3,
+}
+
+/// IEEE 754 exception status flags.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub struct Status(u8);
+
+impl Status {
+    /// Default status indicating no errors.
+    pub const OK: Self = Self(0);
+
+    /// No definable result.
+    ///
+    /// Includes:
+    /// - Any ops on sNaN, with a few exceptions.
+    /// - `0 * inf`, `inf * 0`.
+    /// - `fma(0, inf, c)` or `fma(inf, 0, c)`, possibly excluding `c = qNaN`.
+    /// - `+inf + -inf` and similar (includes subtraction and fma).
+    /// - `0.0 / 0.0`, `inf / inf`
+    /// - `remainder(x, y)` if `y == 0.0` or `x == inf`, and neither is NaN.
+    /// - `sqrt(x)` with `x < 0.0`.
+    pub const INVALID: Self = Self(1);
+
+    /// Division by zero.
+    ///
+    /// The default result for division is +/-inf based on operand sign. For `logB`, the default
+    /// result is -inf.
+    /// `x / y` when `x != 0.0` and `y == 0.0`,
+    #[cfg_attr(not(feature = "unstable-public-internals"), allow(dead_code))]
+    pub const DIVIDE_BY_ZERO: Self = Self(1 << 2);
+
+    /// The result exceeds the maximum finite value.
+    ///
+    /// The default result depends on rounding mode. `Nearest*` rounds to +/- infinity, sign based
+    /// on the intermediate result. `Zero` rounds to the signed maximum finite. `Positive` and
+    /// `Negative` round to signed maximum finite in one direction, signed infinity in the other.
+    #[cfg_attr(not(feature = "unstable-public-internals"), allow(dead_code))]
+    pub const OVERFLOW: Self = Self(1 << 3);
+
+    /// The result is subnormal and lost precision.
+    pub const UNDERFLOW: Self = Self(1 << 4);
+
+    /// The finite-precision result does not match that of infinite precision, and the reason
+    /// is not represented by one of the other flags.
+    pub const INEXACT: Self = Self(1 << 5);
+
+    /// True if `UNDERFLOW` is set.
+    #[cfg_attr(not(feature = "unstable-public-internals"), allow(dead_code))]
+    pub const fn underflow(self) -> bool {
+        self.0 & Self::UNDERFLOW.0 != 0
+    }
+
+    /// True if `OVERFLOW` is set.
+    #[cfg_attr(not(feature = "unstable-public-internals"), allow(dead_code))]
+    pub const fn overflow(self) -> bool {
+        self.0 & Self::OVERFLOW.0 != 0
+    }
+
+    pub fn set_underflow(&mut self, val: bool) {
+        self.set_flag(val, Self::UNDERFLOW);
+    }
+
+    /// True if `INEXACT` is set.
+    pub const fn inexact(self) -> bool {
+        self.0 & Self::INEXACT.0 != 0
+    }
+
+    pub fn set_inexact(&mut self, val: bool) {
+        self.set_flag(val, Self::INEXACT);
+    }
+
+    fn set_flag(&mut self, val: bool, mask: Self) {
+        if val {
+            self.0 |= mask.0;
+        } else {
+            self.0 &= !mask.0;
+        }
+    }
+
+    pub(crate) const fn with(self, rhs: Self) -> Self {
+        Self(self.0 | rhs.0)
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/support/feature_detect.rs b/library/compiler-builtins/libm/src/math/support/feature_detect.rs
new file mode 100644
index 00000000000..9ebd434a5f8
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/support/feature_detect.rs
@@ -0,0 +1,211 @@
+//! Helpers for runtime target feature detection that are shared across architectures.
+
+// `AtomicU32` is preferred for a consistent size across targets.
+#[cfg(all(target_has_atomic = "ptr", not(target_has_atomic = "32")))]
+compile_error!("currently all targets that support `AtomicPtr` also support `AtomicU32`");
+
+use core::sync::atomic::{AtomicU32, Ordering};
+
+/// Given a list of identifiers, assign each one a unique sequential single-bit mask.
+#[allow(unused_macros)]
+macro_rules! unique_masks {
+    ($ty:ty, $($name:ident,)+) => {
+        #[cfg(test)]
+        pub const ALL: &[$ty] = &[$($name),+];
+        #[cfg(test)]
+        pub const NAMES: &[&str] = &[$(stringify!($name)),+];
+
+        unique_masks!(@one; $ty; 0; $($name,)+);
+    };
+    // Matcher for a single value
+    (@one; $_ty:ty; $_idx:expr;) => {};
+    (@one; $ty:ty; $shift:expr; $name:ident, $($tail:tt)*) => {
+        pub const $name: $ty = 1 << $shift;
+        // Ensure the top bit is not used since it stores initialized state.
+        const _: () = assert!($name != (1 << (<$ty>::BITS - 1)));
+        // Increment the shift and invoke the next
+        unique_masks!(@one; $ty; $shift + 1; $($tail)*);
+    };
+}
+
+/// Call `init` once to choose an implementation, then use it for the rest of the program.
+///
+/// - `sig` is the function type.
+/// - `init` is an expression called at startup that chooses an implementation and returns a
+///   function pointer.
+/// - `call` is an expression to call a function returned by `init`, encapsulating any safety
+///   preconditions.
+///
+/// The type `Func` is available in `init` and `call`.
+///
+/// This is effectively our version of an ifunc without linker support. Note that `init` may be
+/// called more than once until one completes.
+#[allow(unused_macros)] // only used on some architectures
+macro_rules! select_once {
+    (
+        sig: fn($($arg:ident: $ArgTy:ty),*) -> $RetTy:ty,
+        init: $init:expr,
+        call: $call:expr,
+    ) => {{
+        use core::mem;
+        use core::sync::atomic::{AtomicPtr, Ordering};
+
+        type Func = unsafe fn($($arg: $ArgTy),*) -> $RetTy;
+
+        /// Stores a pointer that is immediately jumped to. By default it is an init function
+        /// that sets FUNC to something else.
+        static FUNC: AtomicPtr<()> = AtomicPtr::new((initializer as Func) as *mut ());
+
+        /// Run once to set the function that will be used for all subsequent calls.
+        fn initializer($($arg: $ArgTy),*) -> $RetTy {
+            // Select an implementation, ensuring a 'static lifetime.
+            let fn_ptr: Func = $init();
+            FUNC.store(fn_ptr as *mut (), Ordering::Relaxed);
+
+            // Forward the call to the selected function.
+            $call(fn_ptr)
+        }
+
+        let raw: *mut () = FUNC.load(Ordering::Relaxed);
+
+        // SAFETY: will only ever be `initializer` or another function pointer that has the
+        // 'static lifetime.
+        let fn_ptr: Func = unsafe { mem::transmute::<*mut (), Func>(raw) };
+
+        $call(fn_ptr)
+    }}
+}
+
+#[allow(unused_imports)]
+pub(crate) use {select_once, unique_masks};
+
+use crate::support::cold_path;
+
+/// Helper for working with bit flags, based on `bitflags`.
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub struct Flags(u32);
+
+#[allow(dead_code)] // only used on some architectures
+impl Flags {
+    /// No bits set.
+    pub const fn empty() -> Self {
+        Self(0)
+    }
+
+    /// Create with bits already set.
+    pub const fn from_bits(val: u32) -> Self {
+        Self(val)
+    }
+
+    /// Get the integer representation.
+    pub fn bits(&self) -> u32 {
+        self.0
+    }
+
+    /// Set any bits in `mask`.
+    pub fn insert(&mut self, mask: u32) {
+        self.0 |= mask;
+    }
+
+    /// Check whether the mask is set.
+    pub fn contains(&self, mask: u32) -> bool {
+        self.0 & mask == mask
+    }
+
+    /// Check whether the nth bit is set.
+    pub fn test_nth(&self, bit: u32) -> bool {
+        debug_assert!(bit < u32::BITS, "bit index out-of-bounds");
+        self.0 & (1 << bit) != 0
+    }
+}
+
+/// Load flags from an atomic value. If the flags have not yet been initialized, call `init`
+/// to do so.
+///
+/// Note that `init` may run more than once.
+#[allow(dead_code)] // only used on some architectures
+pub fn get_or_init_flags_cache(cache: &AtomicU32, init: impl FnOnce() -> Flags) -> Flags {
+    // The top bit is used to indicate that the values have already been set once.
+    const INITIALIZED: u32 = 1 << 31;
+
+    // Relaxed ops are sufficient since the result should always be the same.
+    let mut flags = Flags::from_bits(cache.load(Ordering::Relaxed));
+
+    if !flags.contains(INITIALIZED) {
+        // Without this, `init` is inlined and the bit check gets wrapped in `init`'s lengthy
+        // prologue/epilogue. Cold pathing gives a preferable load->test->?jmp->ret.
+        cold_path();
+
+        flags = init();
+        debug_assert!(
+            !flags.contains(INITIALIZED),
+            "initialized bit shouldn't be set"
+        );
+        flags.insert(INITIALIZED);
+        cache.store(flags.bits(), Ordering::Relaxed);
+    }
+
+    flags
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn unique_masks() {
+        unique_masks! {
+            u32,
+            V0,
+            V1,
+            V2,
+        }
+        assert_eq!(V0, 1u32 << 0);
+        assert_eq!(V1, 1u32 << 1);
+        assert_eq!(V2, 1u32 << 2);
+        assert_eq!(ALL, [V0, V1, V2]);
+        assert_eq!(NAMES, ["V0", "V1", "V2"]);
+    }
+
+    #[test]
+    fn flag_cache_is_used() {
+        // Sanity check that flags are only ever set once
+        static CACHE: AtomicU32 = AtomicU32::new(0);
+
+        let mut f1 = Flags::from_bits(0x1);
+        let f2 = Flags::from_bits(0x2);
+
+        let r1 = get_or_init_flags_cache(&CACHE, || f1);
+        let r2 = get_or_init_flags_cache(&CACHE, || f2);
+
+        f1.insert(1 << 31); // init bit
+
+        assert_eq!(r1, f1);
+        assert_eq!(r2, f1);
+    }
+
+    #[test]
+    fn select_cache_is_used() {
+        // Sanity check that cache is used
+        static CALLED: AtomicU32 = AtomicU32::new(0);
+
+        fn inner() {
+            fn nop() {}
+
+            select_once! {
+                sig: fn() -> (),
+                init: || {
+                    CALLED.fetch_add(1, Ordering::Relaxed);
+                    nop
+                },
+                call: |fn_ptr: Func| unsafe { fn_ptr() },
+            }
+        }
+
+        // `init` should only have been called once.
+        inner();
+        assert_eq!(CALLED.load(Ordering::Relaxed), 1);
+        inner();
+        assert_eq!(CALLED.load(Ordering::Relaxed), 1);
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/support/float_traits.rs b/library/compiler-builtins/libm/src/math/support/float_traits.rs
new file mode 100644
index 00000000000..4c866ef10bd
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/support/float_traits.rs
@@ -0,0 +1,551 @@
+#![allow(unknown_lints)] // FIXME(msrv) we shouldn't need this
+
+use core::{fmt, mem, ops};
+
+use super::int_traits::{CastFrom, Int, MinInt};
+
+/// Trait for some basic operations on floats
+// #[allow(dead_code)]
+pub trait Float:
+    Copy
+    + fmt::Debug
+    + PartialEq
+    + PartialOrd
+    + ops::AddAssign
+    + ops::MulAssign
+    + ops::Add<Output = Self>
+    + ops::Sub<Output = Self>
+    + ops::Mul<Output = Self>
+    + ops::Div<Output = Self>
+    + ops::Rem<Output = Self>
+    + ops::Neg<Output = Self>
+    + 'static
+{
+    /// A uint of the same width as the float
+    type Int: Int<OtherSign = Self::SignedInt, Unsigned = Self::Int>;
+
+    /// A int of the same width as the float
+    type SignedInt: Int
+        + MinInt<OtherSign = Self::Int, Unsigned = Self::Int>
+        + ops::Neg<Output = Self::SignedInt>;
+
+    const ZERO: Self;
+    const NEG_ZERO: Self;
+    const ONE: Self;
+    const NEG_ONE: Self;
+    const INFINITY: Self;
+    const NEG_INFINITY: Self;
+    const NAN: Self;
+    const NEG_NAN: Self;
+    const MAX: Self;
+    const MIN: Self;
+    const EPSILON: Self;
+    const PI: Self;
+    const NEG_PI: Self;
+    const FRAC_PI_2: Self;
+
+    const MIN_POSITIVE_NORMAL: Self;
+
+    /// The bitwidth of the float type
+    const BITS: u32;
+
+    /// The bitwidth of the significand
+    const SIG_BITS: u32;
+
+    /// The bitwidth of the exponent
+    const EXP_BITS: u32 = Self::BITS - Self::SIG_BITS - 1;
+
+    /// The saturated (maximum bitpattern) value of the exponent, i.e. the infinite
+    /// representation.
+    ///
+    /// This shifted fully right, use `EXP_MASK` for the shifted value.
+    const EXP_SAT: u32 = (1 << Self::EXP_BITS) - 1;
+
+    /// The exponent bias value
+    const EXP_BIAS: u32 = Self::EXP_SAT >> 1;
+
+    /// Maximum unbiased exponent value.
+    const EXP_MAX: i32 = Self::EXP_BIAS as i32;
+
+    /// Minimum *NORMAL* unbiased exponent value.
+    const EXP_MIN: i32 = -(Self::EXP_MAX - 1);
+
+    /// Minimum subnormal exponent value.
+    const EXP_MIN_SUBNORM: i32 = Self::EXP_MIN - Self::SIG_BITS as i32;
+
+    /// A mask for the sign bit
+    const SIGN_MASK: Self::Int;
+
+    /// A mask for the significand
+    const SIG_MASK: Self::Int;
+
+    /// A mask for the exponent
+    const EXP_MASK: Self::Int;
+
+    /// The implicit bit of the float format
+    const IMPLICIT_BIT: Self::Int;
+
+    /// Returns `self` transmuted to `Self::Int`
+    fn to_bits(self) -> Self::Int;
+
+    /// Returns `self` transmuted to `Self::SignedInt`
+    #[allow(dead_code)]
+    fn to_bits_signed(self) -> Self::SignedInt {
+        self.to_bits().signed()
+    }
+
+    /// Check bitwise equality.
+    #[allow(dead_code)]
+    fn biteq(self, rhs: Self) -> bool {
+        self.to_bits() == rhs.to_bits()
+    }
+
+    /// Checks if two floats have the same bit representation. *Except* for NaNs! NaN can be
+    /// represented in multiple different ways.
+    ///
+    /// This method returns `true` if two NaNs are compared. Use [`biteq`](Self::biteq) instead
+    /// if `NaN` should not be treated separately.
+    #[allow(dead_code)]
+    fn eq_repr(self, rhs: Self) -> bool {
+        if self.is_nan() && rhs.is_nan() {
+            true
+        } else {
+            self.biteq(rhs)
+        }
+    }
+
+    /// Returns true if the value is NaN.
+    fn is_nan(self) -> bool;
+
+    /// Returns true if the value is +inf or -inf.
+    fn is_infinite(self) -> bool;
+
+    /// Returns true if the sign is negative. Extracts the sign bit regardless of zero or NaN.
+    fn is_sign_negative(self) -> bool;
+
+    /// Returns true if the sign is positive. Extracts the sign bit regardless of zero or NaN.
+    fn is_sign_positive(self) -> bool {
+        !self.is_sign_negative()
+    }
+
+    /// Returns if `self` is subnormal.
+    #[allow(dead_code)]
+    fn is_subnormal(self) -> bool {
+        (self.to_bits() & Self::EXP_MASK) == Self::Int::ZERO
+    }
+
+    /// Returns the exponent, not adjusting for bias, not accounting for subnormals or zero.
+    fn ex(self) -> u32 {
+        u32::cast_from(self.to_bits() >> Self::SIG_BITS) & Self::EXP_SAT
+    }
+
+    /// Extract the exponent and adjust it for bias, not accounting for subnormals or zero.
+    fn exp_unbiased(self) -> i32 {
+        self.ex().signed() - (Self::EXP_BIAS as i32)
+    }
+
+    /// Returns the significand with no implicit bit (or the "fractional" part)
+    #[allow(dead_code)]
+    fn frac(self) -> Self::Int {
+        self.to_bits() & Self::SIG_MASK
+    }
+
+    /// Returns a `Self::Int` transmuted back to `Self`
+    fn from_bits(a: Self::Int) -> Self;
+
+    /// Constructs a `Self` from its parts. Inputs are treated as bits and shifted into position.
+    fn from_parts(negative: bool, exponent: u32, significand: Self::Int) -> Self {
+        let sign = if negative {
+            Self::Int::ONE
+        } else {
+            Self::Int::ZERO
+        };
+        Self::from_bits(
+            (sign << (Self::BITS - 1))
+                | (Self::Int::cast_from(exponent & Self::EXP_SAT) << Self::SIG_BITS)
+                | (significand & Self::SIG_MASK),
+        )
+    }
+
+    #[allow(dead_code)]
+    fn abs(self) -> Self;
+
+    /// Returns a number composed of the magnitude of self and the sign of sign.
+    fn copysign(self, other: Self) -> Self;
+
+    /// Fused multiply add, rounding once.
+    fn fma(self, y: Self, z: Self) -> Self;
+
+    /// Returns (normalized exponent, normalized significand)
+    #[allow(dead_code)]
+    fn normalize(significand: Self::Int) -> (i32, Self::Int);
+
+    /// Returns a number that represents the sign of self.
+    #[allow(dead_code)]
+    fn signum(self) -> Self {
+        if self.is_nan() {
+            self
+        } else {
+            Self::ONE.copysign(self)
+        }
+    }
+}
+
+/// Access the associated `Int` type from a float (helper to avoid ambiguous associated types).
+pub type IntTy<F> = <F as Float>::Int;
+
+macro_rules! float_impl {
+    (
+        $ty:ident,
+        $ity:ident,
+        $sity:ident,
+        $bits:expr,
+        $significand_bits:expr,
+        $from_bits:path,
+        $to_bits:path,
+        $fma_fn:ident,
+        $fma_intrinsic:ident
+    ) => {
+        impl Float for $ty {
+            type Int = $ity;
+            type SignedInt = $sity;
+
+            const ZERO: Self = 0.0;
+            const NEG_ZERO: Self = -0.0;
+            const ONE: Self = 1.0;
+            const NEG_ONE: Self = -1.0;
+            const INFINITY: Self = Self::INFINITY;
+            const NEG_INFINITY: Self = Self::NEG_INFINITY;
+            const NAN: Self = Self::NAN;
+            // NAN isn't guaranteed to be positive but it usually is. We only use this for
+            // tests.
+            const NEG_NAN: Self = $from_bits($to_bits(Self::NAN) | Self::SIGN_MASK);
+            const MAX: Self = -Self::MIN;
+            // Sign bit set, saturated mantissa, saturated exponent with last bit zeroed
+            const MIN: Self = $from_bits(Self::Int::MAX & !(1 << Self::SIG_BITS));
+            const EPSILON: Self = <$ty>::EPSILON;
+
+            // Exponent is a 1 in the LSB
+            const MIN_POSITIVE_NORMAL: Self = $from_bits(1 << Self::SIG_BITS);
+
+            const PI: Self = core::$ty::consts::PI;
+            const NEG_PI: Self = -Self::PI;
+            const FRAC_PI_2: Self = core::$ty::consts::FRAC_PI_2;
+
+            const BITS: u32 = $bits;
+            const SIG_BITS: u32 = $significand_bits;
+
+            const SIGN_MASK: Self::Int = 1 << (Self::BITS - 1);
+            const SIG_MASK: Self::Int = (1 << Self::SIG_BITS) - 1;
+            const EXP_MASK: Self::Int = !(Self::SIGN_MASK | Self::SIG_MASK);
+            const IMPLICIT_BIT: Self::Int = 1 << Self::SIG_BITS;
+
+            fn to_bits(self) -> Self::Int {
+                self.to_bits()
+            }
+            fn is_nan(self) -> bool {
+                self.is_nan()
+            }
+            fn is_infinite(self) -> bool {
+                self.is_infinite()
+            }
+            fn is_sign_negative(self) -> bool {
+                self.is_sign_negative()
+            }
+            fn from_bits(a: Self::Int) -> Self {
+                Self::from_bits(a)
+            }
+            fn abs(self) -> Self {
+                cfg_if! {
+                    // FIXME(msrv): `abs` is available in `core` starting with 1.85.
+                    if #[cfg(intrinsics_enabled)] {
+                        self.abs()
+                    } else {
+                        super::super::generic::fabs(self)
+                    }
+                }
+            }
+            fn copysign(self, other: Self) -> Self {
+                cfg_if! {
+                    // FIXME(msrv): `copysign` is available in `core` starting with 1.85.
+                    if #[cfg(intrinsics_enabled)] {
+                        self.copysign(other)
+                    } else {
+                        super::super::generic::copysign(self, other)
+                    }
+                }
+            }
+            fn fma(self, y: Self, z: Self) -> Self {
+                cfg_if! {
+                    // fma is not yet available in `core`
+                    if #[cfg(intrinsics_enabled)] {
+                        unsafe{ core::intrinsics::$fma_intrinsic(self, y, z) }
+                    } else {
+                        super::super::$fma_fn(self, y, z)
+                    }
+                }
+            }
+            fn normalize(significand: Self::Int) -> (i32, Self::Int) {
+                let shift = significand.leading_zeros().wrapping_sub(Self::EXP_BITS);
+                (
+                    1i32.wrapping_sub(shift as i32),
+                    significand << shift as Self::Int,
+                )
+            }
+        }
+    };
+}
+
+#[cfg(f16_enabled)]
+float_impl!(
+    f16,
+    u16,
+    i16,
+    16,
+    10,
+    f16::from_bits,
+    f16::to_bits,
+    fmaf16,
+    fmaf16
+);
+float_impl!(
+    f32,
+    u32,
+    i32,
+    32,
+    23,
+    f32_from_bits,
+    f32_to_bits,
+    fmaf,
+    fmaf32
+);
+float_impl!(
+    f64,
+    u64,
+    i64,
+    64,
+    52,
+    f64_from_bits,
+    f64_to_bits,
+    fma,
+    fmaf64
+);
+#[cfg(f128_enabled)]
+float_impl!(
+    f128,
+    u128,
+    i128,
+    128,
+    112,
+    f128::from_bits,
+    f128::to_bits,
+    fmaf128,
+    fmaf128
+);
+
+/* FIXME(msrv): vendor some things that are not const stable at our MSRV */
+
+/// `f32::from_bits`
+#[allow(unnecessary_transmutes)] // lint appears in newer versions of Rust
+pub const fn f32_from_bits(bits: u32) -> f32 {
+    // SAFETY: POD cast with no preconditions
+    unsafe { mem::transmute::<u32, f32>(bits) }
+}
+
+/// `f32::to_bits`
+#[allow(unnecessary_transmutes)] // lint appears in newer versions of Rust
+pub const fn f32_to_bits(x: f32) -> u32 {
+    // SAFETY: POD cast with no preconditions
+    unsafe { mem::transmute::<f32, u32>(x) }
+}
+
+/// `f64::from_bits`
+#[allow(unnecessary_transmutes)] // lint appears in newer versions of Rust
+pub const fn f64_from_bits(bits: u64) -> f64 {
+    // SAFETY: POD cast with no preconditions
+    unsafe { mem::transmute::<u64, f64>(bits) }
+}
+
+/// `f64::to_bits`
+#[allow(unnecessary_transmutes)] // lint appears in newer versions of Rust
+pub const fn f64_to_bits(x: f64) -> u64 {
+    // SAFETY: POD cast with no preconditions
+    unsafe { mem::transmute::<f64, u64>(x) }
+}
+
+/// Trait for floats twice the bit width of another integer.
+pub trait DFloat: Float {
+    /// Float that is half the bit width of the floatthis trait is implemented for.
+    type H: HFloat<D = Self>;
+
+    /// Narrow the float type.
+    fn narrow(self) -> Self::H;
+}
+
+/// Trait for floats half the bit width of another float.
+pub trait HFloat: Float {
+    /// Float that is double the bit width of the float this trait is implemented for.
+    type D: DFloat<H = Self>;
+
+    /// Widen the float type.
+    fn widen(self) -> Self::D;
+}
+
+macro_rules! impl_d_float {
+    ($($X:ident $D:ident),*) => {
+        $(
+            impl DFloat for $D {
+                type H = $X;
+
+                fn narrow(self) -> Self::H {
+                    self as $X
+                }
+            }
+        )*
+    };
+}
+
+macro_rules! impl_h_float {
+    ($($H:ident $X:ident),*) => {
+        $(
+            impl HFloat for $H {
+                type D = $X;
+
+                fn widen(self) -> Self::D {
+                    self as $X
+                }
+            }
+        )*
+    };
+}
+
+impl_d_float!(f32 f64);
+#[cfg(f16_enabled)]
+impl_d_float!(f16 f32);
+#[cfg(f128_enabled)]
+impl_d_float!(f64 f128);
+
+impl_h_float!(f32 f64);
+#[cfg(f16_enabled)]
+impl_h_float!(f16 f32);
+#[cfg(f128_enabled)]
+impl_h_float!(f64 f128);
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    #[cfg(f16_enabled)]
+    fn check_f16() {
+        // Constants
+        assert_eq!(f16::EXP_SAT, 0b11111);
+        assert_eq!(f16::EXP_BIAS, 15);
+        assert_eq!(f16::EXP_MAX, 15);
+        assert_eq!(f16::EXP_MIN, -14);
+        assert_eq!(f16::EXP_MIN_SUBNORM, -24);
+
+        // `exp_unbiased`
+        assert_eq!(f16::FRAC_PI_2.exp_unbiased(), 0);
+        assert_eq!((1.0f16 / 2.0).exp_unbiased(), -1);
+        assert_eq!(f16::MAX.exp_unbiased(), 15);
+        assert_eq!(f16::MIN.exp_unbiased(), 15);
+        assert_eq!(f16::MIN_POSITIVE.exp_unbiased(), -14);
+        // This is a convenience method and not ldexp, `exp_unbiased` does not return correct
+        // results for zero and subnormals.
+        assert_eq!(f16::ZERO.exp_unbiased(), -15);
+        assert_eq!(f16::from_bits(0x1).exp_unbiased(), -15);
+        assert_eq!(f16::MIN_POSITIVE, f16::MIN_POSITIVE_NORMAL);
+
+        // `from_parts`
+        assert_biteq!(f16::from_parts(true, f16::EXP_BIAS, 0), -1.0f16);
+        assert_biteq!(f16::from_parts(false, 0, 1), f16::from_bits(0x1));
+    }
+
+    #[test]
+    fn check_f32() {
+        // Constants
+        assert_eq!(f32::EXP_SAT, 0b11111111);
+        assert_eq!(f32::EXP_BIAS, 127);
+        assert_eq!(f32::EXP_MAX, 127);
+        assert_eq!(f32::EXP_MIN, -126);
+        assert_eq!(f32::EXP_MIN_SUBNORM, -149);
+
+        // `exp_unbiased`
+        assert_eq!(f32::FRAC_PI_2.exp_unbiased(), 0);
+        assert_eq!((1.0f32 / 2.0).exp_unbiased(), -1);
+        assert_eq!(f32::MAX.exp_unbiased(), 127);
+        assert_eq!(f32::MIN.exp_unbiased(), 127);
+        assert_eq!(f32::MIN_POSITIVE.exp_unbiased(), -126);
+        // This is a convenience method and not ldexp, `exp_unbiased` does not return correct
+        // results for zero and subnormals.
+        assert_eq!(f32::ZERO.exp_unbiased(), -127);
+        assert_eq!(f32::from_bits(0x1).exp_unbiased(), -127);
+        assert_eq!(f32::MIN_POSITIVE, f32::MIN_POSITIVE_NORMAL);
+
+        // `from_parts`
+        assert_biteq!(f32::from_parts(true, f32::EXP_BIAS, 0), -1.0f32);
+        assert_biteq!(
+            f32::from_parts(false, 10 + f32::EXP_BIAS, 0),
+            hf32!("0x1p10")
+        );
+        assert_biteq!(f32::from_parts(false, 0, 1), f32::from_bits(0x1));
+    }
+
+    #[test]
+    fn check_f64() {
+        // Constants
+        assert_eq!(f64::EXP_SAT, 0b11111111111);
+        assert_eq!(f64::EXP_BIAS, 1023);
+        assert_eq!(f64::EXP_MAX, 1023);
+        assert_eq!(f64::EXP_MIN, -1022);
+        assert_eq!(f64::EXP_MIN_SUBNORM, -1074);
+
+        // `exp_unbiased`
+        assert_eq!(f64::FRAC_PI_2.exp_unbiased(), 0);
+        assert_eq!((1.0f64 / 2.0).exp_unbiased(), -1);
+        assert_eq!(f64::MAX.exp_unbiased(), 1023);
+        assert_eq!(f64::MIN.exp_unbiased(), 1023);
+        assert_eq!(f64::MIN_POSITIVE.exp_unbiased(), -1022);
+        // This is a convenience method and not ldexp, `exp_unbiased` does not return correct
+        // results for zero and subnormals.
+        assert_eq!(f64::ZERO.exp_unbiased(), -1023);
+        assert_eq!(f64::from_bits(0x1).exp_unbiased(), -1023);
+        assert_eq!(f64::MIN_POSITIVE, f64::MIN_POSITIVE_NORMAL);
+
+        // `from_parts`
+        assert_biteq!(f64::from_parts(true, f64::EXP_BIAS, 0), -1.0f64);
+        assert_biteq!(
+            f64::from_parts(false, 10 + f64::EXP_BIAS, 0),
+            hf64!("0x1p10")
+        );
+        assert_biteq!(f64::from_parts(false, 0, 1), f64::from_bits(0x1));
+    }
+
+    #[test]
+    #[cfg(f128_enabled)]
+    fn check_f128() {
+        // Constants
+        assert_eq!(f128::EXP_SAT, 0b111111111111111);
+        assert_eq!(f128::EXP_BIAS, 16383);
+        assert_eq!(f128::EXP_MAX, 16383);
+        assert_eq!(f128::EXP_MIN, -16382);
+        assert_eq!(f128::EXP_MIN_SUBNORM, -16494);
+
+        // `exp_unbiased`
+        assert_eq!(f128::FRAC_PI_2.exp_unbiased(), 0);
+        assert_eq!((1.0f128 / 2.0).exp_unbiased(), -1);
+        assert_eq!(f128::MAX.exp_unbiased(), 16383);
+        assert_eq!(f128::MIN.exp_unbiased(), 16383);
+        assert_eq!(f128::MIN_POSITIVE.exp_unbiased(), -16382);
+        // This is a convenience method and not ldexp, `exp_unbiased` does not return correct
+        // results for zero and subnormals.
+        assert_eq!(f128::ZERO.exp_unbiased(), -16383);
+        assert_eq!(f128::from_bits(0x1).exp_unbiased(), -16383);
+        assert_eq!(f128::MIN_POSITIVE, f128::MIN_POSITIVE_NORMAL);
+
+        // `from_parts`
+        assert_biteq!(f128::from_parts(true, f128::EXP_BIAS, 0), -1.0f128);
+        assert_biteq!(f128::from_parts(false, 0, 1), f128::from_bits(0x1));
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/support/hex_float.rs b/library/compiler-builtins/libm/src/math/support/hex_float.rs
new file mode 100644
index 00000000000..85569d98aef
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/support/hex_float.rs
@@ -0,0 +1,1181 @@
+//! Utilities for working with hex float formats.
+
+use core::fmt;
+
+use super::{Float, Round, Status, f32_from_bits, f64_from_bits};
+
+/// Construct a 16-bit float from hex float representation (C-style)
+#[cfg(f16_enabled)]
+pub const fn hf16(s: &str) -> f16 {
+    match parse_hex_exact(s, 16, 10) {
+        Ok(bits) => f16::from_bits(bits as u16),
+        Err(HexFloatParseError(s)) => panic!("{}", s),
+    }
+}
+
+/// Construct a 32-bit float from hex float representation (C-style)
+#[allow(unused)]
+pub const fn hf32(s: &str) -> f32 {
+    match parse_hex_exact(s, 32, 23) {
+        Ok(bits) => f32_from_bits(bits as u32),
+        Err(HexFloatParseError(s)) => panic!("{}", s),
+    }
+}
+
+/// Construct a 64-bit float from hex float representation (C-style)
+pub const fn hf64(s: &str) -> f64 {
+    match parse_hex_exact(s, 64, 52) {
+        Ok(bits) => f64_from_bits(bits as u64),
+        Err(HexFloatParseError(s)) => panic!("{}", s),
+    }
+}
+
+/// Construct a 128-bit float from hex float representation (C-style)
+#[cfg(f128_enabled)]
+pub const fn hf128(s: &str) -> f128 {
+    match parse_hex_exact(s, 128, 112) {
+        Ok(bits) => f128::from_bits(bits),
+        Err(HexFloatParseError(s)) => panic!("{}", s),
+    }
+}
+#[derive(Copy, Clone, Debug)]
+pub struct HexFloatParseError(&'static str);
+
+/// Parses any float to its bitwise representation, returning an error if it cannot be represented exactly
+pub const fn parse_hex_exact(
+    s: &str,
+    bits: u32,
+    sig_bits: u32,
+) -> Result<u128, HexFloatParseError> {
+    match parse_any(s, bits, sig_bits, Round::Nearest) {
+        Err(e) => Err(e),
+        Ok((bits, Status::OK)) => Ok(bits),
+        Ok((_, status)) if status.overflow() => Err(HexFloatParseError("the value is too huge")),
+        Ok((_, status)) if status.underflow() => Err(HexFloatParseError("the value is too tiny")),
+        Ok((_, status)) if status.inexact() => Err(HexFloatParseError("the value is too precise")),
+        Ok(_) => unreachable!(),
+    }
+}
+
+/// Parse any float from hex to its bitwise representation.
+pub const fn parse_any(
+    s: &str,
+    bits: u32,
+    sig_bits: u32,
+    round: Round,
+) -> Result<(u128, Status), HexFloatParseError> {
+    let mut b = s.as_bytes();
+
+    if sig_bits > 119 || bits > 128 || bits < sig_bits + 3 || bits > sig_bits + 30 {
+        return Err(HexFloatParseError("unsupported target float configuration"));
+    }
+
+    let neg = matches!(b, [b'-', ..]);
+    if let &[b'-' | b'+', ref rest @ ..] = b {
+        b = rest;
+    }
+
+    let sign_bit = 1 << (bits - 1);
+    let quiet_bit = 1 << (sig_bits - 1);
+    let nan = sign_bit - quiet_bit;
+    let inf = nan - quiet_bit;
+
+    let (mut x, status) = match *b {
+        [b'i' | b'I', b'n' | b'N', b'f' | b'F'] => (inf, Status::OK),
+        [b'n' | b'N', b'a' | b'A', b'n' | b'N'] => (nan, Status::OK),
+        [b'0', b'x' | b'X', ref rest @ ..] => {
+            let round = match (neg, round) {
+                // parse("-x", Round::Positive) == -parse("x", Round::Negative)
+                (true, Round::Positive) => Round::Negative,
+                (true, Round::Negative) => Round::Positive,
+                // rounding toward nearest or zero are symmetric
+                (true, Round::Nearest | Round::Zero) | (false, _) => round,
+            };
+            match parse_finite(rest, bits, sig_bits, round) {
+                Err(e) => return Err(e),
+                Ok(res) => res,
+            }
+        }
+        _ => return Err(HexFloatParseError("no hex indicator")),
+    };
+
+    if neg {
+        x ^= sign_bit;
+    }
+
+    Ok((x, status))
+}
+
+const fn parse_finite(
+    b: &[u8],
+    bits: u32,
+    sig_bits: u32,
+    rounding_mode: Round,
+) -> Result<(u128, Status), HexFloatParseError> {
+    let exp_bits: u32 = bits - sig_bits - 1;
+    let max_msb: i32 = (1 << (exp_bits - 1)) - 1;
+    // The exponent of one ULP in the subnormals
+    let min_lsb: i32 = 1 - max_msb - sig_bits as i32;
+
+    let (mut sig, mut exp) = match parse_hex(b) {
+        Err(e) => return Err(e),
+        Ok(Parsed { sig: 0, .. }) => return Ok((0, Status::OK)),
+        Ok(Parsed { sig, exp }) => (sig, exp),
+    };
+
+    let mut round_bits = u128_ilog2(sig) as i32 - sig_bits as i32;
+
+    // Round at least up to min_lsb
+    if exp < min_lsb - round_bits {
+        round_bits = min_lsb - exp;
+    }
+
+    let mut status = Status::OK;
+
+    exp += round_bits;
+
+    if round_bits > 0 {
+        // first, prepare for rounding exactly two bits
+        if round_bits == 1 {
+            sig <<= 1;
+        } else if round_bits > 2 {
+            sig = shr_odd_rounding(sig, (round_bits - 2) as u32);
+        }
+
+        if sig & 0b11 != 0 {
+            status = Status::INEXACT;
+        }
+
+        sig = shr2_round(sig, rounding_mode);
+    } else if round_bits < 0 {
+        sig <<= -round_bits;
+    }
+
+    // The parsed value is X = sig * 2^exp
+    // Expressed as a multiple U of the smallest subnormal value:
+    // X = U * 2^min_lsb, so U = sig * 2^(exp-min_lsb)
+    let uexp = (exp - min_lsb) as u128;
+    let uexp = uexp << sig_bits;
+
+    // Note that it is possible for the exponent bits to equal 2 here
+    // if the value rounded up, but that means the mantissa is all zeroes
+    // so the value is still correct
+    debug_assert!(sig <= 2 << sig_bits);
+
+    let inf = ((1 << exp_bits) - 1) << sig_bits;
+
+    let bits = match sig.checked_add(uexp) {
+        Some(bits) if bits < inf => {
+            // inexact subnormal or zero?
+            if status.inexact() && bits < (1 << sig_bits) {
+                status = status.with(Status::UNDERFLOW);
+            }
+            bits
+        }
+        _ => {
+            // overflow to infinity
+            status = status.with(Status::OVERFLOW).with(Status::INEXACT);
+            match rounding_mode {
+                Round::Positive | Round::Nearest => inf,
+                Round::Negative | Round::Zero => inf - 1,
+            }
+        }
+    };
+    Ok((bits, status))
+}
+
+/// Shift right, rounding all inexact divisions to the nearest odd number
+/// E.g. (0 >> 4) -> 0, (1..=31 >> 4) -> 1, (32 >> 4) -> 2, ...
+///
+/// Useful for reducing a number before rounding the last two bits, since
+/// the result of the final rounding is preserved for all rounding modes.
+const fn shr_odd_rounding(x: u128, k: u32) -> u128 {
+    if k < 128 {
+        let inexact = x.trailing_zeros() < k;
+        (x >> k) | (inexact as u128)
+    } else {
+        (x != 0) as u128
+    }
+}
+
+/// Divide by 4, rounding with the given mode
+const fn shr2_round(mut x: u128, round: Round) -> u128 {
+    let t = (x as u32) & 0b111;
+    x >>= 2;
+    match round {
+        // Look-up-table on the last three bits for when to round up
+        Round::Nearest => x + ((0b11001000_u8 >> t) & 1) as u128,
+
+        Round::Negative => x,
+        Round::Zero => x,
+        Round::Positive => x + (t & 0b11 != 0) as u128,
+    }
+}
+
+/// A parsed finite and unsigned floating point number.
+struct Parsed {
+    /// Absolute value sig * 2^exp
+    sig: u128,
+    exp: i32,
+}
+
+/// Parse a hexadecimal float x
+const fn parse_hex(mut b: &[u8]) -> Result<Parsed, HexFloatParseError> {
+    let mut sig: u128 = 0;
+    let mut exp: i32 = 0;
+
+    let mut seen_point = false;
+    let mut some_digits = false;
+    let mut inexact = false;
+
+    while let &[c, ref rest @ ..] = b {
+        b = rest;
+
+        match c {
+            b'.' => {
+                if seen_point {
+                    return Err(HexFloatParseError(
+                        "unexpected '.' parsing fractional digits",
+                    ));
+                }
+                seen_point = true;
+                continue;
+            }
+            b'p' | b'P' => break,
+            c => {
+                let digit = match hex_digit(c) {
+                    Some(d) => d,
+                    None => return Err(HexFloatParseError("expected hexadecimal digit")),
+                };
+                some_digits = true;
+
+                if (sig >> 124) == 0 {
+                    sig <<= 4;
+                    sig |= digit as u128;
+                } else {
+                    // FIXME: it is technically possible for exp to overflow if parsing a string with >500M digits
+                    exp += 4;
+                    inexact |= digit != 0;
+                }
+                // Up until the fractional point, the value grows
+                // with more digits, but after it the exponent is
+                // compensated to match.
+                if seen_point {
+                    exp -= 4;
+                }
+            }
+        }
+    }
+    // If we've set inexact, the exact value has more than 125
+    // significant bits, and lies somewhere between sig and sig + 1.
+    // Because we'll round off at least two of the trailing bits,
+    // setting the last bit gives correct rounding for inexact values.
+    sig |= inexact as u128;
+
+    if !some_digits {
+        return Err(HexFloatParseError("at least one digit is required"));
+    };
+
+    some_digits = false;
+
+    let negate_exp = matches!(b, [b'-', ..]);
+    if let &[b'-' | b'+', ref rest @ ..] = b {
+        b = rest;
+    }
+
+    let mut pexp: u32 = 0;
+    while let &[c, ref rest @ ..] = b {
+        b = rest;
+        let digit = match dec_digit(c) {
+            Some(d) => d,
+            None => return Err(HexFloatParseError("expected decimal digit")),
+        };
+        some_digits = true;
+        pexp = pexp.saturating_mul(10);
+        pexp += digit as u32;
+    }
+
+    if !some_digits {
+        return Err(HexFloatParseError(
+            "at least one exponent digit is required",
+        ));
+    };
+
+    {
+        let e;
+        if negate_exp {
+            e = (exp as i64) - (pexp as i64);
+        } else {
+            e = (exp as i64) + (pexp as i64);
+        };
+
+        exp = if e < i32::MIN as i64 {
+            i32::MIN
+        } else if e > i32::MAX as i64 {
+            i32::MAX
+        } else {
+            e as i32
+        };
+    }
+    /* FIXME(msrv): once MSRV >= 1.66, replace the above workaround block with:
+    if negate_exp {
+        exp = exp.saturating_sub_unsigned(pexp);
+    } else {
+        exp = exp.saturating_add_unsigned(pexp);
+    };
+    */
+
+    Ok(Parsed { sig, exp })
+}
+
+const fn dec_digit(c: u8) -> Option<u8> {
+    match c {
+        b'0'..=b'9' => Some(c - b'0'),
+        _ => None,
+    }
+}
+
+const fn hex_digit(c: u8) -> Option<u8> {
+    match c {
+        b'0'..=b'9' => Some(c - b'0'),
+        b'a'..=b'f' => Some(c - b'a' + 10),
+        b'A'..=b'F' => Some(c - b'A' + 10),
+        _ => None,
+    }
+}
+
+/* FIXME(msrv): vendor some things that are not const stable at our MSRV */
+
+/// `u128::ilog2`
+const fn u128_ilog2(v: u128) -> u32 {
+    assert!(v != 0);
+    u128::BITS - 1 - v.leading_zeros()
+}
+
+/// Format a floating point number as its IEEE hex (`%a`) representation.
+pub struct Hexf<F>(pub F);
+
+// Adapted from https://github.com/ericseppanen/hexfloat2/blob/a5c27932f0ff/src/format.rs
+#[cfg(not(feature = "compiler-builtins"))]
+fn fmt_any_hex<F: Float>(x: &F, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+    if x.is_sign_negative() {
+        write!(f, "-")?;
+    }
+
+    if x.is_nan() {
+        return write!(f, "NaN");
+    } else if x.is_infinite() {
+        return write!(f, "inf");
+    } else if *x == F::ZERO {
+        return write!(f, "0x0p+0");
+    }
+
+    let mut exponent = x.exp_unbiased();
+    let sig = x.to_bits() & F::SIG_MASK;
+
+    let bias = F::EXP_BIAS as i32;
+    // The mantissa MSB needs to be shifted up to the nearest nibble.
+    let mshift = (4 - (F::SIG_BITS % 4)) % 4;
+    let sig = sig << mshift;
+    // The width is rounded up to the nearest char (4 bits)
+    let mwidth = (F::SIG_BITS as usize + 3) / 4;
+    let leading = if exponent == -bias {
+        // subnormal number means we shift our output by 1 bit.
+        exponent += 1;
+        "0."
+    } else {
+        "1."
+    };
+
+    write!(f, "0x{leading}{sig:0mwidth$x}p{exponent:+}")
+}
+
+#[cfg(feature = "compiler-builtins")]
+fn fmt_any_hex<F: Float>(_x: &F, _f: &mut fmt::Formatter<'_>) -> fmt::Result {
+    unimplemented!()
+}
+
+impl<F: Float> fmt::LowerHex for Hexf<F> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        cfg_if! {
+            if #[cfg(feature = "compiler-builtins")] {
+                let _ = f;
+                unimplemented!()
+            } else {
+                fmt_any_hex(&self.0, f)
+            }
+        }
+    }
+}
+
+impl<F: Float> fmt::LowerHex for Hexf<(F, F)> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        cfg_if! {
+            if #[cfg(feature = "compiler-builtins")] {
+                let _ = f;
+                unimplemented!()
+            } else {
+                write!(f, "({:x}, {:x})", Hexf(self.0.0), Hexf(self.0.1))
+            }
+        }
+    }
+}
+
+impl<F: Float> fmt::LowerHex for Hexf<(F, i32)> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        cfg_if! {
+            if #[cfg(feature = "compiler-builtins")] {
+                let _ = f;
+                unimplemented!()
+            } else {
+                write!(f, "({:x}, {:x})", Hexf(self.0.0), Hexf(self.0.1))
+            }
+        }
+    }
+}
+
+impl fmt::LowerHex for Hexf<i32> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        cfg_if! {
+            if #[cfg(feature = "compiler-builtins")] {
+                let _ = f;
+                unimplemented!()
+            } else {
+                fmt::LowerHex::fmt(&self.0, f)
+            }
+        }
+    }
+}
+
+impl<T> fmt::Debug for Hexf<T>
+where
+    Hexf<T>: fmt::LowerHex,
+{
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        cfg_if! {
+            if #[cfg(feature = "compiler-builtins")] {
+                let _ = f;
+                unimplemented!()
+            } else {
+                fmt::LowerHex::fmt(self, f)
+            }
+        }
+    }
+}
+
+impl<T> fmt::Display for Hexf<T>
+where
+    Hexf<T>: fmt::LowerHex,
+{
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        cfg_if! {
+            if #[cfg(feature = "compiler-builtins")] {
+                let _ = f;
+                unimplemented!()
+            } else {
+                fmt::LowerHex::fmt(self, f)
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod parse_tests {
+    extern crate std;
+    use std::{format, println};
+
+    use super::*;
+
+    #[cfg(f16_enabled)]
+    fn rounding_properties(s: &str) -> Result<(), HexFloatParseError> {
+        let (xd, s0) = parse_any(s, 16, 10, Round::Negative)?;
+        let (xu, s1) = parse_any(s, 16, 10, Round::Positive)?;
+        let (xz, s2) = parse_any(s, 16, 10, Round::Zero)?;
+        let (xn, s3) = parse_any(s, 16, 10, Round::Nearest)?;
+
+        // FIXME: A value between the least normal and largest subnormal
+        // could have underflow status depend on rounding mode.
+
+        if let Status::OK = s0 {
+            // an exact result is the same for all rounding modes
+            assert_eq!(s0, s1);
+            assert_eq!(s0, s2);
+            assert_eq!(s0, s3);
+
+            assert_eq!(xd, xu);
+            assert_eq!(xd, xz);
+            assert_eq!(xd, xn);
+        } else {
+            assert!([s0, s1, s2, s3].into_iter().all(Status::inexact));
+
+            let xd = f16::from_bits(xd as u16);
+            let xu = f16::from_bits(xu as u16);
+            let xz = f16::from_bits(xz as u16);
+            let xn = f16::from_bits(xn as u16);
+
+            assert_biteq!(xd.next_up(), xu, "s={s}, xd={xd:?}, xu={xu:?}");
+
+            let signs = [xd, xu, xz, xn].map(f16::is_sign_negative);
+
+            if signs == [true; 4] {
+                assert_biteq!(xz, xu);
+            } else {
+                assert_eq!(signs, [false; 4]);
+                assert_biteq!(xz, xd);
+            }
+
+            if xn.to_bits() != xd.to_bits() {
+                assert_biteq!(xn, xu);
+            }
+        }
+        Ok(())
+    }
+    #[test]
+    #[cfg(f16_enabled)]
+    fn test_rounding() {
+        let n = 1_i32 << 14;
+        for i in -n..n {
+            let u = i.rotate_right(11) as u32;
+            let s = format!("{}", Hexf(f32::from_bits(u)));
+            assert!(rounding_properties(&s).is_ok());
+        }
+    }
+
+    #[test]
+    fn test_parse_any() {
+        for k in -149..=127 {
+            let s = format!("0x1p{k}");
+            let x = hf32(&s);
+            let y = if k < 0 {
+                0.5f32.powi(-k)
+            } else {
+                2.0f32.powi(k)
+            };
+            assert_eq!(x, y);
+        }
+
+        let mut s = *b"0x.0000000p-121";
+        for e in 0..40 {
+            for k in 0..(1 << 15) {
+                let expected = f32::from_bits(k) * 2.0f32.powi(e);
+                let x = hf32(std::str::from_utf8(&s).unwrap());
+                assert_eq!(
+                    x.to_bits(),
+                    expected.to_bits(),
+                    "\
+                    e={e}\n\
+                    k={k}\n\
+                    x={x}\n\
+                    expected={expected}\n\
+                    s={}\n\
+                    f32::from_bits(k)={}\n\
+                    2.0f32.powi(e)={}\
+                    ",
+                    std::str::from_utf8(&s).unwrap(),
+                    f32::from_bits(k),
+                    2.0f32.powi(e),
+                );
+                for i in (3..10).rev() {
+                    if s[i] == b'f' {
+                        s[i] = b'0';
+                    } else if s[i] == b'9' {
+                        s[i] = b'a';
+                        break;
+                    } else {
+                        s[i] += 1;
+                        break;
+                    }
+                }
+            }
+            for i in (12..15).rev() {
+                if s[i] == b'0' {
+                    s[i] = b'9';
+                } else {
+                    s[i] -= 1;
+                    break;
+                }
+            }
+            for i in (3..10).rev() {
+                s[i] = b'0';
+            }
+        }
+    }
+
+    // FIXME: this test is causing failures that are likely UB on various platforms
+    #[cfg(all(target_arch = "x86_64", target_os = "linux"))]
+    #[test]
+    #[cfg(f128_enabled)]
+    fn rounding() {
+        let pi = std::f128::consts::PI;
+        let s = format!("{}", Hexf(pi));
+
+        for k in 0..=111 {
+            let (bits, status) = parse_any(&s, 128 - k, 112 - k, Round::Nearest).unwrap();
+            let scale = (1u128 << (112 - k - 1)) as f128;
+            let expected = (pi * scale).round_ties_even() / scale;
+            assert_eq!(bits << k, expected.to_bits(), "k = {k}, s = {s}");
+            assert_eq!(expected != pi, status.inexact());
+        }
+    }
+    #[test]
+    fn rounding_extreme_underflow() {
+        for k in 1..1000 {
+            let s = format!("0x1p{}", -149 - k);
+            let Ok((bits, status)) = parse_any(&s, 32, 23, Round::Nearest) else {
+                unreachable!()
+            };
+            assert_eq!(bits, 0, "{s} should round to zero, got bits={bits}");
+            assert!(
+                status.underflow(),
+                "should indicate underflow when parsing {s}"
+            );
+            assert!(status.inexact(), "should indicate inexact when parsing {s}");
+        }
+    }
+    #[test]
+    fn long_tail() {
+        for k in 1..1000 {
+            let s = format!("0x1.{}p0", "0".repeat(k));
+            let Ok(bits) = parse_hex_exact(&s, 32, 23) else {
+                panic!("parsing {s} failed")
+            };
+            assert_eq!(f32::from_bits(bits as u32), 1.0);
+
+            let s = format!("0x1.{}1p0", "0".repeat(k));
+            let Ok((bits, status)) = parse_any(&s, 32, 23, Round::Nearest) else {
+                unreachable!()
+            };
+            if status.inexact() {
+                assert!(1.0 == f32::from_bits(bits as u32));
+            } else {
+                assert!(1.0 < f32::from_bits(bits as u32));
+            }
+        }
+    }
+    // HACK(msrv): 1.63 rejects unknown width float literals at an AST level, so use a macro to
+    // hide them from the AST.
+    #[cfg(f16_enabled)]
+    macro_rules! f16_tests {
+        () => {
+            #[test]
+            fn test_f16() {
+                let checks = [
+                    ("0x.1234p+16", (0x1234 as f16).to_bits()),
+                    ("0x1.234p+12", (0x1234 as f16).to_bits()),
+                    ("0x12.34p+8", (0x1234 as f16).to_bits()),
+                    ("0x123.4p+4", (0x1234 as f16).to_bits()),
+                    ("0x1234p+0", (0x1234 as f16).to_bits()),
+                    ("0x1234.p+0", (0x1234 as f16).to_bits()),
+                    ("0x1234.0p+0", (0x1234 as f16).to_bits()),
+                    ("0x1.ffcp+15", f16::MAX.to_bits()),
+                    ("0x1.0p+1", 2.0f16.to_bits()),
+                    ("0x1.0p+0", 1.0f16.to_bits()),
+                    ("0x1.ffp+8", 0x5ffc),
+                    ("+0x1.ffp+8", 0x5ffc),
+                    ("0x1p+0", 0x3c00),
+                    ("0x1.998p-4", 0x2e66),
+                    ("0x1.9p+6", 0x5640),
+                    ("0x0.0p0", 0.0f16.to_bits()),
+                    ("-0x0.0p0", (-0.0f16).to_bits()),
+                    ("0x1.0p0", 1.0f16.to_bits()),
+                    ("0x1.998p-4", (0.1f16).to_bits()),
+                    ("-0x1.998p-4", (-0.1f16).to_bits()),
+                    ("0x0.123p-12", 0x0123),
+                    ("0x1p-24", 0x0001),
+                    ("nan", f16::NAN.to_bits()),
+                    ("-nan", (-f16::NAN).to_bits()),
+                    ("inf", f16::INFINITY.to_bits()),
+                    ("-inf", f16::NEG_INFINITY.to_bits()),
+                ];
+                for (s, exp) in checks {
+                    println!("parsing {s}");
+                    assert!(rounding_properties(s).is_ok());
+                    let act = hf16(s).to_bits();
+                    assert_eq!(
+                        act, exp,
+                        "parsing {s}: {act:#06x} != {exp:#06x}\nact: {act:#018b}\nexp: {exp:#018b}"
+                    );
+                }
+            }
+
+            #[test]
+            fn test_macros_f16() {
+                assert_eq!(hf16!("0x1.ffp+8").to_bits(), 0x5ffc_u16);
+            }
+        };
+    }
+
+    #[cfg(f16_enabled)]
+    f16_tests!();
+
+    #[test]
+    fn test_f32() {
+        let checks = [
+            ("0x.1234p+16", (0x1234 as f32).to_bits()),
+            ("0x1.234p+12", (0x1234 as f32).to_bits()),
+            ("0x12.34p+8", (0x1234 as f32).to_bits()),
+            ("0x123.4p+4", (0x1234 as f32).to_bits()),
+            ("0x1234p+0", (0x1234 as f32).to_bits()),
+            ("0x1234.p+0", (0x1234 as f32).to_bits()),
+            ("0x1234.0p+0", (0x1234 as f32).to_bits()),
+            ("0x1.fffffep+127", f32::MAX.to_bits()),
+            ("0x1.0p+1", 2.0f32.to_bits()),
+            ("0x1.0p+0", 1.0f32.to_bits()),
+            ("0x1.ffep+8", 0x43fff000),
+            ("+0x1.ffep+8", 0x43fff000),
+            ("0x1p+0", 0x3f800000),
+            ("0x1.99999ap-4", 0x3dcccccd),
+            ("0x1.9p+6", 0x42c80000),
+            ("0x1.2d5ed2p+20", 0x4996af69),
+            ("-0x1.348eb8p+10", 0xc49a475c),
+            ("-0x1.33dcfep-33", 0xaf19ee7f),
+            ("0x0.0p0", 0.0f32.to_bits()),
+            ("-0x0.0p0", (-0.0f32).to_bits()),
+            ("0x1.0p0", 1.0f32.to_bits()),
+            ("0x1.99999ap-4", (0.1f32).to_bits()),
+            ("-0x1.99999ap-4", (-0.1f32).to_bits()),
+            ("0x1.111114p-127", 0x00444445),
+            ("0x1.23456p-130", 0x00091a2b),
+            ("0x1p-149", 0x00000001),
+            ("nan", f32::NAN.to_bits()),
+            ("-nan", (-f32::NAN).to_bits()),
+            ("inf", f32::INFINITY.to_bits()),
+            ("-inf", f32::NEG_INFINITY.to_bits()),
+        ];
+        for (s, exp) in checks {
+            println!("parsing {s}");
+            let act = hf32(s).to_bits();
+            assert_eq!(
+                act, exp,
+                "parsing {s}: {act:#010x} != {exp:#010x}\nact: {act:#034b}\nexp: {exp:#034b}"
+            );
+        }
+    }
+
+    #[test]
+    fn test_f64() {
+        let checks = [
+            ("0x.1234p+16", (0x1234 as f64).to_bits()),
+            ("0x1.234p+12", (0x1234 as f64).to_bits()),
+            ("0x12.34p+8", (0x1234 as f64).to_bits()),
+            ("0x123.4p+4", (0x1234 as f64).to_bits()),
+            ("0x1234p+0", (0x1234 as f64).to_bits()),
+            ("0x1234.p+0", (0x1234 as f64).to_bits()),
+            ("0x1234.0p+0", (0x1234 as f64).to_bits()),
+            ("0x1.ffep+8", 0x407ffe0000000000),
+            ("0x1p+0", 0x3ff0000000000000),
+            ("0x1.999999999999ap-4", 0x3fb999999999999a),
+            ("0x1.9p+6", 0x4059000000000000),
+            ("0x1.2d5ed1fe1da7bp+20", 0x4132d5ed1fe1da7b),
+            ("-0x1.348eb851eb852p+10", 0xc09348eb851eb852),
+            ("-0x1.33dcfe54a3803p-33", 0xbde33dcfe54a3803),
+            ("0x1.0p0", 1.0f64.to_bits()),
+            ("0x0.0p0", 0.0f64.to_bits()),
+            ("-0x0.0p0", (-0.0f64).to_bits()),
+            ("0x1.999999999999ap-4", 0.1f64.to_bits()),
+            ("0x1.999999999998ap-4", (0.1f64 - f64::EPSILON).to_bits()),
+            ("-0x1.999999999999ap-4", (-0.1f64).to_bits()),
+            ("-0x1.999999999998ap-4", (-0.1f64 + f64::EPSILON).to_bits()),
+            ("0x0.8000000000001p-1022", 0x0008000000000001),
+            ("0x0.123456789abcdp-1022", 0x000123456789abcd),
+            ("0x0.0000000000002p-1022", 0x0000000000000002),
+            ("nan", f64::NAN.to_bits()),
+            ("-nan", (-f64::NAN).to_bits()),
+            ("inf", f64::INFINITY.to_bits()),
+            ("-inf", f64::NEG_INFINITY.to_bits()),
+        ];
+        for (s, exp) in checks {
+            println!("parsing {s}");
+            let act = hf64(s).to_bits();
+            assert_eq!(
+                act, exp,
+                "parsing {s}: {act:#018x} != {exp:#018x}\nact: {act:#066b}\nexp: {exp:#066b}"
+            );
+        }
+    }
+
+    // HACK(msrv): 1.63 rejects unknown width float literals at an AST level, so use a macro to
+    // hide them from the AST.
+    #[cfg(f128_enabled)]
+    macro_rules! f128_tests {
+        () => {
+            #[test]
+            fn test_f128() {
+                let checks = [
+                    ("0x.1234p+16", (0x1234 as f128).to_bits()),
+                    ("0x1.234p+12", (0x1234 as f128).to_bits()),
+                    ("0x12.34p+8", (0x1234 as f128).to_bits()),
+                    ("0x123.4p+4", (0x1234 as f128).to_bits()),
+                    ("0x1234p+0", (0x1234 as f128).to_bits()),
+                    ("0x1234.p+0", (0x1234 as f128).to_bits()),
+                    ("0x1234.0p+0", (0x1234 as f128).to_bits()),
+                    ("0x1.ffffffffffffffffffffffffffffp+16383", f128::MAX.to_bits()),
+                    ("0x1.0p+1", 2.0f128.to_bits()),
+                    ("0x1.0p+0", 1.0f128.to_bits()),
+                    ("0x1.ffep+8", 0x4007ffe0000000000000000000000000),
+                    ("+0x1.ffep+8", 0x4007ffe0000000000000000000000000),
+                    ("0x1p+0", 0x3fff0000000000000000000000000000),
+                    ("0x1.999999999999999999999999999ap-4", 0x3ffb999999999999999999999999999a),
+                    ("0x1.9p+6", 0x40059000000000000000000000000000),
+                    ("0x0.0p0", 0.0f128.to_bits()),
+                    ("-0x0.0p0", (-0.0f128).to_bits()),
+                    ("0x1.0p0", 1.0f128.to_bits()),
+                    ("0x1.999999999999999999999999999ap-4", (0.1f128).to_bits()),
+                    ("-0x1.999999999999999999999999999ap-4", (-0.1f128).to_bits()),
+                    ("0x0.abcdef0123456789abcdef012345p-16382", 0x0000abcdef0123456789abcdef012345),
+                    ("0x1p-16494", 0x00000000000000000000000000000001),
+                    ("nan", f128::NAN.to_bits()),
+                    ("-nan", (-f128::NAN).to_bits()),
+                    ("inf", f128::INFINITY.to_bits()),
+                    ("-inf", f128::NEG_INFINITY.to_bits()),
+                ];
+                for (s, exp) in checks {
+                    println!("parsing {s}");
+                    let act = hf128(s).to_bits();
+                    assert_eq!(
+                        act, exp,
+                        "parsing {s}: {act:#034x} != {exp:#034x}\nact: {act:#0130b}\nexp: {exp:#0130b}"
+                    );
+                }
+            }
+
+            #[test]
+            fn test_macros_f128() {
+                assert_eq!(hf128!("0x1.ffep+8").to_bits(), 0x4007ffe0000000000000000000000000_u128);
+            }
+        }
+    }
+
+    #[cfg(f128_enabled)]
+    f128_tests!();
+
+    #[test]
+    fn test_macros() {
+        #[cfg(f16_enabled)]
+        assert_eq!(hf16!("0x1.ffp+8").to_bits(), 0x5ffc_u16);
+        assert_eq!(hf32!("0x1.ffep+8").to_bits(), 0x43fff000_u32);
+        assert_eq!(hf64!("0x1.ffep+8").to_bits(), 0x407ffe0000000000_u64);
+        #[cfg(f128_enabled)]
+        assert_eq!(
+            hf128!("0x1.ffep+8").to_bits(),
+            0x4007ffe0000000000000000000000000_u128
+        );
+    }
+}
+
+#[cfg(test)]
+// FIXME(ppc): something with `should_panic` tests cause a SIGILL with ppc64le
+#[cfg(not(all(target_arch = "powerpc64", target_endian = "little")))]
+mod tests_panicking {
+    extern crate std;
+    use super::*;
+
+    // HACK(msrv): 1.63 rejects unknown width float literals at an AST level, so use a macro to
+    // hide them from the AST.
+    #[cfg(f16_enabled)]
+    macro_rules! f16_tests {
+        () => {
+            #[test]
+            fn test_f16_almost_extra_precision() {
+                // Exact maximum precision allowed
+                hf16("0x1.ffcp+0");
+            }
+
+            #[test]
+            #[should_panic(expected = "the value is too precise")]
+            fn test_f16_extra_precision() {
+                // One bit more than the above.
+                hf16("0x1.ffdp+0");
+            }
+
+            #[test]
+            #[should_panic(expected = "the value is too huge")]
+            fn test_f16_overflow() {
+                // One bit more than the above.
+                hf16("0x1p+16");
+            }
+
+            #[test]
+            fn test_f16_tiniest() {
+                let x = hf16("0x1.p-24");
+                let y = hf16("0x0.001p-12");
+                let z = hf16("0x0.8p-23");
+                assert_eq!(x, y);
+                assert_eq!(x, z);
+            }
+
+            #[test]
+            #[should_panic(expected = "the value is too tiny")]
+            fn test_f16_too_tiny() {
+                hf16("0x1.p-25");
+            }
+
+            #[test]
+            #[should_panic(expected = "the value is too tiny")]
+            fn test_f16_also_too_tiny() {
+                hf16("0x0.8p-24");
+            }
+
+            #[test]
+            #[should_panic(expected = "the value is too tiny")]
+            fn test_f16_again_too_tiny() {
+                hf16("0x0.001p-13");
+            }
+        };
+    }
+
+    #[cfg(f16_enabled)]
+    f16_tests!();
+
+    #[test]
+    fn test_f32_almost_extra_precision() {
+        // Exact maximum precision allowed
+        hf32("0x1.abcdeep+0");
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_f32_extra_precision2() {
+        // One bit more than the above.
+        hf32("0x1.ffffffp+127");
+    }
+
+    #[test]
+    #[should_panic(expected = "the value is too huge")]
+    fn test_f32_overflow() {
+        // One bit more than the above.
+        hf32("0x1p+128");
+    }
+
+    #[test]
+    #[should_panic(expected = "the value is too precise")]
+    fn test_f32_extra_precision() {
+        // One bit more than the above.
+        hf32("0x1.abcdefp+0");
+    }
+
+    #[test]
+    fn test_f32_tiniest() {
+        let x = hf32("0x1.p-149");
+        let y = hf32("0x0.0000000000000001p-85");
+        let z = hf32("0x0.8p-148");
+        assert_eq!(x, y);
+        assert_eq!(x, z);
+    }
+
+    #[test]
+    #[should_panic(expected = "the value is too tiny")]
+    fn test_f32_too_tiny() {
+        hf32("0x1.p-150");
+    }
+
+    #[test]
+    #[should_panic(expected = "the value is too tiny")]
+    fn test_f32_also_too_tiny() {
+        hf32("0x0.8p-149");
+    }
+
+    #[test]
+    #[should_panic(expected = "the value is too tiny")]
+    fn test_f32_again_too_tiny() {
+        hf32("0x0.0000000000000001p-86");
+    }
+
+    #[test]
+    fn test_f64_almost_extra_precision() {
+        // Exact maximum precision allowed
+        hf64("0x1.abcdabcdabcdfp+0");
+    }
+
+    #[test]
+    #[should_panic(expected = "the value is too precise")]
+    fn test_f64_extra_precision() {
+        // One bit more than the above.
+        hf64("0x1.abcdabcdabcdf8p+0");
+    }
+
+    // HACK(msrv): 1.63 rejects unknown width float literals at an AST level, so use a macro to
+    // hide them from the AST.
+    #[cfg(f128_enabled)]
+    macro_rules! f128_tests {
+        () => {
+            #[test]
+            fn test_f128_almost_extra_precision() {
+                // Exact maximum precision allowed
+                hf128("0x1.ffffffffffffffffffffffffffffp+16383");
+            }
+
+            #[test]
+            #[should_panic(expected = "the value is too precise")]
+            fn test_f128_extra_precision() {
+                // Just below the maximum finite.
+                hf128("0x1.fffffffffffffffffffffffffffe8p+16383");
+            }
+            #[test]
+            #[should_panic(expected = "the value is too huge")]
+            fn test_f128_extra_precision_overflow() {
+                // One bit more than the above. Should overflow.
+                hf128("0x1.ffffffffffffffffffffffffffff8p+16383");
+            }
+
+            #[test]
+            #[should_panic(expected = "the value is too huge")]
+            fn test_f128_overflow() {
+                // One bit more than the above.
+                hf128("0x1p+16384");
+            }
+
+            #[test]
+            fn test_f128_tiniest() {
+                let x = hf128("0x1.p-16494");
+                let y = hf128("0x0.0000000000000001p-16430");
+                let z = hf128("0x0.8p-16493");
+                assert_eq!(x, y);
+                assert_eq!(x, z);
+            }
+
+            #[test]
+            #[should_panic(expected = "the value is too tiny")]
+            fn test_f128_too_tiny() {
+                hf128("0x1.p-16495");
+            }
+
+            #[test]
+            #[should_panic(expected = "the value is too tiny")]
+            fn test_f128_again_too_tiny() {
+                hf128("0x0.0000000000000001p-16431");
+            }
+
+            #[test]
+            #[should_panic(expected = "the value is too tiny")]
+            fn test_f128_also_too_tiny() {
+                hf128("0x0.8p-16494");
+            }
+        };
+    }
+
+    #[cfg(f128_enabled)]
+    f128_tests!();
+}
+
+#[cfg(test)]
+mod print_tests {
+    extern crate std;
+    use std::string::ToString;
+
+    use super::*;
+
+    #[test]
+    #[cfg(f16_enabled)]
+    fn test_f16() {
+        use std::format;
+        // Exhaustively check that `f16` roundtrips.
+        for x in 0..=u16::MAX {
+            let f = f16::from_bits(x);
+            let s = format!("{}", Hexf(f));
+            let from_s = hf16(&s);
+
+            if f.is_nan() && from_s.is_nan() {
+                continue;
+            }
+
+            assert_eq!(
+                f.to_bits(),
+                from_s.to_bits(),
+                "{f:?} formatted as {s} but parsed as {from_s:?}"
+            );
+        }
+    }
+
+    #[test]
+    #[cfg(f16_enabled)]
+    fn test_f16_to_f32() {
+        use std::format;
+        // Exhaustively check that these are equivalent for all `f16`:
+        //  - `f16 -> f32`
+        //  - `f16 -> str -> f32`
+        //  - `f16 -> f32 -> str -> f32`
+        //  - `f16 -> f32 -> str -> f16 -> f32`
+        for x in 0..=u16::MAX {
+            let f16 = f16::from_bits(x);
+            let s16 = format!("{}", Hexf(f16));
+            let f32 = f16 as f32;
+            let s32 = format!("{}", Hexf(f32));
+
+            let a = hf32(&s16);
+            let b = hf32(&s32);
+            let c = hf16(&s32);
+
+            if f32.is_nan() && a.is_nan() && b.is_nan() && c.is_nan() {
+                continue;
+            }
+
+            assert_eq!(
+                f32.to_bits(),
+                a.to_bits(),
+                "{f16:?} : f16 formatted as {s16} which parsed as {a:?} : f16"
+            );
+            assert_eq!(
+                f32.to_bits(),
+                b.to_bits(),
+                "{f32:?} : f32 formatted as {s32} which parsed as {b:?} : f32"
+            );
+            assert_eq!(
+                f32.to_bits(),
+                (c as f32).to_bits(),
+                "{f32:?} : f32 formatted as {s32} which parsed as {c:?} : f16"
+            );
+        }
+    }
+    #[test]
+    fn spot_checks() {
+        assert_eq!(Hexf(f32::MAX).to_string(), "0x1.fffffep+127");
+        assert_eq!(Hexf(f64::MAX).to_string(), "0x1.fffffffffffffp+1023");
+
+        assert_eq!(Hexf(f32::MIN).to_string(), "-0x1.fffffep+127");
+        assert_eq!(Hexf(f64::MIN).to_string(), "-0x1.fffffffffffffp+1023");
+
+        assert_eq!(Hexf(f32::ZERO).to_string(), "0x0p+0");
+        assert_eq!(Hexf(f64::ZERO).to_string(), "0x0p+0");
+
+        assert_eq!(Hexf(f32::NEG_ZERO).to_string(), "-0x0p+0");
+        assert_eq!(Hexf(f64::NEG_ZERO).to_string(), "-0x0p+0");
+
+        assert_eq!(Hexf(f32::NAN).to_string(), "NaN");
+        assert_eq!(Hexf(f64::NAN).to_string(), "NaN");
+
+        assert_eq!(Hexf(f32::INFINITY).to_string(), "inf");
+        assert_eq!(Hexf(f64::INFINITY).to_string(), "inf");
+
+        assert_eq!(Hexf(f32::NEG_INFINITY).to_string(), "-inf");
+        assert_eq!(Hexf(f64::NEG_INFINITY).to_string(), "-inf");
+
+        #[cfg(f16_enabled)]
+        {
+            assert_eq!(Hexf(f16::MAX).to_string(), "0x1.ffcp+15");
+            assert_eq!(Hexf(f16::MIN).to_string(), "-0x1.ffcp+15");
+            assert_eq!(Hexf(f16::ZERO).to_string(), "0x0p+0");
+            assert_eq!(Hexf(f16::NEG_ZERO).to_string(), "-0x0p+0");
+            assert_eq!(Hexf(f16::NAN).to_string(), "NaN");
+            assert_eq!(Hexf(f16::INFINITY).to_string(), "inf");
+            assert_eq!(Hexf(f16::NEG_INFINITY).to_string(), "-inf");
+        }
+
+        #[cfg(f128_enabled)]
+        {
+            assert_eq!(
+                Hexf(f128::MAX).to_string(),
+                "0x1.ffffffffffffffffffffffffffffp+16383"
+            );
+            assert_eq!(
+                Hexf(f128::MIN).to_string(),
+                "-0x1.ffffffffffffffffffffffffffffp+16383"
+            );
+            assert_eq!(Hexf(f128::ZERO).to_string(), "0x0p+0");
+            assert_eq!(Hexf(f128::NEG_ZERO).to_string(), "-0x0p+0");
+            assert_eq!(Hexf(f128::NAN).to_string(), "NaN");
+            assert_eq!(Hexf(f128::INFINITY).to_string(), "inf");
+            assert_eq!(Hexf(f128::NEG_INFINITY).to_string(), "-inf");
+        }
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/support/int_traits.rs b/library/compiler-builtins/libm/src/math/support/int_traits.rs
new file mode 100644
index 00000000000..3ec1faba170
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/support/int_traits.rs
@@ -0,0 +1,455 @@
+use core::{cmp, fmt, ops};
+
+/// Minimal integer implementations needed on all integer types, including wide integers.
+pub trait MinInt:
+    Copy
+    + fmt::Debug
+    + ops::BitOr<Output = Self>
+    + ops::Not<Output = Self>
+    + ops::Shl<u32, Output = Self>
+{
+    /// Type with the same width but other signedness
+    type OtherSign: MinInt;
+    /// Unsigned version of Self
+    type Unsigned: MinInt;
+
+    /// If `Self` is a signed integer
+    const SIGNED: bool;
+
+    /// The bitwidth of the int type
+    const BITS: u32;
+
+    const ZERO: Self;
+    const ONE: Self;
+    const MIN: Self;
+    const MAX: Self;
+}
+
+/// Access the associated `OtherSign` type from an int (helper to avoid ambiguous associated
+/// types).
+pub type OtherSign<I> = <I as MinInt>::OtherSign;
+
+/// Trait for some basic operations on integers
+#[allow(dead_code)]
+pub trait Int:
+    MinInt
+    + fmt::Display
+    + fmt::Binary
+    + fmt::LowerHex
+    + PartialEq
+    + PartialOrd
+    + ops::AddAssign
+    + ops::SubAssign
+    + ops::MulAssign
+    + ops::DivAssign
+    + ops::RemAssign
+    + ops::BitAndAssign
+    + ops::BitOrAssign
+    + ops::BitXorAssign
+    + ops::ShlAssign<i32>
+    + ops::ShlAssign<u32>
+    + ops::ShrAssign<u32>
+    + ops::ShrAssign<i32>
+    + ops::Add<Output = Self>
+    + ops::Sub<Output = Self>
+    + ops::Mul<Output = Self>
+    + ops::Div<Output = Self>
+    + ops::Rem<Output = Self>
+    + ops::Shl<i32, Output = Self>
+    + ops::Shl<u32, Output = Self>
+    + ops::Shr<i32, Output = Self>
+    + ops::Shr<u32, Output = Self>
+    + ops::BitXor<Output = Self>
+    + ops::BitAnd<Output = Self>
+    + cmp::Ord
+    + From<bool>
+    + CastFrom<i32>
+    + CastFrom<u16>
+    + CastFrom<u32>
+    + CastFrom<u8>
+    + CastFrom<usize>
+    + CastInto<i32>
+    + CastInto<u16>
+    + CastInto<u32>
+    + CastInto<u8>
+    + CastInto<usize>
+{
+    fn signed(self) -> OtherSign<Self::Unsigned>;
+    fn unsigned(self) -> Self::Unsigned;
+    fn from_unsigned(unsigned: Self::Unsigned) -> Self;
+    fn abs(self) -> Self;
+
+    fn from_bool(b: bool) -> Self;
+
+    /// Prevents the need for excessive conversions between signed and unsigned
+    fn logical_shr(self, other: u32) -> Self;
+
+    /// Absolute difference between two integers.
+    fn abs_diff(self, other: Self) -> Self::Unsigned;
+
+    // copied from primitive integers, but put in a trait
+    fn is_zero(self) -> bool;
+    fn checked_add(self, other: Self) -> Option<Self>;
+    fn checked_sub(self, other: Self) -> Option<Self>;
+    fn wrapping_neg(self) -> Self;
+    fn wrapping_add(self, other: Self) -> Self;
+    fn wrapping_mul(self, other: Self) -> Self;
+    fn wrapping_sub(self, other: Self) -> Self;
+    fn wrapping_shl(self, other: u32) -> Self;
+    fn wrapping_shr(self, other: u32) -> Self;
+    fn rotate_left(self, other: u32) -> Self;
+    fn overflowing_add(self, other: Self) -> (Self, bool);
+    fn overflowing_sub(self, other: Self) -> (Self, bool);
+    fn leading_zeros(self) -> u32;
+    fn ilog2(self) -> u32;
+}
+
+macro_rules! int_impl_common {
+    ($ty:ty) => {
+        fn from_bool(b: bool) -> Self {
+            b as $ty
+        }
+
+        fn logical_shr(self, other: u32) -> Self {
+            Self::from_unsigned(self.unsigned().wrapping_shr(other))
+        }
+
+        fn is_zero(self) -> bool {
+            self == Self::ZERO
+        }
+
+        fn checked_add(self, other: Self) -> Option<Self> {
+            self.checked_add(other)
+        }
+
+        fn checked_sub(self, other: Self) -> Option<Self> {
+            self.checked_sub(other)
+        }
+
+        fn wrapping_neg(self) -> Self {
+            <Self>::wrapping_neg(self)
+        }
+
+        fn wrapping_add(self, other: Self) -> Self {
+            <Self>::wrapping_add(self, other)
+        }
+
+        fn wrapping_mul(self, other: Self) -> Self {
+            <Self>::wrapping_mul(self, other)
+        }
+
+        fn wrapping_sub(self, other: Self) -> Self {
+            <Self>::wrapping_sub(self, other)
+        }
+
+        fn wrapping_shl(self, other: u32) -> Self {
+            <Self>::wrapping_shl(self, other)
+        }
+
+        fn wrapping_shr(self, other: u32) -> Self {
+            <Self>::wrapping_shr(self, other)
+        }
+
+        fn rotate_left(self, other: u32) -> Self {
+            <Self>::rotate_left(self, other)
+        }
+
+        fn overflowing_add(self, other: Self) -> (Self, bool) {
+            <Self>::overflowing_add(self, other)
+        }
+
+        fn overflowing_sub(self, other: Self) -> (Self, bool) {
+            <Self>::overflowing_sub(self, other)
+        }
+
+        fn leading_zeros(self) -> u32 {
+            <Self>::leading_zeros(self)
+        }
+
+        fn ilog2(self) -> u32 {
+            // On our older MSRV, this resolves to the trait method. Which won't actually work,
+            // but this is only called behind other gates.
+            #[allow(clippy::incompatible_msrv)]
+            <Self>::ilog2(self)
+        }
+    };
+}
+
+macro_rules! int_impl {
+    ($ity:ty, $uty:ty) => {
+        impl MinInt for $uty {
+            type OtherSign = $ity;
+            type Unsigned = $uty;
+
+            const BITS: u32 = <Self as MinInt>::ZERO.count_zeros();
+            const SIGNED: bool = Self::MIN != Self::ZERO;
+
+            const ZERO: Self = 0;
+            const ONE: Self = 1;
+            const MIN: Self = <Self>::MIN;
+            const MAX: Self = <Self>::MAX;
+        }
+
+        impl Int for $uty {
+            fn signed(self) -> $ity {
+                self as $ity
+            }
+
+            fn unsigned(self) -> Self {
+                self
+            }
+
+            fn abs(self) -> Self {
+                unimplemented!()
+            }
+
+            // It makes writing macros easier if this is implemented for both signed and unsigned
+            #[allow(clippy::wrong_self_convention)]
+            fn from_unsigned(me: $uty) -> Self {
+                me
+            }
+
+            fn abs_diff(self, other: Self) -> Self {
+                self.abs_diff(other)
+            }
+
+            int_impl_common!($uty);
+        }
+
+        impl MinInt for $ity {
+            type OtherSign = $uty;
+            type Unsigned = $uty;
+
+            const BITS: u32 = <Self as MinInt>::ZERO.count_zeros();
+            const SIGNED: bool = Self::MIN != Self::ZERO;
+
+            const ZERO: Self = 0;
+            const ONE: Self = 1;
+            const MIN: Self = <Self>::MIN;
+            const MAX: Self = <Self>::MAX;
+        }
+
+        impl Int for $ity {
+            fn signed(self) -> Self {
+                self
+            }
+
+            fn unsigned(self) -> $uty {
+                self as $uty
+            }
+
+            fn abs(self) -> Self {
+                self.abs()
+            }
+
+            fn from_unsigned(me: $uty) -> Self {
+                me as $ity
+            }
+
+            fn abs_diff(self, other: Self) -> $uty {
+                self.abs_diff(other)
+            }
+
+            int_impl_common!($ity);
+        }
+    };
+}
+
+int_impl!(isize, usize);
+int_impl!(i8, u8);
+int_impl!(i16, u16);
+int_impl!(i32, u32);
+int_impl!(i64, u64);
+int_impl!(i128, u128);
+
+/// Trait for integers twice the bit width of another integer. This is implemented for all
+/// primitives except for `u8`, because there is not a smaller primitive.
+pub trait DInt: MinInt {
+    /// Integer that is half the bit width of the integer this trait is implemented for
+    type H: HInt<D = Self>;
+
+    /// Returns the low half of `self`
+    fn lo(self) -> Self::H;
+    /// Returns the high half of `self`
+    fn hi(self) -> Self::H;
+    /// Returns the low and high halves of `self` as a tuple
+    fn lo_hi(self) -> (Self::H, Self::H) {
+        (self.lo(), self.hi())
+    }
+    /// Constructs an integer using lower and higher half parts
+    #[allow(unused)]
+    fn from_lo_hi(lo: Self::H, hi: Self::H) -> Self {
+        lo.zero_widen() | hi.widen_hi()
+    }
+}
+
+/// Trait for integers half the bit width of another integer. This is implemented for all
+/// primitives except for `u128`, because it there is not a larger primitive.
+pub trait HInt: Int {
+    /// Integer that is double the bit width of the integer this trait is implemented for
+    type D: DInt<H = Self> + MinInt;
+
+    // NB: some of the below methods could have default implementations (e.g. `widen_hi`), but for
+    // unknown reasons this can cause infinite recursion when optimizations are disabled. See
+    // <https://github.com/rust-lang/compiler-builtins/pull/707> for context.
+
+    /// Widens (using default extension) the integer to have double bit width
+    fn widen(self) -> Self::D;
+    /// Widens (zero extension only) the integer to have double bit width. This is needed to get
+    /// around problems with associated type bounds (such as `Int<Othersign: DInt>`) being unstable
+    fn zero_widen(self) -> Self::D;
+    /// Widens the integer to have double bit width and shifts the integer into the higher bits
+    #[allow(unused)]
+    fn widen_hi(self) -> Self::D;
+    /// Widening multiplication with zero widening. This cannot overflow.
+    fn zero_widen_mul(self, rhs: Self) -> Self::D;
+    /// Widening multiplication. This cannot overflow.
+    fn widen_mul(self, rhs: Self) -> Self::D;
+}
+
+macro_rules! impl_d_int {
+    ($($X:ident $D:ident),*) => {
+        $(
+            impl DInt for $D {
+                type H = $X;
+
+                fn lo(self) -> Self::H {
+                    self as $X
+                }
+                fn hi(self) -> Self::H {
+                    (self >> <$X as MinInt>::BITS) as $X
+                }
+            }
+        )*
+    };
+}
+
+macro_rules! impl_h_int {
+    ($($H:ident $uH:ident $X:ident),*) => {
+        $(
+            impl HInt for $H {
+                type D = $X;
+
+                fn widen(self) -> Self::D {
+                    self as $X
+                }
+                fn zero_widen(self) -> Self::D {
+                    (self as $uH) as $X
+                }
+                fn zero_widen_mul(self, rhs: Self) -> Self::D {
+                    self.zero_widen().wrapping_mul(rhs.zero_widen())
+                }
+                fn widen_mul(self, rhs: Self) -> Self::D {
+                    self.widen().wrapping_mul(rhs.widen())
+                }
+                fn widen_hi(self) -> Self::D {
+                    (self as $X) << <Self as MinInt>::BITS
+                }
+            }
+        )*
+    };
+}
+
+impl_d_int!(u8 u16, u16 u32, u32 u64, u64 u128, i8 i16, i16 i32, i32 i64, i64 i128);
+impl_h_int!(
+    u8 u8 u16,
+    u16 u16 u32,
+    u32 u32 u64,
+    u64 u64 u128,
+    i8 u8 i16,
+    i16 u16 i32,
+    i32 u32 i64,
+    i64 u64 i128
+);
+
+/// Trait to express (possibly lossy) casting of integers
+pub trait CastInto<T: Copy>: Copy {
+    /// By default, casts should be exact.
+    fn cast(self) -> T;
+
+    /// Call for casts that are expected to truncate.
+    fn cast_lossy(self) -> T;
+}
+
+pub trait CastFrom<T: Copy>: Copy {
+    /// By default, casts should be exact.
+    fn cast_from(value: T) -> Self;
+
+    /// Call for casts that are expected to truncate.
+    fn cast_from_lossy(value: T) -> Self;
+}
+
+impl<T: Copy, U: CastInto<T> + Copy> CastFrom<U> for T {
+    fn cast_from(value: U) -> Self {
+        value.cast()
+    }
+
+    fn cast_from_lossy(value: U) -> Self {
+        value.cast_lossy()
+    }
+}
+
+macro_rules! cast_into {
+    ($ty:ty) => {
+        cast_into!($ty; usize, isize, u8, i8, u16, i16, u32, i32, u64, i64, u128, i128);
+    };
+    ($ty:ty; $($into:ty),*) => {$(
+        impl CastInto<$into> for $ty {
+            fn cast(self) -> $into {
+                // All we can really do to enforce casting rules is check the rules when in
+                // debug mode.
+                #[cfg(not(feature = "compiler-builtins"))]
+                debug_assert!(<$into>::try_from(self).is_ok(), "failed cast from {self}");
+                self as $into
+            }
+
+            fn cast_lossy(self) -> $into {
+                self as $into
+            }
+        }
+    )*};
+}
+
+macro_rules! cast_into_float {
+    ($ty:ty) => {
+        #[cfg(f16_enabled)]
+        cast_into_float!($ty; f16);
+
+        cast_into_float!($ty; f32, f64);
+
+        #[cfg(f128_enabled)]
+        cast_into_float!($ty; f128);
+    };
+    ($ty:ty; $($into:ty),*) => {$(
+        impl CastInto<$into> for $ty {
+            fn cast(self) -> $into {
+                #[cfg(not(feature = "compiler-builtins"))]
+                debug_assert_eq!(self as $into as $ty, self, "inexact float cast");
+                self as $into
+            }
+
+            fn cast_lossy(self) -> $into {
+                self as $into
+            }
+        }
+    )*};
+}
+
+cast_into!(usize);
+cast_into!(isize);
+cast_into!(u8);
+cast_into!(i8);
+cast_into!(u16);
+cast_into!(i16);
+cast_into!(u32);
+cast_into!(i32);
+cast_into!(u64);
+cast_into!(i64);
+cast_into!(u128);
+cast_into!(i128);
+
+cast_into_float!(i8);
+cast_into_float!(i16);
+cast_into_float!(i32);
+cast_into_float!(i64);
+cast_into_float!(i128);
diff --git a/library/compiler-builtins/libm/src/math/support/macros.rs b/library/compiler-builtins/libm/src/math/support/macros.rs
new file mode 100644
index 00000000000..0b72db0e46e
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/support/macros.rs
@@ -0,0 +1,157 @@
+/// `libm` cannot have dependencies, so this is vendored directly from the `cfg-if` crate
+/// (with some comments stripped for compactness).
+macro_rules! cfg_if {
+    // match if/else chains with a final `else`
+    ($(
+        if #[cfg($meta:meta)] { $($tokens:tt)* }
+    ) else * else {
+        $($tokens2:tt)*
+    }) => {
+        cfg_if! { @__items () ; $( ( ($meta) ($($tokens)*) ), )* ( () ($($tokens2)*) ), }
+    };
+
+    // match if/else chains lacking a final `else`
+    (
+        if #[cfg($i_met:meta)] { $($i_tokens:tt)* }
+        $( else if #[cfg($e_met:meta)] { $($e_tokens:tt)* } )*
+    ) => {
+        cfg_if! {
+            @__items
+            () ;
+            ( ($i_met) ($($i_tokens)*) ),
+            $( ( ($e_met) ($($e_tokens)*) ), )*
+            ( () () ),
+        }
+    };
+
+    // Internal and recursive macro to emit all the items
+    //
+    // Collects all the negated cfgs in a list at the beginning and after the
+    // semicolon is all the remaining items
+    (@__items ($($not:meta,)*) ; ) => {};
+    (@__items ($($not:meta,)*) ; ( ($($m:meta),*) ($($tokens:tt)*) ), $($rest:tt)*) => {
+        #[cfg(all($($m,)* not(any($($not),*))))] cfg_if! { @__identity $($tokens)* }
+        cfg_if! { @__items ($($not,)* $($m,)*) ; $($rest)* }
+    };
+
+    // Internal macro to make __apply work out right for different match types,
+    // because of how macros matching/expand stuff.
+    (@__identity $($tokens:tt)*) => { $($tokens)* };
+}
+
+/// Choose between using an arch-specific implementation and the function body. Returns directly
+/// if the arch implementation is used, otherwise continue with the rest of the function.
+///
+/// Specify a `use_arch` meta field if an architecture-specific implementation is provided.
+/// These live in the `math::arch::some_target_arch` module.
+///
+/// Specify a `use_arch_required` meta field if something architecture-specific must be used
+/// regardless of feature configuration (`force-soft-floats`).
+///
+/// The passed meta options do not need to account for the `arch` target feature.
+macro_rules! select_implementation {
+    (
+        name: $fn_name:ident,
+        // Configuration meta for when to use arch-specific implementation that requires hard
+        // float ops
+        $( use_arch: $use_arch:meta, )?
+        // Configuration meta for when to use the arch module regardless of whether softfloats
+        // have been requested.
+        $( use_arch_required: $use_arch_required:meta, )?
+        args: $($arg:ident),+ ,
+    ) => {
+        // FIXME: these use paths that are a pretty fragile (`super`). We should figure out
+        // something better w.r.t. how this is vendored into compiler-builtins.
+
+        // However, we do need a few things from `arch` that are used even with soft floats.
+        select_implementation! {
+            @cfg $($use_arch_required)?;
+            if true {
+                return  super::arch::$fn_name( $($arg),+ );
+            }
+        }
+
+        // By default, never use arch-specific implementations if we have force-soft-floats
+        #[cfg(arch_enabled)]
+        select_implementation! {
+            @cfg $($use_arch)?;
+            // Wrap in `if true` to avoid unused warnings
+            if true {
+                return  super::arch::$fn_name( $($arg),+ );
+            }
+        }
+    };
+
+    // Coalesce helper to construct an expression only if a config is provided
+    (@cfg ; $ex:expr) => { };
+    (@cfg $provided:meta; $ex:expr) => { #[cfg($provided)] $ex };
+}
+
+/// Construct a 16-bit float from hex float representation (C-style), guaranteed to
+/// evaluate at compile time.
+#[cfg(f16_enabled)]
+#[cfg_attr(feature = "unstable-public-internals", macro_export)]
+#[allow(unused_macros)]
+macro_rules! hf16 {
+    ($s:literal) => {{
+        const X: f16 = $crate::support::hf16($s);
+        X
+    }};
+}
+
+/// Construct a 32-bit float from hex float representation (C-style), guaranteed to
+/// evaluate at compile time.
+#[allow(unused_macros)]
+#[cfg_attr(feature = "unstable-public-internals", macro_export)]
+macro_rules! hf32 {
+    ($s:literal) => {{
+        const X: f32 = $crate::support::hf32($s);
+        X
+    }};
+}
+
+/// Construct a 64-bit float from hex float representation (C-style), guaranteed to
+/// evaluate at compile time.
+#[allow(unused_macros)]
+#[cfg_attr(feature = "unstable-public-internals", macro_export)]
+macro_rules! hf64 {
+    ($s:literal) => {{
+        const X: f64 = $crate::support::hf64($s);
+        X
+    }};
+}
+
+/// Construct a 128-bit float from hex float representation (C-style), guaranteed to
+/// evaluate at compile time.
+#[cfg(f128_enabled)]
+#[allow(unused_macros)]
+#[cfg_attr(feature = "unstable-public-internals", macro_export)]
+macro_rules! hf128 {
+    ($s:literal) => {{
+        const X: f128 = $crate::support::hf128($s);
+        X
+    }};
+}
+
+/// Assert `F::biteq` with better messages.
+#[cfg(test)]
+macro_rules! assert_biteq {
+    ($left:expr, $right:expr, $($tt:tt)*) => {{
+        use $crate::support::Int;
+        let l = $left;
+        let r = $right;
+        let bits = Int::leading_zeros(l.to_bits() - l.to_bits()); // hack to get the width from the value
+        assert!(
+            l.biteq(r),
+            "{}\nl: {l:?} ({lb:#0width$x})\nr: {r:?} ({rb:#0width$x})",
+            format_args!($($tt)*),
+            lb = l.to_bits(),
+            rb = r.to_bits(),
+            width = ((bits / 4) + 2) as usize,
+
+        );
+    }};
+    ($left:expr, $right:expr $(,)?) => {
+        assert_biteq!($left, $right, "")
+    };
+}
diff --git a/library/compiler-builtins/libm/src/math/support/mod.rs b/library/compiler-builtins/libm/src/math/support/mod.rs
new file mode 100644
index 00000000000..a4f596ab844
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/support/mod.rs
@@ -0,0 +1,32 @@
+#[macro_use]
+pub mod macros;
+mod big;
+mod env;
+// Runtime feature detection requires atomics.
+#[cfg(target_has_atomic = "ptr")]
+pub(crate) mod feature_detect;
+mod float_traits;
+pub mod hex_float;
+mod int_traits;
+
+#[allow(unused_imports)]
+pub use big::{i256, u256};
+pub use env::{FpResult, Round, Status};
+#[allow(unused_imports)]
+pub use float_traits::{DFloat, Float, HFloat, IntTy};
+pub(crate) use float_traits::{f32_from_bits, f64_from_bits};
+#[cfg(f16_enabled)]
+#[allow(unused_imports)]
+pub use hex_float::hf16;
+#[cfg(f128_enabled)]
+#[allow(unused_imports)]
+pub use hex_float::hf128;
+#[allow(unused_imports)]
+pub use hex_float::{Hexf, hf32, hf64};
+pub use int_traits::{CastFrom, CastInto, DInt, HInt, Int, MinInt};
+
+/// Hint to the compiler that the current path is cold.
+pub fn cold_path() {
+    #[cfg(intrinsics_enabled)]
+    core::intrinsics::cold_path();
+}
diff --git a/library/compiler-builtins/libm/src/math/tan.rs b/library/compiler-builtins/libm/src/math/tan.rs
new file mode 100644
index 00000000000..a072bdec56e
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/tan.rs
@@ -0,0 +1,74 @@
+// origin: FreeBSD /usr/src/lib/msun/src/s_tan.c */
+//
+// ====================================================
+// Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+//
+// Developed at SunPro, a Sun Microsystems, Inc. business.
+// Permission to use, copy, modify, and distribute this
+// software is freely granted, provided that this notice
+// is preserved.
+// ====================================================
+
+use super::{k_tan, rem_pio2};
+
+// tan(x)
+// Return tangent function of x.
+//
+// kernel function:
+//      k_tan           ... tangent function on [-pi/4,pi/4]
+//      rem_pio2        ... argument reduction routine
+//
+// Method.
+//      Let S,C and T denote the sin, cos and tan respectively on
+//      [-PI/4, +PI/4]. Reduce the argument x to y1+y2 = x-k*pi/2
+//      in [-pi/4 , +pi/4], and let n = k mod 4.
+//      We have
+//
+//          n        sin(x)      cos(x)        tan(x)
+//     ----------------------------------------------------------
+//          0          S           C             T
+//          1          C          -S            -1/T
+//          2         -S          -C             T
+//          3         -C           S            -1/T
+//     ----------------------------------------------------------
+//
+// Special cases:
+//      Let trig be any of sin, cos, or tan.
+//      trig(+-INF)  is NaN, with signals;
+//      trig(NaN)    is that NaN;
+//
+// Accuracy:
+//      TRIG(x) returns trig(x) nearly rounded
+
+/// The tangent of `x` (f64).
+///
+/// `x` is specified in radians.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn tan(x: f64) -> f64 {
+    let x1p120 = f32::from_bits(0x7b800000); // 0x1p120f === 2 ^ 120
+
+    let ix = (f64::to_bits(x) >> 32) as u32 & 0x7fffffff;
+    /* |x| ~< pi/4 */
+    if ix <= 0x3fe921fb {
+        if ix < 0x3e400000 {
+            /* |x| < 2**-27 */
+            /* raise inexact if x!=0 and underflow if subnormal */
+            force_eval!(if ix < 0x00100000 {
+                x / x1p120 as f64
+            } else {
+                x + x1p120 as f64
+            });
+            return x;
+        }
+        return k_tan(x, 0.0, 0);
+    }
+
+    /* tan(Inf or NaN) is NaN */
+    if ix >= 0x7ff00000 {
+        return x - x;
+    }
+
+    /* argument reduction */
+    let (n, y0, y1) = rem_pio2(x);
+    k_tan(y0, y1, n & 1)
+}
diff --git a/library/compiler-builtins/libm/src/math/tanf.rs b/library/compiler-builtins/libm/src/math/tanf.rs
new file mode 100644
index 00000000000..8bcf9581ff6
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/tanf.rs
@@ -0,0 +1,81 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/s_tanf.c */
+/*
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
+ * Optimized by Bruce D. Evans.
+ */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+use core::f64::consts::FRAC_PI_2;
+
+use super::{k_tanf, rem_pio2f};
+
+/* Small multiples of pi/2 rounded to double precision. */
+const T1_PIO2: f64 = 1. * FRAC_PI_2; /* 0x3FF921FB, 0x54442D18 */
+const T2_PIO2: f64 = 2. * FRAC_PI_2; /* 0x400921FB, 0x54442D18 */
+const T3_PIO2: f64 = 3. * FRAC_PI_2; /* 0x4012D97C, 0x7F3321D2 */
+const T4_PIO2: f64 = 4. * FRAC_PI_2; /* 0x401921FB, 0x54442D18 */
+
+/// The tangent of `x` (f32).
+///
+/// `x` is specified in radians.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn tanf(x: f32) -> f32 {
+    let x64 = x as f64;
+
+    let x1p120 = f32::from_bits(0x7b800000); // 0x1p120f === 2 ^ 120
+
+    let mut ix = x.to_bits();
+    let sign = (ix >> 31) != 0;
+    ix &= 0x7fffffff;
+
+    if ix <= 0x3f490fda {
+        /* |x| ~<= pi/4 */
+        if ix < 0x39800000 {
+            /* |x| < 2**-12 */
+            /* raise inexact if x!=0 and underflow if subnormal */
+            force_eval!(if ix < 0x00800000 {
+                x / x1p120
+            } else {
+                x + x1p120
+            });
+            return x;
+        }
+        return k_tanf(x64, false);
+    }
+    if ix <= 0x407b53d1 {
+        /* |x| ~<= 5*pi/4 */
+        if ix <= 0x4016cbe3 {
+            /* |x| ~<= 3pi/4 */
+            return k_tanf(if sign { x64 + T1_PIO2 } else { x64 - T1_PIO2 }, true);
+        } else {
+            return k_tanf(if sign { x64 + T2_PIO2 } else { x64 - T2_PIO2 }, false);
+        }
+    }
+    if ix <= 0x40e231d5 {
+        /* |x| ~<= 9*pi/4 */
+        if ix <= 0x40afeddf {
+            /* |x| ~<= 7*pi/4 */
+            return k_tanf(if sign { x64 + T3_PIO2 } else { x64 - T3_PIO2 }, true);
+        } else {
+            return k_tanf(if sign { x64 + T4_PIO2 } else { x64 - T4_PIO2 }, false);
+        }
+    }
+
+    /* tan(Inf or NaN) is NaN */
+    if ix >= 0x7f800000 {
+        return x - x;
+    }
+
+    /* argument reduction */
+    let (n, y) = rem_pio2f(x);
+    k_tanf(y, n & 1 != 0)
+}
diff --git a/library/compiler-builtins/libm/src/math/tanh.rs b/library/compiler-builtins/libm/src/math/tanh.rs
new file mode 100644
index 00000000000..cc0abe4fcb2
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/tanh.rs
@@ -0,0 +1,53 @@
+use super::expm1;
+
+/* tanh(x) = (exp(x) - exp(-x))/(exp(x) + exp(-x))
+ *         = (exp(2*x) - 1)/(exp(2*x) - 1 + 2)
+ *         = (1 - exp(-2*x))/(exp(-2*x) - 1 + 2)
+ */
+
+/// The hyperbolic tangent of `x` (f64).
+///
+/// `x` is specified in radians.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn tanh(mut x: f64) -> f64 {
+    let mut uf: f64 = x;
+    let mut ui: u64 = f64::to_bits(uf);
+
+    let w: u32;
+    let sign: bool;
+    let mut t: f64;
+
+    /* x = |x| */
+    sign = ui >> 63 != 0;
+    ui &= !1 / 2;
+    uf = f64::from_bits(ui);
+    x = uf;
+    w = (ui >> 32) as u32;
+
+    if w > 0x3fe193ea {
+        /* |x| > log(3)/2 ~= 0.5493 or nan */
+        if w > 0x40340000 {
+            /* |x| > 20 or nan */
+            /* note: this branch avoids raising overflow */
+            t = 1.0 - 0.0 / x;
+        } else {
+            t = expm1(2.0 * x);
+            t = 1.0 - 2.0 / (t + 2.0);
+        }
+    } else if w > 0x3fd058ae {
+        /* |x| > log(5/3)/2 ~= 0.2554 */
+        t = expm1(2.0 * x);
+        t = t / (t + 2.0);
+    } else if w >= 0x00100000 {
+        /* |x| >= 0x1p-1022, up to 2ulp error in [0.1,0.2554] */
+        t = expm1(-2.0 * x);
+        t = -t / (t + 2.0);
+    } else {
+        /* |x| is subnormal */
+        /* note: the branch above would not raise underflow in [0x1p-1023,0x1p-1022) */
+        force_eval!(x as f32);
+        t = x;
+    }
+
+    if sign { -t } else { t }
+}
diff --git a/library/compiler-builtins/libm/src/math/tanhf.rs b/library/compiler-builtins/libm/src/math/tanhf.rs
new file mode 100644
index 00000000000..fffbba6c6ec
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/tanhf.rs
@@ -0,0 +1,38 @@
+use super::expm1f;
+
+/// The hyperbolic tangent of `x` (f32).
+///
+/// `x` is specified in radians.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn tanhf(mut x: f32) -> f32 {
+    /* x = |x| */
+    let mut ix = x.to_bits();
+    let sign = (ix >> 31) != 0;
+    ix &= 0x7fffffff;
+    x = f32::from_bits(ix);
+    let w = ix;
+
+    let tt = if w > 0x3f0c9f54 {
+        /* |x| > log(3)/2 ~= 0.5493 or nan */
+        if w > 0x41200000 {
+            /* |x| > 10 */
+            1. + 0. / x
+        } else {
+            let t = expm1f(2. * x);
+            1. - 2. / (t + 2.)
+        }
+    } else if w > 0x3e82c578 {
+        /* |x| > log(5/3)/2 ~= 0.2554 */
+        let t = expm1f(2. * x);
+        t / (t + 2.)
+    } else if w >= 0x00800000 {
+        /* |x| >= 0x1p-126 */
+        let t = expm1f(-2. * x);
+        -t / (t + 2.)
+    } else {
+        /* |x| is subnormal */
+        force_eval!(x * x);
+        x
+    };
+    if sign { -tt } else { tt }
+}
diff --git a/library/compiler-builtins/libm/src/math/tgamma.rs b/library/compiler-builtins/libm/src/math/tgamma.rs
new file mode 100644
index 00000000000..3059860646a
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/tgamma.rs
@@ -0,0 +1,209 @@
+/*
+"A Precision Approximation of the Gamma Function" - Cornelius Lanczos (1964)
+"Lanczos Implementation of the Gamma Function" - Paul Godfrey (2001)
+"An Analysis of the Lanczos Gamma Approximation" - Glendon Ralph Pugh (2004)
+
+approximation method:
+
+                        (x - 0.5)         S(x)
+Gamma(x) = (x + g - 0.5)         *  ----------------
+                                    exp(x + g - 0.5)
+
+with
+                 a1      a2      a3            aN
+S(x) ~= [ a0 + ----- + ----- + ----- + ... + ----- ]
+               x + 1   x + 2   x + 3         x + N
+
+with a0, a1, a2, a3,.. aN constants which depend on g.
+
+for x < 0 the following reflection formula is used:
+
+Gamma(x)*Gamma(-x) = -pi/(x sin(pi x))
+
+most ideas and constants are from boost and python
+*/
+use super::{exp, floor, k_cos, k_sin, pow};
+
+const PI: f64 = 3.141592653589793238462643383279502884;
+
+/* sin(pi x) with x > 0x1p-100, if sin(pi*x)==0 the sign is arbitrary */
+fn sinpi(mut x: f64) -> f64 {
+    let mut n: isize;
+
+    /* argument reduction: x = |x| mod 2 */
+    /* spurious inexact when x is odd int */
+    x = x * 0.5;
+    x = 2.0 * (x - floor(x));
+
+    /* reduce x into [-.25,.25] */
+    n = (4.0 * x) as isize;
+    n = div!(n + 1, 2);
+    x -= (n as f64) * 0.5;
+
+    x *= PI;
+    match n {
+        1 => k_cos(x, 0.0),
+        2 => k_sin(-x, 0.0, 0),
+        3 => -k_cos(x, 0.0),
+        // 0
+        _ => k_sin(x, 0.0, 0),
+    }
+}
+
+const N: usize = 12;
+//static const double g = 6.024680040776729583740234375;
+const GMHALF: f64 = 5.524680040776729583740234375;
+const SNUM: [f64; N + 1] = [
+    23531376880.410759688572007674451636754734846804940,
+    42919803642.649098768957899047001988850926355848959,
+    35711959237.355668049440185451547166705960488635843,
+    17921034426.037209699919755754458931112671403265390,
+    6039542586.3520280050642916443072979210699388420708,
+    1439720407.3117216736632230727949123939715485786772,
+    248874557.86205415651146038641322942321632125127801,
+    31426415.585400194380614231628318205362874684987640,
+    2876370.6289353724412254090516208496135991145378768,
+    186056.26539522349504029498971604569928220784236328,
+    8071.6720023658162106380029022722506138218516325024,
+    210.82427775157934587250973392071336271166969580291,
+    2.5066282746310002701649081771338373386264310793408,
+];
+const SDEN: [f64; N + 1] = [
+    0.0,
+    39916800.0,
+    120543840.0,
+    150917976.0,
+    105258076.0,
+    45995730.0,
+    13339535.0,
+    2637558.0,
+    357423.0,
+    32670.0,
+    1925.0,
+    66.0,
+    1.0,
+];
+/* n! for small integer n */
+const FACT: [f64; 23] = [
+    1.0,
+    1.0,
+    2.0,
+    6.0,
+    24.0,
+    120.0,
+    720.0,
+    5040.0,
+    40320.0,
+    362880.0,
+    3628800.0,
+    39916800.0,
+    479001600.0,
+    6227020800.0,
+    87178291200.0,
+    1307674368000.0,
+    20922789888000.0,
+    355687428096000.0,
+    6402373705728000.0,
+    121645100408832000.0,
+    2432902008176640000.0,
+    51090942171709440000.0,
+    1124000727777607680000.0,
+];
+
+/* S(x) rational function for positive x */
+fn s(x: f64) -> f64 {
+    let mut num: f64 = 0.0;
+    let mut den: f64 = 0.0;
+
+    /* to avoid overflow handle large x differently */
+    if x < 8.0 {
+        for i in (0..=N).rev() {
+            num = num * x + i!(SNUM, i);
+            den = den * x + i!(SDEN, i);
+        }
+    } else {
+        for i in 0..=N {
+            num = num / x + i!(SNUM, i);
+            den = den / x + i!(SDEN, i);
+        }
+    }
+    return num / den;
+}
+
+/// The [Gamma function](https://en.wikipedia.org/wiki/Gamma_function) (f64).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn tgamma(mut x: f64) -> f64 {
+    let u: u64 = x.to_bits();
+    let absx: f64;
+    let mut y: f64;
+    let mut dy: f64;
+    let mut z: f64;
+    let mut r: f64;
+    let ix: u32 = ((u >> 32) as u32) & 0x7fffffff;
+    let sign: bool = (u >> 63) != 0;
+
+    /* special cases */
+    if ix >= 0x7ff00000 {
+        /* tgamma(nan)=nan, tgamma(inf)=inf, tgamma(-inf)=nan with invalid */
+        return x + f64::INFINITY;
+    }
+    if ix < ((0x3ff - 54) << 20) {
+        /* |x| < 2^-54: tgamma(x) ~ 1/x, +-0 raises div-by-zero */
+        return 1.0 / x;
+    }
+
+    /* integer arguments */
+    /* raise inexact when non-integer */
+    if x == floor(x) {
+        if sign {
+            return 0.0 / 0.0;
+        }
+        if x <= FACT.len() as f64 {
+            return i!(FACT, (x as usize) - 1);
+        }
+    }
+
+    /* x >= 172: tgamma(x)=inf with overflow */
+    /* x =< -184: tgamma(x)=+-0 with underflow */
+    if ix >= 0x40670000 {
+        /* |x| >= 184 */
+        if sign {
+            let x1p_126 = f64::from_bits(0x3810000000000000); // 0x1p-126 == 2^-126
+            force_eval!((x1p_126 / x) as f32);
+            if floor(x) * 0.5 == floor(x * 0.5) {
+                return 0.0;
+            } else {
+                return -0.0;
+            }
+        }
+        let x1p1023 = f64::from_bits(0x7fe0000000000000); // 0x1p1023 == 2^1023
+        x *= x1p1023;
+        return x;
+    }
+
+    absx = if sign { -x } else { x };
+
+    /* handle the error of x + g - 0.5 */
+    y = absx + GMHALF;
+    if absx > GMHALF {
+        dy = y - absx;
+        dy -= GMHALF;
+    } else {
+        dy = y - GMHALF;
+        dy -= absx;
+    }
+
+    z = absx - 0.5;
+    r = s(absx) * exp(-y);
+    if x < 0.0 {
+        /* reflection formula for negative x */
+        /* sinpi(absx) is not 0, integers are already handled */
+        r = -PI / (sinpi(absx) * absx * r);
+        dy = -dy;
+        z = -z;
+    }
+    r += dy * (GMHALF + 0.5) * r / y;
+    z = pow(y, 0.5 * z);
+    y = r * z * z;
+    return y;
+}
diff --git a/library/compiler-builtins/libm/src/math/tgammaf.rs b/library/compiler-builtins/libm/src/math/tgammaf.rs
new file mode 100644
index 00000000000..fe178f7a3c0
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/tgammaf.rs
@@ -0,0 +1,7 @@
+use super::tgamma;
+
+/// The [Gamma function](https://en.wikipedia.org/wiki/Gamma_function) (f32).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn tgammaf(x: f32) -> f32 {
+    tgamma(x as f64) as f32
+}
diff --git a/library/compiler-builtins/libm/src/math/trunc.rs b/library/compiler-builtins/libm/src/math/trunc.rs
new file mode 100644
index 00000000000..fa50d55e136
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/trunc.rs
@@ -0,0 +1,53 @@
+/// Rounds the number toward 0 to the closest integral value (f16).
+///
+/// This effectively removes the decimal part of the number, leaving the integral part.
+#[cfg(f16_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn truncf16(x: f16) -> f16 {
+    super::generic::trunc(x)
+}
+
+/// Rounds the number toward 0 to the closest integral value (f32).
+///
+/// This effectively removes the decimal part of the number, leaving the integral part.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn truncf(x: f32) -> f32 {
+    select_implementation! {
+        name: truncf,
+        use_arch: all(target_arch = "wasm32", intrinsics_enabled),
+        args: x,
+    }
+
+    super::generic::trunc(x)
+}
+
+/// Rounds the number toward 0 to the closest integral value (f64).
+///
+/// This effectively removes the decimal part of the number, leaving the integral part.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn trunc(x: f64) -> f64 {
+    select_implementation! {
+        name: trunc,
+        use_arch: all(target_arch = "wasm32", intrinsics_enabled),
+        args: x,
+    }
+
+    super::generic::trunc(x)
+}
+
+/// Rounds the number toward 0 to the closest integral value (f128).
+///
+/// This effectively removes the decimal part of the number, leaving the integral part.
+#[cfg(f128_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn truncf128(x: f128) -> f128 {
+    super::generic::trunc(x)
+}
+
+#[cfg(test)]
+mod tests {
+    #[test]
+    fn sanity_check() {
+        assert_eq!(super::truncf(1.1), 1.0);
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/truncf.rs b/library/compiler-builtins/libm/src/math/truncf.rs
new file mode 100644
index 00000000000..14533a26706
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/truncf.rs
@@ -0,0 +1,23 @@
+/// Rounds the number toward 0 to the closest integral value (f32).
+///
+/// This effectively removes the decimal part of the number, leaving the integral part.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn truncf(x: f32) -> f32 {
+    select_implementation! {
+        name: truncf,
+        use_arch: all(target_arch = "wasm32", intrinsics_enabled),
+        args: x,
+    }
+
+    super::generic::trunc(x)
+}
+
+// PowerPC tests are failing on LLVM 13: https://github.com/rust-lang/rust/issues/88520
+#[cfg(not(target_arch = "powerpc64"))]
+#[cfg(test)]
+mod tests {
+    #[test]
+    fn sanity_check() {
+        assert_eq!(super::truncf(1.1), 1.0);
+    }
+}
diff --git a/library/compiler-builtins/libm/src/math/truncf128.rs b/library/compiler-builtins/libm/src/math/truncf128.rs
new file mode 100644
index 00000000000..9dccc0d0e9d
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/truncf128.rs
@@ -0,0 +1,7 @@
+/// Rounds the number toward 0 to the closest integral value (f128).
+///
+/// This effectively removes the decimal part of the number, leaving the integral part.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn truncf128(x: f128) -> f128 {
+    super::generic::trunc(x)
+}
diff --git a/library/compiler-builtins/libm/src/math/truncf16.rs b/library/compiler-builtins/libm/src/math/truncf16.rs
new file mode 100644
index 00000000000..d7c3d225cf9
--- /dev/null
+++ b/library/compiler-builtins/libm/src/math/truncf16.rs
@@ -0,0 +1,7 @@
+/// Rounds the number toward 0 to the closest integral value (f16).
+///
+/// This effectively removes the decimal part of the number, leaving the integral part.
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn truncf16(x: f16) -> f16 {
+    super::generic::trunc(x)
+}
diff --git a/library/compiler-builtins/thumbv6m-linux-eabi.json b/library/compiler-builtins/thumbv6m-linux-eabi.json
new file mode 100644
index 00000000000..ac736eae686
--- /dev/null
+++ b/library/compiler-builtins/thumbv6m-linux-eabi.json
@@ -0,0 +1,28 @@
+{
+    "abi-blacklist": [
+        "stdcall",
+        "fastcall",
+        "vectorcall",
+        "win64",
+        "sysv64"
+    ],
+    "arch": "arm",
+    "data-layout": "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64",
+    "env": "",
+    "executables": true,
+    "features": "+strict-align",
+    "linker": "arm-none-eabi-gcc",
+    "linker-flavor": "gcc",
+    "llvm-target": "thumbv6m-none-eabi",
+    "max-atomic-width": 0,
+    "os": "linux",
+    "panic-strategy": "abort",
+    "pre-link-args": {
+        "gcc": ["-nostartfiles"]
+    },
+    "relocation-model": "static",
+    "target-endian": "little",
+    "target-pointer-width": "32",
+    "target-c-int-width": "32",
+    "vendor": ""
+}
diff --git a/library/compiler-builtins/thumbv7em-linux-eabi.json b/library/compiler-builtins/thumbv7em-linux-eabi.json
new file mode 100644
index 00000000000..b6d4a6bda7b
--- /dev/null
+++ b/library/compiler-builtins/thumbv7em-linux-eabi.json
@@ -0,0 +1,27 @@
+{
+    "abi-blacklist": [
+        "stdcall",
+        "fastcall",
+        "vectorcall",
+        "win64",
+        "sysv64"
+    ],
+    "arch": "arm",
+    "data-layout": "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64",
+    "env": "",
+    "executables": true,
+    "linker": "arm-none-eabi-gcc",
+    "linker-flavor": "gcc",
+    "llvm-target": "thumbv7em-none-eabi",
+    "max-atomic-width": 32,
+    "os": "linux",
+    "panic-strategy": "abort",
+    "pre-link-args": {
+        "gcc": ["-nostartfiles"]
+    },
+    "relocation-model": "static",
+    "target-endian": "little",
+    "target-pointer-width": "32",
+    "target-c-int-width": "32",
+    "vendor": ""
+}
diff --git a/library/compiler-builtins/thumbv7em-linux-eabihf.json b/library/compiler-builtins/thumbv7em-linux-eabihf.json
new file mode 100644
index 00000000000..81cfcd48d56
--- /dev/null
+++ b/library/compiler-builtins/thumbv7em-linux-eabihf.json
@@ -0,0 +1,28 @@
+{
+    "abi-blacklist": [
+        "stdcall",
+        "fastcall",
+        "vectorcall",
+        "win64",
+        "sysv64"
+    ],
+    "arch": "arm",
+    "data-layout": "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64",
+    "env": "",
+    "executables": true,
+    "features": "+vfp4,+d16,+fp-only-sp",
+    "linker": "arm-none-eabi-gcc",
+    "linker-flavor": "gcc",
+    "llvm-target": "thumbv7em-none-eabihf",
+    "max-atomic-width": 32,
+    "os": "linux",
+    "panic-strategy": "abort",
+    "pre-link-args": {
+        "gcc": ["-nostartfiles"]
+    },
+    "relocation-model": "static",
+    "target-endian": "little",
+    "target-pointer-width": "32",
+    "target-c-int-width": "32",
+    "vendor": ""
+}
diff --git a/library/compiler-builtins/thumbv7m-linux-eabi.json b/library/compiler-builtins/thumbv7m-linux-eabi.json
new file mode 100644
index 00000000000..abe037c5bef
--- /dev/null
+++ b/library/compiler-builtins/thumbv7m-linux-eabi.json
@@ -0,0 +1,27 @@
+{
+    "abi-blacklist": [
+        "stdcall",
+        "fastcall",
+        "vectorcall",
+        "win64",
+        "sysv64"
+    ],
+    "arch": "arm",
+    "data-layout": "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64",
+    "env": "",
+    "executables": true,
+    "linker": "arm-none-eabi-gcc",
+    "linker-flavor": "gcc",
+    "llvm-target": "thumbv7m-none-eabi",
+    "max-atomic-width": 32,
+    "os": "linux",
+    "panic-strategy": "abort",
+    "pre-link-args": {
+        "gcc": ["-nostartfiles"]
+    },
+    "relocation-model": "static",
+    "target-endian": "little",
+    "target-pointer-width": "32",
+    "target-c-int-width": "32",
+    "vendor": ""
+}