about summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--compiler/rustc_codegen_cranelift/Cargo.lock60
-rw-r--r--compiler/rustc_codegen_cranelift/Cargo.toml12
-rw-r--r--compiler/rustc_codegen_cranelift/build_system/tests.rs5
-rw-r--r--compiler/rustc_codegen_cranelift/config.txt1
-rw-r--r--compiler/rustc_codegen_cranelift/example/polymorphize_coroutine.rs16
-rw-r--r--compiler/rustc_codegen_cranelift/rust-toolchain2
-rwxr-xr-xcompiler/rustc_codegen_cranelift/scripts/test_rustc_tests.sh5
-rw-r--r--compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs372
-rw-r--r--compiler/rustc_codegen_cranelift/src/pretty_clif.rs16
-rw-r--r--compiler/rustc_codegen_cranelift/src/value_and_place.rs34
10 files changed, 452 insertions, 71 deletions
diff --git a/compiler/rustc_codegen_cranelift/Cargo.lock b/compiler/rustc_codegen_cranelift/Cargo.lock
index dcb6cc57584..901d1dbea66 100644
--- a/compiler/rustc_codegen_cranelift/Cargo.lock
+++ b/compiler/rustc_codegen_cranelift/Cargo.lock
@@ -21,9 +21,9 @@ checksum = "a4668cab20f66d8d020e1fbc0ebe47217433c1b6c8f2040faf858554e394ace6"
 
 [[package]]
 name = "arbitrary"
-version = "1.3.0"
+version = "1.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e2d098ff73c1ca148721f37baad5ea6a465a13f9573aba8641fbbbae8164a54e"
+checksum = "7d5a26814d8dcb93b0e5a0ff3c6d80a8843bafb21b39e8e18a6f05471870e110"
 
 [[package]]
 name = "bitflags"
@@ -45,18 +45,18 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 
 [[package]]
 name = "cranelift-bforest"
-version = "0.101.2"
+version = "0.102.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f773437307980ac0f424bf9b9a5d0cd21a0f17248c6860c9a65bec8b5975f3fe"
+checksum = "76eb38f2af690b5a4411d9a8782b6d77dabff3ca939e0518453ab9f9a4392d41"
 dependencies = [
  "cranelift-entity",
 ]
 
 [[package]]
 name = "cranelift-codegen"
-version = "0.101.2"
+version = "0.102.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "443c2ac50e97fb7de1a0f862753fce3f27215558811a6fcee508eb0c3747fa79"
+checksum = "39526c036b92912417e8931f52c1e235796688068d3efdbbd8b164f299d19156"
 dependencies = [
  "bumpalo",
  "cranelift-bforest",
@@ -75,39 +75,39 @@ dependencies = [
 
 [[package]]
 name = "cranelift-codegen-meta"
-version = "0.101.2"
+version = "0.102.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c5b174c411480c79ce0793c55042fa51bec27e486381d103a53cab3b480cb2db"
+checksum = "fdb0deedc9fccf2db53a5a3c9c9d0163e44143b0d004dca9bf6ab6a0024cd79a"
 dependencies = [
  "cranelift-codegen-shared",
 ]
 
 [[package]]
 name = "cranelift-codegen-shared"
-version = "0.101.2"
+version = "0.102.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "73fa0151a528066a369de6debeea4d4b23a32aba68b5add8c46d3dc8091ff434"
+checksum = "cea2d1b274e45aa8e61e9103efa1ba82d4b5a19d12bd1fd10744c3b7380ba3ff"
 
 [[package]]
 name = "cranelift-control"
-version = "0.101.2"
+version = "0.102.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b8adf1e6398493c9bea1190e37d28a0eb0eca5fddbc80e01e506cda34db92b1f"
+checksum = "6ea5977559a71e63db79a263f0e81a89b996e8a38212c4281e37dd1dbaa8b65c"
 dependencies = [
  "arbitrary",
 ]
 
 [[package]]
 name = "cranelift-entity"
-version = "0.101.2"
+version = "0.102.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4917e2ed3bb5fe87d0ed88395ca6d644018d119a034faedd1f3e1f2c33cd52b2"
+checksum = "2f871ada808b58158d84dfc43a6a2e2d2756baaf4ed1c51fd969ca8330e6ca5c"
 
 [[package]]
 name = "cranelift-frontend"
-version = "0.101.2"
+version = "0.102.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9aaadf1e7cf28886bbf046eaf7ef538997bc8a7e020e578ea4957b39da87d5a1"
+checksum = "e8e6890f587ef59824b3debe577e68fdf9b307b3808c54b8d93a18fd0b70941b"
 dependencies = [
  "cranelift-codegen",
  "log",
@@ -117,15 +117,15 @@ dependencies = [
 
 [[package]]
 name = "cranelift-isle"
-version = "0.101.2"
+version = "0.102.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a67fda31b9d69eaa1c49a2081939454c45857596a9d45af6744680541c628b4c"
+checksum = "a8d5fc6d5d3b52d1917002b17a8ecce448c2621b5bf394bb4e77e2f676893537"
 
 [[package]]
 name = "cranelift-jit"
-version = "0.101.2"
+version = "0.102.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d6bf32710628e7ff298739f1ed80a0bfdafc0c6a3e284c4540b23f18e8889d4b"
+checksum = "e8a2d7744f743f59d9646d7589ad22ea17ed0d71e04906eb77c31e99bc13bd8b"
 dependencies = [
  "anyhow",
  "cranelift-codegen",
@@ -143,9 +143,9 @@ dependencies = [
 
 [[package]]
 name = "cranelift-module"
-version = "0.101.2"
+version = "0.102.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4d693e93a0fbf56b4bc93cffe6b107c2e52f070e1111950505fc8c83ac440b9d"
+checksum = "b96cb196334698e612c197d7d0ae59af5e07667306ec20d7be414717db400873"
 dependencies = [
  "anyhow",
  "cranelift-codegen",
@@ -154,9 +154,9 @@ dependencies = [
 
 [[package]]
 name = "cranelift-native"
-version = "0.101.2"
+version = "0.102.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "76fb52ba71be98312f35e798d9e98e45ab2586f27584231bf7c644fa9501e8af"
+checksum = "3e10c2e7faa65d4ae7de9a83b44f2c31aca7dc638e17d0a79572fdf8103d720b"
 dependencies = [
  "cranelift-codegen",
  "libc",
@@ -165,9 +165,9 @@ dependencies = [
 
 [[package]]
 name = "cranelift-object"
-version = "0.101.2"
+version = "0.102.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2551b2e185022b89e9efa5e04c0f17f679b86ef73d9f7feabc48b608ff23120d"
+checksum = "83ce94e18756058af8a66e3c0ba1123ae15517c72162d8060d0cb0974642adf2"
 dependencies = [
  "anyhow",
  "cranelift-codegen",
@@ -295,9 +295,9 @@ checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"
 
 [[package]]
 name = "regalloc2"
-version = "0.9.2"
+version = "0.9.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b4dcbd3a2ae7fb94b5813fa0e957c6ab51bf5d0a8ee1b69e0c2d0f1e6eb8485"
+checksum = "ad156d539c879b7a24a363a2016d77961786e71f48f2e2fc8302a92abd2429a6"
 dependencies = [
  "hashbrown 0.13.2",
  "log",
@@ -374,9 +374,9 @@ checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
 
 [[package]]
 name = "wasmtime-jit-icache-coherence"
-version = "14.0.2"
+version = "15.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0980a96b16abbdaf829858d2389697b1d6cfc6a903873fd74b7e47a6b1045584"
+checksum = "b73ad1395eda136baec5ece7e079e0536a82ef73488e345456cc9b89858ad0ec"
 dependencies = [
  "cfg-if",
  "libc",
diff --git a/compiler/rustc_codegen_cranelift/Cargo.toml b/compiler/rustc_codegen_cranelift/Cargo.toml
index 30db10f7457..20fcd222732 100644
--- a/compiler/rustc_codegen_cranelift/Cargo.toml
+++ b/compiler/rustc_codegen_cranelift/Cargo.toml
@@ -8,12 +8,12 @@ crate-type = ["dylib"]
 
 [dependencies]
 # These have to be in sync with each other
-cranelift-codegen = { version = "0.101.2", default-features = false, features = ["std", "unwind", "all-arch"] }
-cranelift-frontend = { version = "0.101.2" }
-cranelift-module = { version = "0.101.2" }
-cranelift-native = { version = "0.101.2" }
-cranelift-jit = { version = "0.101.2", optional = true }
-cranelift-object = { version = "0.101.2" }
+cranelift-codegen = { version = "0.102", default-features = false, features = ["std", "unwind", "all-arch"] }
+cranelift-frontend = { version = "0.102" }
+cranelift-module = { version = "0.102" }
+cranelift-native = { version = "0.102" }
+cranelift-jit = { version = "0.102", optional = true }
+cranelift-object = { version = "0.102" }
 target-lexicon = "0.12.0"
 gimli = { version = "0.28", default-features = false, features = ["write"]}
 object = { version = "0.32", default-features = false, features = ["std", "read_core", "write", "archive", "coff", "elf", "macho", "pe"] }
diff --git a/compiler/rustc_codegen_cranelift/build_system/tests.rs b/compiler/rustc_codegen_cranelift/build_system/tests.rs
index aa50dbfdf35..3309a0a6abd 100644
--- a/compiler/rustc_codegen_cranelift/build_system/tests.rs
+++ b/compiler/rustc_codegen_cranelift/build_system/tests.rs
@@ -99,6 +99,10 @@ const BASE_SYSROOT_SUITE: &[TestCase] = &[
     TestCase::build_bin_and_run("aot.mod_bench", "example/mod_bench.rs", &[]),
     TestCase::build_bin_and_run("aot.issue-72793", "example/issue-72793.rs", &[]),
     TestCase::build_bin("aot.issue-59326", "example/issue-59326.rs"),
+    TestCase::custom("aot.polymorphize_coroutine", &|runner| {
+        runner.run_rustc(&["example/polymorphize_coroutine.rs", "-Zpolymorphize"]);
+        runner.run_out_command("polymorphize_coroutine", &[]);
+    }),
     TestCase::build_bin_and_run("aot.neon", "example/neon.rs", &[]),
     TestCase::custom("aot.gen_block_iterate", &|runner| {
         runner.run_rustc([
@@ -466,6 +470,7 @@ impl<'a> TestRunner<'a> {
         cmd.arg("--target");
         cmd.arg(&self.target_compiler.triple);
         cmd.arg("-Cpanic=abort");
+        cmd.arg("-Zunstable-options");
         cmd.arg("--check-cfg=cfg(no_unstable_features)");
         cmd.arg("--check-cfg=cfg(jit)");
         cmd.args(args);
diff --git a/compiler/rustc_codegen_cranelift/config.txt b/compiler/rustc_codegen_cranelift/config.txt
index 3cf295c003e..0b7cac18837 100644
--- a/compiler/rustc_codegen_cranelift/config.txt
+++ b/compiler/rustc_codegen_cranelift/config.txt
@@ -42,6 +42,7 @@ aot.float-minmax-pass
 aot.mod_bench
 aot.issue-72793
 aot.issue-59326
+aot.polymorphize_coroutine
 aot.neon
 aot.gen_block_iterate
 
diff --git a/compiler/rustc_codegen_cranelift/example/polymorphize_coroutine.rs b/compiler/rustc_codegen_cranelift/example/polymorphize_coroutine.rs
new file mode 100644
index 00000000000..c965b34e13b
--- /dev/null
+++ b/compiler/rustc_codegen_cranelift/example/polymorphize_coroutine.rs
@@ -0,0 +1,16 @@
+#![feature(coroutines, coroutine_trait)]
+
+use std::ops::Coroutine;
+use std::pin::Pin;
+
+fn main() {
+    run_coroutine::<i32>();
+}
+
+fn run_coroutine<T>() {
+    let mut coroutine = || {
+        yield;
+        return;
+    };
+    Pin::new(&mut coroutine).resume(());
+}
diff --git a/compiler/rustc_codegen_cranelift/rust-toolchain b/compiler/rustc_codegen_cranelift/rust-toolchain
index 80ef1e49f23..2997816d96c 100644
--- a/compiler/rustc_codegen_cranelift/rust-toolchain
+++ b/compiler/rustc_codegen_cranelift/rust-toolchain
@@ -1,3 +1,3 @@
 [toolchain]
-channel = "nightly-2023-11-16"
+channel = "nightly-2023-11-25"
 components = ["rust-src", "rustc-dev", "llvm-tools"]
diff --git a/compiler/rustc_codegen_cranelift/scripts/test_rustc_tests.sh b/compiler/rustc_codegen_cranelift/scripts/test_rustc_tests.sh
index cdc78adcf85..a299b6de6b1 100755
--- a/compiler/rustc_codegen_cranelift/scripts/test_rustc_tests.sh
+++ b/compiler/rustc_codegen_cranelift/scripts/test_rustc_tests.sh
@@ -146,11 +146,6 @@ rm tests/ui/process/nofile-limit.rs # TODO some AArch64 linking issue
 
 rm tests/ui/stdio-is-blocking.rs # really slow with unoptimized libstd
 
-# rustc bugs
-# ==========
-# https://github.com/rust-lang/rust/pull/116447#issuecomment-1790451463
-rm tests/ui/coroutine/gen_block_*.rs
-
 cp ../dist/bin/rustdoc-clif ../dist/bin/rustdoc # some tests expect bin/rustdoc to exist
 
 # prevent $(RUSTDOC) from picking up the sysroot built by x.py. It conflicts with the one used by
diff --git a/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs b/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs
index 8dd2b6ed014..07b95b7933d 100644
--- a/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs
+++ b/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs
@@ -22,6 +22,11 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
             // Spin loop hint
         }
 
+        "llvm.x86.avx.vzeroupper" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroupper&ig_expand=7218
+            // Do nothing. It is a perf hint anyway.
+        }
+
         // Used by is_x86_feature_detected!();
         "llvm.x86.xgetbv" => {
             intrinsic_args!(fx, args => (xcr_no); intrinsic);
@@ -69,6 +74,103 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
             ret.write_cvalue(fx, val);
         }
 
+        "llvm.x86.avx2.gather.d.d"
+        | "llvm.x86.avx2.gather.d.q"
+        | "llvm.x86.avx2.gather.d.ps"
+        | "llvm.x86.avx2.gather.d.pd"
+        | "llvm.x86.avx2.gather.d.d.256"
+        | "llvm.x86.avx2.gather.d.q.256"
+        | "llvm.x86.avx2.gather.d.ps.256"
+        | "llvm.x86.avx2.gather.d.pd.256"
+        | "llvm.x86.avx2.gather.q.d"
+        | "llvm.x86.avx2.gather.q.q"
+        | "llvm.x86.avx2.gather.q.ps"
+        | "llvm.x86.avx2.gather.q.pd"
+        | "llvm.x86.avx2.gather.q.d.256"
+        | "llvm.x86.avx2.gather.q.q.256"
+        | "llvm.x86.avx2.gather.q.ps.256"
+        | "llvm.x86.avx2.gather.q.pd.256" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_pd&ig_expand=3818
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_pd&ig_expand=3819
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_pd&ig_expand=3821
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_pd&ig_expand=3822
+            // ...
+
+            intrinsic_args!(fx, args => (src, ptr, index, mask, scale); intrinsic);
+
+            let (src_lane_count, src_lane_ty) = src.layout().ty.simd_size_and_type(fx.tcx);
+            let (index_lane_count, index_lane_ty) = index.layout().ty.simd_size_and_type(fx.tcx);
+            let (mask_lane_count, mask_lane_ty) = mask.layout().ty.simd_size_and_type(fx.tcx);
+            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+            assert_eq!(src_lane_ty, ret_lane_ty);
+            assert!(index_lane_ty.is_integral());
+            assert_eq!(src_lane_count, mask_lane_count);
+            assert_eq!(src_lane_count, ret_lane_count);
+
+            let lane_clif_ty = fx.clif_type(ret_lane_ty).unwrap();
+            let index_lane_clif_ty = fx.clif_type(index_lane_ty).unwrap();
+            let mask_lane_clif_ty = fx.clif_type(mask_lane_ty).unwrap();
+            let ret_lane_layout = fx.layout_of(ret_lane_ty);
+
+            let ptr = ptr.load_scalar(fx);
+            let scale = scale.load_scalar(fx);
+            let scale = fx.bcx.ins().uextend(types::I64, scale);
+            for lane_idx in 0..std::cmp::min(src_lane_count, index_lane_count) {
+                let src_lane = src.value_lane(fx, lane_idx).load_scalar(fx);
+                let index_lane = index.value_lane(fx, lane_idx).load_scalar(fx);
+                let mask_lane = mask.value_lane(fx, lane_idx).load_scalar(fx);
+                let mask_lane =
+                    fx.bcx.ins().bitcast(mask_lane_clif_ty.as_int(), MemFlags::new(), mask_lane);
+
+                let if_enabled = fx.bcx.create_block();
+                let if_disabled = fx.bcx.create_block();
+                let next = fx.bcx.create_block();
+                let res_lane = fx.bcx.append_block_param(next, lane_clif_ty);
+
+                let mask_lane = match mask_lane_clif_ty {
+                    types::I32 | types::F32 => {
+                        fx.bcx.ins().band_imm(mask_lane, 0x8000_0000u64 as i64)
+                    }
+                    types::I64 | types::F64 => {
+                        fx.bcx.ins().band_imm(mask_lane, 0x8000_0000_0000_0000u64 as i64)
+                    }
+                    _ => unreachable!(),
+                };
+                fx.bcx.ins().brif(mask_lane, if_enabled, &[], if_disabled, &[]);
+                fx.bcx.seal_block(if_enabled);
+                fx.bcx.seal_block(if_disabled);
+
+                fx.bcx.switch_to_block(if_enabled);
+                let index_lane = if index_lane_clif_ty != types::I64 {
+                    fx.bcx.ins().sextend(types::I64, index_lane)
+                } else {
+                    index_lane
+                };
+                let offset = fx.bcx.ins().imul(index_lane, scale);
+                let lane_ptr = fx.bcx.ins().iadd(ptr, offset);
+                let res = fx.bcx.ins().load(lane_clif_ty, MemFlags::trusted(), lane_ptr, 0);
+                fx.bcx.ins().jump(next, &[res]);
+
+                fx.bcx.switch_to_block(if_disabled);
+                fx.bcx.ins().jump(next, &[src_lane]);
+
+                fx.bcx.seal_block(next);
+                fx.bcx.switch_to_block(next);
+
+                fx.bcx.ins().nop();
+
+                ret.place_lane(fx, lane_idx)
+                    .write_cvalue(fx, CValue::by_val(res_lane, ret_lane_layout));
+            }
+
+            for lane_idx in std::cmp::min(src_lane_count, index_lane_count)..ret_lane_count {
+                let zero_lane = fx.bcx.ins().iconst(mask_lane_clif_ty.as_int(), 0);
+                let zero_lane = fx.bcx.ins().bitcast(mask_lane_clif_ty, MemFlags::new(), zero_lane);
+                ret.place_lane(fx, lane_idx)
+                    .write_cvalue(fx, CValue::by_val(zero_lane, ret_lane_layout));
+            }
+        }
+
         "llvm.x86.sse.cmp.ps" | "llvm.x86.sse2.cmp.pd" => {
             let (x, y, kind) = match args {
                 [x, y, kind] => (x, y, kind),
@@ -273,16 +375,31 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
             );
         }
         "llvm.x86.ssse3.pabs.b.128" | "llvm.x86.ssse3.pabs.w.128" | "llvm.x86.ssse3.pabs.d.128" => {
-            let a = match args {
-                [a] => a,
-                _ => bug!("wrong number of args for intrinsic {intrinsic}"),
-            };
-            let a = codegen_operand(fx, a);
+            intrinsic_args!(fx, args => (a); intrinsic);
 
             simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| {
                 fx.bcx.ins().iabs(lane)
             });
         }
+        "llvm.x86.sse2.cvttps2dq" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32&ig_expand=2429
+            intrinsic_args!(fx, args => (a); intrinsic);
+            let a = a.load_scalar(fx);
+
+            // Using inline asm instead of fcvt_to_sint_sat as unrepresentable values are turned
+            // into 0x80000000 for which Cranelift doesn't have a native instruction.
+            codegen_inline_asm_inner(
+                fx,
+                &[InlineAsmTemplatePiece::String(format!("cvttps2dq xmm0, xmm0"))],
+                &[CInlineAsmOperand::InOut {
+                    reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),
+                    _late: true,
+                    in_value: a,
+                    out_place: Some(ret),
+                }],
+                InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
+            );
+        }
         "llvm.x86.addcarry.32" | "llvm.x86.addcarry.64" => {
             intrinsic_args!(fx, args => (c_in, a, b); intrinsic);
             let c_in = c_in.load_scalar(fx);
@@ -364,9 +481,11 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
             for out_lane_idx in 0..lane_count / 8 {
                 let mut lane_diff_acc = fx.bcx.ins().iconst(types::I64, 0);
 
-                for lane_idx in out_lane_idx * 8..out_lane_idx * 8 + 1 {
+                for lane_idx in out_lane_idx * 8..out_lane_idx * 8 + 8 {
                     let a_lane = a.value_lane(fx, lane_idx).load_scalar(fx);
+                    let a_lane = fx.bcx.ins().uextend(types::I16, a_lane);
                     let b_lane = b.value_lane(fx, lane_idx).load_scalar(fx);
+                    let b_lane = fx.bcx.ins().uextend(types::I16, b_lane);
 
                     let lane_diff = fx.bcx.ins().isub(a_lane, b_lane);
                     let abs_lane_diff = fx.bcx.ins().iabs(lane_diff);
@@ -437,12 +556,12 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
             let ret_lane_layout = fx.layout_of(fx.tcx.types.i32);
             for out_lane_idx in 0..lane_count / 2 {
                 let a_lane0 = a.value_lane(fx, out_lane_idx * 2).load_scalar(fx);
-                let a_lane0 = fx.bcx.ins().uextend(types::I32, a_lane0);
+                let a_lane0 = fx.bcx.ins().sextend(types::I32, a_lane0);
                 let b_lane0 = b.value_lane(fx, out_lane_idx * 2).load_scalar(fx);
                 let b_lane0 = fx.bcx.ins().sextend(types::I32, b_lane0);
 
                 let a_lane1 = a.value_lane(fx, out_lane_idx * 2 + 1).load_scalar(fx);
-                let a_lane1 = fx.bcx.ins().uextend(types::I32, a_lane1);
+                let a_lane1 = fx.bcx.ins().sextend(types::I32, a_lane1);
                 let b_lane1 = b.value_lane(fx, out_lane_idx * 2 + 1).load_scalar(fx);
                 let b_lane1 = fx.bcx.ins().sextend(types::I32, b_lane1);
 
@@ -597,14 +716,14 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
             assert_eq!(ret_lane_ty, fx.tcx.types.i16);
             assert_eq!(lane_count * 2, ret_lane_count);
 
-            let min_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MIN as u16));
-            let max_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MAX as u16));
+            let min_i16 = fx.bcx.ins().iconst(types::I32, i32::from(i16::MIN) as u32 as i64);
+            let max_i16 = fx.bcx.ins().iconst(types::I32, i32::from(i16::MAX) as u32 as i64);
             let ret_lane_layout = fx.layout_of(fx.tcx.types.i16);
 
             for idx in 0..lane_count {
                 let lane = a.value_lane(fx, idx).load_scalar(fx);
                 let sat = fx.bcx.ins().smax(lane, min_i16);
-                let sat = fx.bcx.ins().umin(sat, max_i16);
+                let sat = fx.bcx.ins().smin(sat, max_i16);
                 let res = fx.bcx.ins().ireduce(types::I16, sat);
 
                 let res_lane = CValue::by_val(res, ret_lane_layout);
@@ -614,7 +733,7 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
             for idx in 0..lane_count {
                 let lane = b.value_lane(fx, idx).load_scalar(fx);
                 let sat = fx.bcx.ins().smax(lane, min_i16);
-                let sat = fx.bcx.ins().umin(sat, max_i16);
+                let sat = fx.bcx.ins().smin(sat, max_i16);
                 let res = fx.bcx.ins().ireduce(types::I16, sat);
 
                 let res_lane = CValue::by_val(res, ret_lane_layout);
@@ -641,8 +760,8 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
 
             for idx in 0..lane_count {
                 let lane = a.value_lane(fx, idx).load_scalar(fx);
-                let sat = fx.bcx.ins().umax(lane, min_u16);
-                let sat = fx.bcx.ins().umin(sat, max_u16);
+                let sat = fx.bcx.ins().smax(lane, min_u16);
+                let sat = fx.bcx.ins().smin(sat, max_u16);
                 let res = fx.bcx.ins().ireduce(types::I16, sat);
 
                 let res_lane = CValue::by_val(res, ret_lane_layout);
@@ -651,8 +770,8 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
 
             for idx in 0..lane_count {
                 let lane = b.value_lane(fx, idx).load_scalar(fx);
-                let sat = fx.bcx.ins().umax(lane, min_u16);
-                let sat = fx.bcx.ins().umin(sat, max_u16);
+                let sat = fx.bcx.ins().smax(lane, min_u16);
+                let sat = fx.bcx.ins().smin(sat, max_u16);
                 let res = fx.bcx.ins().ireduce(types::I16, sat);
 
                 let res_lane = CValue::by_val(res, ret_lane_layout);
@@ -673,14 +792,14 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
             assert_eq!(ret_lane_ty, fx.tcx.types.i16);
             assert_eq!(lane_count * 2, ret_lane_count);
 
-            let min_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MIN as u16));
-            let max_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MAX as u16));
+            let min_i16 = fx.bcx.ins().iconst(types::I32, i32::from(i16::MIN) as u32 as i64);
+            let max_i16 = fx.bcx.ins().iconst(types::I32, i32::from(i16::MAX) as u32 as i64);
             let ret_lane_layout = fx.layout_of(fx.tcx.types.i16);
 
             for idx in 0..lane_count / 2 {
                 let lane = a.value_lane(fx, idx).load_scalar(fx);
                 let sat = fx.bcx.ins().smax(lane, min_i16);
-                let sat = fx.bcx.ins().umin(sat, max_i16);
+                let sat = fx.bcx.ins().smin(sat, max_i16);
                 let res = fx.bcx.ins().ireduce(types::I16, sat);
 
                 let res_lane = CValue::by_val(res, ret_lane_layout);
@@ -690,7 +809,7 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
             for idx in 0..lane_count / 2 {
                 let lane = b.value_lane(fx, idx).load_scalar(fx);
                 let sat = fx.bcx.ins().smax(lane, min_i16);
-                let sat = fx.bcx.ins().umin(sat, max_i16);
+                let sat = fx.bcx.ins().smin(sat, max_i16);
                 let res = fx.bcx.ins().ireduce(types::I16, sat);
 
                 let res_lane = CValue::by_val(res, ret_lane_layout);
@@ -700,7 +819,7 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
             for idx in 0..lane_count / 2 {
                 let lane = a.value_lane(fx, idx).load_scalar(fx);
                 let sat = fx.bcx.ins().smax(lane, min_i16);
-                let sat = fx.bcx.ins().umin(sat, max_i16);
+                let sat = fx.bcx.ins().smin(sat, max_i16);
                 let res = fx.bcx.ins().ireduce(types::I16, sat);
 
                 let res_lane = CValue::by_val(res, ret_lane_layout);
@@ -710,7 +829,7 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
             for idx in 0..lane_count / 2 {
                 let lane = b.value_lane(fx, idx).load_scalar(fx);
                 let sat = fx.bcx.ins().smax(lane, min_i16);
-                let sat = fx.bcx.ins().umin(sat, max_i16);
+                let sat = fx.bcx.ins().smin(sat, max_i16);
                 let res = fx.bcx.ins().ireduce(types::I16, sat);
 
                 let res_lane = CValue::by_val(res, ret_lane_layout);
@@ -718,6 +837,215 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
             }
         }
 
+        "llvm.x86.fma.vfmaddsub.ps"
+        | "llvm.x86.fma.vfmaddsub.pd"
+        | "llvm.x86.fma.vfmaddsub.ps.256"
+        | "llvm.x86.fma.vfmaddsub.pd.256" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmaddsub_ps&ig_expand=3205
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmaddsub_pd&ig_expand=3181
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmaddsub_ps&ig_expand=3209
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmaddsub_pd&ig_expand=3185
+            intrinsic_args!(fx, args => (a, b, c); intrinsic);
+
+            assert_eq!(a.layout(), b.layout());
+            assert_eq!(a.layout(), c.layout());
+            let layout = a.layout();
+
+            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+            assert!(lane_ty.is_floating_point());
+            assert!(ret_lane_ty.is_floating_point());
+            assert_eq!(lane_count, ret_lane_count);
+            let ret_lane_layout = fx.layout_of(ret_lane_ty);
+
+            for idx in 0..lane_count {
+                let a_lane = a.value_lane(fx, idx).load_scalar(fx);
+                let b_lane = b.value_lane(fx, idx).load_scalar(fx);
+                let c_lane = c.value_lane(fx, idx).load_scalar(fx);
+
+                let mul = fx.bcx.ins().fmul(a_lane, b_lane);
+                let res = if idx & 1 == 0 {
+                    fx.bcx.ins().fsub(mul, c_lane)
+                } else {
+                    fx.bcx.ins().fadd(mul, c_lane)
+                };
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
+            }
+        }
+
+        "llvm.x86.fma.vfmsubadd.ps"
+        | "llvm.x86.fma.vfmsubadd.pd"
+        | "llvm.x86.fma.vfmsubadd.ps.256"
+        | "llvm.x86.fma.vfmsubadd.pd.256" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsubadd_ps&ig_expand=3325
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsubadd_pd&ig_expand=3301
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsubadd_ps&ig_expand=3329
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsubadd_pd&ig_expand=3305
+            intrinsic_args!(fx, args => (a, b, c); intrinsic);
+
+            assert_eq!(a.layout(), b.layout());
+            assert_eq!(a.layout(), c.layout());
+            let layout = a.layout();
+
+            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+            assert!(lane_ty.is_floating_point());
+            assert!(ret_lane_ty.is_floating_point());
+            assert_eq!(lane_count, ret_lane_count);
+            let ret_lane_layout = fx.layout_of(ret_lane_ty);
+
+            for idx in 0..lane_count {
+                let a_lane = a.value_lane(fx, idx).load_scalar(fx);
+                let b_lane = b.value_lane(fx, idx).load_scalar(fx);
+                let c_lane = c.value_lane(fx, idx).load_scalar(fx);
+
+                let mul = fx.bcx.ins().fmul(a_lane, b_lane);
+                let res = if idx & 1 == 0 {
+                    fx.bcx.ins().fadd(mul, c_lane)
+                } else {
+                    fx.bcx.ins().fsub(mul, c_lane)
+                };
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
+            }
+        }
+
+        "llvm.x86.fma.vfnmadd.ps"
+        | "llvm.x86.fma.vfnmadd.pd"
+        | "llvm.x86.fma.vfnmadd.ps.256"
+        | "llvm.x86.fma.vfnmadd.pd.256" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_ps&ig_expand=3391
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_pd&ig_expand=3367
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmadd_ps&ig_expand=3395
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmadd_pd&ig_expand=3371
+            intrinsic_args!(fx, args => (a, b, c); intrinsic);
+
+            assert_eq!(a.layout(), b.layout());
+            assert_eq!(a.layout(), c.layout());
+            let layout = a.layout();
+
+            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+            assert!(lane_ty.is_floating_point());
+            assert!(ret_lane_ty.is_floating_point());
+            assert_eq!(lane_count, ret_lane_count);
+            let ret_lane_layout = fx.layout_of(ret_lane_ty);
+
+            for idx in 0..lane_count {
+                let a_lane = a.value_lane(fx, idx).load_scalar(fx);
+                let b_lane = b.value_lane(fx, idx).load_scalar(fx);
+                let c_lane = c.value_lane(fx, idx).load_scalar(fx);
+
+                let mul = fx.bcx.ins().fmul(a_lane, b_lane);
+                let neg_mul = fx.bcx.ins().fneg(mul);
+                let res = fx.bcx.ins().fadd(neg_mul, c_lane);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
+            }
+        }
+
+        "llvm.x86.sse42.pcmpestri128" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri&ig_expand=939
+            intrinsic_args!(fx, args => (a, la, b, lb, _imm8); intrinsic);
+
+            let a = a.load_scalar(fx);
+            let la = la.load_scalar(fx);
+            let b = b.load_scalar(fx);
+            let lb = lb.load_scalar(fx);
+
+            let imm8 = if let Some(imm8) = crate::constant::mir_operand_get_const_val(fx, &args[4])
+            {
+                imm8
+            } else {
+                fx.tcx.sess.span_fatal(span, "Index argument for `_mm_cmpestri` is not a constant");
+            };
+
+            let imm8 = imm8.try_to_u8().unwrap_or_else(|_| panic!("kind not scalar: {:?}", imm8));
+
+            codegen_inline_asm_inner(
+                fx,
+                &[InlineAsmTemplatePiece::String(format!("pcmpestri xmm0, xmm1, {imm8}"))],
+                &[
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),
+                        value: a,
+                    },
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),
+                        value: b,
+                    },
+                    // Implicit argument to the pcmpestri intrinsic
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::ax)),
+                        value: la,
+                    },
+                    // Implicit argument to the pcmpestri intrinsic
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::dx)),
+                        value: lb,
+                    },
+                    // Implicit result of the pcmpestri intrinsic
+                    CInlineAsmOperand::Out {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::cx)),
+                        late: true,
+                        place: Some(ret),
+                    },
+                ],
+                InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
+            );
+        }
+
+        "llvm.x86.sse42.pcmpestrm128" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm&ig_expand=940
+            intrinsic_args!(fx, args => (a, la, b, lb, _imm8); intrinsic);
+
+            let a = a.load_scalar(fx);
+            let la = la.load_scalar(fx);
+            let b = b.load_scalar(fx);
+            let lb = lb.load_scalar(fx);
+
+            let imm8 = if let Some(imm8) = crate::constant::mir_operand_get_const_val(fx, &args[4])
+            {
+                imm8
+            } else {
+                fx.tcx.sess.span_fatal(span, "Index argument for `_mm_cmpestrm` is not a constant");
+            };
+
+            let imm8 = imm8.try_to_u8().unwrap_or_else(|_| panic!("kind not scalar: {:?}", imm8));
+
+            codegen_inline_asm_inner(
+                fx,
+                &[InlineAsmTemplatePiece::String(format!("pcmpestrm xmm0, xmm1, {imm8}"))],
+                &[
+                    CInlineAsmOperand::InOut {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),
+                        _late: true,
+                        in_value: a,
+                        out_place: Some(ret),
+                    },
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),
+                        value: b,
+                    },
+                    // Implicit argument to the pcmpestri intrinsic
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::ax)),
+                        value: la,
+                    },
+                    // Implicit argument to the pcmpestri intrinsic
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::dx)),
+                        value: lb,
+                    },
+                ],
+                InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
+            );
+        }
+
         "llvm.x86.pclmulqdq" => {
             // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128&ig_expand=772
             intrinsic_args!(fx, args => (a, b, _imm8); intrinsic);
diff --git a/compiler/rustc_codegen_cranelift/src/pretty_clif.rs b/compiler/rustc_codegen_cranelift/src/pretty_clif.rs
index da84e54a916..02c0dcb8b1b 100644
--- a/compiler/rustc_codegen_cranelift/src/pretty_clif.rs
+++ b/compiler/rustc_codegen_cranelift/src/pretty_clif.rs
@@ -58,11 +58,10 @@
 use std::fmt;
 use std::io::Write;
 
-use cranelift_codegen::{
-    entity::SecondaryMap,
-    ir::entities::AnyEntity,
-    write::{FuncWriter, PlainWriter},
-};
+use cranelift_codegen::entity::SecondaryMap;
+use cranelift_codegen::ir::entities::AnyEntity;
+use cranelift_codegen::ir::Fact;
+use cranelift_codegen::write::{FuncWriter, PlainWriter};
 use rustc_middle::ty::layout::FnAbiOf;
 use rustc_middle::ty::print::with_no_trimmed_paths;
 use rustc_session::config::{OutputFilenames, OutputType};
@@ -155,8 +154,13 @@ impl FuncWriter for &'_ CommentWriter {
         _func: &Function,
         entity: AnyEntity,
         value: &dyn fmt::Display,
+        maybe_fact: Option<&Fact>,
     ) -> fmt::Result {
-        write!(w, "    {} = {}", entity, value)?;
+        if let Some(fact) = maybe_fact {
+            write!(w, "    {} ! {} = {}", entity, fact, value)?;
+        } else {
+            write!(w, "    {} = {}", entity, value)?;
+        }
 
         if let Some(comment) = self.entity_comments.get(&entity) {
             writeln!(w, " ; {}", comment.replace('\n', "\n; "))
diff --git a/compiler/rustc_codegen_cranelift/src/value_and_place.rs b/compiler/rustc_codegen_cranelift/src/value_and_place.rs
index 21ad2a835fc..f52f59716a8 100644
--- a/compiler/rustc_codegen_cranelift/src/value_and_place.rs
+++ b/compiler/rustc_codegen_cranelift/src/value_and_place.rs
@@ -329,7 +329,13 @@ impl<'tcx> CValue<'tcx> {
                 let msb = fx.bcx.ins().iconst(types::I64, (const_val >> 64) as u64 as i64);
                 fx.bcx.ins().iconcat(lsb, msb)
             }
-            ty::Bool | ty::Char | ty::Uint(_) | ty::Int(_) | ty::Ref(..) | ty::RawPtr(..) => {
+            ty::Bool
+            | ty::Char
+            | ty::Uint(_)
+            | ty::Int(_)
+            | ty::Ref(..)
+            | ty::RawPtr(..)
+            | ty::FnPtr(..) => {
                 let raw_val = const_val.size().truncate(const_val.to_bits(layout.size).unwrap());
                 fx.bcx.ins().iconst(clif_ty, raw_val as i64)
             }
@@ -971,6 +977,32 @@ pub(crate) fn assert_assignable<'tcx>(
                 }
             }
         }
+        (&ty::Coroutine(def_id_a, args_a, mov_a), &ty::Coroutine(def_id_b, args_b, mov_b))
+            if def_id_a == def_id_b && mov_a == mov_b =>
+        {
+            let mut types_a = args_a.types();
+            let mut types_b = args_b.types();
+            loop {
+                match (types_a.next(), types_b.next()) {
+                    (Some(a), Some(b)) => assert_assignable(fx, a, b, limit - 1),
+                    (None, None) => return,
+                    (Some(_), None) | (None, Some(_)) => panic!("{:#?}/{:#?}", from_ty, to_ty),
+                }
+            }
+        }
+        (&ty::CoroutineWitness(def_id_a, args_a), &ty::CoroutineWitness(def_id_b, args_b))
+            if def_id_a == def_id_b =>
+        {
+            let mut types_a = args_a.types();
+            let mut types_b = args_b.types();
+            loop {
+                match (types_a.next(), types_b.next()) {
+                    (Some(a), Some(b)) => assert_assignable(fx, a, b, limit - 1),
+                    (None, None) => return,
+                    (Some(_), None) | (None, Some(_)) => panic!("{:#?}/{:#?}", from_ty, to_ty),
+                }
+            }
+        }
         (ty::Param(_), _) | (_, ty::Param(_)) if fx.tcx.sess.opts.unstable_opts.polymorphize => {
             // No way to check if it is correct or not with polymorphization enabled
         }