about summary refs log tree commit diff
diff options
context:
space:
mode:
authorbors <bors@rust-lang.org>2023-10-24 13:09:19 +0000
committerbors <bors@rust-lang.org>2023-10-24 13:09:19 +0000
commit642bfb254a851d0deca01d6e2f4db13b60536210 (patch)
tree10bb7574dc53427e34d33fdb5d2eb25c695bc6fb
parentcee6db171d03c22f1cbf98a72dffde858a49fbd8 (diff)
parent484bc7fc8885cc580c50be4c74b800ff82451613 (diff)
downloadrust-642bfb254a851d0deca01d6e2f4db13b60536210.tar.gz
rust-642bfb254a851d0deca01d6e2f4db13b60536210.zip
Auto merge of #117124 - bjorn3:sync_cg_clif-2023-10-24, r=bjorn3
Sync rustc_codegen_cranelift

This contains fixes for the last two remaining known miscompilations. One is the lack of stack alignment support in cranelift which has been worked around by dynamically realigning at runtime. This fixed rayon and by extension Wasmtime. And the other is lack of zero/sign extending of small arguments when the ABI requires this. This is completely fine when only using cg_clif compiled code, but LLVM depends on this resulting in weird behavior of mixed LLVM, Cranelift binaries. The update to Cranelift 0.101.1 fixes this. In addition I have implemented all x86_64 SIMD intrinsics required by the image and rav1e crates.

r? `@ghost`

`@rustbot` label +A-codegen +A-cranelift +T-compiler
-rw-r--r--compiler/rustc_codegen_cranelift/.vscode/settings.json2
-rw-r--r--compiler/rustc_codegen_cranelift/Cargo.lock52
-rw-r--r--compiler/rustc_codegen_cranelift/Cargo.toml12
-rw-r--r--compiler/rustc_codegen_cranelift/Readme.md6
-rw-r--r--compiler/rustc_codegen_cranelift/example/mini_core_hello_world.rs11
-rw-r--r--compiler/rustc_codegen_cranelift/patches/0001-regex-Ignore-test-which-gets-miscompiled-with-llvm-sysroot.patch25
-rw-r--r--compiler/rustc_codegen_cranelift/src/abi/mod.rs35
-rw-r--r--compiler/rustc_codegen_cranelift/src/abi/pass_mode.rs11
-rw-r--r--compiler/rustc_codegen_cranelift/src/cast.rs6
-rw-r--r--compiler/rustc_codegen_cranelift/src/common.rs19
-rw-r--r--compiler/rustc_codegen_cranelift/src/driver/aot.rs23
-rw-r--r--compiler/rustc_codegen_cranelift/src/inline_asm.rs22
-rw-r--r--compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs137
-rw-r--r--compiler/rustc_codegen_cranelift/src/value_and_place.rs37
14 files changed, 260 insertions, 138 deletions
diff --git a/compiler/rustc_codegen_cranelift/.vscode/settings.json b/compiler/rustc_codegen_cranelift/.vscode/settings.json
index 60cb51d5663..834a1362caf 100644
--- a/compiler/rustc_codegen_cranelift/.vscode/settings.json
+++ b/compiler/rustc_codegen_cranelift/.vscode/settings.json
@@ -33,7 +33,7 @@
             ]
         },
         {
-            "sysroot_src": "./download/sysroot/sysroot_src/library",
+            "sysroot_src": "./build/stdlib/library",
             "crates": [
                 {
                     "root_module": "./example/std_example.rs",
diff --git a/compiler/rustc_codegen_cranelift/Cargo.lock b/compiler/rustc_codegen_cranelift/Cargo.lock
index 8079913cb0f..c716d501173 100644
--- a/compiler/rustc_codegen_cranelift/Cargo.lock
+++ b/compiler/rustc_codegen_cranelift/Cargo.lock
@@ -45,18 +45,18 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 
 [[package]]
 name = "cranelift-bforest"
-version = "0.101.0"
+version = "0.101.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e5e1df0da8488dd03b34afc134ba84b754d61862cc465932a9e5d07952f661e"
+checksum = "c1512c3bb6b13018e7109fc3ac964bc87b329eaf3a77825d337558d0c7f6f1be"
 dependencies = [
  "cranelift-entity",
 ]
 
 [[package]]
 name = "cranelift-codegen"
-version = "0.101.0"
+version = "0.101.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77a17ca4e699a0aaf49a0c88f6311a864f321048aa63f6b787cab20eb5f93f10"
+checksum = "16cb8fb9220a6ea7a226705a273ab905309ee546267bdf34948d57932d7f0396"
 dependencies = [
  "bumpalo",
  "cranelift-bforest",
@@ -75,39 +75,39 @@ dependencies = [
 
 [[package]]
 name = "cranelift-codegen-meta"
-version = "0.101.0"
+version = "0.101.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "022f2793cdade1d37a1f755ac42938a3f832f533eac6cafc8b26b209544c3c06"
+checksum = "ab3a8d3b0d4745b183da5ea0792b13d79f5c23d6e69ac04761728e2532b56649"
 dependencies = [
  "cranelift-codegen-shared",
 ]
 
 [[package]]
 name = "cranelift-codegen-shared"
-version = "0.101.0"
+version = "0.101.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a4d72dbb83c2ad788dec4ad0843070973cb48c35a3ca19b1e7437ac40834fd9c"
+checksum = "524141c8e68f2abc2043de4c2b31f6d9dd42432738c246431d0572a1422a4a84"
 
 [[package]]
 name = "cranelift-control"
-version = "0.101.0"
+version = "0.101.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ae07cf26dcc90d546826d747ac63b6c40c916f34b03e92a6ae0422c28d771b8a"
+checksum = "97513b57c961c713789a03886a57b43e14ebcd204cbaa8ae50ca6c70a8e716b3"
 dependencies = [
  "arbitrary",
 ]
 
 [[package]]
 name = "cranelift-entity"
-version = "0.101.0"
+version = "0.101.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c2fe6b7e49820893691aea497f36257e9d6f52061d8c4758d61d802d5f101a3d"
+checksum = "e3f23d3cf3afa7e45f239702612c76d87964f652a55e28d13ed6d7e20f3479dd"
 
 [[package]]
 name = "cranelift-frontend"
-version = "0.101.0"
+version = "0.101.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "44f497576ca3674581581601b6a55ccc1b43447217648c880e5bce70db3cf659"
+checksum = "554cd4947ec9209b58bf9ae5bf83581b5ddf9128bd967208e334b504a57db54e"
 dependencies = [
  "cranelift-codegen",
  "log",
@@ -117,15 +117,15 @@ dependencies = [
 
 [[package]]
 name = "cranelift-isle"
-version = "0.101.0"
+version = "0.101.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b96aa02eac00fffee13b0cd37d17874ccdb3d5458983041accd825ef78ce6454"
+checksum = "6c1892a439696b6413cb54083806f5fd9fc431768b8de74864b3d9e8b93b124f"
 
 [[package]]
 name = "cranelift-jit"
-version = "0.101.0"
+version = "0.101.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b1d6e0e308c873eefc185745a6b21daec2a10f7554c9fb67e334c2d7d756d979"
+checksum = "32209252fb38acaf1662ccd0397907bbe0e92bdb13b6ddbfd2f74e437f83e685"
 dependencies = [
  "anyhow",
  "cranelift-codegen",
@@ -143,9 +143,9 @@ dependencies = [
 
 [[package]]
 name = "cranelift-module"
-version = "0.101.0"
+version = "0.101.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c1aa8ebb06eced4e478c3f94f1d65d4e7c93493f4640057912b27a3e34b84841"
+checksum = "bf42656f5f6df7bfafc4dd7b63a1888b0627c07b43b2cb9aa54e13843fed39eb"
 dependencies = [
  "anyhow",
  "cranelift-codegen",
@@ -154,9 +154,9 @@ dependencies = [
 
 [[package]]
 name = "cranelift-native"
-version = "0.101.0"
+version = "0.101.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2870170ca44054b202c737626607b87be6e35655084bd94a6ff807a5812ba7df"
+checksum = "e0c2d3badd4b9690865f5bb68a71fa94de592fa2df3f3d11a5a062c60c0a107a"
 dependencies = [
  "cranelift-codegen",
  "libc",
@@ -165,9 +165,9 @@ dependencies = [
 
 [[package]]
 name = "cranelift-object"
-version = "0.101.0"
+version = "0.101.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "20647761742d17dabac8205da958910ede78599550e06418a16711a3ee2fc897"
+checksum = "88eca54bbecea3170035168357306e9c779d4a63d8bf036c9e16bd21fdaa69b5"
 dependencies = [
  "anyhow",
  "cranelift-codegen",
@@ -374,9 +374,9 @@ checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
 
 [[package]]
 name = "wasmtime-jit-icache-coherence"
-version = "14.0.0"
+version = "14.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a3a5dda53ad6993f9b0a2d65fb49e0348a7232a27a8794064122870d6ee19eb2"
+checksum = "9aaf2fa8fd2d6b65abae9b92edfe69254cc5d6b166e342364036c3e347de8da9"
 dependencies = [
  "cfg-if",
  "libc",
diff --git a/compiler/rustc_codegen_cranelift/Cargo.toml b/compiler/rustc_codegen_cranelift/Cargo.toml
index 5ad5e0e8140..f2ce714e8ff 100644
--- a/compiler/rustc_codegen_cranelift/Cargo.toml
+++ b/compiler/rustc_codegen_cranelift/Cargo.toml
@@ -8,12 +8,12 @@ crate-type = ["dylib"]
 
 [dependencies]
 # These have to be in sync with each other
-cranelift-codegen = { version = "0.101", features = ["unwind", "all-arch"] }
-cranelift-frontend = { version = "0.101" }
-cranelift-module = { version = "0.101" }
-cranelift-native = { version = "0.101" }
-cranelift-jit = { version = "0.101", optional = true }
-cranelift-object = { version = "0.101" }
+cranelift-codegen = { version = "0.101.1", features = ["unwind", "all-arch"] }
+cranelift-frontend = { version = "0.101.1" }
+cranelift-module = { version = "0.101.1" }
+cranelift-native = { version = "0.101.1" }
+cranelift-jit = { version = "0.101.1", optional = true }
+cranelift-object = { version = "0.101.1" }
 target-lexicon = "0.12.0"
 gimli = { version = "0.28", default-features = false, features = ["write"]}
 object = { version = "0.32", default-features = false, features = ["std", "read_core", "write", "archive", "coff", "elf", "macho", "pe"] }
diff --git a/compiler/rustc_codegen_cranelift/Readme.md b/compiler/rustc_codegen_cranelift/Readme.md
index 6f2027be96d..5664cbe7d4f 100644
--- a/compiler/rustc_codegen_cranelift/Readme.md
+++ b/compiler/rustc_codegen_cranelift/Readme.md
@@ -8,7 +8,7 @@ If not please open an issue.
 ## Building and testing
 
 ```bash
-$ git clone https://github.com/bjorn3/rustc_codegen_cranelift
+$ git clone https://github.com/rust-lang/rustc_codegen_cranelift
 $ cd rustc_codegen_cranelift
 $ ./y.sh prepare
 $ ./y.sh build
@@ -29,7 +29,7 @@ Extract the `dist` directory in the archive anywhere you want.
 If you want to use `cargo clif build` instead of having to specify the full path to the `cargo-clif` executable, you can add the `bin` subdirectory of the extracted `dist` directory to your `PATH`.
 (tutorial [for Windows](https://stackoverflow.com/a/44272417), and [for Linux/MacOS](https://unix.stackexchange.com/questions/26047/how-to-correctly-add-a-path-to-path/26059#26059)).
 
-[releases]: https://github.com/bjorn3/rustc_codegen_cranelift/releases/tag/dev
+[releases]: https://github.com/rust-lang/rustc_codegen_cranelift/releases/tag/dev
 
 ## Usage
 
@@ -78,7 +78,7 @@ configuration options.
 
 * Inline assembly ([no cranelift support](https://github.com/bytecodealliance/wasmtime/issues/1041))
     * On UNIX there is support for invoking an external assembler for `global_asm!` and `asm!`.
-* SIMD ([tracked here](https://github.com/bjorn3/rustc_codegen_cranelift/issues/171), `std::simd` fully works, `std::arch` is partially supported)
+* SIMD ([tracked here](https://github.com/rust-lang/rustc_codegen_cranelift/issues/171), `std::simd` fully works, `std::arch` is partially supported)
 * Unwinding on panics ([no cranelift support](https://github.com/bytecodealliance/wasmtime/issues/1677), `-Cpanic=abort` is enabled by default)
 
 ## License
diff --git a/compiler/rustc_codegen_cranelift/example/mini_core_hello_world.rs b/compiler/rustc_codegen_cranelift/example/mini_core_hello_world.rs
index 91de04d9770..afc51a47f14 100644
--- a/compiler/rustc_codegen_cranelift/example/mini_core_hello_world.rs
+++ b/compiler/rustc_codegen_cranelift/example/mini_core_hello_world.rs
@@ -353,6 +353,17 @@ fn main() {
 
     let f = V([0.0, 1.0]);
     let _a = f.0[0];
+
+    stack_val_align();
+}
+
+#[inline(never)]
+fn stack_val_align() {
+    #[repr(align(8192))]
+    struct Foo(u8);
+
+    let a = Foo(0);
+    assert_eq!(&a as *const Foo as usize % 8192, 0);
 }
 
 #[cfg(all(
diff --git a/compiler/rustc_codegen_cranelift/patches/0001-regex-Ignore-test-which-gets-miscompiled-with-llvm-sysroot.patch b/compiler/rustc_codegen_cranelift/patches/0001-regex-Ignore-test-which-gets-miscompiled-with-llvm-sysroot.patch
deleted file mode 100644
index e6ebdcec783..00000000000
--- a/compiler/rustc_codegen_cranelift/patches/0001-regex-Ignore-test-which-gets-miscompiled-with-llvm-sysroot.patch
+++ /dev/null
@@ -1,25 +0,0 @@
-From 5d4afb8d807d181038b6a004d17ed055a8d191b2 Mon Sep 17 00:00:00 2001
-From: bjorn3 <17426603+bjorn3@users.noreply.github.com>
-Date: Mon, 2 Oct 2023 13:59:00 +0000
-Subject: [PATCH] Ignore test which gets miscompiled with llvm sysroot
-
----
- regex-automata/src/util/pool.rs | 2 ++
- 1 file changed, 2 insertions(+)
-
-diff --git a/regex-automata/src/util/pool.rs b/regex-automata/src/util/pool.rs
-index c03d7b0..28b233b 100644
---- a/regex-automata/src/util/pool.rs
-+++ b/regex-automata/src/util/pool.rs
-@@ -1081,6 +1081,8 @@ mod tests {
-     // into the pool. This in turn resulted in this test producing a data race.
-     #[cfg(feature = "std")]
-     #[test]
-+    // FIXME(rustc_codegen_cranelift#1395) miscompilation of thread::scope with LLVM sysroot
-+    #[ignore]
-     fn thread_owner_sync() {
-         let pool = Pool::new(|| vec!['a']);
-         {
--- 
-2.34.1
-
diff --git a/compiler/rustc_codegen_cranelift/src/abi/mod.rs b/compiler/rustc_codegen_cranelift/src/abi/mod.rs
index c75ad852f82..c4572e03525 100644
--- a/compiler/rustc_codegen_cranelift/src/abi/mod.rs
+++ b/compiler/rustc_codegen_cranelift/src/abi/mod.rs
@@ -120,32 +120,25 @@ impl<'tcx> FunctionCx<'_, '_, 'tcx> {
         args: &[Value],
     ) -> Cow<'_, [Value]> {
         if self.tcx.sess.target.is_like_windows {
-            let (mut params, mut args): (Vec<_>, Vec<_>) =
-                params
-                    .into_iter()
-                    .zip(args)
-                    .map(|(param, &arg)| {
-                        if param.value_type == types::I128 {
-                            let arg_ptr = Pointer::stack_slot(self.bcx.create_sized_stack_slot(
-                                StackSlotData { kind: StackSlotKind::ExplicitSlot, size: 16 },
-                            ));
-                            arg_ptr.store(self, arg, MemFlags::trusted());
-                            (AbiParam::new(self.pointer_type), arg_ptr.get_addr(self))
-                        } else {
-                            (param, arg)
-                        }
-                    })
-                    .unzip();
+            let (mut params, mut args): (Vec<_>, Vec<_>) = params
+                .into_iter()
+                .zip(args)
+                .map(|(param, &arg)| {
+                    if param.value_type == types::I128 {
+                        let arg_ptr = self.create_stack_slot(16, 16);
+                        arg_ptr.store(self, arg, MemFlags::trusted());
+                        (AbiParam::new(self.pointer_type), arg_ptr.get_addr(self))
+                    } else {
+                        (param, arg)
+                    }
+                })
+                .unzip();
 
             let indirect_ret_val = returns.len() == 1 && returns[0].value_type == types::I128;
 
             if indirect_ret_val {
                 params.insert(0, AbiParam::new(self.pointer_type));
-                let ret_ptr =
-                    Pointer::stack_slot(self.bcx.create_sized_stack_slot(StackSlotData {
-                        kind: StackSlotKind::ExplicitSlot,
-                        size: 16,
-                    }));
+                let ret_ptr = self.create_stack_slot(16, 16);
                 args.insert(0, ret_ptr.get_addr(self));
                 self.lib_call_unadjusted(name, params, vec![], &args);
                 return Cow::Owned(vec![ret_ptr.load(self, types::I128, MemFlags::trusted())]);
diff --git a/compiler/rustc_codegen_cranelift/src/abi/pass_mode.rs b/compiler/rustc_codegen_cranelift/src/abi/pass_mode.rs
index 7c9f8c1051c..06522670029 100644
--- a/compiler/rustc_codegen_cranelift/src/abi/pass_mode.rs
+++ b/compiler/rustc_codegen_cranelift/src/abi/pass_mode.rs
@@ -189,16 +189,13 @@ pub(super) fn from_casted_value<'tcx>(
     let abi_params = cast_target_to_abi_params(cast);
     let abi_param_size: u32 = abi_params.iter().map(|param| param.value_type.bytes()).sum();
     let layout_size = u32::try_from(layout.size.bytes()).unwrap();
-    let stack_slot = fx.bcx.create_sized_stack_slot(StackSlotData {
-        kind: StackSlotKind::ExplicitSlot,
-        // FIXME Don't force the size to a multiple of 16 bytes once Cranelift gets a way to
-        // specify stack slot alignment.
+    let ptr = fx.create_stack_slot(
         // Stack slot size may be bigger for example `[u8; 3]` which is packed into an `i32`.
         // It may also be smaller for example when the type is a wrapper around an integer with a
         // larger alignment than the integer.
-        size: (std::cmp::max(abi_param_size, layout_size) + 15) / 16 * 16,
-    });
-    let ptr = Pointer::stack_slot(stack_slot);
+        std::cmp::max(abi_param_size, layout_size),
+        u32::try_from(layout.align.pref.bytes()).unwrap(),
+    );
     let mut offset = 0;
     let mut block_params_iter = block_params.iter().copied();
     for param in abi_params {
diff --git a/compiler/rustc_codegen_cranelift/src/cast.rs b/compiler/rustc_codegen_cranelift/src/cast.rs
index 7e027c5f1b3..0b5cb1547fc 100644
--- a/compiler/rustc_codegen_cranelift/src/cast.rs
+++ b/compiler/rustc_codegen_cranelift/src/cast.rs
@@ -104,11 +104,7 @@ pub(crate) fn clif_int_or_float_cast(
                     &[from],
                 )[0];
                 // FIXME(bytecodealliance/wasmtime#6104) use bitcast instead of store to get from i64x2 to i128
-                let stack_slot = fx.bcx.create_sized_stack_slot(StackSlotData {
-                    kind: StackSlotKind::ExplicitSlot,
-                    size: 16,
-                });
-                let ret_ptr = Pointer::stack_slot(stack_slot);
+                let ret_ptr = fx.create_stack_slot(16, 16);
                 ret_ptr.store(fx, ret, MemFlags::trusted());
                 ret_ptr.load(fx, types::I128, MemFlags::trusted())
             } else {
diff --git a/compiler/rustc_codegen_cranelift/src/common.rs b/compiler/rustc_codegen_cranelift/src/common.rs
index 7a3ae6ebf52..9771f44f62c 100644
--- a/compiler/rustc_codegen_cranelift/src/common.rs
+++ b/compiler/rustc_codegen_cranelift/src/common.rs
@@ -383,6 +383,25 @@ impl<'tcx> FunctionCx<'_, '_, 'tcx> {
         })
     }
 
+    pub(crate) fn create_stack_slot(&mut self, size: u32, align: u32) -> Pointer {
+        if align <= 16 {
+            let stack_slot = self.bcx.create_sized_stack_slot(StackSlotData {
+                kind: StackSlotKind::ExplicitSlot,
+                // FIXME Don't force the size to a multiple of 16 bytes once Cranelift gets a way to
+                // specify stack slot alignment.
+                size: (size + 15) / 16 * 16,
+            });
+            Pointer::stack_slot(stack_slot)
+        } else {
+            // Alignment is too big to handle using the above hack. Dynamically realign a stack slot
+            // instead. This wastes some space for the realignment.
+            let base_ptr = self.create_stack_slot(size + align, 16).get_addr(self);
+            let misalign_offset = self.bcx.ins().urem_imm(base_ptr, i64::from(align));
+            let realign_offset = self.bcx.ins().irsub_imm(misalign_offset, i64::from(align));
+            Pointer::new(self.bcx.ins().iadd(base_ptr, realign_offset))
+        }
+    }
+
     pub(crate) fn set_debug_loc(&mut self, source_info: mir::SourceInfo) {
         if let Some(debug_context) = &mut self.cx.debug_context {
             let (file, line, column) =
diff --git a/compiler/rustc_codegen_cranelift/src/driver/aot.rs b/compiler/rustc_codegen_cranelift/src/driver/aot.rs
index d3a7decc543..11229dd421e 100644
--- a/compiler/rustc_codegen_cranelift/src/driver/aot.rs
+++ b/compiler/rustc_codegen_cranelift/src/driver/aot.rs
@@ -361,12 +361,26 @@ pub(crate) fn run_aot(
     metadata: EncodedMetadata,
     need_metadata_module: bool,
 ) -> Box<OngoingCodegen> {
+    // FIXME handle `-Ctarget-cpu=native`
+    let target_cpu = match tcx.sess.opts.cg.target_cpu {
+        Some(ref name) => name,
+        None => tcx.sess.target.cpu.as_ref(),
+    }
+    .to_owned();
+
     let cgus = if tcx.sess.opts.output_types.should_codegen() {
         tcx.collect_and_partition_mono_items(()).1
     } else {
         // If only `--emit metadata` is used, we shouldn't perform any codegen.
         // Also `tcx.collect_and_partition_mono_items` may panic in that case.
-        &[]
+        return Box::new(OngoingCodegen {
+            modules: vec![],
+            allocator_module: None,
+            metadata_module: None,
+            metadata,
+            crate_info: CrateInfo::new(tcx, target_cpu),
+            concurrency_limiter: ConcurrencyLimiter::new(tcx.sess, 0),
+        });
     };
 
     if tcx.dep_graph.is_fully_enabled() {
@@ -481,13 +495,6 @@ pub(crate) fn run_aot(
         None
     };
 
-    // FIXME handle `-Ctarget-cpu=native`
-    let target_cpu = match tcx.sess.opts.cg.target_cpu {
-        Some(ref name) => name,
-        None => tcx.sess.target.cpu.as_ref(),
-    }
-    .to_owned();
-
     Box::new(OngoingCodegen {
         modules,
         allocator_module,
diff --git a/compiler/rustc_codegen_cranelift/src/inline_asm.rs b/compiler/rustc_codegen_cranelift/src/inline_asm.rs
index ed077234254..0517c609337 100644
--- a/compiler/rustc_codegen_cranelift/src/inline_asm.rs
+++ b/compiler/rustc_codegen_cranelift/src/inline_asm.rs
@@ -878,13 +878,7 @@ fn call_inline_asm<'tcx>(
     inputs: Vec<(Size, Value)>,
     outputs: Vec<(Size, CPlace<'tcx>)>,
 ) {
-    let stack_slot = fx.bcx.func.create_sized_stack_slot(StackSlotData {
-        kind: StackSlotKind::ExplicitSlot,
-        size: u32::try_from(slot_size.bytes()).unwrap(),
-    });
-    if fx.clif_comments.enabled() {
-        fx.add_comment(stack_slot, "inline asm scratch slot");
-    }
+    let stack_slot = fx.create_stack_slot(u32::try_from(slot_size.bytes()).unwrap(), 16);
 
     let inline_asm_func = fx
         .module
@@ -904,15 +898,23 @@ fn call_inline_asm<'tcx>(
     }
 
     for (offset, value) in inputs {
-        fx.bcx.ins().stack_store(value, stack_slot, i32::try_from(offset.bytes()).unwrap());
+        stack_slot.offset(fx, i32::try_from(offset.bytes()).unwrap().into()).store(
+            fx,
+            value,
+            MemFlags::trusted(),
+        );
     }
 
-    let stack_slot_addr = fx.bcx.ins().stack_addr(fx.pointer_type, stack_slot, 0);
+    let stack_slot_addr = stack_slot.get_addr(fx);
     fx.bcx.ins().call(inline_asm_func, &[stack_slot_addr]);
 
     for (offset, place) in outputs {
         let ty = fx.clif_type(place.layout().ty).unwrap();
-        let value = fx.bcx.ins().stack_load(ty, stack_slot, i32::try_from(offset.bytes()).unwrap());
+        let value = stack_slot.offset(fx, i32::try_from(offset.bytes()).unwrap().into()).load(
+            fx,
+            ty,
+            MemFlags::trusted(),
+        );
         place.write_cvalue(fx, CValue::by_val(value, place.layout()));
     }
 }
diff --git a/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs b/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs
index 0c9a94e1c23..35f144d7dad 100644
--- a/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs
+++ b/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs
@@ -310,6 +310,143 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
             let val = CValue::by_val_pair(cb_out, c, layout);
             ret.write_cvalue(fx, val);
         }
+        "llvm.x86.sse2.pavg.b" | "llvm.x86.sse2.pavg.w" => {
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+
+            // FIXME use vector instructions when possible
+            simd_pair_for_each_lane(
+                fx,
+                a,
+                b,
+                ret,
+                &|fx, _lane_ty, _res_lane_ty, a_lane, b_lane| {
+                    // (a + b + 1) >> 1
+                    let lane_ty = fx.bcx.func.dfg.value_type(a_lane);
+                    let a_lane = fx.bcx.ins().uextend(lane_ty.double_width().unwrap(), a_lane);
+                    let b_lane = fx.bcx.ins().uextend(lane_ty.double_width().unwrap(), b_lane);
+                    let sum = fx.bcx.ins().iadd(a_lane, b_lane);
+                    let num_plus_one = fx.bcx.ins().iadd_imm(sum, 1);
+                    let res = fx.bcx.ins().ushr_imm(num_plus_one, 1);
+                    fx.bcx.ins().ireduce(lane_ty, res)
+                },
+            );
+        }
+        "llvm.x86.sse2.psra.w" => {
+            intrinsic_args!(fx, args => (a, count); intrinsic);
+
+            let count_lane = count.force_stack(fx).0.load(fx, types::I64, MemFlags::trusted());
+            let lane_ty = fx.clif_type(a.layout().ty.simd_size_and_type(fx.tcx).1).unwrap();
+            let max_count = fx.bcx.ins().iconst(types::I64, i64::from(lane_ty.bits() - 1));
+            let saturated_count = fx.bcx.ins().umin(count_lane, max_count);
+
+            // FIXME use vector instructions when possible
+            simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, a_lane| {
+                fx.bcx.ins().sshr(a_lane, saturated_count)
+            });
+        }
+        "llvm.x86.sse2.psad.bw" => {
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+
+            assert_eq!(a.layout(), b.layout());
+            let layout = a.layout();
+
+            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+            assert_eq!(lane_ty, fx.tcx.types.u8);
+            assert_eq!(ret_lane_ty, fx.tcx.types.u64);
+            assert_eq!(lane_count, ret_lane_count * 8);
+
+            let ret_lane_layout = fx.layout_of(fx.tcx.types.u64);
+            for out_lane_idx in 0..lane_count / 8 {
+                let mut lane_diff_acc = fx.bcx.ins().iconst(types::I64, 0);
+
+                for lane_idx in out_lane_idx * 8..out_lane_idx * 8 + 1 {
+                    let a_lane = a.value_lane(fx, lane_idx).load_scalar(fx);
+                    let b_lane = b.value_lane(fx, lane_idx).load_scalar(fx);
+
+                    let lane_diff = fx.bcx.ins().isub(a_lane, b_lane);
+                    let abs_lane_diff = fx.bcx.ins().iabs(lane_diff);
+                    let abs_lane_diff = fx.bcx.ins().uextend(types::I64, abs_lane_diff);
+                    lane_diff_acc = fx.bcx.ins().iadd(lane_diff_acc, abs_lane_diff);
+                }
+
+                let res_lane = CValue::by_val(lane_diff_acc, ret_lane_layout);
+
+                ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane);
+            }
+        }
+        "llvm.x86.ssse3.pmadd.ub.sw.128" => {
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+
+            let (lane_count, lane_ty) = a.layout().ty.simd_size_and_type(fx.tcx);
+            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+            assert_eq!(lane_ty, fx.tcx.types.u8);
+            assert_eq!(ret_lane_ty, fx.tcx.types.i16);
+            assert_eq!(lane_count, ret_lane_count * 2);
+
+            let ret_lane_layout = fx.layout_of(fx.tcx.types.i16);
+            for out_lane_idx in 0..lane_count / 2 {
+                let a_lane0 = a.value_lane(fx, out_lane_idx * 2).load_scalar(fx);
+                let a_lane0 = fx.bcx.ins().uextend(types::I16, a_lane0);
+                let b_lane0 = b.value_lane(fx, out_lane_idx * 2).load_scalar(fx);
+                let b_lane0 = fx.bcx.ins().sextend(types::I16, b_lane0);
+
+                let a_lane1 = a.value_lane(fx, out_lane_idx * 2 + 1).load_scalar(fx);
+                let a_lane1 = fx.bcx.ins().uextend(types::I16, a_lane1);
+                let b_lane1 = b.value_lane(fx, out_lane_idx * 2 + 1).load_scalar(fx);
+                let b_lane1 = fx.bcx.ins().sextend(types::I16, b_lane1);
+
+                let mul0: Value = fx.bcx.ins().imul(a_lane0, b_lane0);
+                let mul1 = fx.bcx.ins().imul(a_lane1, b_lane1);
+
+                let (val, has_overflow) = fx.bcx.ins().sadd_overflow(mul0, mul1);
+
+                let rhs_ge_zero = fx.bcx.ins().icmp_imm(IntCC::SignedGreaterThanOrEqual, mul1, 0);
+
+                let min = fx.bcx.ins().iconst(types::I16, i64::from(i16::MIN as u16));
+                let max = fx.bcx.ins().iconst(types::I16, i64::from(i16::MAX as u16));
+
+                let sat_val = fx.bcx.ins().select(rhs_ge_zero, max, min);
+                let res_lane = fx.bcx.ins().select(has_overflow, sat_val, val);
+
+                let res_lane = CValue::by_val(res_lane, ret_lane_layout);
+
+                ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane);
+            }
+        }
+        "llvm.x86.sse2.pmadd.wd" => {
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+
+            assert_eq!(a.layout(), b.layout());
+            let layout = a.layout();
+
+            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+            assert_eq!(lane_ty, fx.tcx.types.i16);
+            assert_eq!(ret_lane_ty, fx.tcx.types.i32);
+            assert_eq!(lane_count, ret_lane_count * 2);
+
+            let ret_lane_layout = fx.layout_of(fx.tcx.types.i32);
+            for out_lane_idx in 0..lane_count / 2 {
+                let a_lane0 = a.value_lane(fx, out_lane_idx * 2).load_scalar(fx);
+                let a_lane0 = fx.bcx.ins().uextend(types::I32, a_lane0);
+                let b_lane0 = b.value_lane(fx, out_lane_idx * 2).load_scalar(fx);
+                let b_lane0 = fx.bcx.ins().sextend(types::I32, b_lane0);
+
+                let a_lane1 = a.value_lane(fx, out_lane_idx * 2 + 1).load_scalar(fx);
+                let a_lane1 = fx.bcx.ins().uextend(types::I32, a_lane1);
+                let b_lane1 = b.value_lane(fx, out_lane_idx * 2 + 1).load_scalar(fx);
+                let b_lane1 = fx.bcx.ins().sextend(types::I32, b_lane1);
+
+                let mul0: Value = fx.bcx.ins().imul(a_lane0, b_lane0);
+                let mul1 = fx.bcx.ins().imul(a_lane1, b_lane1);
+
+                let res_lane = fx.bcx.ins().iadd(mul0, mul1);
+                let res_lane = CValue::by_val(res_lane, ret_lane_layout);
+
+                ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane);
+            }
+        }
         _ => {
             fx.tcx
                 .sess
diff --git a/compiler/rustc_codegen_cranelift/src/value_and_place.rs b/compiler/rustc_codegen_cranelift/src/value_and_place.rs
index 3a6a6c9e3f5..5f0aa6c5581 100644
--- a/compiler/rustc_codegen_cranelift/src/value_and_place.rs
+++ b/compiler/rustc_codegen_cranelift/src/value_and_place.rs
@@ -132,18 +132,11 @@ impl<'tcx> CValue<'tcx> {
                 (ptr.get_addr(fx), vtable)
             }
             CValueInner::ByValPair(data, vtable) => {
-                let stack_slot = fx.bcx.create_sized_stack_slot(StackSlotData {
-                    kind: StackSlotKind::ExplicitSlot,
-                    // FIXME Don't force the size to a multiple of 16 bytes once Cranelift gets a way to
-                    // specify stack slot alignment.
-                    size: (u32::try_from(fx.target_config.pointer_type().bytes()).unwrap() + 15)
-                        / 16
-                        * 16,
-                });
-                let data_ptr = Pointer::stack_slot(stack_slot);
-                let mut flags = MemFlags::new();
-                flags.set_notrap();
-                data_ptr.store(fx, data, flags);
+                let data_ptr = fx.create_stack_slot(
+                    u32::try_from(fx.target_config.pointer_type().bytes()).unwrap(),
+                    u32::try_from(fx.target_config.pointer_type().bytes()).unwrap(),
+                );
+                data_ptr.store(fx, data, MemFlags::trusted());
 
                 (data_ptr.get_addr(fx), vtable)
             }
@@ -372,13 +365,11 @@ impl<'tcx> CPlace<'tcx> {
                 .fatal(format!("values of type {} are too big to store on the stack", layout.ty));
         }
 
-        let stack_slot = fx.bcx.create_sized_stack_slot(StackSlotData {
-            kind: StackSlotKind::ExplicitSlot,
-            // FIXME Don't force the size to a multiple of 16 bytes once Cranelift gets a way to
-            // specify stack slot alignment.
-            size: (u32::try_from(layout.size.bytes()).unwrap() + 15) / 16 * 16,
-        });
-        CPlace { inner: CPlaceInner::Addr(Pointer::stack_slot(stack_slot), None), layout }
+        let stack_slot = fx.create_stack_slot(
+            u32::try_from(layout.size.bytes()).unwrap(),
+            u32::try_from(layout.align.pref.bytes()).unwrap(),
+        );
+        CPlace { inner: CPlaceInner::Addr(stack_slot, None), layout }
     }
 
     pub(crate) fn new_var(
@@ -543,13 +534,7 @@ impl<'tcx> CPlace<'tcx> {
                 _ if src_ty.is_vector() && dst_ty.is_vector() => codegen_bitcast(fx, dst_ty, data),
                 _ if src_ty.is_vector() || dst_ty.is_vector() => {
                     // FIXME(bytecodealliance/wasmtime#6104) do something more efficient for transmutes between vectors and integers.
-                    let stack_slot = fx.bcx.create_sized_stack_slot(StackSlotData {
-                        kind: StackSlotKind::ExplicitSlot,
-                        // FIXME Don't force the size to a multiple of 16 bytes once Cranelift gets a way to
-                        // specify stack slot alignment.
-                        size: (src_ty.bytes() + 15) / 16 * 16,
-                    });
-                    let ptr = Pointer::stack_slot(stack_slot);
+                    let ptr = fx.create_stack_slot(src_ty.bytes(), src_ty.bytes());
                     ptr.store(fx, data, MemFlags::trusted());
                     ptr.load(fx, dst_ty, MemFlags::trusted())
                 }