30 files changed, 1219 insertions, 474 deletions
diff --git a/compiler/rustc_codegen_llvm/src/abi.rs b/compiler/rustc_codegen_llvm/src/abi.rs
index 8294e29d07d..119cd634f98 100644
--- a/compiler/rustc_codegen_llvm/src/abi.rs
+++ b/compiler/rustc_codegen_llvm/src/abi.rs
@@ -2,7 +2,10 @@ use std::borrow::Borrow;
 use std::cmp;
 
 use libc::c_uint;
-use rustc_abi::{BackendRepr, HasDataLayout, Primitive, Reg, RegKind, Size};
+use rustc_abi::{
+    ArmCall, BackendRepr, CanonAbi, HasDataLayout, InterruptKind, Primitive, Reg, RegKind, Size,
+    X86Call,
+};
 use rustc_codegen_ssa::MemFlags;
 use rustc_codegen_ssa::mir::operand::{OperandRef, OperandValue};
 use rustc_codegen_ssa::mir::place::{PlaceRef, PlaceValue};
@@ -12,7 +15,7 @@ use rustc_middle::ty::layout::LayoutOf;
 use rustc_middle::{bug, ty};
 use rustc_session::config;
 use rustc_target::callconv::{
-    ArgAbi, ArgAttribute, ArgAttributes, ArgExtension, CastTarget, Conv, FnAbi, PassMode,
+    ArgAbi, ArgAttribute, ArgAttributes, ArgExtension, CastTarget, FnAbi, PassMode,
 };
 use rustc_target::spec::SanitizerSet;
 use smallvec::SmallVec;
@@ -172,7 +175,6 @@ impl LlvmType for CastTarget {
 }
 
 trait ArgAbiExt<'ll, 'tcx> {
-    fn memory_ty(&self, cx: &CodegenCx<'ll, 'tcx>) -> &'ll Type;
     fn store(
         &self,
         bx: &mut Builder<'_, 'll, 'tcx>,
@@ -188,12 +190,6 @@ trait ArgAbiExt<'ll, 'tcx> {
 }
 
 impl<'ll, 'tcx> ArgAbiExt<'ll, 'tcx> for ArgAbi<'tcx, Ty<'tcx>> {
-    /// Gets the LLVM type for a place of the original Rust type of
-    /// this argument/return, i.e., the result of `type_of::type_of`.
-    fn memory_ty(&self, cx: &CodegenCx<'ll, 'tcx>) -> &'ll Type {
-        self.layout.llvm_type(cx)
-    }
-
     /// Stores a direct/indirect value described by this ArgAbi into a
     /// place for the original Rust type of this argument/return.
     /// Can be used for both storing formal arguments into Rust variables
@@ -302,9 +298,6 @@ impl<'ll, 'tcx> ArgAbiBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
     ) {
         arg_abi.store(self, val, dst)
     }
-    fn arg_memory_ty(&self, arg_abi: &ArgAbi<'tcx, Ty<'tcx>>) -> &'ll Type {
-        arg_abi.memory_ty(self)
-    }
 }
 
 pub(crate) trait FnAbiLlvmExt<'ll, 'tcx> {
@@ -419,11 +412,17 @@ impl<'ll, 'tcx> FnAbiLlvmExt<'ll, 'tcx> for FnAbi<'tcx, Ty<'tcx>> {
         if !self.can_unwind {
             func_attrs.push(llvm::AttributeKind::NoUnwind.create_attr(cx.llcx));
         }
-        if let Conv::RiscvInterrupt { kind } = self.conv {
-            func_attrs.push(llvm::CreateAttrStringValue(cx.llcx, "interrupt", kind.as_str()));
-        }
-        if let Conv::CCmseNonSecureEntry = self.conv {
-            func_attrs.push(llvm::CreateAttrString(cx.llcx, "cmse_nonsecure_entry"))
+        match self.conv {
+            CanonAbi::Interrupt(InterruptKind::RiscvMachine) => {
+                func_attrs.push(llvm::CreateAttrStringValue(cx.llcx, "interrupt", "machine"))
+            }
+            CanonAbi::Interrupt(InterruptKind::RiscvSupervisor) => {
+                func_attrs.push(llvm::CreateAttrStringValue(cx.llcx, "interrupt", "supervisor"))
+            }
+            CanonAbi::Arm(ArmCall::CCmseNonSecureEntry) => {
+                func_attrs.push(llvm::CreateAttrString(cx.llcx, "cmse_nonsecure_entry"))
+            }
+            _ => (),
         }
         attributes::apply_to_llfn(llfn, llvm::AttributePlace::Function, &{ func_attrs });
 
@@ -610,7 +609,7 @@ impl<'ll, 'tcx> FnAbiLlvmExt<'ll, 'tcx> for FnAbi<'tcx, Ty<'tcx>> {
             llvm::SetInstructionCallConv(callsite, cconv);
         }
 
-        if self.conv == Conv::CCmseNonSecureCall {
+        if self.conv == CanonAbi::Arm(ArmCall::CCmseNonSecureCall) {
             // This will probably get ignored on all targets but those supporting the TrustZone-M
             // extension (thumbv8m targets).
             let cmse_nonsecure_call = llvm::CreateAttrString(bx.cx.llcx, "cmse_nonsecure_call");
@@ -646,17 +645,11 @@ impl AbiBuilderMethods for Builder<'_, '_, '_> {
 }
 
 impl llvm::CallConv {
-    pub(crate) fn from_conv(conv: Conv, arch: &str) -> Self {
+    pub(crate) fn from_conv(conv: CanonAbi, arch: &str) -> Self {
         match conv {
-            Conv::C
-            | Conv::Rust
-            | Conv::CCmseNonSecureCall
-            | Conv::CCmseNonSecureEntry
-            | Conv::RiscvInterrupt { .. } => llvm::CCallConv,
-            Conv::Cold => llvm::ColdCallConv,
-            Conv::PreserveMost => llvm::PreserveMost,
-            Conv::PreserveAll => llvm::PreserveAll,
-            Conv::GpuKernel => {
+            CanonAbi::C | CanonAbi::Rust => llvm::CCallConv,
+            CanonAbi::RustCold => llvm::PreserveMost,
+            CanonAbi::GpuKernel => {
                 if arch == "amdgpu" {
                     llvm::AmdgpuKernel
                 } else if arch == "nvptx64" {
@@ -665,17 +658,25 @@ impl llvm::CallConv {
                     panic!("Architecture {arch} does not support GpuKernel calling convention");
                 }
             }
-            Conv::AvrInterrupt => llvm::AvrInterrupt,
-            Conv::AvrNonBlockingInterrupt => llvm::AvrNonBlockingInterrupt,
-            Conv::ArmAapcs => llvm::ArmAapcsCallConv,
-            Conv::Msp430Intr => llvm::Msp430Intr,
-            Conv::X86Fastcall => llvm::X86FastcallCallConv,
-            Conv::X86Intr => llvm::X86_Intr,
-            Conv::X86Stdcall => llvm::X86StdcallCallConv,
-            Conv::X86ThisCall => llvm::X86_ThisCall,
-            Conv::X86VectorCall => llvm::X86_VectorCall,
-            Conv::X86_64SysV => llvm::X86_64_SysV,
-            Conv::X86_64Win64 => llvm::X86_64_Win64,
+            CanonAbi::Interrupt(interrupt_kind) => match interrupt_kind {
+                InterruptKind::Avr => llvm::AvrInterrupt,
+                InterruptKind::AvrNonBlocking => llvm::AvrNonBlockingInterrupt,
+                InterruptKind::Msp430 => llvm::Msp430Intr,
+                InterruptKind::RiscvMachine | InterruptKind::RiscvSupervisor => llvm::CCallConv,
+                InterruptKind::X86 => llvm::X86_Intr,
+            },
+            CanonAbi::Arm(arm_call) => match arm_call {
+                ArmCall::Aapcs => llvm::ArmAapcsCallConv,
+                ArmCall::CCmseNonSecureCall | ArmCall::CCmseNonSecureEntry => llvm::CCallConv,
+            },
+            CanonAbi::X86(x86_call) => match x86_call {
+                X86Call::Fastcall => llvm::X86FastcallCallConv,
+                X86Call::Stdcall => llvm::X86StdcallCallConv,
+                X86Call::SysV64 => llvm::X86_64_SysV,
+                X86Call::Thiscall => llvm::X86_ThisCall,
+                X86Call::Vectorcall => llvm::X86_VectorCall,
+                X86Call::Win64 => llvm::X86_64_Win64,
+            },
         }
     }
 }
diff --git a/compiler/rustc_codegen_llvm/src/asm.rs b/compiler/rustc_codegen_llvm/src/asm.rs
index 88daa025740..4185aef8b31 100644
--- a/compiler/rustc_codegen_llvm/src/asm.rs
+++ b/compiler/rustc_codegen_llvm/src/asm.rs
@@ -14,7 +14,7 @@ use smallvec::SmallVec;
 use tracing::debug;
 
 use crate::builder::Builder;
-use crate::common::{AsCCharPtr, Funclet};
+use crate::common::Funclet;
 use crate::context::CodegenCx;
 use crate::type_::Type;
 use crate::type_of::LayoutLlvmExt;
@@ -251,7 +251,7 @@ impl<'ll, 'tcx> AsmBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
                 InlineAsmArch::Nvptx64 => {}
                 InlineAsmArch::PowerPC | InlineAsmArch::PowerPC64 => {}
                 InlineAsmArch::Hexagon => {}
-                InlineAsmArch::LoongArch64 => {
+                InlineAsmArch::LoongArch32 | InlineAsmArch::LoongArch64 => {
                     constraints.extend_from_slice(&[
                         "~{$fcc0}".to_string(),
                         "~{$fcc1}".to_string(),
@@ -376,7 +376,7 @@ impl<'ll, 'tcx> AsmBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
 
 impl<'tcx> AsmCodegenMethods<'tcx> for CodegenCx<'_, 'tcx> {
     fn codegen_global_asm(
-        &self,
+        &mut self,
         template: &[InlineAsmTemplatePiece],
         operands: &[GlobalAsmOperandRef<'tcx>],
         options: InlineAsmOptions,
@@ -435,13 +435,7 @@ impl<'tcx> AsmCodegenMethods<'tcx> for CodegenCx<'_, 'tcx> {
             template_str.push_str("\n.att_syntax\n");
         }
 
-        unsafe {
-            llvm::LLVMAppendModuleInlineAsm(
-                self.llmod,
-                template_str.as_c_char_ptr(),
-                template_str.len(),
-            );
-        }
+        llvm::append_module_inline_asm(self.llmod, template_str.as_bytes());
     }
 
     fn mangled_name(&self, instance: Instance<'tcx>) -> String {
@@ -482,67 +476,67 @@ pub(crate) fn inline_asm_call<'ll>(
 
     debug!("Asm Output Type: {:?}", output);
     let fty = bx.cx.type_func(&argtys, output);
+
     // Ask LLVM to verify that the constraints are well-formed.
-    let constraints_ok =
-        unsafe { llvm::LLVMRustInlineAsmVerify(fty, cons.as_c_char_ptr(), cons.len()) };
+    let constraints_ok = unsafe { llvm::LLVMRustInlineAsmVerify(fty, cons.as_ptr(), cons.len()) };
     debug!("constraint verification result: {:?}", constraints_ok);
-    if constraints_ok {
-        let v = unsafe {
-            llvm::LLVMRustInlineAsm(
-                fty,
-                asm.as_c_char_ptr(),
-                asm.len(),
-                cons.as_c_char_ptr(),
-                cons.len(),
-                volatile,
-                alignstack,
-                dia,
-                can_throw,
-            )
-        };
+    if !constraints_ok {
+        // LLVM has detected an issue with our constraints, so bail out.
+        return None;
+    }
 
-        let call = if !labels.is_empty() {
-            assert!(catch_funclet.is_none());
-            bx.callbr(fty, None, None, v, inputs, dest.unwrap(), labels, None, None)
-        } else if let Some((catch, funclet)) = catch_funclet {
-            bx.invoke(fty, None, None, v, inputs, dest.unwrap(), catch, funclet, None)
-        } else {
-            bx.call(fty, None, None, v, inputs, None, None)
-        };
+    let v = unsafe {
+        llvm::LLVMGetInlineAsm(
+            fty,
+            asm.as_ptr(),
+            asm.len(),
+            cons.as_ptr(),
+            cons.len(),
+            volatile,
+            alignstack,
+            dia,
+            can_throw,
+        )
+    };
 
-        // Store mark in a metadata node so we can map LLVM errors
-        // back to source locations. See #17552.
-        let key = "srcloc";
-        let kind = bx.get_md_kind_id(key);
+    let call = if !labels.is_empty() {
+        assert!(catch_funclet.is_none());
+        bx.callbr(fty, None, None, v, inputs, dest.unwrap(), labels, None, None)
+    } else if let Some((catch, funclet)) = catch_funclet {
+        bx.invoke(fty, None, None, v, inputs, dest.unwrap(), catch, funclet, None)
+    } else {
+        bx.call(fty, None, None, v, inputs, None, None)
+    };
 
-        // `srcloc` contains one 64-bit integer for each line of assembly code,
-        // where the lower 32 bits hold the lo byte position and the upper 32 bits
-        // hold the hi byte position.
-        let mut srcloc = vec![];
-        if dia == llvm::AsmDialect::Intel && line_spans.len() > 1 {
-            // LLVM inserts an extra line to add the ".intel_syntax", so add
-            // a dummy srcloc entry for it.
-            //
-            // Don't do this if we only have 1 line span since that may be
-            // due to the asm template string coming from a macro. LLVM will
-            // default to the first srcloc for lines that don't have an
-            // associated srcloc.
-            srcloc.push(llvm::LLVMValueAsMetadata(bx.const_u64(0)));
-        }
-        srcloc.extend(line_spans.iter().map(|span| {
-            llvm::LLVMValueAsMetadata(
-                bx.const_u64(u64::from(span.lo().to_u32()) | (u64::from(span.hi().to_u32()) << 32)),
-            )
-        }));
-        let md = unsafe { llvm::LLVMMDNodeInContext2(bx.llcx, srcloc.as_ptr(), srcloc.len()) };
-        let md = bx.get_metadata_value(md);
-        llvm::LLVMSetMetadata(call, kind, md);
+    // Store mark in a metadata node so we can map LLVM errors
+    // back to source locations. See #17552.
+    let key = "srcloc";
+    let kind = bx.get_md_kind_id(key);
 
-        Some(call)
-    } else {
-        // LLVM has detected an issue with our constraints, bail out
-        None
+    // `srcloc` contains one 64-bit integer for each line of assembly code,
+    // where the lower 32 bits hold the lo byte position and the upper 32 bits
+    // hold the hi byte position.
+    let mut srcloc = vec![];
+    if dia == llvm::AsmDialect::Intel && line_spans.len() > 1 {
+        // LLVM inserts an extra line to add the ".intel_syntax", so add
+        // a dummy srcloc entry for it.
+        //
+        // Don't do this if we only have 1 line span since that may be
+        // due to the asm template string coming from a macro. LLVM will
+        // default to the first srcloc for lines that don't have an
+        // associated srcloc.
+        srcloc.push(llvm::LLVMValueAsMetadata(bx.const_u64(0)));
     }
+    srcloc.extend(line_spans.iter().map(|span| {
+        llvm::LLVMValueAsMetadata(
+            bx.const_u64(u64::from(span.lo().to_u32()) | (u64::from(span.hi().to_u32()) << 32)),
+        )
+    }));
+    let md = unsafe { llvm::LLVMMDNodeInContext2(bx.llcx, srcloc.as_ptr(), srcloc.len()) };
+    let md = bx.get_metadata_value(md);
+    llvm::LLVMSetMetadata(call, kind, md);
+
+    Some(call)
 }
 
 /// If the register is an xmm/ymm/zmm register then return its index.
diff --git a/compiler/rustc_codegen_llvm/src/attributes.rs b/compiler/rustc_codegen_llvm/src/attributes.rs
index e8c42d16733..27fd09745ff 100644
--- a/compiler/rustc_codegen_llvm/src/attributes.rs
+++ b/compiler/rustc_codegen_llvm/src/attributes.rs
@@ -1,11 +1,11 @@
 //! Set and unset common attributes on LLVM values.
-
-use rustc_attr_parsing::{InlineAttr, InstructionSetAttr, OptimizeAttr};
+use rustc_attr_data_structures::{InlineAttr, InstructionSetAttr, OptimizeAttr};
 use rustc_codegen_ssa::traits::*;
 use rustc_hir::def_id::DefId;
 use rustc_middle::middle::codegen_fn_attrs::{CodegenFnAttrFlags, PatchableFunctionEntry};
 use rustc_middle::ty::{self, TyCtxt};
 use rustc_session::config::{BranchProtection, FunctionReturn, OptLevel, PAuthKey, PacRet};
+use rustc_symbol_mangling::mangle_internal_symbol;
 use rustc_target::spec::{FramePointer, SanitizerSet, StackProbeType, StackProtector};
 use smallvec::SmallVec;
 
@@ -28,6 +28,22 @@ pub(crate) fn apply_to_callsite(callsite: &Value, idx: AttributePlace, attrs: &[
     }
 }
 
+pub(crate) fn has_attr(llfn: &Value, idx: AttributePlace, attr: AttributeKind) -> bool {
+    llvm::HasAttributeAtIndex(llfn, idx, attr)
+}
+
+pub(crate) fn has_string_attr(llfn: &Value, name: &str) -> bool {
+    llvm::HasStringAttribute(llfn, name)
+}
+
+pub(crate) fn remove_from_llfn(llfn: &Value, place: AttributePlace, kind: AttributeKind) {
+    llvm::RemoveRustEnumAttributeAtIndex(llfn, place, kind);
+}
+
+pub(crate) fn remove_string_attr_from_llfn(llfn: &Value, name: &str) {
+    llvm::RemoveStringAttrFromFn(llfn, name);
+}
+
 /// Get LLVM attribute for the provided inline heuristic.
 #[inline]
 fn inline_attr<'ll>(cx: &CodegenCx<'ll, '_>, inline: InlineAttr) -> Option<&'ll Attribute> {
@@ -241,11 +257,11 @@ fn probestack_attr<'ll>(cx: &CodegenCx<'ll, '_>) -> Option<&'ll Attribute> {
         StackProbeType::Inline => "inline-asm",
         // Flag our internal `__rust_probestack` function as the stack probe symbol.
         // This is defined in the `compiler-builtins` crate for each architecture.
-        StackProbeType::Call => "__rust_probestack",
+        StackProbeType::Call => &mangle_internal_symbol(cx.tcx, "__rust_probestack"),
         // Pick from the two above based on the LLVM version.
         StackProbeType::InlineOrCall { min_llvm_version_for_inline } => {
             if llvm_util::get_version() < min_llvm_version_for_inline {
-                "__rust_probestack"
+                &mangle_internal_symbol(cx.tcx, "__rust_probestack")
             } else {
                 "inline-asm"
             }
diff --git a/compiler/rustc_codegen_llvm/src/back/lto.rs b/compiler/rustc_codegen_llvm/src/back/lto.rs
index a8b49e9552c..ee46b49a094 100644
--- a/compiler/rustc_codegen_llvm/src/back/lto.rs
+++ b/compiler/rustc_codegen_llvm/src/back/lto.rs
@@ -28,8 +28,9 @@ use crate::back::write::{
 use crate::errors::{
     DynamicLinkingWithLTO, LlvmError, LtoBitcodeFromRlib, LtoDisallowed, LtoDylib, LtoProcMacro,
 };
+use crate::llvm::AttributePlace::Function;
 use crate::llvm::{self, build_string};
-use crate::{LlvmCodegenBackend, ModuleLlvm};
+use crate::{LlvmCodegenBackend, ModuleLlvm, SimpleCx, attributes};
 
 /// We keep track of the computed LTO cache keys from the previous
 /// session to determine which CGUs we can reuse.
@@ -41,7 +42,8 @@ fn crate_type_allows_lto(crate_type: CrateType) -> bool {
         | CrateType::Dylib
         | CrateType::Staticlib
         | CrateType::Cdylib
-        | CrateType::ProcMacro => true,
+        | CrateType::ProcMacro
+        | CrateType::Sdylib => true,
         CrateType::Rlib => false,
     }
 }
@@ -584,12 +586,10 @@ fn thin_lto(
     }
 }
 
-fn enable_autodiff_settings(ad: &[config::AutoDiff], module: &mut ModuleCodegen<ModuleLlvm>) {
+fn enable_autodiff_settings(ad: &[config::AutoDiff]) {
     for &val in ad {
+        // We intentionally don't use a wildcard, to not forget handling anything new.
         match val {
-            config::AutoDiff::PrintModBefore => {
-                unsafe { llvm::LLVMDumpModule(module.module_llvm.llmod()) };
-            }
             config::AutoDiff::PrintPerf => {
                 llvm::set_print_perf(true);
             }
@@ -603,17 +603,23 @@ fn enable_autodiff_settings(ad: &[config::AutoDiff], module: &mut ModuleCodegen<
                 llvm::set_inline(true);
             }
             config::AutoDiff::LooseTypes => {
-                llvm::set_loose_types(false);
+                llvm::set_loose_types(true);
             }
             config::AutoDiff::PrintSteps => {
                 llvm::set_print(true);
             }
-            // We handle this below
+            // We handle this in the PassWrapper.cpp
+            config::AutoDiff::PrintPasses => {}
+            // We handle this in the PassWrapper.cpp
+            config::AutoDiff::PrintModBefore => {}
+            // We handle this in the PassWrapper.cpp
             config::AutoDiff::PrintModAfter => {}
-            // We handle this below
+            // We handle this in the PassWrapper.cpp
             config::AutoDiff::PrintModFinal => {}
             // This is required and already checked
             config::AutoDiff::Enable => {}
+            // We handle this below
+            config::AutoDiff::NoPostopt => {}
         }
     }
     // This helps with handling enums for now.
@@ -647,27 +653,52 @@ pub(crate) fn run_pass_manager(
     // We then run the llvm_optimize function a second time, to optimize the code which we generated
     // in the enzyme differentiation pass.
     let enable_ad = config.autodiff.contains(&config::AutoDiff::Enable);
-    let stage =
-        if enable_ad { write::AutodiffStage::DuringAD } else { write::AutodiffStage::PostAD };
+    let stage = if thin {
+        write::AutodiffStage::PreAD
+    } else {
+        if enable_ad { write::AutodiffStage::DuringAD } else { write::AutodiffStage::PostAD }
+    };
 
     if enable_ad {
-        enable_autodiff_settings(&config.autodiff, module);
+        enable_autodiff_settings(&config.autodiff);
     }
 
     unsafe {
         write::llvm_optimize(cgcx, dcx, module, None, config, opt_level, opt_stage, stage)?;
     }
 
-    if cfg!(llvm_enzyme) && enable_ad {
-        // This is the post-autodiff IR, mainly used for testing and educational purposes.
-        if config.autodiff.contains(&config::AutoDiff::PrintModAfter) {
-            unsafe { llvm::LLVMDumpModule(module.module_llvm.llmod()) };
+    if cfg!(llvm_enzyme) && enable_ad && !thin {
+        let cx =
+            SimpleCx::new(module.module_llvm.llmod(), &module.module_llvm.llcx, cgcx.pointer_size);
+
+        for function in cx.get_functions() {
+            let enzyme_marker = "enzyme_marker";
+            if attributes::has_string_attr(function, enzyme_marker) {
+                // Sanity check: Ensure 'noinline' is present before replacing it.
+                assert!(
+                    !attributes::has_attr(function, Function, llvm::AttributeKind::NoInline),
+                    "Expected __enzyme function to have 'noinline' before adding 'alwaysinline'"
+                );
+
+                attributes::remove_from_llfn(function, Function, llvm::AttributeKind::NoInline);
+                attributes::remove_string_attr_from_llfn(function, enzyme_marker);
+
+                assert!(
+                    !attributes::has_string_attr(function, enzyme_marker),
+                    "Expected function to not have 'enzyme_marker'"
+                );
+
+                let always_inline = llvm::AttributeKind::AlwaysInline.create_attr(cx.llcx);
+                attributes::apply_to_llfn(function, Function, &[always_inline]);
+            }
         }
 
         let opt_stage = llvm::OptStage::FatLTO;
         let stage = write::AutodiffStage::PostAD;
-        unsafe {
-            write::llvm_optimize(cgcx, dcx, module, None, config, opt_level, opt_stage, stage)?;
+        if !config.autodiff.contains(&config::AutoDiff::NoPostopt) {
+            unsafe {
+                write::llvm_optimize(cgcx, dcx, module, None, config, opt_level, opt_stage, stage)?;
+            }
         }
 
         // This is the final IR, so people should be able to inspect the optimized autodiff output,
@@ -768,7 +799,7 @@ impl Drop for ThinBuffer {
     }
 }
 
-pub(crate) unsafe fn optimize_thin_module(
+pub(crate) fn optimize_thin_module(
     thin_module: ThinModule<LlvmCodegenBackend>,
     cgcx: &CodegenContext<LlvmCodegenBackend>,
 ) -> Result<ModuleCodegen<ModuleLlvm>, FatalError> {
diff --git a/compiler/rustc_codegen_llvm/src/back/write.rs b/compiler/rustc_codegen_llvm/src/back/write.rs
index 18d221d232e..bde6a9cf4bc 100644
--- a/compiler/rustc_codegen_llvm/src/back/write.rs
+++ b/compiler/rustc_codegen_llvm/src/back/write.rs
@@ -572,6 +572,10 @@ pub(crate) unsafe fn llvm_optimize(
 
     let consider_ad = cfg!(llvm_enzyme) && config.autodiff.contains(&config::AutoDiff::Enable);
     let run_enzyme = autodiff_stage == AutodiffStage::DuringAD;
+    let print_before_enzyme = config.autodiff.contains(&config::AutoDiff::PrintModBefore);
+    let print_after_enzyme = config.autodiff.contains(&config::AutoDiff::PrintModAfter);
+    let print_passes = config.autodiff.contains(&config::AutoDiff::PrintPasses);
+    let merge_functions;
     let unroll_loops;
     let vectorize_slp;
     let vectorize_loop;
@@ -579,13 +583,20 @@ pub(crate) unsafe fn llvm_optimize(
     // When we build rustc with enzyme/autodiff support, we want to postpone size-increasing
     // optimizations until after differentiation. Our pipeline is thus: (opt + enzyme), (full opt).
     // We therefore have two calls to llvm_optimize, if autodiff is used.
+    //
+    // We also must disable merge_functions, since autodiff placeholder/dummy bodies tend to be
+    // identical. We run opts before AD, so there is a chance that LLVM will merge our dummies.
+    // In that case, we lack some dummy bodies and can't replace them with the real AD code anymore.
+    // We then would need to abort compilation. This was especially common in test cases.
     if consider_ad && autodiff_stage != AutodiffStage::PostAD {
+        merge_functions = false;
         unroll_loops = false;
         vectorize_slp = false;
         vectorize_loop = false;
     } else {
         unroll_loops =
             opt_level != config::OptLevel::Size && opt_level != config::OptLevel::SizeMin;
+        merge_functions = config.merge_functions;
         vectorize_slp = config.vectorize_slp;
         vectorize_loop = config.vectorize_loop;
     }
@@ -663,13 +674,16 @@ pub(crate) unsafe fn llvm_optimize(
             thin_lto_buffer,
             config.emit_thin_lto,
             config.emit_thin_lto_summary,
-            config.merge_functions,
+            merge_functions,
             unroll_loops,
             vectorize_slp,
             vectorize_loop,
             config.no_builtins,
             config.emit_lifetime_markers,
             run_enzyme,
+            print_before_enzyme,
+            print_after_enzyme,
+            print_passes,
             sanitizer_options.as_ref(),
             pgo_gen_path.as_ref().map_or(std::ptr::null(), |s| s.as_ptr()),
             pgo_use_path.as_ref().map_or(std::ptr::null(), |s| s.as_ptr()),
@@ -690,7 +704,7 @@ pub(crate) unsafe fn llvm_optimize(
 }
 
 // Unsafe due to LLVM calls.
-pub(crate) unsafe fn optimize(
+pub(crate) fn optimize(
     cgcx: &CodegenContext<LlvmCodegenBackend>,
     dcx: DiagCtxtHandle<'_>,
     module: &mut ModuleCodegen<ModuleLlvm>,
@@ -801,7 +815,7 @@ pub(crate) fn link(
     Ok(modules.remove(0))
 }
 
-pub(crate) unsafe fn codegen(
+pub(crate) fn codegen(
     cgcx: &CodegenContext<LlvmCodegenBackend>,
     dcx: DiagCtxtHandle<'_>,
     module: ModuleCodegen<ModuleLlvm>,
@@ -1134,9 +1148,9 @@ unsafe fn embed_bitcode(
             // We need custom section flags, so emit module-level inline assembly.
             let section_flags = if cgcx.is_pe_coff { "n" } else { "e" };
             let asm = create_section_with_flags_asm(".llvmbc", section_flags, bitcode);
-            llvm::LLVMAppendModuleInlineAsm(llmod, asm.as_c_char_ptr(), asm.len());
+            llvm::append_module_inline_asm(llmod, &asm);
             let asm = create_section_with_flags_asm(".llvmcmd", section_flags, cmdline.as_bytes());
-            llvm::LLVMAppendModuleInlineAsm(llmod, asm.as_c_char_ptr(), asm.len());
+            llvm::append_module_inline_asm(llmod, &asm);
         }
     }
 }
diff --git a/compiler/rustc_codegen_llvm/src/base.rs b/compiler/rustc_codegen_llvm/src/base.rs
index 6bd27914dbd..5dda836988c 100644
--- a/compiler/rustc_codegen_llvm/src/base.rs
+++ b/compiler/rustc_codegen_llvm/src/base.rs
@@ -83,20 +83,27 @@ pub(crate) fn compile_codegen_unit(
         // Instantiate monomorphizations without filling out definitions yet...
         let llvm_module = ModuleLlvm::new(tcx, cgu_name.as_str());
         {
-            let cx = CodegenCx::new(tcx, cgu, &llvm_module);
+            let mut cx = CodegenCx::new(tcx, cgu, &llvm_module);
             let mono_items = cx.codegen_unit.items_in_deterministic_order(cx.tcx);
             for &(mono_item, data) in &mono_items {
-                mono_item.predefine::<Builder<'_, '_, '_>>(&cx, data.linkage, data.visibility);
+                mono_item.predefine::<Builder<'_, '_, '_>>(
+                    &mut cx,
+                    cgu_name.as_str(),
+                    data.linkage,
+                    data.visibility,
+                );
             }
 
             // ... and now that we have everything pre-defined, fill out those definitions.
-            for &(mono_item, _) in &mono_items {
-                mono_item.define::<Builder<'_, '_, '_>>(&cx);
+            for &(mono_item, item_data) in &mono_items {
+                mono_item.define::<Builder<'_, '_, '_>>(&mut cx, cgu_name.as_str(), item_data);
             }
 
             // If this codegen unit contains the main function, also create the
             // wrapper here
-            if let Some(entry) = maybe_create_entry_wrapper::<Builder<'_, '_, '_>>(&cx) {
+            if let Some(entry) =
+                maybe_create_entry_wrapper::<Builder<'_, '_, '_>>(&cx, cx.codegen_unit)
+            {
                 let attrs = attributes::sanitize_attrs(&cx, SanitizerSet::empty());
                 attributes::apply_to_llfn(entry, llvm::AttributePlace::Function, &attrs);
             }
@@ -108,14 +115,11 @@ pub(crate) fn compile_codegen_unit(
             }
 
             // Create the llvm.used and llvm.compiler.used variables.
-            if !cx.used_statics.borrow().is_empty() {
-                cx.create_used_variable_impl(c"llvm.used", &*cx.used_statics.borrow());
+            if !cx.used_statics.is_empty() {
+                cx.create_used_variable_impl(c"llvm.used", &cx.used_statics);
             }
-            if !cx.compiler_used_statics.borrow().is_empty() {
-                cx.create_used_variable_impl(
-                    c"llvm.compiler.used",
-                    &*cx.compiler_used_statics.borrow(),
-                );
+            if !cx.compiler_used_statics.is_empty() {
+                cx.create_used_variable_impl(c"llvm.compiler.used", &cx.compiler_used_statics);
             }
 
             // Run replace-all-uses-with for statics that need it. This must
diff --git a/compiler/rustc_codegen_llvm/src/builder.rs b/compiler/rustc_codegen_llvm/src/builder.rs
index 04c8118b616..ec006b59192 100644
--- a/compiler/rustc_codegen_llvm/src/builder.rs
+++ b/compiler/rustc_codegen_llvm/src/builder.rs
@@ -361,7 +361,7 @@ impl<'a, 'll, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
 
         // Emit KCFI operand bundle
         let kcfi_bundle = self.kcfi_operand_bundle(fn_attrs, fn_abi, instance, llfn);
-        if let Some(kcfi_bundle) = kcfi_bundle.as_ref().map(|b| b.raw()) {
+        if let Some(kcfi_bundle) = kcfi_bundle.as_ref().map(|b| b.as_ref()) {
             bundles.push(kcfi_bundle);
         }
 
@@ -612,7 +612,7 @@ impl<'a, 'll, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
         &mut self,
         ty: &'ll Type,
         ptr: &'ll Value,
-        order: rustc_codegen_ssa::common::AtomicOrdering,
+        order: rustc_middle::ty::AtomicOrdering,
         size: Size,
     ) -> &'ll Value {
         unsafe {
@@ -851,7 +851,7 @@ impl<'a, 'll, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
         &mut self,
         val: &'ll Value,
         ptr: &'ll Value,
-        order: rustc_codegen_ssa::common::AtomicOrdering,
+        order: rustc_middle::ty::AtomicOrdering,
         size: Size,
     ) {
         debug!("Store {:?} -> {:?}", val, ptr);
@@ -1307,8 +1307,8 @@ impl<'a, 'll, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
         dst: &'ll Value,
         cmp: &'ll Value,
         src: &'ll Value,
-        order: rustc_codegen_ssa::common::AtomicOrdering,
-        failure_order: rustc_codegen_ssa::common::AtomicOrdering,
+        order: rustc_middle::ty::AtomicOrdering,
+        failure_order: rustc_middle::ty::AtomicOrdering,
         weak: bool,
     ) -> (&'ll Value, &'ll Value) {
         let weak = if weak { llvm::True } else { llvm::False };
@@ -1334,7 +1334,7 @@ impl<'a, 'll, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
         op: rustc_codegen_ssa::common::AtomicRmwBinOp,
         dst: &'ll Value,
         mut src: &'ll Value,
-        order: rustc_codegen_ssa::common::AtomicOrdering,
+        order: rustc_middle::ty::AtomicOrdering,
     ) -> &'ll Value {
         // The only RMW operation that LLVM supports on pointers is compare-exchange.
         let requires_cast_to_int = self.val_ty(src) == self.type_ptr()
@@ -1360,7 +1360,7 @@ impl<'a, 'll, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
 
     fn atomic_fence(
         &mut self,
-        order: rustc_codegen_ssa::common::AtomicOrdering,
+        order: rustc_middle::ty::AtomicOrdering,
         scope: SynchronizationScope,
     ) {
         let single_threaded = match scope {
@@ -1416,7 +1416,7 @@ impl<'a, 'll, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
 
         // Emit KCFI operand bundle
         let kcfi_bundle = self.kcfi_operand_bundle(fn_attrs, fn_abi, instance, llfn);
-        if let Some(kcfi_bundle) = kcfi_bundle.as_ref().map(|b| b.raw()) {
+        if let Some(kcfi_bundle) = kcfi_bundle.as_ref().map(|b| b.as_ref()) {
             bundles.push(kcfi_bundle);
         }
 
@@ -1452,9 +1452,15 @@ impl<'a, 'll, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
 impl<'ll> StaticBuilderMethods for Builder<'_, 'll, '_> {
     fn get_static(&mut self, def_id: DefId) -> &'ll Value {
         // Forward to the `get_static` method of `CodegenCx`
-        let s = self.cx().get_static(def_id);
-        // Cast to default address space if globals are in a different addrspace
-        self.cx().const_pointercast(s, self.type_ptr())
+        let global = self.cx().get_static(def_id);
+        if self.cx().tcx.is_thread_local_static(def_id) {
+            let pointer = self.call_intrinsic("llvm.threadlocal.address", &[global]);
+            // Cast to default address space if globals are in a different addrspace
+            self.pointercast(pointer, self.type_ptr())
+        } else {
+            // Cast to default address space if globals are in a different addrspace
+            self.cx().const_pointercast(global, self.type_ptr())
+        }
     }
 }
 
@@ -1749,7 +1755,7 @@ impl<'a, 'll, 'tcx> Builder<'a, 'll, 'tcx> {
 
         // Emit KCFI operand bundle
         let kcfi_bundle = self.kcfi_operand_bundle(fn_attrs, fn_abi, instance, llfn);
-        if let Some(kcfi_bundle) = kcfi_bundle.as_ref().map(|b| b.raw()) {
+        if let Some(kcfi_bundle) = kcfi_bundle.as_ref().map(|b| b.as_ref()) {
             bundles.push(kcfi_bundle);
         }
 
@@ -1809,8 +1815,11 @@ impl<'a, 'll, 'tcx> Builder<'a, 'll, 'tcx> {
             let typeid_metadata = self.cx.typeid_metadata(typeid).unwrap();
             let dbg_loc = self.get_dbg_loc();
 
-            // Test whether the function pointer is associated with the type identifier.
-            let cond = self.type_test(llfn, typeid_metadata);
+            // Test whether the function pointer is associated with the type identifier using the
+            // llvm.type.test intrinsic. The LowerTypeTests link-time optimization pass replaces
+            // calls to this intrinsic with code to test type membership.
+            let typeid = self.get_metadata_value(typeid_metadata);
+            let cond = self.call_intrinsic("llvm.type.test", &[llfn, typeid]);
             let bb_pass = self.append_sibling_block("type_test.pass");
             let bb_fail = self.append_sibling_block("type_test.fail");
             self.cond_br(cond, bb_pass, bb_fail);
@@ -1836,7 +1845,7 @@ impl<'a, 'll, 'tcx> Builder<'a, 'll, 'tcx> {
         fn_abi: Option<&FnAbi<'tcx, Ty<'tcx>>>,
         instance: Option<Instance<'tcx>>,
         llfn: &'ll Value,
-    ) -> Option<llvm::OperandBundleOwned<'ll>> {
+    ) -> Option<llvm::OperandBundleBox<'ll>> {
         let is_indirect_call = unsafe { llvm::LLVMRustIsNonGVFunctionPointerTy(llfn) };
         let kcfi_bundle = if self.tcx.sess.is_sanitizer_kcfi_enabled()
             && let Some(fn_abi) = fn_abi
@@ -1862,7 +1871,7 @@ impl<'a, 'll, 'tcx> Builder<'a, 'll, 'tcx> {
                 kcfi::typeid_for_fnabi(self.tcx, fn_abi, options)
             };
 
-            Some(llvm::OperandBundleOwned::new("kcfi", &[self.const_u32(kcfi_typeid)]))
+            Some(llvm::OperandBundleBox::new("kcfi", &[self.const_u32(kcfi_typeid)]))
         } else {
             None
         };
diff --git a/compiler/rustc_codegen_llvm/src/builder/autodiff.rs b/compiler/rustc_codegen_llvm/src/builder/autodiff.rs
index e7c071d05aa..c5c13ac097a 100644
--- a/compiler/rustc_codegen_llvm/src/builder/autodiff.rs
+++ b/compiler/rustc_codegen_llvm/src/builder/autodiff.rs
@@ -361,6 +361,11 @@ fn generate_enzyme_call<'ll>(
         let attr = llvm::AttributeKind::NoInline.create_attr(cx.llcx);
         attributes::apply_to_llfn(ad_fn, Function, &[attr]);
 
+        // We add a made-up attribute just such that we can recognize it after AD to update
+        // (no)-inline attributes. We'll then also remove this attribute.
+        let enzyme_marker_attr = llvm::CreateAttrString(cx.llcx, "enzyme_marker");
+        attributes::apply_to_llfn(outer_fn, Function, &[enzyme_marker_attr]);
+
         // first, remove all calls from fnc
         let entry = llvm::LLVMGetFirstBasicBlock(outer_fn);
         let br = llvm::LLVMRustGetTerminator(entry);
@@ -473,7 +478,7 @@ pub(crate) fn differentiate<'ll>(
         return Err(diag_handler.handle().emit_almost_fatal(AutoDiffWithoutEnable));
     }
 
-    // Before dumping the module, we want all the TypeTrees to become part of the module.
+    // Here we replace the placeholder code with the actual autodiff code, which calls Enzyme.
     for item in diff_items.iter() {
         let name = item.source.clone();
         let fn_def: Option<&llvm::Value> = cx.get_function(&name);
diff --git a/compiler/rustc_codegen_llvm/src/callee.rs b/compiler/rustc_codegen_llvm/src/callee.rs
index ea9ab5c02bd..6d68eca60af 100644
--- a/compiler/rustc_codegen_llvm/src/callee.rs
+++ b/compiler/rustc_codegen_llvm/src/callee.rs
@@ -103,7 +103,7 @@ pub(crate) fn get_fn<'ll, 'tcx>(cx: &CodegenCx<'ll, 'tcx>, instance: Instance<'t
             // This is a monomorphization of a generic function.
             if !(cx.tcx.sess.opts.share_generics()
                 || tcx.codegen_fn_attrs(instance_def_id).inline
-                    == rustc_attr_parsing::InlineAttr::Never)
+                    == rustc_attr_data_structures::InlineAttr::Never)
             {
                 // When not sharing generics, all instances are in the same
                 // crate and have hidden visibility.
diff --git a/compiler/rustc_codegen_llvm/src/common.rs b/compiler/rustc_codegen_llvm/src/common.rs
index a6f277e4455..3cfa96393e9 100644
--- a/compiler/rustc_codegen_llvm/src/common.rs
+++ b/compiler/rustc_codegen_llvm/src/common.rs
@@ -67,12 +67,12 @@ use crate::value::Value;
 /// the `OperandBundleDef` value created for MSVC landing pads.
 pub(crate) struct Funclet<'ll> {
     cleanuppad: &'ll Value,
-    operand: llvm::OperandBundleOwned<'ll>,
+    operand: llvm::OperandBundleBox<'ll>,
 }
 
 impl<'ll> Funclet<'ll> {
     pub(crate) fn new(cleanuppad: &'ll Value) -> Self {
-        Funclet { cleanuppad, operand: llvm::OperandBundleOwned::new("funclet", &[cleanuppad]) }
+        Funclet { cleanuppad, operand: llvm::OperandBundleBox::new("funclet", &[cleanuppad]) }
     }
 
     pub(crate) fn cleanuppad(&self) -> &'ll Value {
@@ -80,7 +80,7 @@ impl<'ll> Funclet<'ll> {
     }
 
     pub(crate) fn bundle(&self) -> &llvm::OperandBundle<'ll> {
-        self.operand.raw()
+        self.operand.as_ref()
     }
 }
 
diff --git a/compiler/rustc_codegen_llvm/src/consts.rs b/compiler/rustc_codegen_llvm/src/consts.rs
index bf81eb648f8..a4492d76c3c 100644
--- a/compiler/rustc_codegen_llvm/src/consts.rs
+++ b/compiler/rustc_codegen_llvm/src/consts.rs
@@ -1,8 +1,6 @@
 use std::ops::Range;
 
-use rustc_abi::{
-    Align, AlignFromBytesError, HasDataLayout, Primitive, Scalar, Size, WrappingRange,
-};
+use rustc_abi::{Align, HasDataLayout, Primitive, Scalar, Size, WrappingRange};
 use rustc_codegen_ssa::common;
 use rustc_codegen_ssa::traits::*;
 use rustc_hir::LangItem;
@@ -20,9 +18,7 @@ use rustc_middle::{bug, span_bug};
 use tracing::{debug, instrument, trace};
 
 use crate::common::{AsCCharPtr, CodegenCx};
-use crate::errors::{
-    InvalidMinimumAlignmentNotPowerOfTwo, InvalidMinimumAlignmentTooLarge, SymbolAlreadyDefined,
-};
+use crate::errors::SymbolAlreadyDefined;
 use crate::llvm::{self, True};
 use crate::type_::Type;
 use crate::type_of::LayoutLlvmExt;
@@ -149,22 +145,10 @@ fn set_global_alignment<'ll>(cx: &CodegenCx<'ll, '_>, gv: &'ll Value, mut align:
     // The target may require greater alignment for globals than the type does.
     // Note: GCC and Clang also allow `__attribute__((aligned))` on variables,
     // which can force it to be smaller. Rust doesn't support this yet.
-    if let Some(min) = cx.sess().target.min_global_align {
-        match Align::from_bits(min) {
-            Ok(min) => align = align.max(min),
-            Err(err) => match err {
-                AlignFromBytesError::NotPowerOfTwo(align) => {
-                    cx.sess().dcx().emit_err(InvalidMinimumAlignmentNotPowerOfTwo { align });
-                }
-                AlignFromBytesError::TooLarge(align) => {
-                    cx.sess().dcx().emit_err(InvalidMinimumAlignmentTooLarge { align });
-                }
-            },
-        }
-    }
-    unsafe {
-        llvm::LLVMSetAlignment(gv, align.bytes() as u32);
+    if let Some(min_global) = cx.sess().target.min_global_align {
+        align = Ord::max(align, min_global);
     }
+    llvm::set_alignment(gv, align);
 }
 
 fn check_and_apply_linkage<'ll, 'tcx>(
@@ -411,7 +395,7 @@ impl<'ll> CodegenCx<'ll, '_> {
         g
     }
 
-    fn codegen_static_item(&self, def_id: DefId) {
+    fn codegen_static_item(&mut self, def_id: DefId) {
         unsafe {
             assert!(
                 llvm::LLVMGetInitializer(
@@ -527,7 +511,7 @@ impl<'ll> CodegenCx<'ll, '_> {
 
             base::set_variable_sanitizer_attrs(g, attrs);
 
-            if attrs.flags.contains(CodegenFnAttrFlags::USED) {
+            if attrs.flags.contains(CodegenFnAttrFlags::USED_COMPILER) {
                 // `USED` and `USED_LINKER` can't be used together.
                 assert!(!attrs.flags.contains(CodegenFnAttrFlags::USED_LINKER));
 
@@ -541,22 +525,33 @@ impl<'ll> CodegenCx<'ll, '_> {
                 // in the handling of `.init_array` (the static constructor list) in versions of
                 // the gold linker (prior to the one released with binutils 2.36).
                 //
-                // That said, we only ever emit these when compiling for ELF targets, unless
-                // `#[used(compiler)]` is explicitly requested. This is to avoid similar breakage
-                // on other targets, in particular MachO targets have *their* static constructor
-                // lists broken if `llvm.compiler.used` is emitted rather than `llvm.used`. However,
-                // that check happens when assigning the `CodegenFnAttrFlags` in
-                // `rustc_hir_analysis`, so we don't need to take care of it here.
+                // That said, we only ever emit these when `#[used(compiler)]` is explicitly
+                // requested. This is to avoid similar breakage on other targets, in particular
+                // MachO targets have *their* static constructor lists broken if `llvm.compiler.used`
+                // is emitted rather than `llvm.used`. However, that check happens when assigning
+                // the `CodegenFnAttrFlags` in the `codegen_fn_attrs` query, so we don't need to
+                // take care of it here.
                 self.add_compiler_used_global(g);
             }
             if attrs.flags.contains(CodegenFnAttrFlags::USED_LINKER) {
                 // `USED` and `USED_LINKER` can't be used together.
-                assert!(!attrs.flags.contains(CodegenFnAttrFlags::USED));
+                assert!(!attrs.flags.contains(CodegenFnAttrFlags::USED_COMPILER));
 
                 self.add_used_global(g);
             }
         }
     }
+
+    /// Add a global value to a list to be stored in the `llvm.used` variable, an array of ptr.
+    pub(crate) fn add_used_global(&mut self, global: &'ll Value) {
+        self.used_statics.push(global);
+    }
+
+    /// Add a global value to a list to be stored in the `llvm.compiler.used` variable,
+    /// an array of ptr.
+    pub(crate) fn add_compiler_used_global(&mut self, global: &'ll Value) {
+        self.compiler_used_statics.push(global);
+    }
 }
 
 impl<'ll> StaticCodegenMethods for CodegenCx<'ll, '_> {
@@ -571,18 +566,7 @@ impl<'ll> StaticCodegenMethods for CodegenCx<'ll, '_> {
         self.const_pointercast(gv, self.type_ptr())
     }
 
-    fn codegen_static(&self, def_id: DefId) {
+    fn codegen_static(&mut self, def_id: DefId) {
         self.codegen_static_item(def_id)
     }
-
-    /// Add a global value to a list to be stored in the `llvm.used` variable, an array of ptr.
-    fn add_used_global(&self, global: &'ll Value) {
-        self.used_statics.borrow_mut().push(global);
-    }
-
-    /// Add a global value to a list to be stored in the `llvm.compiler.used` variable,
-    /// an array of ptr.
-    fn add_compiler_used_global(&self, global: &'ll Value) {
-        self.compiler_used_statics.borrow_mut().push(global);
-    }
 }
diff --git a/compiler/rustc_codegen_llvm/src/context.rs b/compiler/rustc_codegen_llvm/src/context.rs
index 4ec69995518..8d6e1d8941b 100644
--- a/compiler/rustc_codegen_llvm/src/context.rs
+++ b/compiler/rustc_codegen_llvm/src/context.rs
@@ -2,7 +2,7 @@ use std::borrow::Borrow;
 use std::cell::{Cell, RefCell};
 use std::ffi::{CStr, c_char, c_uint};
 use std::marker::PhantomData;
-use std::ops::Deref;
+use std::ops::{Deref, DerefMut};
 use std::str;
 
 use rustc_abi::{HasDataLayout, Size, TargetDataLayout, VariantIdx};
@@ -77,6 +77,13 @@ impl<'ll, T: Borrow<SCx<'ll>>> Deref for GenericCx<'ll, T> {
     }
 }
 
+impl<'ll, T: Borrow<SCx<'ll>>> DerefMut for GenericCx<'ll, T> {
+    #[inline]
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
+    }
+}
+
 pub(crate) type SimpleCx<'ll> = GenericCx<'ll, SCx<'ll>>;
 
 /// There is one `CodegenCx` per codegen unit. Each one has its own LLVM
@@ -110,11 +117,11 @@ pub(crate) struct FullCx<'ll, 'tcx> {
 
     /// Statics that will be placed in the llvm.used variable
     /// See <https://llvm.org/docs/LangRef.html#the-llvm-used-global-variable> for details
-    pub used_statics: RefCell<Vec<&'ll Value>>,
+    pub used_statics: Vec<&'ll Value>,
 
     /// Statics that will be placed in the llvm.compiler.used variable
     /// See <https://llvm.org/docs/LangRef.html#the-llvm-compiler-used-global-variable> for details
-    pub compiler_used_statics: RefCell<Vec<&'ll Value>>,
+    pub compiler_used_statics: Vec<&'ll Value>,
 
     /// Mapping of non-scalar types to llvm types.
     pub type_lowering: RefCell<FxHashMap<(Ty<'tcx>, Option<VariantIdx>), &'ll Type>>,
@@ -606,8 +613,8 @@ impl<'ll, 'tcx> CodegenCx<'ll, 'tcx> {
                 const_str_cache: Default::default(),
                 const_globals: Default::default(),
                 statics_to_rauw: RefCell::new(Vec::new()),
-                used_statics: RefCell::new(Vec::new()),
-                compiler_used_statics: RefCell::new(Vec::new()),
+                used_statics: Vec::new(),
+                compiler_used_statics: Vec::new(),
                 type_lowering: Default::default(),
                 scalar_lltypes: Default::default(),
                 coverage_cx,
@@ -698,6 +705,16 @@ impl<'ll, CX: Borrow<SCx<'ll>>> GenericCx<'ll, CX> {
             llvm::LLVMMDStringInContext2(self.llcx(), name.as_ptr() as *const c_char, name.len())
         })
     }
+
+    pub(crate) fn get_functions(&self) -> Vec<&'ll Value> {
+        let mut functions = vec![];
+        let mut func = unsafe { llvm::LLVMGetFirstFunction(self.llmod()) };
+        while let Some(f) = func {
+            functions.push(f);
+            func = unsafe { llvm::LLVMGetNextFunction(f) }
+        }
+        functions
+    }
 }
 
 impl<'ll, 'tcx> MiscCodegenMethods<'tcx> for CodegenCx<'ll, 'tcx> {
@@ -791,10 +808,6 @@ impl<'ll, 'tcx> MiscCodegenMethods<'tcx> for CodegenCx<'ll, 'tcx> {
         self.tcx.sess
     }
 
-    fn codegen_unit(&self) -> &'tcx CodegenUnit<'tcx> {
-        self.codegen_unit
-    }
-
     fn set_frame_pointer_type(&self, llfn: &'ll Value) {
         if let Some(attr) = attributes::frame_pointer_type_attr(self) {
             attributes::apply_to_llfn(llfn, llvm::AttributePlace::Function, &[attr]);
@@ -999,11 +1012,27 @@ impl<'ll> CodegenCx<'ll, '_> {
         ifn!("llvm.minnum.f64", fn(t_f64, t_f64) -> t_f64);
         ifn!("llvm.minnum.f128", fn(t_f128, t_f128) -> t_f128);
 
+        ifn!("llvm.minimum.f16", fn(t_f16, t_f16) -> t_f16);
+        ifn!("llvm.minimum.f32", fn(t_f32, t_f32) -> t_f32);
+        ifn!("llvm.minimum.f64", fn(t_f64, t_f64) -> t_f64);
+        // There are issues on x86_64 and aarch64 with the f128 variant.
+        //  - https://github.com/llvm/llvm-project/issues/139380
+        //  - https://github.com/llvm/llvm-project/issues/139381
+        // ifn!("llvm.minimum.f128", fn(t_f128, t_f128) -> t_f128);
+
         ifn!("llvm.maxnum.f16", fn(t_f16, t_f16) -> t_f16);
         ifn!("llvm.maxnum.f32", fn(t_f32, t_f32) -> t_f32);
         ifn!("llvm.maxnum.f64", fn(t_f64, t_f64) -> t_f64);
         ifn!("llvm.maxnum.f128", fn(t_f128, t_f128) -> t_f128);
 
+        ifn!("llvm.maximum.f16", fn(t_f16, t_f16) -> t_f16);
+        ifn!("llvm.maximum.f32", fn(t_f32, t_f32) -> t_f32);
+        ifn!("llvm.maximum.f64", fn(t_f64, t_f64) -> t_f64);
+        // There are issues on x86_64 and aarch64 with the f128 variant.
+        //  - https://github.com/llvm/llvm-project/issues/139380
+        //  - https://github.com/llvm/llvm-project/issues/139381
+        // ifn!("llvm.maximum.f128", fn(t_f128, t_f128) -> t_f128);
+
         ifn!("llvm.floor.f16", fn(t_f16) -> t_f16);
         ifn!("llvm.floor.f32", fn(t_f32) -> t_f32);
         ifn!("llvm.floor.f64", fn(t_f64) -> t_f64);
@@ -1214,6 +1243,7 @@ impl<'ll> CodegenCx<'ll, '_> {
         }
 
         ifn!("llvm.ptrmask", fn(ptr, t_isize) -> ptr);
+        ifn!("llvm.threadlocal.address", fn(ptr) -> ptr);
 
         None
     }
diff --git a/compiler/rustc_codegen_llvm/src/coverageinfo/mapgen.rs b/compiler/rustc_codegen_llvm/src/coverageinfo/mapgen.rs
index 55b1e728b70..a9be833a643 100644
--- a/compiler/rustc_codegen_llvm/src/coverageinfo/mapgen.rs
+++ b/compiler/rustc_codegen_llvm/src/coverageinfo/mapgen.rs
@@ -2,9 +2,7 @@ use std::sync::Arc;
 
 use itertools::Itertools;
 use rustc_abi::Align;
-use rustc_codegen_ssa::traits::{
-    BaseTypeCodegenMethods, ConstCodegenMethods, StaticCodegenMethods,
-};
+use rustc_codegen_ssa::traits::{BaseTypeCodegenMethods, ConstCodegenMethods};
 use rustc_data_structures::fx::FxIndexMap;
 use rustc_index::IndexVec;
 use rustc_middle::ty::TyCtxt;
@@ -27,7 +25,7 @@ mod unused;
 ///
 /// Those sections are then read and understood by LLVM's `llvm-cov` tool,
 /// which is distributed in the `llvm-tools` rustup component.
-pub(crate) fn finalize(cx: &CodegenCx<'_, '_>) {
+pub(crate) fn finalize(cx: &mut CodegenCx<'_, '_>) {
     let tcx = cx.tcx;
 
     // Ensure that LLVM is using a version of the coverage mapping format that
@@ -62,6 +60,7 @@ pub(crate) fn finalize(cx: &CodegenCx<'_, '_>) {
         .sorted_by_cached_key(|&instance| tcx.symbol_name(instance).name)
         .filter_map(|instance| prepare_covfun_record(tcx, instance, true))
         .collect::<Vec<_>>();
+    drop(instances_used);
 
     // In a single designated CGU, also prepare covfun records for functions
     // in this crate that were instrumented for coverage, but are unused.
@@ -206,7 +205,7 @@ impl VirtualFileMapping {
 /// Generates the contents of the covmap record for this CGU, which mostly
 /// consists of a header and a list of filenames. The record is then stored
 /// as a global variable in the `__llvm_covmap` section.
-fn generate_covmap_record<'ll>(cx: &CodegenCx<'ll, '_>, version: u32, filenames_buffer: &[u8]) {
+fn generate_covmap_record<'ll>(cx: &mut CodegenCx<'ll, '_>, version: u32, filenames_buffer: &[u8]) {
     // A covmap record consists of four target-endian u32 values, followed by
     // the encoded filenames table. Two of the header fields are unused in
     // modern versions of the LLVM coverage mapping format, and are always 0.
diff --git a/compiler/rustc_codegen_llvm/src/coverageinfo/mapgen/covfun.rs b/compiler/rustc_codegen_llvm/src/coverageinfo/mapgen/covfun.rs
index 7bdbc685952..b704cf2b1cd 100644
--- a/compiler/rustc_codegen_llvm/src/coverageinfo/mapgen/covfun.rs
+++ b/compiler/rustc_codegen_llvm/src/coverageinfo/mapgen/covfun.rs
@@ -8,9 +8,7 @@ use std::ffi::CString;
 use std::sync::Arc;
 
 use rustc_abi::Align;
-use rustc_codegen_ssa::traits::{
-    BaseTypeCodegenMethods as _, ConstCodegenMethods, StaticCodegenMethods,
-};
+use rustc_codegen_ssa::traits::{BaseTypeCodegenMethods as _, ConstCodegenMethods};
 use rustc_middle::mir::coverage::{
     BasicCoverageBlock, CovTerm, CoverageIdsInfo, Expression, FunctionCoverageInfo, Mapping,
     MappingKind, Op,
@@ -181,7 +179,7 @@ fn fill_region_tables<'tcx>(
 /// contains the function's coverage mapping data. The record is then stored
 /// as a global variable in the `__llvm_covfun` section.
 pub(crate) fn generate_covfun_record<'tcx>(
-    cx: &CodegenCx<'_, 'tcx>,
+    cx: &mut CodegenCx<'_, 'tcx>,
     global_file_table: &GlobalFileTable,
     covfun: &CovfunRecord<'tcx>,
 ) {
diff --git a/compiler/rustc_codegen_llvm/src/coverageinfo/mapgen/unused.rs b/compiler/rustc_codegen_llvm/src/coverageinfo/mapgen/unused.rs
index 68f60f169b5..fe3a7a1580b 100644
--- a/compiler/rustc_codegen_llvm/src/coverageinfo/mapgen/unused.rs
+++ b/compiler/rustc_codegen_llvm/src/coverageinfo/mapgen/unused.rs
@@ -157,7 +157,7 @@ fn make_dummy_instance<'tcx>(tcx: TyCtxt<'tcx>, local_def_id: LocalDefId) -> ty:
     let def_id = local_def_id.to_def_id();
 
     // Make a dummy instance that fills in all generics with placeholders.
-    ty::Instance::new(
+    ty::Instance::new_raw(
         def_id,
         ty::GenericArgs::for_item(tcx, def_id, |param, _| {
             if let ty::GenericParamDefKind::Lifetime = param.kind {
diff --git a/compiler/rustc_codegen_llvm/src/coverageinfo/mod.rs b/compiler/rustc_codegen_llvm/src/coverageinfo/mod.rs
index ea7f581a3cb..eefbd7cf6c4 100644
--- a/compiler/rustc_codegen_llvm/src/coverageinfo/mod.rs
+++ b/compiler/rustc_codegen_llvm/src/coverageinfo/mod.rs
@@ -56,7 +56,7 @@ impl<'ll, 'tcx> CguCoverageContext<'ll, 'tcx> {
 }
 
 impl<'ll, 'tcx> CodegenCx<'ll, 'tcx> {
-    pub(crate) fn coverageinfo_finalize(&self) {
+    pub(crate) fn coverageinfo_finalize(&mut self) {
         mapgen::finalize(self)
     }
 
diff --git a/compiler/rustc_codegen_llvm/src/debuginfo/gdb.rs b/compiler/rustc_codegen_llvm/src/debuginfo/gdb.rs
index 4ffe551df09..8f0948b8183 100644
--- a/compiler/rustc_codegen_llvm/src/debuginfo/gdb.rs
+++ b/compiler/rustc_codegen_llvm/src/debuginfo/gdb.rs
@@ -95,7 +95,11 @@ pub(crate) fn needs_gdb_debug_scripts_section(cx: &CodegenCx<'_, '_>) -> bool {
     // in the `.debug_gdb_scripts` section. For that reason, we make sure that the
     // section is only emitted for leaf crates.
     let embed_visualizers = cx.tcx.crate_types().iter().any(|&crate_type| match crate_type {
-        CrateType::Executable | CrateType::Dylib | CrateType::Cdylib | CrateType::Staticlib => {
+        CrateType::Executable
+        | CrateType::Dylib
+        | CrateType::Cdylib
+        | CrateType::Staticlib
+        | CrateType::Sdylib => {
             // These are crate types for which we will embed pretty printers since they
             // are treated as leaf crates.
             true
diff --git a/compiler/rustc_codegen_llvm/src/debuginfo/metadata/enums/cpp_like.rs b/compiler/rustc_codegen_llvm/src/debuginfo/metadata/enums/cpp_like.rs
index 07075be55fa..a5c80895741 100644
--- a/compiler/rustc_codegen_llvm/src/debuginfo/metadata/enums/cpp_like.rs
+++ b/compiler/rustc_codegen_llvm/src/debuginfo/metadata/enums/cpp_like.rs
@@ -1,7 +1,7 @@
 use std::borrow::Cow;
 
 use libc::c_uint;
-use rustc_abi::{Align, Endian, Size, TagEncoding, VariantIdx, Variants};
+use rustc_abi::{Align, Endian, FieldIdx, Size, TagEncoding, VariantIdx, Variants};
 use rustc_codegen_ssa::debuginfo::type_names::compute_debuginfo_type_name;
 use rustc_codegen_ssa::debuginfo::{tag_base_type, wants_c_like_enum_debuginfo};
 use rustc_codegen_ssa::traits::{ConstCodegenMethods, MiscCodegenMethods};
@@ -401,7 +401,7 @@ fn build_union_fields_for_enum<'ll, 'tcx>(
     enum_type_and_layout: TyAndLayout<'tcx>,
     enum_type_di_node: &'ll DIType,
     variant_indices: impl Iterator<Item = VariantIdx> + Clone,
-    tag_field: usize,
+    tag_field: FieldIdx,
     untagged_variant_index: Option<VariantIdx>,
 ) -> SmallVec<&'ll DIType> {
     let tag_base_type = tag_base_type(cx.tcx, enum_type_and_layout);
@@ -721,8 +721,7 @@ fn build_union_fields_for_direct_tag_coroutine<'ll, 'tcx>(
         _ => unreachable!(),
     };
 
-    let coroutine_layout =
-        cx.tcx.coroutine_layout(coroutine_def_id, coroutine_args.kind_ty()).unwrap();
+    let coroutine_layout = cx.tcx.coroutine_layout(coroutine_def_id, coroutine_args.args).unwrap();
 
     let common_upvar_names = cx.tcx.closure_saved_names_of_captured_variables(coroutine_def_id);
     let variant_range = coroutine_args.variant_range(coroutine_def_id, cx.tcx);
@@ -806,7 +805,7 @@ fn build_union_fields_for_direct_tag_enum_or_coroutine<'ll, 'tcx>(
     variant_field_infos: &[VariantFieldInfo<'ll>],
     discr_type_di_node: &'ll DIType,
     tag_base_type: Ty<'tcx>,
-    tag_field: usize,
+    tag_field: FieldIdx,
     untagged_variant_index: Option<VariantIdx>,
     di_flags: DIFlags,
 ) -> SmallVec<&'ll DIType> {
@@ -859,7 +858,7 @@ fn build_union_fields_for_direct_tag_enum_or_coroutine<'ll, 'tcx>(
     }));
 
     assert_eq!(
-        cx.size_and_align_of(enum_type_and_layout.field(cx, tag_field).ty),
+        cx.size_and_align_of(enum_type_and_layout.field(cx, tag_field.as_usize()).ty),
         cx.size_and_align_of(self::tag_base_type(cx.tcx, enum_type_and_layout))
     );
 
@@ -876,7 +875,7 @@ fn build_union_fields_for_direct_tag_enum_or_coroutine<'ll, 'tcx>(
             Endian::Big => (8, 0),
         };
 
-        let tag_field_offset = enum_type_and_layout.fields.offset(tag_field).bytes();
+        let tag_field_offset = enum_type_and_layout.fields.offset(tag_field.as_usize()).bytes();
         let lo_offset = Size::from_bytes(tag_field_offset + lo_offset);
         let hi_offset = Size::from_bytes(tag_field_offset + hi_offset);
 
@@ -906,8 +905,8 @@ fn build_union_fields_for_direct_tag_enum_or_coroutine<'ll, 'tcx>(
             cx,
             enum_type_di_node,
             TAG_FIELD_NAME,
-            enum_type_and_layout.field(cx, tag_field),
-            enum_type_and_layout.fields.offset(tag_field),
+            enum_type_and_layout.field(cx, tag_field.as_usize()),
+            enum_type_and_layout.fields.offset(tag_field.as_usize()),
             di_flags,
             tag_base_type_di_node,
             None,
diff --git a/compiler/rustc_codegen_llvm/src/debuginfo/metadata/enums/native.rs b/compiler/rustc_codegen_llvm/src/debuginfo/metadata/enums/native.rs
index bfd131cfd3d..62d38d463ab 100644
--- a/compiler/rustc_codegen_llvm/src/debuginfo/metadata/enums/native.rs
+++ b/compiler/rustc_codegen_llvm/src/debuginfo/metadata/enums/native.rs
@@ -174,10 +174,8 @@ pub(super) fn build_coroutine_di_node<'ll, 'tcx>(
             DIFlags::FlagZero,
         ),
         |cx, coroutine_type_di_node| {
-            let coroutine_layout = cx
-                .tcx
-                .coroutine_layout(coroutine_def_id, coroutine_args.as_coroutine().kind_ty())
-                .unwrap();
+            let coroutine_layout =
+                cx.tcx.coroutine_layout(coroutine_def_id, coroutine_args).unwrap();
 
             let Variants::Multiple { tag_encoding: TagEncoding::Direct, ref variants, .. } =
                 coroutine_type_and_layout.variants
@@ -375,7 +373,7 @@ fn build_discr_member_di_node<'ll, 'tcx>(
                 file,
                 UNKNOWN_LINE_NUMBER,
                 layout,
-                enum_or_coroutine_type_and_layout.fields.offset(tag_field),
+                enum_or_coroutine_type_and_layout.fields.offset(tag_field.as_usize()),
                 DIFlags::FlagArtificial,
                 ty,
             ))
diff --git a/compiler/rustc_codegen_llvm/src/debuginfo/mod.rs b/compiler/rustc_codegen_llvm/src/debuginfo/mod.rs
index c5085927923..5ca2505cec4 100644
--- a/compiler/rustc_codegen_llvm/src/debuginfo/mod.rs
+++ b/compiler/rustc_codegen_llvm/src/debuginfo/mod.rs
@@ -147,6 +147,12 @@ pub(crate) fn finalize(cx: &CodegenCx<'_, '_>) {
     }
 }
 
+impl<'ll> Builder<'_, 'll, '_> {
+    pub(crate) fn get_dbg_loc(&self) -> Option<&'ll DILocation> {
+        unsafe { llvm::LLVMGetCurrentDebugLocation2(self.llbuilder) }
+    }
+}
+
 impl<'ll> DebugInfoBuilderMethods for Builder<'_, 'll, '_> {
     // FIXME(eddyb) find a common convention for all of the debuginfo-related
     // names (choose between `dbg`, `debug`, `debuginfo`, `debug_info` etc.).
@@ -209,10 +215,6 @@ impl<'ll> DebugInfoBuilderMethods for Builder<'_, 'll, '_> {
         }
     }
 
-    fn get_dbg_loc(&self) -> Option<&'ll DILocation> {
-        unsafe { llvm::LLVMGetCurrentDebugLocation2(self.llbuilder) }
-    }
-
     fn insert_reference_to_gdb_debug_scripts_section_global(&mut self) {
         gdb::insert_reference_to_gdb_debug_scripts_section_global(self)
     }
diff --git a/compiler/rustc_codegen_llvm/src/errors.rs b/compiler/rustc_codegen_llvm/src/errors.rs
index ecf108f988f..eaafc680712 100644
--- a/compiler/rustc_codegen_llvm/src/errors.rs
+++ b/compiler/rustc_codegen_llvm/src/errors.rs
@@ -58,18 +58,6 @@ pub(crate) struct SymbolAlreadyDefined<'a> {
 }
 
 #[derive(Diagnostic)]
-#[diag(codegen_llvm_invalid_minimum_alignment_not_power_of_two)]
-pub(crate) struct InvalidMinimumAlignmentNotPowerOfTwo {
-    pub align: u64,
-}
-
-#[derive(Diagnostic)]
-#[diag(codegen_llvm_invalid_minimum_alignment_too_large)]
-pub(crate) struct InvalidMinimumAlignmentTooLarge {
-    pub align: u64,
-}
-
-#[derive(Diagnostic)]
 #[diag(codegen_llvm_sanitizer_memtag_requires_mte)]
 pub(crate) struct SanitizerMemtagRequiresMte;
 
diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs
index ffeab59b05c..10697b9a71f 100644
--- a/compiler/rustc_codegen_llvm/src/intrinsic.rs
+++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs
@@ -15,11 +15,10 @@ use rustc_middle::ty::{self, GenericArgsRef, Ty};
 use rustc_middle::{bug, span_bug};
 use rustc_span::{Span, Symbol, sym};
 use rustc_symbol_mangling::mangle_internal_symbol;
-use rustc_target::callconv::{FnAbi, PassMode};
 use rustc_target::spec::{HasTargetSpec, PanicStrategy};
 use tracing::debug;
 
-use crate::abi::{FnAbiLlvmExt, LlvmType};
+use crate::abi::FnAbiLlvmExt;
 use crate::builder::Builder;
 use crate::context::CodegenCx;
 use crate::llvm::{self, Metadata};
@@ -103,11 +102,23 @@ fn get_simple_intrinsic<'ll>(
         sym::minnumf64 => "llvm.minnum.f64",
         sym::minnumf128 => "llvm.minnum.f128",
 
+        sym::minimumf16 => "llvm.minimum.f16",
+        sym::minimumf32 => "llvm.minimum.f32",
+        sym::minimumf64 => "llvm.minimum.f64",
+        // There are issues on x86_64 and aarch64 with the f128 variant,
+        // let's instead use the instrinsic fallback body.
+        // sym::minimumf128 => "llvm.minimum.f128",
         sym::maxnumf16 => "llvm.maxnum.f16",
         sym::maxnumf32 => "llvm.maxnum.f32",
         sym::maxnumf64 => "llvm.maxnum.f64",
         sym::maxnumf128 => "llvm.maxnum.f128",
 
+        sym::maximumf16 => "llvm.maximum.f16",
+        sym::maximumf32 => "llvm.maximum.f32",
+        sym::maximumf64 => "llvm.maximum.f64",
+        // There are issues on x86_64 and aarch64 with the f128 variant,
+        // let's instead use the instrinsic fallback body.
+        // sym::maximumf128 => "llvm.maximum.f128",
         sym::copysignf16 => "llvm.copysign.f16",
         sym::copysignf32 => "llvm.copysign.f32",
         sym::copysignf64 => "llvm.copysign.f64",
@@ -153,26 +164,14 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
     fn codegen_intrinsic_call(
         &mut self,
         instance: ty::Instance<'tcx>,
-        fn_abi: &FnAbi<'tcx, Ty<'tcx>>,
         args: &[OperandRef<'tcx, &'ll Value>],
-        llresult: &'ll Value,
+        result: PlaceRef<'tcx, &'ll Value>,
         span: Span,
     ) -> Result<(), ty::Instance<'tcx>> {
         let tcx = self.tcx;
-        let callee_ty = instance.ty(tcx, self.typing_env());
 
-        let ty::FnDef(def_id, fn_args) = *callee_ty.kind() else {
-            bug!("expected fn item type, found {}", callee_ty);
-        };
-
-        let sig = callee_ty.fn_sig(tcx);
-        let sig = tcx.normalize_erasing_late_bound_regions(self.typing_env(), sig);
-        let arg_tys = sig.inputs();
-        let ret_ty = sig.output();
-        let name = tcx.item_name(def_id);
-
-        let llret_ty = self.layout_of(ret_ty).llvm_type(self);
-        let result = PlaceRef::new_sized(llresult, fn_abi.ret.layout);
+        let name = tcx.item_name(instance.def_id());
+        let fn_args = instance.args;
 
         let simple = get_simple_intrinsic(self, name);
         let llval = match name {
@@ -243,7 +242,7 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
                     args[0].immediate(),
                     args[1].immediate(),
                     args[2].immediate(),
-                    llresult,
+                    result,
                 );
                 return Ok(());
             }
@@ -252,26 +251,26 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
                 self.call_intrinsic("llvm.va_copy", &[args[0].immediate(), args[1].immediate()])
             }
             sym::va_arg => {
-                match fn_abi.ret.layout.backend_repr {
+                match result.layout.backend_repr {
                     BackendRepr::Scalar(scalar) => {
                         match scalar.primitive() {
                             Primitive::Int(..) => {
-                                if self.cx().size_of(ret_ty).bytes() < 4 {
+                                if self.cx().size_of(result.layout.ty).bytes() < 4 {
                                     // `va_arg` should not be called on an integer type
                                     // less than 4 bytes in length. If it is, promote
                                     // the integer to an `i32` and truncate the result
                                     // back to the smaller type.
                                     let promoted_result = emit_va_arg(self, args[0], tcx.types.i32);
-                                    self.trunc(promoted_result, llret_ty)
+                                    self.trunc(promoted_result, result.layout.llvm_type(self))
                                 } else {
-                                    emit_va_arg(self, args[0], ret_ty)
+                                    emit_va_arg(self, args[0], result.layout.ty)
                                 }
                             }
                             Primitive::Float(Float::F16) => {
                                 bug!("the va_arg intrinsic does not work with `f16`")
                             }
                             Primitive::Float(Float::F64) | Primitive::Pointer(_) => {
-                                emit_va_arg(self, args[0], ret_ty)
+                                emit_va_arg(self, args[0], result.layout.ty)
                             }
                             // `va_arg` should never be used with the return type f32.
                             Primitive::Float(Float::F32) => {
@@ -287,18 +286,12 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
             }
 
             sym::volatile_load | sym::unaligned_volatile_load => {
-                let tp_ty = fn_args.type_at(0);
                 let ptr = args[0].immediate();
-                let load = if let PassMode::Cast { cast: ty, pad_i32: _ } = &fn_abi.ret.mode {
-                    let llty = ty.llvm_type(self);
-                    self.volatile_load(llty, ptr)
-                } else {
-                    self.volatile_load(self.layout_of(tp_ty).llvm_type(self), ptr)
-                };
+                let load = self.volatile_load(result.layout.llvm_type(self), ptr);
                 let align = if name == sym::unaligned_volatile_load {
                     1
                 } else {
-                    self.align_of(tp_ty).bytes() as u32
+                    result.layout.align.abi.bytes() as u32
                 };
                 unsafe {
                     llvm::LLVMSetAlignment(load, align);
@@ -381,7 +374,7 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
             | sym::rotate_right
             | sym::saturating_add
             | sym::saturating_sub => {
-                let ty = arg_tys[0];
+                let ty = args[0].layout.ty;
                 if !ty.is_integral() {
                     tcx.dcx().emit_err(InvalidMonomorphization::BasicIntegerType {
                         span,
@@ -400,26 +393,26 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
                             &[args[0].immediate(), y],
                         );
 
-                        self.intcast(ret, llret_ty, false)
+                        self.intcast(ret, result.layout.llvm_type(self), false)
                     }
                     sym::ctlz_nonzero => {
                         let y = self.const_bool(true);
                         let llvm_name = &format!("llvm.ctlz.i{width}");
                         let ret = self.call_intrinsic(llvm_name, &[args[0].immediate(), y]);
-                        self.intcast(ret, llret_ty, false)
+                        self.intcast(ret, result.layout.llvm_type(self), false)
                     }
                     sym::cttz_nonzero => {
                         let y = self.const_bool(true);
                         let llvm_name = &format!("llvm.cttz.i{width}");
                         let ret = self.call_intrinsic(llvm_name, &[args[0].immediate(), y]);
-                        self.intcast(ret, llret_ty, false)
+                        self.intcast(ret, result.layout.llvm_type(self), false)
                     }
                     sym::ctpop => {
                         let ret = self.call_intrinsic(
                             &format!("llvm.ctpop.i{width}"),
                             &[args[0].immediate()],
                         );
-                        self.intcast(ret, llret_ty, false)
+                        self.intcast(ret, result.layout.llvm_type(self), false)
                     }
                     sym::bswap => {
                         if width == 8 {
@@ -551,16 +544,16 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
                 // Unpack non-power-of-2 #[repr(packed, simd)] arguments.
                 // This gives them the expected layout of a regular #[repr(simd)] vector.
                 let mut loaded_args = Vec::new();
-                for (ty, arg) in arg_tys.iter().zip(args) {
+                for arg in args {
                     loaded_args.push(
                         // #[repr(packed, simd)] vectors are passed like arrays (as references,
                         // with reduced alignment and no padding) rather than as immediates.
                         // We can use a vector load to fix the layout and turn the argument
                         // into an immediate.
-                        if ty.is_simd()
+                        if arg.layout.ty.is_simd()
                             && let OperandValue::Ref(place) = arg.val
                         {
-                            let (size, elem_ty) = ty.simd_size_and_type(self.tcx());
+                            let (size, elem_ty) = arg.layout.ty.simd_size_and_type(self.tcx());
                             let elem_ll_ty = match elem_ty.kind() {
                                 ty::Float(f) => self.type_float_from_ty(*f),
                                 ty::Int(i) => self.type_int_from_ty(*i),
@@ -577,10 +570,10 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
                     );
                 }
 
-                let llret_ty = if ret_ty.is_simd()
-                    && let BackendRepr::Memory { .. } = self.layout_of(ret_ty).layout.backend_repr
+                let llret_ty = if result.layout.ty.is_simd()
+                    && let BackendRepr::Memory { .. } = result.layout.backend_repr
                 {
-                    let (size, elem_ty) = ret_ty.simd_size_and_type(self.tcx());
+                    let (size, elem_ty) = result.layout.ty.simd_size_and_type(self.tcx());
                     let elem_ll_ty = match elem_ty.kind() {
                         ty::Float(f) => self.type_float_from_ty(*f),
                         ty::Int(i) => self.type_int_from_ty(*i),
@@ -590,16 +583,15 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
                     };
                     self.type_vector(elem_ll_ty, size)
                 } else {
-                    llret_ty
+                    result.layout.llvm_type(self)
                 };
 
                 match generic_simd_intrinsic(
                     self,
                     name,
-                    callee_ty,
                     fn_args,
                     &loaded_args,
-                    ret_ty,
+                    result.layout.ty,
                     llret_ty,
                     span,
                 ) {
@@ -613,18 +605,15 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
             _ => {
                 debug!("unknown intrinsic '{}' -- falling back to default body", name);
                 // Call the fallback body instead of generating the intrinsic code
-                return Err(ty::Instance::new(instance.def_id(), instance.args));
+                return Err(ty::Instance::new_raw(instance.def_id(), instance.args));
             }
         };
 
-        if !fn_abi.ret.is_ignore() {
-            if let PassMode::Cast { .. } = &fn_abi.ret.mode {
-                self.store(llval, result.val.llval, result.val.align);
-            } else {
-                OperandRef::from_immediate_or_packed_pair(self, llval, result.layout)
-                    .val
-                    .store(self, result);
-            }
+        if result.layout.ty.is_bool() {
+            let val = self.from_immediate(llval);
+            self.store_to_place(val, result.val);
+        } else if !result.layout.ty.is_unit() {
+            self.store_to_place(llval, result.val);
         }
         Ok(())
     }
@@ -647,13 +636,6 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
         }
     }
 
-    fn type_test(&mut self, pointer: Self::Value, typeid: Self::Metadata) -> Self::Value {
-        // Test the called operand using llvm.type.test intrinsic. The LowerTypeTests link-time
-        // optimization pass replaces calls to this intrinsic with code to test type membership.
-        let typeid = self.get_metadata_value(typeid);
-        self.call_intrinsic("llvm.type.test", &[pointer, typeid])
-    }
-
     fn type_checked_load(
         &mut self,
         llvtable: &'ll Value,
@@ -676,20 +658,19 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
     }
 }
 
-fn catch_unwind_intrinsic<'ll>(
-    bx: &mut Builder<'_, 'll, '_>,
+fn catch_unwind_intrinsic<'ll, 'tcx>(
+    bx: &mut Builder<'_, 'll, 'tcx>,
     try_func: &'ll Value,
     data: &'ll Value,
     catch_func: &'ll Value,
-    dest: &'ll Value,
+    dest: PlaceRef<'tcx, &'ll Value>,
 ) {
     if bx.sess().panic_strategy() == PanicStrategy::Abort {
         let try_func_ty = bx.type_func(&[bx.type_ptr()], bx.type_void());
         bx.call(try_func_ty, None, None, try_func, &[data], None, None);
         // Return 0 unconditionally from the intrinsic call;
         // we can never unwind.
-        let ret_align = bx.tcx().data_layout.i32_align.abi;
-        bx.store(bx.const_i32(0), dest, ret_align);
+        OperandValue::Immediate(bx.const_i32(0)).store(bx, dest);
     } else if wants_msvc_seh(bx.sess()) {
         codegen_msvc_try(bx, try_func, data, catch_func, dest);
     } else if wants_wasm_eh(bx.sess()) {
@@ -708,12 +689,12 @@ fn catch_unwind_intrinsic<'ll>(
 // instructions are meant to work for all targets, as of the time of this
 // writing, however, LLVM does not recommend the usage of these new instructions
 // as the old ones are still more optimized.
-fn codegen_msvc_try<'ll>(
-    bx: &mut Builder<'_, 'll, '_>,
+fn codegen_msvc_try<'ll, 'tcx>(
+    bx: &mut Builder<'_, 'll, 'tcx>,
     try_func: &'ll Value,
     data: &'ll Value,
     catch_func: &'ll Value,
-    dest: &'ll Value,
+    dest: PlaceRef<'tcx, &'ll Value>,
 ) {
     let (llty, llfn) = get_rust_try_fn(bx, &mut |mut bx| {
         bx.set_personality_fn(bx.eh_personality());
@@ -853,17 +834,16 @@ fn codegen_msvc_try<'ll>(
     // Note that no invoke is used here because by definition this function
     // can't panic (that's what it's catching).
     let ret = bx.call(llty, None, None, llfn, &[try_func, data, catch_func], None, None);
-    let i32_align = bx.tcx().data_layout.i32_align.abi;
-    bx.store(ret, dest, i32_align);
+    OperandValue::Immediate(ret).store(bx, dest);
 }
 
 // WASM's definition of the `rust_try` function.
-fn codegen_wasm_try<'ll>(
-    bx: &mut Builder<'_, 'll, '_>,
+fn codegen_wasm_try<'ll, 'tcx>(
+    bx: &mut Builder<'_, 'll, 'tcx>,
     try_func: &'ll Value,
     data: &'ll Value,
     catch_func: &'ll Value,
-    dest: &'ll Value,
+    dest: PlaceRef<'tcx, &'ll Value>,
 ) {
     let (llty, llfn) = get_rust_try_fn(bx, &mut |mut bx| {
         bx.set_personality_fn(bx.eh_personality());
@@ -927,8 +907,7 @@ fn codegen_wasm_try<'ll>(
     // Note that no invoke is used here because by definition this function
     // can't panic (that's what it's catching).
     let ret = bx.call(llty, None, None, llfn, &[try_func, data, catch_func], None, None);
-    let i32_align = bx.tcx().data_layout.i32_align.abi;
-    bx.store(ret, dest, i32_align);
+    OperandValue::Immediate(ret).store(bx, dest);
 }
 
 // Definition of the standard `try` function for Rust using the GNU-like model
@@ -942,12 +921,12 @@ fn codegen_wasm_try<'ll>(
 // function calling it, and that function may already have other personality
 // functions in play. By calling a shim we're guaranteed that our shim will have
 // the right personality function.
-fn codegen_gnu_try<'ll>(
-    bx: &mut Builder<'_, 'll, '_>,
+fn codegen_gnu_try<'ll, 'tcx>(
+    bx: &mut Builder<'_, 'll, 'tcx>,
     try_func: &'ll Value,
     data: &'ll Value,
     catch_func: &'ll Value,
-    dest: &'ll Value,
+    dest: PlaceRef<'tcx, &'ll Value>,
 ) {
     let (llty, llfn) = get_rust_try_fn(bx, &mut |mut bx| {
         // Codegens the shims described above:
@@ -994,19 +973,18 @@ fn codegen_gnu_try<'ll>(
     // Note that no invoke is used here because by definition this function
     // can't panic (that's what it's catching).
     let ret = bx.call(llty, None, None, llfn, &[try_func, data, catch_func], None, None);
-    let i32_align = bx.tcx().data_layout.i32_align.abi;
-    bx.store(ret, dest, i32_align);
+    OperandValue::Immediate(ret).store(bx, dest);
 }
 
 // Variant of codegen_gnu_try used for emscripten where Rust panics are
 // implemented using C++ exceptions. Here we use exceptions of a specific type
 // (`struct rust_panic`) to represent Rust panics.
-fn codegen_emcc_try<'ll>(
-    bx: &mut Builder<'_, 'll, '_>,
+fn codegen_emcc_try<'ll, 'tcx>(
+    bx: &mut Builder<'_, 'll, 'tcx>,
     try_func: &'ll Value,
     data: &'ll Value,
     catch_func: &'ll Value,
-    dest: &'ll Value,
+    dest: PlaceRef<'tcx, &'ll Value>,
 ) {
     let (llty, llfn) = get_rust_try_fn(bx, &mut |mut bx| {
         // Codegens the shims described above:
@@ -1077,8 +1055,7 @@ fn codegen_emcc_try<'ll>(
     // Note that no invoke is used here because by definition this function
     // can't panic (that's what it's catching).
     let ret = bx.call(llty, None, None, llfn, &[try_func, data, catch_func], None, None);
-    let i32_align = bx.tcx().data_layout.i32_align.abi;
-    bx.store(ret, dest, i32_align);
+    OperandValue::Immediate(ret).store(bx, dest);
 }
 
 // Helper function to give a Block to a closure to codegen a shim function.
@@ -1155,7 +1132,6 @@ fn get_rust_try_fn<'a, 'll, 'tcx>(
 fn generic_simd_intrinsic<'ll, 'tcx>(
     bx: &mut Builder<'_, 'll, 'tcx>,
     name: Symbol,
-    callee_ty: Ty<'tcx>,
     fn_args: GenericArgsRef<'tcx>,
     args: &[OperandRef<'tcx, &'ll Value>],
     ret_ty: Ty<'tcx>,
@@ -1226,26 +1202,22 @@ fn generic_simd_intrinsic<'ll, 'tcx>(
         bx.trunc(i_xn_msb, bx.type_vector(bx.type_i1(), in_len))
     }
 
-    let tcx = bx.tcx();
-    let sig = tcx.normalize_erasing_late_bound_regions(bx.typing_env(), callee_ty.fn_sig(tcx));
-    let arg_tys = sig.inputs();
-
     // Sanity-check: all vector arguments must be immediates.
     if cfg!(debug_assertions) {
-        for (ty, arg) in arg_tys.iter().zip(args) {
-            if ty.is_simd() {
+        for arg in args {
+            if arg.layout.ty.is_simd() {
                 assert_matches!(arg.val, OperandValue::Immediate(_));
             }
         }
     }
 
     if name == sym::simd_select_bitmask {
-        let (len, _) = require_simd!(arg_tys[1], SimdArgument);
+        let (len, _) = require_simd!(args[1].layout.ty, SimdArgument);
 
         let expected_int_bits = len.max(8).next_power_of_two();
         let expected_bytes = len.div_ceil(8);
 
-        let mask_ty = arg_tys[0];
+        let mask_ty = args[0].layout.ty;
         let mask = match mask_ty.kind() {
             ty::Int(i) if i.bit_width() == Some(expected_int_bits) => args[0].immediate(),
             ty::Uint(i) if i.bit_width() == Some(expected_int_bits) => args[0].immediate(),
@@ -1279,8 +1251,8 @@ fn generic_simd_intrinsic<'ll, 'tcx>(
     }
 
     // every intrinsic below takes a SIMD vector as its first argument
-    let (in_len, in_elem) = require_simd!(arg_tys[0], SimdInput);
-    let in_ty = arg_tys[0];
+    let (in_len, in_elem) = require_simd!(args[0].layout.ty, SimdInput);
+    let in_ty = args[0].layout.ty;
 
     let comparison = match name {
         sym::simd_eq => Some(BinOp::Eq),
@@ -1411,13 +1383,13 @@ fn generic_simd_intrinsic<'ll, 'tcx>(
 
     if name == sym::simd_insert || name == sym::simd_insert_dyn {
         require!(
-            in_elem == arg_tys[2],
+            in_elem == args[2].layout.ty,
             InvalidMonomorphization::InsertedType {
                 span,
                 name,
                 in_elem,
                 in_ty,
-                out_ty: arg_tys[2]
+                out_ty: args[2].layout.ty
             }
         );
 
@@ -1468,7 +1440,7 @@ fn generic_simd_intrinsic<'ll, 'tcx>(
     if name == sym::simd_select {
         let m_elem_ty = in_elem;
         let m_len = in_len;
-        let (v_len, _) = require_simd!(arg_tys[1], SimdArgument);
+        let (v_len, _) = require_simd!(args[1].layout.ty, SimdArgument);
         require!(
             m_len == v_len,
             InvalidMonomorphization::MismatchedLengths { span, name, m_len, v_len }
@@ -1669,9 +1641,9 @@ fn generic_simd_intrinsic<'ll, 'tcx>(
         // The second argument must be a simd vector with an element type that's a pointer
         // to the element type of the first argument
         let (_, element_ty0) = require_simd!(in_ty, SimdFirst);
-        let (out_len, element_ty1) = require_simd!(arg_tys[1], SimdSecond);
+        let (out_len, element_ty1) = require_simd!(args[1].layout.ty, SimdSecond);
         // The element type of the third argument must be a signed integer type of any width:
-        let (out_len2, element_ty2) = require_simd!(arg_tys[2], SimdThird);
+        let (out_len2, element_ty2) = require_simd!(args[2].layout.ty, SimdThird);
         require_simd!(ret_ty, SimdReturn);
 
         // Of the same length:
@@ -1682,7 +1654,7 @@ fn generic_simd_intrinsic<'ll, 'tcx>(
                 name,
                 in_len,
                 in_ty,
-                arg_ty: arg_tys[1],
+                arg_ty: args[1].layout.ty,
                 out_len
             }
         );
@@ -1693,7 +1665,7 @@ fn generic_simd_intrinsic<'ll, 'tcx>(
                 name,
                 in_len,
                 in_ty,
-                arg_ty: arg_tys[2],
+                arg_ty: args[2].layout.ty,
                 out_len: out_len2
             }
         );
@@ -1713,7 +1685,7 @@ fn generic_simd_intrinsic<'ll, 'tcx>(
                 span,
                 name,
                 expected_element: element_ty1,
-                second_arg: arg_tys[1],
+                second_arg: args[1].layout.ty,
                 in_elem,
                 in_ty,
                 mutability: ExpectedPointerMutability::Not,
@@ -1774,10 +1746,10 @@ fn generic_simd_intrinsic<'ll, 'tcx>(
         let (mask_len, mask_elem) = (in_len, in_elem);
 
         // The second argument must be a pointer matching the element type
-        let pointer_ty = arg_tys[1];
+        let pointer_ty = args[1].layout.ty;
 
         // The last argument is a passthrough vector providing values for disabled lanes
-        let values_ty = arg_tys[2];
+        let values_ty = args[2].layout.ty;
         let (values_len, values_elem) = require_simd!(values_ty, SimdThird);
 
         require_simd!(ret_ty, SimdReturn);
@@ -1865,10 +1837,10 @@ fn generic_simd_intrinsic<'ll, 'tcx>(
         let (mask_len, mask_elem) = (in_len, in_elem);
 
         // The second argument must be a pointer matching the element type
-        let pointer_ty = arg_tys[1];
+        let pointer_ty = args[1].layout.ty;
 
         // The last argument specifies the values to store to memory
-        let values_ty = arg_tys[2];
+        let values_ty = args[2].layout.ty;
         let (values_len, values_elem) = require_simd!(values_ty, SimdThird);
 
         // Of the same length:
@@ -1948,8 +1920,8 @@ fn generic_simd_intrinsic<'ll, 'tcx>(
         // The second argument must be a simd vector with an element type that's a pointer
         // to the element type of the first argument
         let (_, element_ty0) = require_simd!(in_ty, SimdFirst);
-        let (element_len1, element_ty1) = require_simd!(arg_tys[1], SimdSecond);
-        let (element_len2, element_ty2) = require_simd!(arg_tys[2], SimdThird);
+        let (element_len1, element_ty1) = require_simd!(args[1].layout.ty, SimdSecond);
+        let (element_len2, element_ty2) = require_simd!(args[2].layout.ty, SimdThird);
 
         // Of the same length:
         require!(
@@ -1959,7 +1931,7 @@ fn generic_simd_intrinsic<'ll, 'tcx>(
                 name,
                 in_len,
                 in_ty,
-                arg_ty: arg_tys[1],
+                arg_ty: args[1].layout.ty,
                 out_len: element_len1
             }
         );
@@ -1970,7 +1942,7 @@ fn generic_simd_intrinsic<'ll, 'tcx>(
                 name,
                 in_len,
                 in_ty,
-                arg_ty: arg_tys[2],
+                arg_ty: args[2].layout.ty,
                 out_len: element_len2
             }
         );
@@ -1985,7 +1957,7 @@ fn generic_simd_intrinsic<'ll, 'tcx>(
                 span,
                 name,
                 expected_element: element_ty1,
-                second_arg: arg_tys[1],
+                second_arg: args[1].layout.ty,
                 in_elem,
                 in_ty,
                 mutability: ExpectedPointerMutability::Mut,
@@ -2507,7 +2479,7 @@ fn generic_simd_intrinsic<'ll, 'tcx>(
         let ptrs = args[0].immediate();
         // The second argument must be a ptr-sized integer.
         // (We don't care about the signedness, this is wrapping anyway.)
-        let (_offsets_len, offsets_elem) = arg_tys[1].simd_size_and_type(bx.tcx());
+        let (_offsets_len, offsets_elem) = args[1].layout.ty.simd_size_and_type(bx.tcx());
         if !matches!(offsets_elem.kind(), ty::Int(ty::IntTy::Isize) | ty::Uint(ty::UintTy::Usize)) {
             span_bug!(
                 span,
@@ -2531,8 +2503,8 @@ fn generic_simd_intrinsic<'ll, 'tcx>(
                 return_error!(InvalidMonomorphization::ExpectedVectorElementType {
                     span,
                     name,
-                    expected_element: arg_tys[0].simd_size_and_type(bx.tcx()).1,
-                    vector_type: arg_tys[0]
+                    expected_element: args[0].layout.ty.simd_size_and_type(bx.tcx()).1,
+                    vector_type: args[0].layout.ty
                 });
             }
         };
diff --git a/compiler/rustc_codegen_llvm/src/lib.rs b/compiler/rustc_codegen_llvm/src/lib.rs
index 425381b0ffa..6890923a594 100644
--- a/compiler/rustc_codegen_llvm/src/lib.rs
+++ b/compiler/rustc_codegen_llvm/src/lib.rs
@@ -9,13 +9,11 @@
 #![doc(html_root_url = "https://doc.rust-lang.org/nightly/nightly-rustc/")]
 #![doc(rust_logo)]
 #![feature(assert_matches)]
-#![feature(exact_size_is_empty)]
 #![feature(extern_types)]
 #![feature(file_buffered)]
 #![feature(if_let_guard)]
 #![feature(impl_trait_in_assoc_type)]
 #![feature(iter_intersperse)]
-#![feature(let_chains)]
 #![feature(rustdoc_internals)]
 #![feature(slice_as_array)]
 #![feature(try_blocks)]
@@ -29,7 +27,7 @@ use back::owned_target_machine::OwnedTargetMachine;
 use back::write::{create_informational_target_machine, create_target_machine};
 use context::SimpleCx;
 use errors::{AutoDiffWithoutLTO, ParseTargetMachineConfig};
-use llvm_util::target_features_cfg;
+use llvm_util::target_config;
 use rustc_ast::expand::allocator::AllocatorKind;
 use rustc_ast::expand::autodiff_attrs::AutoDiffItem;
 use rustc_codegen_ssa::back::lto::{LtoModuleCodegen, SerializedModule, ThinModule};
@@ -37,7 +35,7 @@ use rustc_codegen_ssa::back::write::{
     CodegenContext, FatLtoInput, ModuleConfig, TargetMachineFactoryConfig, TargetMachineFactoryFn,
 };
 use rustc_codegen_ssa::traits::*;
-use rustc_codegen_ssa::{CodegenResults, CompiledModule, ModuleCodegen};
+use rustc_codegen_ssa::{CodegenResults, CompiledModule, ModuleCodegen, TargetConfig};
 use rustc_data_structures::fx::FxIndexMap;
 use rustc_errors::{DiagCtxtHandle, FatalError};
 use rustc_metadata::EncodedMetadata;
@@ -190,13 +188,13 @@ impl WriteBackendMethods for LlvmCodegenBackend {
     ) -> Result<(Vec<LtoModuleCodegen<Self>>, Vec<WorkProduct>), FatalError> {
         back::lto::run_thin(cgcx, modules, cached_modules)
     }
-    unsafe fn optimize(
+    fn optimize(
         cgcx: &CodegenContext<Self>,
         dcx: DiagCtxtHandle<'_>,
         module: &mut ModuleCodegen<Self::Module>,
         config: &ModuleConfig,
     ) -> Result<(), FatalError> {
-        unsafe { back::write::optimize(cgcx, dcx, module, config) }
+        back::write::optimize(cgcx, dcx, module, config)
     }
     fn optimize_fat(
         cgcx: &CodegenContext<Self>,
@@ -206,19 +204,19 @@ impl WriteBackendMethods for LlvmCodegenBackend {
         let dcx = dcx.handle();
         back::lto::run_pass_manager(cgcx, dcx, module, false)
     }
-    unsafe fn optimize_thin(
+    fn optimize_thin(
         cgcx: &CodegenContext<Self>,
         thin: ThinModule<Self>,
     ) -> Result<ModuleCodegen<Self::Module>, FatalError> {
-        unsafe { back::lto::optimize_thin_module(thin, cgcx) }
+        back::lto::optimize_thin_module(thin, cgcx)
     }
-    unsafe fn codegen(
+    fn codegen(
         cgcx: &CodegenContext<Self>,
         dcx: DiagCtxtHandle<'_>,
         module: ModuleCodegen<Self::Module>,
         config: &ModuleConfig,
     ) -> Result<CompiledModule, FatalError> {
-        unsafe { back::write::codegen(cgcx, dcx, module, config) }
+        back::write::codegen(cgcx, dcx, module, config)
     }
     fn prepare_thin(
         module: ModuleCodegen<Self::Module>,
@@ -338,8 +336,8 @@ impl CodegenBackend for LlvmCodegenBackend {
         llvm_util::print_version();
     }
 
-    fn target_features_cfg(&self, sess: &Session) -> (Vec<Symbol>, Vec<Symbol>) {
-        target_features_cfg(sess)
+    fn target_config(&self, sess: &Session) -> TargetConfig {
+        target_config(sess)
     }
 
     fn codegen_crate<'tcx>(
diff --git a/compiler/rustc_codegen_llvm/src/llvm/enzyme_ffi.rs b/compiler/rustc_codegen_llvm/src/llvm/enzyme_ffi.rs
index a9b3bdf7344..2ad39fc8538 100644
--- a/compiler/rustc_codegen_llvm/src/llvm/enzyme_ffi.rs
+++ b/compiler/rustc_codegen_llvm/src/llvm/enzyme_ffi.rs
@@ -19,6 +19,19 @@ unsafe extern "C" {
     pub(crate) fn LLVMRustVerifyFunction(V: &Value, action: LLVMRustVerifierFailureAction) -> Bool;
     pub(crate) fn LLVMRustHasAttributeAtIndex(V: &Value, i: c_uint, Kind: AttributeKind) -> bool;
     pub(crate) fn LLVMRustGetArrayNumElements(Ty: &Type) -> u64;
+    pub(crate) fn LLVMRustHasFnAttribute(
+        F: &Value,
+        Name: *const c_char,
+        NameLen: libc::size_t,
+    ) -> bool;
+    pub(crate) fn LLVMRustRemoveFnAttribute(F: &Value, Name: *const c_char, NameLen: libc::size_t);
+    pub(crate) fn LLVMGetFirstFunction(M: &Module) -> Option<&Value>;
+    pub(crate) fn LLVMGetNextFunction(Fn: &Value) -> Option<&Value>;
+    pub(crate) fn LLVMRustRemoveEnumAttributeAtIndex(
+        Fn: &Value,
+        index: c_uint,
+        kind: AttributeKind,
+    );
 }
 
 unsafe extern "C" {
diff --git a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs
index 9ff04f72903..e27fbf94f34 100644
--- a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs
+++ b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs
@@ -1,7 +1,7 @@
 //! Bindings to the LLVM-C API (`LLVM*`), and to our own `extern "C"` wrapper
 //! functions around the unstable LLVM C++ API (`LLVMRust*`).
 //!
-//! ## Passing pointer/length strings as `*const c_uchar`
+//! ## Passing pointer/length strings as `*const c_uchar` (PTR_LEN_STR)
 //!
 //! Normally it's a good idea for Rust-side bindings to match the corresponding
 //! C-side function declarations as closely as possible. But when passing `&str`
@@ -415,6 +415,7 @@ impl AtomicRmwBinOp {
 pub(crate) enum AtomicOrdering {
     #[allow(dead_code)]
     NotAtomic = 0,
+    #[allow(dead_code)]
     Unordered = 1,
     Monotonic = 2,
     // Consume = 3,  // Not specified yet.
@@ -425,15 +426,14 @@ pub(crate) enum AtomicOrdering {
 }
 
 impl AtomicOrdering {
-    pub(crate) fn from_generic(ao: rustc_codegen_ssa::common::AtomicOrdering) -> Self {
-        use rustc_codegen_ssa::common::AtomicOrdering as Common;
+    pub(crate) fn from_generic(ao: rustc_middle::ty::AtomicOrdering) -> Self {
+        use rustc_middle::ty::AtomicOrdering as Common;
         match ao {
-            Common::Unordered => Self::Unordered,
             Common::Relaxed => Self::Monotonic,
             Common::Acquire => Self::Acquire,
             Common::Release => Self::Release,
-            Common::AcquireRelease => Self::AcquireRelease,
-            Common::SequentiallyConsistent => Self::SequentiallyConsistent,
+            Common::AcqRel => Self::AcquireRelease,
+            Common::SeqCst => Self::SequentiallyConsistent,
         }
     }
 }
@@ -471,7 +471,7 @@ pub(crate) enum MetadataType {
     MD_kcfi_type = 36,
 }
 
-/// LLVMRustAsmDialect
+/// Must match the layout of `LLVMInlineAsmDialect`.
 #[derive(Copy, Clone, PartialEq)]
 #[repr(C)]
 pub(crate) enum AsmDialect {
@@ -1014,8 +1014,25 @@ unsafe extern "C" {
     pub(crate) fn LLVMGetDataLayoutStr(M: &Module) -> *const c_char;
     pub(crate) fn LLVMSetDataLayout(M: &Module, Triple: *const c_char);
 
-    /// See Module::setModuleInlineAsm.
-    pub(crate) fn LLVMAppendModuleInlineAsm(M: &Module, Asm: *const c_char, Len: size_t);
+    /// Append inline assembly to a module. See `Module::appendModuleInlineAsm`.
+    pub(crate) fn LLVMAppendModuleInlineAsm(
+        M: &Module,
+        Asm: *const c_uchar, // See "PTR_LEN_STR".
+        Len: size_t,
+    );
+
+    /// Create the specified uniqued inline asm string. See `InlineAsm::get()`.
+    pub(crate) fn LLVMGetInlineAsm<'ll>(
+        Ty: &'ll Type,
+        AsmString: *const c_uchar, // See "PTR_LEN_STR".
+        AsmStringSize: size_t,
+        Constraints: *const c_uchar, // See "PTR_LEN_STR".
+        ConstraintsSize: size_t,
+        HasSideEffects: llvm::Bool,
+        IsAlignStack: llvm::Bool,
+        Dialect: AsmDialect,
+        CanThrow: llvm::Bool,
+    ) -> &'ll Value;
 
     // Operations on integer types
     pub(crate) fn LLVMInt1TypeInContext(C: &Context) -> &Type;
@@ -1766,7 +1783,7 @@ unsafe extern "C" {
     pub(crate) fn LLVMDIBuilderCreateNameSpace<'ll>(
         Builder: &DIBuilder<'ll>,
         ParentScope: Option<&'ll Metadata>,
-        Name: *const c_uchar,
+        Name: *const c_uchar, // See "PTR_LEN_STR".
         NameLen: size_t,
         ExportSymbols: llvm::Bool,
     ) -> &'ll Metadata;
@@ -1994,21 +2011,9 @@ unsafe extern "C" {
     /// Prints the statistics collected by `-Zprint-codegen-stats`.
     pub(crate) fn LLVMRustPrintStatistics(OutStr: &RustString);
 
-    /// Prepares inline assembly.
-    pub(crate) fn LLVMRustInlineAsm(
-        Ty: &Type,
-        AsmString: *const c_char,
-        AsmStringLen: size_t,
-        Constraints: *const c_char,
-        ConstraintsLen: size_t,
-        SideEffects: Bool,
-        AlignStack: Bool,
-        Dialect: AsmDialect,
-        CanThrow: Bool,
-    ) -> &Value;
     pub(crate) fn LLVMRustInlineAsmVerify(
         Ty: &Type,
-        Constraints: *const c_char,
+        Constraints: *const c_uchar, // See "PTR_LEN_STR".
         ConstraintsLen: size_t,
     ) -> bool;
 
@@ -2454,6 +2459,9 @@ unsafe extern "C" {
         DisableSimplifyLibCalls: bool,
         EmitLifetimeMarkers: bool,
         RunEnzyme: bool,
+        PrintBeforeEnzyme: bool,
+        PrintAfterEnzyme: bool,
+        PrintPasses: bool,
         SanitizerOptions: Option<&SanitizerOptions>,
         PGOGenPath: *const c_char,
         PGOUsePath: *const c_char,
diff --git a/compiler/rustc_codegen_llvm/src/llvm/mod.rs b/compiler/rustc_codegen_llvm/src/llvm/mod.rs
index 6ca81c651ed..ed23f911930 100644
--- a/compiler/rustc_codegen_llvm/src/llvm/mod.rs
+++ b/compiler/rustc_codegen_llvm/src/llvm/mod.rs
@@ -41,6 +41,32 @@ pub(crate) fn AddFunctionAttributes<'ll>(
     }
 }
 
+pub(crate) fn HasAttributeAtIndex<'ll>(
+    llfn: &'ll Value,
+    idx: AttributePlace,
+    kind: AttributeKind,
+) -> bool {
+    unsafe { LLVMRustHasAttributeAtIndex(llfn, idx.as_uint(), kind) }
+}
+
+pub(crate) fn HasStringAttribute<'ll>(llfn: &'ll Value, name: &str) -> bool {
+    unsafe { LLVMRustHasFnAttribute(llfn, name.as_c_char_ptr(), name.len()) }
+}
+
+pub(crate) fn RemoveStringAttrFromFn<'ll>(llfn: &'ll Value, name: &str) {
+    unsafe { LLVMRustRemoveFnAttribute(llfn, name.as_c_char_ptr(), name.len()) }
+}
+
+pub(crate) fn RemoveRustEnumAttributeAtIndex(
+    llfn: &Value,
+    place: AttributePlace,
+    kind: AttributeKind,
+) {
+    unsafe {
+        LLVMRustRemoveEnumAttributeAtIndex(llfn, place.as_uint(), kind);
+    }
+}
+
 pub(crate) fn AddCallSiteAttributes<'ll>(
     callsite: &'ll Value,
     idx: AttributePlace,
@@ -337,12 +363,13 @@ pub(crate) fn last_error() -> Option<String> {
     }
 }
 
-/// Owns an [`OperandBundle`], and will dispose of it when dropped.
-pub(crate) struct OperandBundleOwned<'a> {
+/// Owning pointer to an [`OperandBundle`] that will dispose of the bundle
+/// when dropped.
+pub(crate) struct OperandBundleBox<'a> {
     raw: ptr::NonNull<OperandBundle<'a>>,
 }
 
-impl<'a> OperandBundleOwned<'a> {
+impl<'a> OperandBundleBox<'a> {
     pub(crate) fn new(name: &str, vals: &[&'a Value]) -> Self {
         let raw = unsafe {
             LLVMCreateOperandBundle(
@@ -352,21 +379,21 @@ impl<'a> OperandBundleOwned<'a> {
                 vals.len() as c_uint,
             )
         };
-        OperandBundleOwned { raw: ptr::NonNull::new(raw).unwrap() }
+        Self { raw: ptr::NonNull::new(raw).unwrap() }
     }
 
-    /// Returns inner `OperandBundle` type.
+    /// Dereferences to the underlying `&OperandBundle`.
     ///
-    /// This could be a `Deref` implementation, but `OperandBundle` contains an extern type and
-    /// `Deref::Target: ?Sized`.
-    pub(crate) fn raw(&self) -> &OperandBundle<'a> {
+    /// This can't be a `Deref` implementation because `OperandBundle` transitively
+    /// contains an extern type, which is incompatible with `Deref::Target: ?Sized`.
+    pub(crate) fn as_ref(&self) -> &OperandBundle<'a> {
         // SAFETY: The returned reference is opaque and can only used for FFI.
         // It is valid for as long as `&self` is.
         unsafe { self.raw.as_ref() }
     }
 }
 
-impl Drop for OperandBundleOwned<'_> {
+impl Drop for OperandBundleBox<'_> {
     fn drop(&mut self) {
         unsafe {
             LLVMDisposeOperandBundle(self.raw);
@@ -414,3 +441,11 @@ pub(crate) fn set_dso_local<'ll>(v: &'ll Value) {
         LLVMRustSetDSOLocal(v, true);
     }
 }
+
+/// Safe wrapper for `LLVMAppendModuleInlineAsm`, which delegates to
+/// `Module::appendModuleInlineAsm`.
+pub(crate) fn append_module_inline_asm<'ll>(llmod: &'ll Module, asm: &[u8]) {
+    unsafe {
+        LLVMAppendModuleInlineAsm(llmod, asm.as_ptr(), asm.len());
+    }
+}
diff --git a/compiler/rustc_codegen_llvm/src/llvm_util.rs b/compiler/rustc_codegen_llvm/src/llvm_util.rs
index 36e35f81392..9718c95f38a 100644
--- a/compiler/rustc_codegen_llvm/src/llvm_util.rs
+++ b/compiler/rustc_codegen_llvm/src/llvm_util.rs
@@ -6,6 +6,7 @@ use std::sync::Once;
 use std::{ptr, slice, str};
 
 use libc::c_int;
+use rustc_codegen_ssa::TargetConfig;
 use rustc_codegen_ssa::base::wants_wasm_eh;
 use rustc_codegen_ssa::codegen_attrs::check_tied_features;
 use rustc_data_structures::fx::{FxHashMap, FxHashSet};
@@ -18,6 +19,7 @@ use rustc_session::config::{PrintKind, PrintRequest};
 use rustc_span::Symbol;
 use rustc_target::spec::{MergeFunctions, PanicStrategy, SmallDataThresholdSupport};
 use rustc_target::target_features::{RUSTC_SPECIAL_FEATURES, RUSTC_SPECIFIC_FEATURES};
+use smallvec::{SmallVec, smallvec};
 
 use crate::back::write::create_informational_target_machine;
 use crate::errors::{
@@ -179,27 +181,27 @@ impl<'a> TargetFeatureFoldStrength<'a> {
 
 pub(crate) struct LLVMFeature<'a> {
     llvm_feature_name: &'a str,
-    dependency: Option<TargetFeatureFoldStrength<'a>>,
+    dependencies: SmallVec<[TargetFeatureFoldStrength<'a>; 1]>,
 }
 
 impl<'a> LLVMFeature<'a> {
     fn new(llvm_feature_name: &'a str) -> Self {
-        Self { llvm_feature_name, dependency: None }
+        Self { llvm_feature_name, dependencies: SmallVec::new() }
     }
 
-    fn with_dependency(
+    fn with_dependencies(
         llvm_feature_name: &'a str,
-        dependency: TargetFeatureFoldStrength<'a>,
+        dependencies: SmallVec<[TargetFeatureFoldStrength<'a>; 1]>,
     ) -> Self {
-        Self { llvm_feature_name, dependency: Some(dependency) }
+        Self { llvm_feature_name, dependencies }
     }
 
-    fn contains(&self, feat: &str) -> bool {
+    fn contains(&'a self, feat: &str) -> bool {
         self.iter().any(|dep| dep == feat)
     }
 
     fn iter(&'a self) -> impl Iterator<Item = &'a str> {
-        let dependencies = self.dependency.iter().map(|feat| feat.as_str());
+        let dependencies = self.dependencies.iter().map(|feat| feat.as_str());
         std::iter::once(self.llvm_feature_name).chain(dependencies)
     }
 }
@@ -209,7 +211,7 @@ impl<'a> IntoIterator for LLVMFeature<'a> {
     type IntoIter = impl Iterator<Item = &'a str>;
 
     fn into_iter(self) -> Self::IntoIter {
-        let dependencies = self.dependency.into_iter().map(|feat| feat.as_str());
+        let dependencies = self.dependencies.into_iter().map(|feat| feat.as_str());
         std::iter::once(self.llvm_feature_name).chain(dependencies)
     }
 }
@@ -239,9 +241,9 @@ pub(crate) fn to_llvm_features<'a>(sess: &Session, s: &'a str) -> Option<LLVMFea
         &*sess.target.arch
     };
     match (arch, s) {
-        ("x86", "sse4.2") => Some(LLVMFeature::with_dependency(
+        ("x86", "sse4.2") => Some(LLVMFeature::with_dependencies(
             "sse4.2",
-            TargetFeatureFoldStrength::EnableOnly("crc32"),
+            smallvec![TargetFeatureFoldStrength::EnableOnly("crc32")],
         )),
         ("x86", "pclmulqdq") => Some(LLVMFeature::new("pclmul")),
         ("x86", "rdrand") => Some(LLVMFeature::new("rdrnd")),
@@ -261,9 +263,10 @@ pub(crate) fn to_llvm_features<'a>(sess: &Session, s: &'a str) -> Option<LLVMFea
         ("aarch64", "sme-b16b16") if get_version().0 < 20 => Some(LLVMFeature::new("b16b16")),
         ("aarch64", "flagm2") => Some(LLVMFeature::new("altnzcv")),
         // Rust ties fp and neon together.
-        ("aarch64", "neon") => {
-            Some(LLVMFeature::with_dependency("neon", TargetFeatureFoldStrength::Both("fp-armv8")))
-        }
+        ("aarch64", "neon") => Some(LLVMFeature::with_dependencies(
+            "neon",
+            smallvec![TargetFeatureFoldStrength::Both("fp-armv8")],
+        )),
         // In LLVM neon implicitly enables fp, but we manually enable
         // neon when a feature only implicitly enables fp
         ("aarch64", "fhm") => Some(LLVMFeature::new("fp16fml")),
@@ -272,11 +275,26 @@ pub(crate) fn to_llvm_features<'a>(sess: &Session, s: &'a str) -> Option<LLVMFea
         ("aarch64", "fpmr") => None, // only existed in 18
         ("arm", "fp16") => Some(LLVMFeature::new("fullfp16")),
         // Filter out features that are not supported by the current LLVM version
+        ("loongarch64", "div32" | "lam-bh" | "lamcas" | "ld-seq-sa" | "scq")
+            if get_version().0 < 20 =>
+        {
+            None
+        }
+        // Filter out features that are not supported by the current LLVM version
         ("riscv32" | "riscv64", "zacas") if get_version().0 < 20 => None,
+        (
+            "s390x",
+            "message-security-assist-extension12"
+            | "concurrent-functions"
+            | "miscellaneous-extensions-4"
+            | "vector-enhancements-3"
+            | "vector-packed-decimal-enhancement-3",
+        ) if get_version().0 < 20 => None,
         // Enable the evex512 target feature if an avx512 target feature is enabled.
-        ("x86", s) if s.starts_with("avx512") => {
-            Some(LLVMFeature::with_dependency(s, TargetFeatureFoldStrength::EnableOnly("evex512")))
-        }
+        ("x86", s) if s.starts_with("avx512") => Some(LLVMFeature::with_dependencies(
+            s,
+            smallvec![TargetFeatureFoldStrength::EnableOnly("evex512")],
+        )),
         // Support for `wide-arithmetic` will first land in LLVM 20 as part of
         // llvm/llvm-project#111598
         ("wasm32" | "wasm64", "wide-arithmetic") if get_version() < (20, 0, 0) => None,
@@ -294,6 +312,21 @@ pub(crate) fn to_llvm_features<'a>(sess: &Session, s: &'a str) -> Option<LLVMFea
             None
         }
         ("x86", "movrs") if get_version().0 < 20 => None,
+        ("x86", "avx10.1") => Some(LLVMFeature::new("avx10.1-512")),
+        ("x86", "avx10.2") if get_version().0 < 20 => None,
+        ("x86", "avx10.2") if get_version().0 >= 20 => Some(LLVMFeature::new("avx10.2-512")),
+        ("x86", "apxf") => Some(LLVMFeature::with_dependencies(
+            "egpr",
+            smallvec![
+                TargetFeatureFoldStrength::Both("push2pop2"),
+                TargetFeatureFoldStrength::Both("ppx"),
+                TargetFeatureFoldStrength::Both("ndd"),
+                TargetFeatureFoldStrength::Both("ccmp"),
+                TargetFeatureFoldStrength::Both("cf"),
+                TargetFeatureFoldStrength::Both("nf"),
+                TargetFeatureFoldStrength::Both("zu"),
+            ],
+        )),
         (_, s) => Some(LLVMFeature::new(s)),
     }
 }
@@ -302,7 +335,7 @@ pub(crate) fn to_llvm_features<'a>(sess: &Session, s: &'a str) -> Option<LLVMFea
 /// Must express features in the way Rust understands them.
 ///
 /// We do not have to worry about RUSTC_SPECIFIC_FEATURES here, those are handled outside codegen.
-pub(crate) fn target_features_cfg(sess: &Session) -> (Vec<Symbol>, Vec<Symbol>) {
+pub(crate) fn target_config(sess: &Session) -> TargetConfig {
     // Add base features for the target.
     // We do *not* add the -Ctarget-features there, and instead duplicate the logic for that below.
     // The reason is that if LLVM considers a feature implied but we do not, we don't want that to
@@ -402,7 +435,85 @@ pub(crate) fn target_features_cfg(sess: &Session) -> (Vec<Symbol>, Vec<Symbol>)
 
     let target_features = f(false);
     let unstable_target_features = f(true);
-    (target_features, unstable_target_features)
+    let mut cfg = TargetConfig {
+        target_features,
+        unstable_target_features,
+        has_reliable_f16: true,
+        has_reliable_f16_math: true,
+        has_reliable_f128: true,
+        has_reliable_f128_math: true,
+    };
+
+    update_target_reliable_float_cfg(sess, &mut cfg);
+    cfg
+}
+
+/// Determine whether or not experimental float types are reliable based on known bugs.
+fn update_target_reliable_float_cfg(sess: &Session, cfg: &mut TargetConfig) {
+    let target_arch = sess.target.arch.as_ref();
+    let target_os = sess.target.options.os.as_ref();
+    let target_env = sess.target.options.env.as_ref();
+    let target_abi = sess.target.options.abi.as_ref();
+    let target_pointer_width = sess.target.pointer_width;
+
+    cfg.has_reliable_f16 = match (target_arch, target_os) {
+        // Selection failure <https://github.com/llvm/llvm-project/issues/50374>
+        ("s390x", _) => false,
+        // Unsupported <https://github.com/llvm/llvm-project/issues/94434>
+        ("arm64ec", _) => false,
+        // MinGW ABI bugs <https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115054>
+        ("x86_64", "windows") if target_env == "gnu" && target_abi != "llvm" => false,
+        // Infinite recursion <https://github.com/llvm/llvm-project/issues/97981>
+        ("csky", _) => false,
+        ("hexagon", _) => false,
+        ("powerpc" | "powerpc64", _) => false,
+        ("sparc" | "sparc64", _) => false,
+        ("wasm32" | "wasm64", _) => false,
+        // `f16` support only requires that symbols converting to and from `f32` are available. We
+        // provide these in `compiler-builtins`, so `f16` should be available on all platforms that
+        // do not have other ABI issues or LLVM crashes.
+        _ => true,
+    };
+
+    cfg.has_reliable_f128 = match (target_arch, target_os) {
+        // Unsupported <https://github.com/llvm/llvm-project/issues/94434>
+        ("arm64ec", _) => false,
+        // Selection bug <https://github.com/llvm/llvm-project/issues/96432>
+        ("mips64" | "mips64r6", _) => false,
+        // Selection bug <https://github.com/llvm/llvm-project/issues/95471>
+        ("nvptx64", _) => false,
+        // ABI bugs <https://github.com/rust-lang/rust/issues/125109> et al. (full
+        // list at <https://github.com/rust-lang/rust/issues/116909>)
+        ("powerpc" | "powerpc64", _) => false,
+        // ABI unsupported  <https://github.com/llvm/llvm-project/issues/41838>
+        ("sparc", _) => false,
+        // Stack alignment bug <https://github.com/llvm/llvm-project/issues/77401>. NB: tests may
+        // not fail if our compiler-builtins is linked.
+        ("x86", _) => false,
+        // MinGW ABI bugs <https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115054>
+        ("x86_64", "windows") if target_env == "gnu" && target_abi != "llvm" => false,
+        // There are no known problems on other platforms, so the only requirement is that symbols
+        // are available. `compiler-builtins` provides all symbols required for core `f128`
+        // support, so this should work for everything else.
+        _ => true,
+    };
+
+    // Assume that working `f16` means working `f16` math for most platforms, since
+    // operations just go through `f32`.
+    cfg.has_reliable_f16_math = cfg.has_reliable_f16;
+
+    cfg.has_reliable_f128_math = match (target_arch, target_os) {
+        // LLVM lowers `fp128` math to `long double` symbols even on platforms where
+        // `long double` is not IEEE binary128. See
+        // <https://github.com/llvm/llvm-project/issues/44744>.
+        //
+        // This rules out anything that doesn't have `long double` = `binary128`; <= 32 bits
+        // (ld is `f64`), anything other than Linux (Windows and MacOS use `f64`), and `x86`
+        // (ld is 80-bit extended precision).
+        ("x86_64", _) => false,
+        (_, "linux") if target_pointer_width == 64 => true,
+        _ => false,
+    } && cfg.has_reliable_f128;
 }
 
 pub(crate) fn print_version() {
@@ -686,7 +797,7 @@ pub(crate) fn global_llvm_features(
                 )
             } else if let Some(feature) = feature.strip_prefix('-') {
                 // FIXME: Why do we not remove implied features on "-" here?
-                // We do the equivalent above in `target_features_cfg`.
+                // We do the equivalent above in `target_config`.
                 // See <https://github.com/rust-lang/rust/issues/134792>.
                 all_rust_features.push((false, feature));
             } else if !feature.is_empty() {
@@ -765,7 +876,7 @@ pub(crate) fn global_llvm_features(
                         "{}{}",
                         enable_disable, llvm_feature.llvm_feature_name
                     ))
-                    .chain(llvm_feature.dependency.into_iter().filter_map(
+                    .chain(llvm_feature.dependencies.into_iter().filter_map(
                         move |feat| match (enable, feat) {
                             (_, TargetFeatureFoldStrength::Both(f))
                             | (true, TargetFeatureFoldStrength::EnableOnly(f)) => {
diff --git a/compiler/rustc_codegen_llvm/src/mono_item.rs b/compiler/rustc_codegen_llvm/src/mono_item.rs
index fdf62a08065..3f38e1e191b 100644
--- a/compiler/rustc_codegen_llvm/src/mono_item.rs
+++ b/compiler/rustc_codegen_llvm/src/mono_item.rs
@@ -16,7 +16,7 @@ use crate::{base, llvm};
 
 impl<'tcx> PreDefineCodegenMethods<'tcx> for CodegenCx<'_, 'tcx> {
     fn predefine_static(
-        &self,
+        &mut self,
         def_id: DefId,
         linkage: Linkage,
         visibility: Visibility,
@@ -44,7 +44,7 @@ impl<'tcx> PreDefineCodegenMethods<'tcx> for CodegenCx<'_, 'tcx> {
     }
 
     fn predefine_fn(
-        &self,
+        &mut self,
         instance: Instance<'tcx>,
         linkage: Linkage,
         visibility: Visibility,
diff --git a/compiler/rustc_codegen_llvm/src/type_.rs b/compiler/rustc_codegen_llvm/src/type_.rs
index b89ce90d1a1..169036f5152 100644
--- a/compiler/rustc_codegen_llvm/src/type_.rs
+++ b/compiler/rustc_codegen_llvm/src/type_.rs
@@ -128,6 +128,10 @@ impl<'ll, CX: Borrow<SCx<'ll>>> GenericCx<'ll, CX> {
         (**self).borrow().llcx
     }
 
+    pub(crate) fn llmod(&self) -> &'ll llvm::Module {
+        (**self).borrow().llmod
+    }
+
     pub(crate) fn isize_ty(&self) -> &'ll Type {
         (**self).borrow().isize_ty
     }
diff --git a/compiler/rustc_codegen_llvm/src/va_arg.rs b/compiler/rustc_codegen_llvm/src/va_arg.rs
index c216f0f4a09..236568590be 100644
--- a/compiler/rustc_codegen_llvm/src/va_arg.rs
+++ b/compiler/rustc_codegen_llvm/src/va_arg.rs
@@ -1,7 +1,10 @@
-use rustc_abi::{Align, Endian, HasDataLayout, Size};
+use rustc_abi::{Align, BackendRepr, Endian, HasDataLayout, Primitive, Size, TyAndLayout};
+use rustc_codegen_ssa::MemFlags;
 use rustc_codegen_ssa::common::IntPredicate;
 use rustc_codegen_ssa::mir::operand::OperandRef;
-use rustc_codegen_ssa::traits::{BaseTypeCodegenMethods, BuilderMethods, ConstCodegenMethods};
+use rustc_codegen_ssa::traits::{
+    BaseTypeCodegenMethods, BuilderMethods, ConstCodegenMethods, LayoutTypeCodegenMethods,
+};
 use rustc_middle::ty::Ty;
 use rustc_middle::ty::layout::{HasTyCtxt, LayoutOf};
 
@@ -37,6 +40,7 @@ fn emit_direct_ptr_va_arg<'ll, 'tcx>(
     align: Align,
     slot_size: Align,
     allow_higher_align: bool,
+    force_right_adjust: bool,
 ) -> (&'ll Value, Align) {
     let va_list_ty = bx.type_ptr();
     let va_list_addr = list.immediate();
@@ -54,7 +58,10 @@ fn emit_direct_ptr_va_arg<'ll, 'tcx>(
     let next = bx.inbounds_ptradd(addr, full_direct_size);
     bx.store(next, va_list_addr, bx.tcx().data_layout.pointer_align.abi);
 
-    if size.bytes() < slot_size.bytes() && bx.tcx().sess.target.endian == Endian::Big {
+    if size.bytes() < slot_size.bytes()
+        && bx.tcx().sess.target.endian == Endian::Big
+        && force_right_adjust
+    {
         let adjusted_size = bx.cx().const_i32((slot_size.bytes() - size.bytes()) as i32);
         let adjusted = bx.inbounds_ptradd(addr, adjusted_size);
         (adjusted, addr_align)
@@ -63,14 +70,40 @@ fn emit_direct_ptr_va_arg<'ll, 'tcx>(
     }
 }
 
+enum PassMode {
+    Direct,
+    Indirect,
+}
+
+enum SlotSize {
+    Bytes8 = 8,
+    Bytes4 = 4,
+}
+
+enum AllowHigherAlign {
+    No,
+    Yes,
+}
+
+enum ForceRightAdjust {
+    No,
+    Yes,
+}
+
 fn emit_ptr_va_arg<'ll, 'tcx>(
     bx: &mut Builder<'_, 'll, 'tcx>,
     list: OperandRef<'tcx, &'ll Value>,
     target_ty: Ty<'tcx>,
-    indirect: bool,
-    slot_size: Align,
-    allow_higher_align: bool,
+    pass_mode: PassMode,
+    slot_size: SlotSize,
+    allow_higher_align: AllowHigherAlign,
+    force_right_adjust: ForceRightAdjust,
 ) -> &'ll Value {
+    let indirect = matches!(pass_mode, PassMode::Indirect);
+    let allow_higher_align = matches!(allow_higher_align, AllowHigherAlign::Yes);
+    let force_right_adjust = matches!(force_right_adjust, ForceRightAdjust::Yes);
+    let slot_size = Align::from_bytes(slot_size as u64).unwrap();
+
     let layout = bx.cx.layout_of(target_ty);
     let (llty, size, align) = if indirect {
         (
@@ -81,8 +114,15 @@ fn emit_ptr_va_arg<'ll, 'tcx>(
     } else {
         (layout.llvm_type(bx.cx), layout.size, layout.align)
     };
-    let (addr, addr_align) =
-        emit_direct_ptr_va_arg(bx, list, size, align.abi, slot_size, allow_higher_align);
+    let (addr, addr_align) = emit_direct_ptr_va_arg(
+        bx,
+        list,
+        size,
+        align.abi,
+        slot_size,
+        allow_higher_align,
+        force_right_adjust,
+    );
     if indirect {
         let tmp_ret = bx.load(llty, addr, addr_align);
         bx.load(bx.cx.layout_of(target_ty).llvm_type(bx.cx), tmp_ret, align.abi)
@@ -179,8 +219,15 @@ fn emit_aapcs_va_arg<'ll, 'tcx>(
 
     // On Stack block
     bx.switch_to_block(on_stack);
-    let stack_value =
-        emit_ptr_va_arg(bx, list, target_ty, false, Align::from_bytes(8).unwrap(), true);
+    let stack_value = emit_ptr_va_arg(
+        bx,
+        list,
+        target_ty,
+        PassMode::Direct,
+        SlotSize::Bytes8,
+        AllowHigherAlign::Yes,
+        ForceRightAdjust::No,
+    );
     bx.br(end);
 
     bx.switch_to_block(end);
@@ -190,6 +237,150 @@ fn emit_aapcs_va_arg<'ll, 'tcx>(
     val
 }
 
+fn emit_powerpc_va_arg<'ll, 'tcx>(
+    bx: &mut Builder<'_, 'll, 'tcx>,
+    list: OperandRef<'tcx, &'ll Value>,
+    target_ty: Ty<'tcx>,
+) -> &'ll Value {
+    let dl = bx.cx.data_layout();
+
+    // struct __va_list_tag {
+    //   unsigned char gpr;
+    //   unsigned char fpr;
+    //   unsigned short reserved;
+    //   void *overflow_arg_area;
+    //   void *reg_save_area;
+    // };
+    let va_list_addr = list.immediate();
+
+    // Peel off any newtype wrappers.
+    let layout = {
+        let mut layout = bx.cx.layout_of(target_ty);
+
+        while let Some((_, inner)) = layout.non_1zst_field(bx.cx) {
+            layout = inner;
+        }
+
+        layout
+    };
+
+    // Rust does not currently support any powerpc softfloat targets.
+    let target = &bx.cx.tcx.sess.target;
+    let is_soft_float_abi = target.abi == "softfloat";
+    assert!(!is_soft_float_abi);
+
+    // All instances of VaArgSafe are passed directly.
+    let is_indirect = false;
+
+    let (is_i64, is_int, is_f64) = match layout.layout.backend_repr() {
+        BackendRepr::Scalar(scalar) => match scalar.primitive() {
+            rustc_abi::Primitive::Int(integer, _) => (integer.size().bits() == 64, true, false),
+            rustc_abi::Primitive::Float(float) => (false, false, float.size().bits() == 64),
+            rustc_abi::Primitive::Pointer(_) => (false, true, false),
+        },
+        _ => unreachable!("all instances of VaArgSafe are represented as scalars"),
+    };
+
+    let num_regs_addr = if is_int || is_soft_float_abi {
+        va_list_addr // gpr
+    } else {
+        bx.inbounds_ptradd(va_list_addr, bx.const_usize(1)) // fpr
+    };
+
+    let mut num_regs = bx.load(bx.type_i8(), num_regs_addr, dl.i8_align.abi);
+
+    // "Align" the register count when the type is passed as `i64`.
+    if is_i64 || (is_f64 && is_soft_float_abi) {
+        num_regs = bx.add(num_regs, bx.const_u8(1));
+        num_regs = bx.and(num_regs, bx.const_u8(0b1111_1110));
+    }
+
+    let max_regs = 8u8;
+    let use_regs = bx.icmp(IntPredicate::IntULT, num_regs, bx.const_u8(max_regs));
+
+    let in_reg = bx.append_sibling_block("va_arg.in_reg");
+    let in_mem = bx.append_sibling_block("va_arg.in_mem");
+    let end = bx.append_sibling_block("va_arg.end");
+
+    bx.cond_br(use_regs, in_reg, in_mem);
+
+    let reg_addr = {
+        bx.switch_to_block(in_reg);
+
+        let reg_safe_area_ptr = bx.inbounds_ptradd(va_list_addr, bx.cx.const_usize(1 + 1 + 2 + 4));
+        let mut reg_addr = bx.load(bx.type_ptr(), reg_safe_area_ptr, dl.pointer_align.abi);
+
+        // Floating-point registers start after the general-purpose registers.
+        if !is_int && !is_soft_float_abi {
+            reg_addr = bx.inbounds_ptradd(reg_addr, bx.cx.const_usize(32))
+        }
+
+        // Get the address of the saved value by scaling the number of
+        // registers we've used by the number of.
+        let reg_size = if is_int || is_soft_float_abi { 4 } else { 8 };
+        let reg_offset = bx.mul(num_regs, bx.cx().const_u8(reg_size));
+        let reg_addr = bx.inbounds_ptradd(reg_addr, reg_offset);
+
+        // Increase the used-register count.
+        let reg_incr = if is_i64 || (is_f64 && is_soft_float_abi) { 2 } else { 1 };
+        let new_num_regs = bx.add(num_regs, bx.cx.const_u8(reg_incr));
+        bx.store(new_num_regs, num_regs_addr, dl.i8_align.abi);
+
+        bx.br(end);
+
+        reg_addr
+    };
+
+    let mem_addr = {
+        bx.switch_to_block(in_mem);
+
+        bx.store(bx.const_u8(max_regs), num_regs_addr, dl.i8_align.abi);
+
+        // Everything in the overflow area is rounded up to a size of at least 4.
+        let overflow_area_align = Align::from_bytes(4).unwrap();
+
+        let size = if !is_indirect {
+            layout.layout.size.align_to(overflow_area_align)
+        } else {
+            dl.pointer_size
+        };
+
+        let overflow_area_ptr = bx.inbounds_ptradd(va_list_addr, bx.cx.const_usize(1 + 1 + 2));
+        let mut overflow_area = bx.load(bx.type_ptr(), overflow_area_ptr, dl.pointer_align.abi);
+
+        // Round up address of argument to alignment
+        if layout.layout.align.abi > overflow_area_align {
+            overflow_area = round_pointer_up_to_alignment(
+                bx,
+                overflow_area,
+                layout.layout.align.abi,
+                bx.type_ptr(),
+            );
+        }
+
+        let mem_addr = overflow_area;
+
+        // Increase the overflow area.
+        overflow_area = bx.inbounds_ptradd(overflow_area, bx.const_usize(size.bytes()));
+        bx.store(overflow_area, overflow_area_ptr, dl.pointer_align.abi);
+
+        bx.br(end);
+
+        mem_addr
+    };
+
+    // Return the appropriate result.
+    bx.switch_to_block(end);
+    let val_addr = bx.phi(bx.type_ptr(), &[reg_addr, mem_addr], &[in_reg, in_mem]);
+    let val_type = layout.llvm_type(bx);
+    let val_addr = if is_indirect {
+        bx.load(bx.cx.type_ptr(), val_addr, dl.pointer_align.abi)
+    } else {
+        val_addr
+    };
+    bx.load(val_type, val_addr, layout.align.abi)
+}
+
 fn emit_s390x_va_arg<'ll, 'tcx>(
     bx: &mut Builder<'_, 'll, 'tcx>,
     list: OperandRef<'tcx, &'ll Value>,
@@ -278,6 +469,313 @@ fn emit_s390x_va_arg<'ll, 'tcx>(
     bx.load(val_type, val_addr, layout.align.abi)
 }
 
+fn emit_x86_64_sysv64_va_arg<'ll, 'tcx>(
+    bx: &mut Builder<'_, 'll, 'tcx>,
+    list: OperandRef<'tcx, &'ll Value>,
+    target_ty: Ty<'tcx>,
+) -> &'ll Value {
+    let dl = bx.cx.data_layout();
+
+    // Implementation of the systemv x86_64 ABI calling convention for va_args, see
+    // https://gitlab.com/x86-psABIs/x86-64-ABI (section 3.5.7). This implementation is heavily
+    // based on the one in clang.
+
+    // We're able to take some shortcuts because the return type of `va_arg` must implement the
+    // `VaArgSafe` trait. Currently, only pointers, f64, i32, u32, i64 and u64 implement this trait.
+
+    // typedef struct __va_list_tag {
+    //     unsigned int gp_offset;
+    //     unsigned int fp_offset;
+    //     void *overflow_arg_area;
+    //     void *reg_save_area;
+    // } va_list[1];
+    let va_list_addr = list.immediate();
+
+    // Peel off any newtype wrappers.
+    //
+    // The "C" ABI does not unwrap newtypes (see `ReprOptions::inhibit_newtype_abi_optimization`).
+    // Here, we do actually want the unwrapped representation, because that is how LLVM/Clang
+    // pass such types to variadic functions.
+    //
+    // An example of a type that must be unwrapped is `Foo` below. Without the unwrapping, it has
+    // `BackendRepr::Memory`, but we need it to be `BackendRepr::Scalar` to generate correct code.
+    //
+    // ```
+    // #[repr(C)]
+    // struct Empty;
+    //
+    // #[repr(C)]
+    // struct Foo([Empty; 8], i32);
+    // ```
+    let layout = {
+        let mut layout = bx.cx.layout_of(target_ty);
+
+        while let Some((_, inner)) = layout.non_1zst_field(bx.cx) {
+            layout = inner;
+        }
+
+        layout
+    };
+
+    // AMD64-ABI 3.5.7p5: Step 1. Determine whether type may be passed
+    // in the registers. If not go to step 7.
+
+    // AMD64-ABI 3.5.7p5: Step 2. Compute num_gp to hold the number of
+    // general purpose registers needed to pass type and num_fp to hold
+    // the number of floating point registers needed.
+
+    let mut num_gp_registers = 0;
+    let mut num_fp_registers = 0;
+
+    let mut registers_for_primitive = |p| match p {
+        Primitive::Int(integer, _is_signed) => {
+            num_gp_registers += integer.size().bytes().div_ceil(8) as u32;
+        }
+        Primitive::Float(float) => {
+            num_fp_registers += float.size().bytes().div_ceil(16) as u32;
+        }
+        Primitive::Pointer(_) => {
+            num_gp_registers += 1;
+        }
+    };
+
+    match layout.layout.backend_repr() {
+        BackendRepr::Scalar(scalar) => {
+            registers_for_primitive(scalar.primitive());
+        }
+        BackendRepr::ScalarPair(scalar1, scalar2) => {
+            registers_for_primitive(scalar1.primitive());
+            registers_for_primitive(scalar2.primitive());
+        }
+        BackendRepr::SimdVector { .. } => {
+            // Because no instance of VaArgSafe uses a non-scalar `BackendRepr`.
+            unreachable!(
+                "No x86-64 SysV va_arg implementation for {:?}",
+                layout.layout.backend_repr()
+            )
+        }
+        BackendRepr::Memory { .. } => {
+            let mem_addr = x86_64_sysv64_va_arg_from_memory(bx, va_list_addr, layout);
+            return bx.load(layout.llvm_type(bx), mem_addr, layout.align.abi);
+        }
+    };
+
+    // AMD64-ABI 3.5.7p5: Step 3. Verify whether arguments fit into
+    // registers. In the case: l->gp_offset > 48 - num_gp * 8 or
+    // l->fp_offset > 176 - num_fp * 16 go to step 7.
+
+    let unsigned_int_offset = 4;
+    let ptr_offset = 8;
+    let gp_offset_ptr = va_list_addr;
+    let fp_offset_ptr = bx.inbounds_ptradd(va_list_addr, bx.cx.const_usize(unsigned_int_offset));
+
+    let gp_offset_v = bx.load(bx.type_i32(), gp_offset_ptr, Align::from_bytes(8).unwrap());
+    let fp_offset_v = bx.load(bx.type_i32(), fp_offset_ptr, Align::from_bytes(4).unwrap());
+
+    let mut use_regs = bx.const_bool(false);
+
+    if num_gp_registers > 0 {
+        let max_offset_val = 48u32 - num_gp_registers * 8;
+        let fits_in_gp = bx.icmp(IntPredicate::IntULE, gp_offset_v, bx.const_u32(max_offset_val));
+        use_regs = fits_in_gp;
+    }
+
+    if num_fp_registers > 0 {
+        let max_offset_val = 176u32 - num_fp_registers * 16;
+        let fits_in_fp = bx.icmp(IntPredicate::IntULE, fp_offset_v, bx.const_u32(max_offset_val));
+        use_regs = if num_gp_registers > 0 { bx.and(use_regs, fits_in_fp) } else { fits_in_fp };
+    }
+
+    let in_reg = bx.append_sibling_block("va_arg.in_reg");
+    let in_mem = bx.append_sibling_block("va_arg.in_mem");
+    let end = bx.append_sibling_block("va_arg.end");
+
+    bx.cond_br(use_regs, in_reg, in_mem);
+
+    // Emit code to load the value if it was passed in a register.
+    bx.switch_to_block(in_reg);
+
+    // AMD64-ABI 3.5.7p5: Step 4. Fetch type from l->reg_save_area with
+    // an offset of l->gp_offset and/or l->fp_offset. This may require
+    // copying to a temporary location in case the parameter is passed
+    // in different register classes or requires an alignment greater
+    // than 8 for general purpose registers and 16 for XMM registers.
+    //
+    // FIXME(llvm): This really results in shameful code when we end up needing to
+    // collect arguments from different places; often what should result in a
+    // simple assembling of a structure from scattered addresses has many more
+    // loads than necessary. Can we clean this up?
+    let reg_save_area_ptr =
+        bx.inbounds_ptradd(va_list_addr, bx.cx.const_usize(2 * unsigned_int_offset + ptr_offset));
+    let reg_save_area_v = bx.load(bx.type_ptr(), reg_save_area_ptr, dl.pointer_align.abi);
+
+    let reg_addr = match layout.layout.backend_repr() {
+        BackendRepr::Scalar(scalar) => match scalar.primitive() {
+            Primitive::Int(_, _) | Primitive::Pointer(_) => {
+                let reg_addr = bx.inbounds_ptradd(reg_save_area_v, gp_offset_v);
+
+                // Copy into a temporary if the type is more aligned than the register save area.
+                let gp_align = Align::from_bytes(8).unwrap();
+                copy_to_temporary_if_more_aligned(bx, reg_addr, layout, gp_align)
+            }
+            Primitive::Float(_) => bx.inbounds_ptradd(reg_save_area_v, fp_offset_v),
+        },
+        BackendRepr::ScalarPair(scalar1, scalar2) => {
+            let ty_lo = bx.cx().scalar_pair_element_backend_type(layout, 0, false);
+            let ty_hi = bx.cx().scalar_pair_element_backend_type(layout, 1, false);
+
+            let align_lo = layout.field(bx.cx, 0).layout.align().abi;
+            let align_hi = layout.field(bx.cx, 1).layout.align().abi;
+
+            match (scalar1.primitive(), scalar2.primitive()) {
+                (Primitive::Float(_), Primitive::Float(_)) => {
+                    // SSE registers are spaced 16 bytes apart in the register save
+                    // area, we need to collect the two eightbytes together.
+                    // The ABI isn't explicit about this, but it seems reasonable
+                    // to assume that the slots are 16-byte aligned, since the stack is
+                    // naturally 16-byte aligned and the prologue is expected to store
+                    // all the SSE registers to the RSA.
+                    let reg_lo_addr = bx.inbounds_ptradd(reg_save_area_v, fp_offset_v);
+                    let reg_hi_addr = bx.inbounds_ptradd(reg_lo_addr, bx.const_i32(16));
+
+                    let align = layout.layout.align().abi;
+                    let tmp = bx.alloca(layout.layout.size(), align);
+
+                    let reg_lo = bx.load(ty_lo, reg_lo_addr, align_lo);
+                    let reg_hi = bx.load(ty_hi, reg_hi_addr, align_hi);
+
+                    let offset = scalar1.size(bx.cx).align_to(align_hi).bytes();
+                    let field0 = tmp;
+                    let field1 = bx.inbounds_ptradd(tmp, bx.const_u32(offset as u32));
+
+                    bx.store(reg_lo, field0, align);
+                    bx.store(reg_hi, field1, align);
+
+                    tmp
+                }
+                (Primitive::Float(_), _) | (_, Primitive::Float(_)) => {
+                    let gp_addr = bx.inbounds_ptradd(reg_save_area_v, gp_offset_v);
+                    let fp_addr = bx.inbounds_ptradd(reg_save_area_v, fp_offset_v);
+
+                    let (reg_lo_addr, reg_hi_addr) = match scalar1.primitive() {
+                        Primitive::Float(_) => (fp_addr, gp_addr),
+                        Primitive::Int(_, _) | Primitive::Pointer(_) => (gp_addr, fp_addr),
+                    };
+
+                    let tmp = bx.alloca(layout.layout.size(), layout.layout.align().abi);
+
+                    let reg_lo = bx.load(ty_lo, reg_lo_addr, align_lo);
+                    let reg_hi = bx.load(ty_hi, reg_hi_addr, align_hi);
+
+                    let offset = scalar1.size(bx.cx).align_to(align_hi).bytes();
+                    let field0 = tmp;
+                    let field1 = bx.inbounds_ptradd(tmp, bx.const_u32(offset as u32));
+
+                    bx.store(reg_lo, field0, align_lo);
+                    bx.store(reg_hi, field1, align_hi);
+
+                    tmp
+                }
+                (_, _) => {
+                    // Two integer/pointer values are just contiguous in memory.
+                    let reg_addr = bx.inbounds_ptradd(reg_save_area_v, gp_offset_v);
+
+                    // Copy into a temporary if the type is more aligned than the register save area.
+                    let gp_align = Align::from_bytes(8).unwrap();
+                    copy_to_temporary_if_more_aligned(bx, reg_addr, layout, gp_align)
+                }
+            }
+        }
+        // The Previous match on `BackendRepr` means control flow already escaped.
+        BackendRepr::SimdVector { .. } | BackendRepr::Memory { .. } => unreachable!(),
+    };
+
+    // AMD64-ABI 3.5.7p5: Step 5. Set:
+    // l->gp_offset = l->gp_offset + num_gp * 8
+    if num_gp_registers > 0 {
+        let offset = bx.const_u32(num_gp_registers * 8);
+        let sum = bx.add(gp_offset_v, offset);
+        // An alignment of 8 because `__va_list_tag` is 8-aligned and this is its first field.
+        bx.store(sum, gp_offset_ptr, Align::from_bytes(8).unwrap());
+    }
+
+    // l->fp_offset = l->fp_offset + num_fp * 16.
+    if num_fp_registers > 0 {
+        let offset = bx.const_u32(num_fp_registers * 16);
+        let sum = bx.add(fp_offset_v, offset);
+        bx.store(sum, fp_offset_ptr, Align::from_bytes(4).unwrap());
+    }
+
+    bx.br(end);
+
+    bx.switch_to_block(in_mem);
+    let mem_addr = x86_64_sysv64_va_arg_from_memory(bx, va_list_addr, layout);
+    bx.br(end);
+
+    bx.switch_to_block(end);
+
+    let val_type = layout.llvm_type(bx);
+    let val_addr = bx.phi(bx.type_ptr(), &[reg_addr, mem_addr], &[in_reg, in_mem]);
+
+    bx.load(val_type, val_addr, layout.align.abi)
+}
+
+/// Copy into a temporary if the type is more aligned than the register save area.
+fn copy_to_temporary_if_more_aligned<'ll, 'tcx>(
+    bx: &mut Builder<'_, 'll, 'tcx>,
+    reg_addr: &'ll Value,
+    layout: TyAndLayout<'tcx, Ty<'tcx>>,
+    src_align: Align,
+) -> &'ll Value {
+    if layout.layout.align.abi > src_align {
+        let tmp = bx.alloca(layout.layout.size(), layout.layout.align().abi);
+        bx.memcpy(
+            tmp,
+            layout.layout.align.abi,
+            reg_addr,
+            src_align,
+            bx.const_u32(layout.layout.size().bytes() as u32),
+            MemFlags::empty(),
+        );
+        tmp
+    } else {
+        reg_addr
+    }
+}
+
+fn x86_64_sysv64_va_arg_from_memory<'ll, 'tcx>(
+    bx: &mut Builder<'_, 'll, 'tcx>,
+    va_list_addr: &'ll Value,
+    layout: TyAndLayout<'tcx, Ty<'tcx>>,
+) -> &'ll Value {
+    let dl = bx.cx.data_layout();
+
+    let overflow_arg_area_ptr = bx.inbounds_ptradd(va_list_addr, bx.const_usize(8));
+
+    let overflow_arg_area_v = bx.load(bx.type_ptr(), overflow_arg_area_ptr, dl.pointer_align.abi);
+    // AMD64-ABI 3.5.7p5: Step 7. Align l->overflow_arg_area upwards to a 16
+    // byte boundary if alignment needed by type exceeds 8 byte boundary.
+    // It isn't stated explicitly in the standard, but in practice we use
+    // alignment greater than 16 where necessary.
+    if layout.layout.align.abi.bytes() > 8 {
+        unreachable!("all instances of VaArgSafe have an alignment <= 8");
+    }
+
+    // AMD64-ABI 3.5.7p5: Step 8. Fetch type from l->overflow_arg_area.
+    let mem_addr = overflow_arg_area_v;
+
+    // AMD64-ABI 3.5.7p5: Step 9. Set l->overflow_arg_area to:
+    // l->overflow_arg_area + sizeof(type).
+    // AMD64-ABI 3.5.7p5: Step 10. Align l->overflow_arg_area upwards to
+    // an 8 byte boundary.
+    let size_in_bytes = layout.layout.size().bytes();
+    let offset = bx.const_i32(size_in_bytes.next_multiple_of(8) as i32);
+    let overflow_arg_area = bx.inbounds_ptradd(overflow_arg_area_v, offset);
+    bx.store(overflow_arg_area, overflow_arg_area_ptr, dl.pointer_align.abi);
+
+    mem_addr
+}
+
 fn emit_xtensa_va_arg<'ll, 'tcx>(
     bx: &mut Builder<'_, 'll, 'tcx>,
     list: OperandRef<'tcx, &'ll Value>,
@@ -309,8 +807,7 @@ fn emit_xtensa_va_arg<'ll, 'tcx>(
     // (*va).va_ndx
     let va_reg_offset = 4;
     let va_ndx_offset = va_reg_offset + 4;
-    let offset_ptr =
-        bx.inbounds_gep(bx.type_i8(), va_list_addr, &[bx.cx.const_usize(va_ndx_offset)]);
+    let offset_ptr = bx.inbounds_ptradd(va_list_addr, bx.cx.const_usize(va_ndx_offset));
 
     let offset = bx.load(bx.type_i32(), offset_ptr, bx.tcx().data_layout.i32_align.abi);
     let offset = round_up_to_alignment(bx, offset, layout.align.abi);
@@ -331,11 +828,10 @@ fn emit_xtensa_va_arg<'ll, 'tcx>(
     bx.store(offset_next, offset_ptr, bx.tcx().data_layout.pointer_align.abi);
 
     // (*va).va_reg
-    let regsave_area_ptr =
-        bx.inbounds_gep(bx.type_i8(), va_list_addr, &[bx.cx.const_usize(va_reg_offset)]);
+    let regsave_area_ptr = bx.inbounds_ptradd(va_list_addr, bx.cx.const_usize(va_reg_offset));
     let regsave_area =
         bx.load(bx.type_ptr(), regsave_area_ptr, bx.tcx().data_layout.pointer_align.abi);
-    let regsave_value_ptr = bx.inbounds_gep(bx.type_i8(), regsave_area, &[offset]);
+    let regsave_value_ptr = bx.inbounds_ptradd(regsave_area, offset);
     bx.br(end);
 
     bx.switch_to_block(from_stack);
@@ -356,9 +852,9 @@ fn emit_xtensa_va_arg<'ll, 'tcx>(
     bx.store(offset_next_corrected, offset_ptr, bx.tcx().data_layout.pointer_align.abi);
 
     // let stack_value_ptr = unsafe { (*va).va_stk.byte_add(offset_corrected) };
-    let stack_area_ptr = bx.inbounds_gep(bx.type_i8(), va_list_addr, &[bx.cx.const_usize(0)]);
+    let stack_area_ptr = bx.inbounds_ptradd(va_list_addr, bx.cx.const_usize(0));
     let stack_area = bx.load(bx.type_ptr(), stack_area_ptr, bx.tcx().data_layout.pointer_align.abi);
-    let stack_value_ptr = bx.inbounds_gep(bx.type_i8(), stack_area, &[offset_corrected]);
+    let stack_value_ptr = bx.inbounds_ptradd(stack_area, offset_corrected);
     bx.br(end);
 
     bx.switch_to_block(end);
@@ -386,30 +882,62 @@ pub(super) fn emit_va_arg<'ll, 'tcx>(
     // Determine the va_arg implementation to use. The LLVM va_arg instruction
     // is lacking in some instances, so we should only use it as a fallback.
     let target = &bx.cx.tcx.sess.target;
-    let arch = &bx.cx.tcx.sess.target.arch;
-    match &**arch {
-        // Windows x86
-        "x86" if target.is_like_windows => {
-            emit_ptr_va_arg(bx, addr, target_ty, false, Align::from_bytes(4).unwrap(), false)
-        }
-        // Generic x86
-        "x86" => emit_ptr_va_arg(bx, addr, target_ty, false, Align::from_bytes(4).unwrap(), true),
-        // Windows AArch64
-        "aarch64" | "arm64ec" if target.is_like_windows => {
-            emit_ptr_va_arg(bx, addr, target_ty, false, Align::from_bytes(8).unwrap(), false)
-        }
-        // macOS / iOS AArch64
-        "aarch64" if target.is_like_darwin => {
-            emit_ptr_va_arg(bx, addr, target_ty, false, Align::from_bytes(8).unwrap(), true)
+
+    match &*target.arch {
+        "x86" => emit_ptr_va_arg(
+            bx,
+            addr,
+            target_ty,
+            PassMode::Direct,
+            SlotSize::Bytes4,
+            if target.is_like_windows { AllowHigherAlign::No } else { AllowHigherAlign::Yes },
+            ForceRightAdjust::No,
+        ),
+        "aarch64" | "arm64ec" if target.is_like_windows || target.is_like_darwin => {
+            emit_ptr_va_arg(
+                bx,
+                addr,
+                target_ty,
+                PassMode::Direct,
+                SlotSize::Bytes8,
+                if target.is_like_windows { AllowHigherAlign::No } else { AllowHigherAlign::Yes },
+                ForceRightAdjust::No,
+            )
         }
         "aarch64" => emit_aapcs_va_arg(bx, addr, target_ty),
         "s390x" => emit_s390x_va_arg(bx, addr, target_ty),
+        "powerpc" => emit_powerpc_va_arg(bx, addr, target_ty),
+        "powerpc64" | "powerpc64le" => emit_ptr_va_arg(
+            bx,
+            addr,
+            target_ty,
+            PassMode::Direct,
+            SlotSize::Bytes8,
+            AllowHigherAlign::Yes,
+            match &*target.arch {
+                "powerpc64" => ForceRightAdjust::Yes,
+                _ => ForceRightAdjust::No,
+            },
+        ),
         // Windows x86_64
         "x86_64" if target.is_like_windows => {
             let target_ty_size = bx.cx.size_of(target_ty).bytes();
-            let indirect: bool = target_ty_size > 8 || !target_ty_size.is_power_of_two();
-            emit_ptr_va_arg(bx, addr, target_ty, indirect, Align::from_bytes(8).unwrap(), false)
+            emit_ptr_va_arg(
+                bx,
+                addr,
+                target_ty,
+                if target_ty_size > 8 || !target_ty_size.is_power_of_two() {
+                    PassMode::Indirect
+                } else {
+                    PassMode::Direct
+                },
+                SlotSize::Bytes8,
+                AllowHigherAlign::No,
+                ForceRightAdjust::No,
+            )
         }
+        // This includes `target.is_like_darwin`, which on x86_64 targets is like sysv64.
+        "x86_64" => emit_x86_64_sysv64_va_arg(bx, addr, target_ty),
         "xtensa" => emit_xtensa_va_arg(bx, addr, target_ty),
         // For all other architecture/OS combinations fall back to using
         // the LLVM va_arg instruction.