about summary refs log tree commit diff
path: root/compiler/rustc_codegen_llvm
diff options
context:
space:
mode:
Diffstat (limited to 'compiler/rustc_codegen_llvm')
-rw-r--r--compiler/rustc_codegen_llvm/src/asm.rs2
-rw-r--r--compiler/rustc_codegen_llvm/src/attributes.rs17
-rw-r--r--compiler/rustc_codegen_llvm/src/back/lto.rs64
-rw-r--r--compiler/rustc_codegen_llvm/src/back/write.rs80
-rw-r--r--compiler/rustc_codegen_llvm/src/base.rs6
-rw-r--r--compiler/rustc_codegen_llvm/src/builder.rs4
-rw-r--r--compiler/rustc_codegen_llvm/src/builder/autodiff.rs45
-rw-r--r--compiler/rustc_codegen_llvm/src/common.rs16
-rw-r--r--compiler/rustc_codegen_llvm/src/context.rs10
-rw-r--r--compiler/rustc_codegen_llvm/src/debuginfo/create_scope_map.rs4
-rw-r--r--compiler/rustc_codegen_llvm/src/debuginfo/metadata.rs39
-rw-r--r--compiler/rustc_codegen_llvm/src/debuginfo/metadata/enums/cpp_like.rs3
-rw-r--r--compiler/rustc_codegen_llvm/src/debuginfo/metadata/enums/mod.rs1
-rw-r--r--compiler/rustc_codegen_llvm/src/debuginfo/metadata/enums/native.rs6
-rw-r--r--compiler/rustc_codegen_llvm/src/debuginfo/metadata/type_map.rs48
-rw-r--r--compiler/rustc_codegen_llvm/src/debuginfo/mod.rs36
-rw-r--r--compiler/rustc_codegen_llvm/src/intrinsic.rs125
-rw-r--r--compiler/rustc_codegen_llvm/src/lib.rs10
-rw-r--r--compiler/rustc_codegen_llvm/src/llvm/enzyme_ffi.rs13
-rw-r--r--compiler/rustc_codegen_llvm/src/llvm/ffi.rs3
-rw-r--r--compiler/rustc_codegen_llvm/src/llvm/mod.rs26
-rw-r--r--compiler/rustc_codegen_llvm/src/llvm_util.rs91
-rw-r--r--compiler/rustc_codegen_llvm/src/type_.rs4
23 files changed, 479 insertions, 174 deletions
diff --git a/compiler/rustc_codegen_llvm/src/asm.rs b/compiler/rustc_codegen_llvm/src/asm.rs
index 88daa025740..e481b99afcc 100644
--- a/compiler/rustc_codegen_llvm/src/asm.rs
+++ b/compiler/rustc_codegen_llvm/src/asm.rs
@@ -376,7 +376,7 @@ impl<'ll, 'tcx> AsmBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
 
 impl<'tcx> AsmCodegenMethods<'tcx> for CodegenCx<'_, 'tcx> {
     fn codegen_global_asm(
-        &self,
+        &mut self,
         template: &[InlineAsmTemplatePiece],
         operands: &[GlobalAsmOperandRef<'tcx>],
         options: InlineAsmOptions,
diff --git a/compiler/rustc_codegen_llvm/src/attributes.rs b/compiler/rustc_codegen_llvm/src/attributes.rs
index e8c42d16733..176fb72dfdc 100644
--- a/compiler/rustc_codegen_llvm/src/attributes.rs
+++ b/compiler/rustc_codegen_llvm/src/attributes.rs
@@ -1,5 +1,4 @@
 //! Set and unset common attributes on LLVM values.
-
 use rustc_attr_parsing::{InlineAttr, InstructionSetAttr, OptimizeAttr};
 use rustc_codegen_ssa::traits::*;
 use rustc_hir::def_id::DefId;
@@ -28,6 +27,22 @@ pub(crate) fn apply_to_callsite(callsite: &Value, idx: AttributePlace, attrs: &[
     }
 }
 
+pub(crate) fn has_attr(llfn: &Value, idx: AttributePlace, attr: AttributeKind) -> bool {
+    llvm::HasAttributeAtIndex(llfn, idx, attr)
+}
+
+pub(crate) fn has_string_attr(llfn: &Value, name: &str) -> bool {
+    llvm::HasStringAttribute(llfn, name)
+}
+
+pub(crate) fn remove_from_llfn(llfn: &Value, place: AttributePlace, kind: AttributeKind) {
+    llvm::RemoveRustEnumAttributeAtIndex(llfn, place, kind);
+}
+
+pub(crate) fn remove_string_attr_from_llfn(llfn: &Value, name: &str) {
+    llvm::RemoveStringAttrFromFn(llfn, name);
+}
+
 /// Get LLVM attribute for the provided inline heuristic.
 #[inline]
 fn inline_attr<'ll>(cx: &CodegenCx<'ll, '_>, inline: InlineAttr) -> Option<&'ll Attribute> {
diff --git a/compiler/rustc_codegen_llvm/src/back/lto.rs b/compiler/rustc_codegen_llvm/src/back/lto.rs
index a8b49e9552c..39b3a23e0b1 100644
--- a/compiler/rustc_codegen_llvm/src/back/lto.rs
+++ b/compiler/rustc_codegen_llvm/src/back/lto.rs
@@ -28,8 +28,9 @@ use crate::back::write::{
 use crate::errors::{
     DynamicLinkingWithLTO, LlvmError, LtoBitcodeFromRlib, LtoDisallowed, LtoDylib, LtoProcMacro,
 };
+use crate::llvm::AttributePlace::Function;
 use crate::llvm::{self, build_string};
-use crate::{LlvmCodegenBackend, ModuleLlvm};
+use crate::{LlvmCodegenBackend, ModuleLlvm, SimpleCx, attributes};
 
 /// We keep track of the computed LTO cache keys from the previous
 /// session to determine which CGUs we can reuse.
@@ -584,12 +585,10 @@ fn thin_lto(
     }
 }
 
-fn enable_autodiff_settings(ad: &[config::AutoDiff], module: &mut ModuleCodegen<ModuleLlvm>) {
+fn enable_autodiff_settings(ad: &[config::AutoDiff]) {
     for &val in ad {
+        // We intentionally don't use a wildcard, to not forget handling anything new.
         match val {
-            config::AutoDiff::PrintModBefore => {
-                unsafe { llvm::LLVMDumpModule(module.module_llvm.llmod()) };
-            }
             config::AutoDiff::PrintPerf => {
                 llvm::set_print_perf(true);
             }
@@ -603,17 +602,23 @@ fn enable_autodiff_settings(ad: &[config::AutoDiff], module: &mut ModuleCodegen<
                 llvm::set_inline(true);
             }
             config::AutoDiff::LooseTypes => {
-                llvm::set_loose_types(false);
+                llvm::set_loose_types(true);
             }
             config::AutoDiff::PrintSteps => {
                 llvm::set_print(true);
             }
-            // We handle this below
+            // We handle this in the PassWrapper.cpp
+            config::AutoDiff::PrintPasses => {}
+            // We handle this in the PassWrapper.cpp
+            config::AutoDiff::PrintModBefore => {}
+            // We handle this in the PassWrapper.cpp
             config::AutoDiff::PrintModAfter => {}
-            // We handle this below
+            // We handle this in the PassWrapper.cpp
             config::AutoDiff::PrintModFinal => {}
             // This is required and already checked
             config::AutoDiff::Enable => {}
+            // We handle this below
+            config::AutoDiff::NoPostopt => {}
         }
     }
     // This helps with handling enums for now.
@@ -647,27 +652,52 @@ pub(crate) fn run_pass_manager(
     // We then run the llvm_optimize function a second time, to optimize the code which we generated
     // in the enzyme differentiation pass.
     let enable_ad = config.autodiff.contains(&config::AutoDiff::Enable);
-    let stage =
-        if enable_ad { write::AutodiffStage::DuringAD } else { write::AutodiffStage::PostAD };
+    let stage = if thin {
+        write::AutodiffStage::PreAD
+    } else {
+        if enable_ad { write::AutodiffStage::DuringAD } else { write::AutodiffStage::PostAD }
+    };
 
     if enable_ad {
-        enable_autodiff_settings(&config.autodiff, module);
+        enable_autodiff_settings(&config.autodiff);
     }
 
     unsafe {
         write::llvm_optimize(cgcx, dcx, module, None, config, opt_level, opt_stage, stage)?;
     }
 
-    if cfg!(llvm_enzyme) && enable_ad {
-        // This is the post-autodiff IR, mainly used for testing and educational purposes.
-        if config.autodiff.contains(&config::AutoDiff::PrintModAfter) {
-            unsafe { llvm::LLVMDumpModule(module.module_llvm.llmod()) };
+    if cfg!(llvm_enzyme) && enable_ad && !thin {
+        let cx =
+            SimpleCx::new(module.module_llvm.llmod(), &module.module_llvm.llcx, cgcx.pointer_size);
+
+        for function in cx.get_functions() {
+            let enzyme_marker = "enzyme_marker";
+            if attributes::has_string_attr(function, enzyme_marker) {
+                // Sanity check: Ensure 'noinline' is present before replacing it.
+                assert!(
+                    !attributes::has_attr(function, Function, llvm::AttributeKind::NoInline),
+                    "Expected __enzyme function to have 'noinline' before adding 'alwaysinline'"
+                );
+
+                attributes::remove_from_llfn(function, Function, llvm::AttributeKind::NoInline);
+                attributes::remove_string_attr_from_llfn(function, enzyme_marker);
+
+                assert!(
+                    !attributes::has_string_attr(function, enzyme_marker),
+                    "Expected function to not have 'enzyme_marker'"
+                );
+
+                let always_inline = llvm::AttributeKind::AlwaysInline.create_attr(cx.llcx);
+                attributes::apply_to_llfn(function, Function, &[always_inline]);
+            }
         }
 
         let opt_stage = llvm::OptStage::FatLTO;
         let stage = write::AutodiffStage::PostAD;
-        unsafe {
-            write::llvm_optimize(cgcx, dcx, module, None, config, opt_level, opt_stage, stage)?;
+        if !config.autodiff.contains(&config::AutoDiff::NoPostopt) {
+            unsafe {
+                write::llvm_optimize(cgcx, dcx, module, None, config, opt_level, opt_stage, stage)?;
+            }
         }
 
         // This is the final IR, so people should be able to inspect the optimized autodiff output,
diff --git a/compiler/rustc_codegen_llvm/src/back/write.rs b/compiler/rustc_codegen_llvm/src/back/write.rs
index 76d431a4975..4ac77c8f7f1 100644
--- a/compiler/rustc_codegen_llvm/src/back/write.rs
+++ b/compiler/rustc_codegen_llvm/src/back/write.rs
@@ -119,14 +119,18 @@ pub(crate) fn create_target_machine(tcx: TyCtxt<'_>, mod_name: &str) -> OwnedTar
         tcx.output_filenames(()).split_dwarf_path(
             tcx.sess.split_debuginfo(),
             tcx.sess.opts.unstable_opts.split_dwarf_kind,
-            Some(mod_name),
+            mod_name,
+            tcx.sess.invocation_temp.as_deref(),
         )
     } else {
         None
     };
 
-    let output_obj_file =
-        Some(tcx.output_filenames(()).temp_path(OutputType::Object, Some(mod_name)));
+    let output_obj_file = Some(tcx.output_filenames(()).temp_path_for_cgu(
+        OutputType::Object,
+        mod_name,
+        tcx.sess.invocation_temp.as_deref(),
+    ));
     let config = TargetMachineFactoryConfig { split_dwarf_file, output_obj_file };
 
     target_machine_factory(
@@ -330,8 +334,11 @@ pub(crate) fn save_temp_bitcode(
         return;
     }
     let ext = format!("{name}.bc");
-    let cgu = Some(&module.name[..]);
-    let path = cgcx.output_filenames.temp_path_ext(&ext, cgu);
+    let path = cgcx.output_filenames.temp_path_ext_for_cgu(
+        &ext,
+        &module.name,
+        cgcx.invocation_temp.as_deref(),
+    );
     write_bitcode_to_file(module, &path)
 }
 
@@ -565,6 +572,10 @@ pub(crate) unsafe fn llvm_optimize(
 
     let consider_ad = cfg!(llvm_enzyme) && config.autodiff.contains(&config::AutoDiff::Enable);
     let run_enzyme = autodiff_stage == AutodiffStage::DuringAD;
+    let print_before_enzyme = config.autodiff.contains(&config::AutoDiff::PrintModBefore);
+    let print_after_enzyme = config.autodiff.contains(&config::AutoDiff::PrintModAfter);
+    let print_passes = config.autodiff.contains(&config::AutoDiff::PrintPasses);
+    let merge_functions;
     let unroll_loops;
     let vectorize_slp;
     let vectorize_loop;
@@ -572,13 +583,20 @@ pub(crate) unsafe fn llvm_optimize(
     // When we build rustc with enzyme/autodiff support, we want to postpone size-increasing
     // optimizations until after differentiation. Our pipeline is thus: (opt + enzyme), (full opt).
     // We therefore have two calls to llvm_optimize, if autodiff is used.
+    //
+    // We also must disable merge_functions, since autodiff placeholder/dummy bodies tend to be
+    // identical. We run opts before AD, so there is a chance that LLVM will merge our dummies.
+    // In that case, we lack some dummy bodies and can't replace them with the real AD code anymore.
+    // We then would need to abort compilation. This was especially common in test cases.
     if consider_ad && autodiff_stage != AutodiffStage::PostAD {
+        merge_functions = false;
         unroll_loops = false;
         vectorize_slp = false;
         vectorize_loop = false;
     } else {
         unroll_loops =
             opt_level != config::OptLevel::Size && opt_level != config::OptLevel::SizeMin;
+        merge_functions = config.merge_functions;
         vectorize_slp = config.vectorize_slp;
         vectorize_loop = config.vectorize_loop;
     }
@@ -656,13 +674,16 @@ pub(crate) unsafe fn llvm_optimize(
             thin_lto_buffer,
             config.emit_thin_lto,
             config.emit_thin_lto_summary,
-            config.merge_functions,
+            merge_functions,
             unroll_loops,
             vectorize_slp,
             vectorize_loop,
             config.no_builtins,
             config.emit_lifetime_markers,
             run_enzyme,
+            print_before_enzyme,
+            print_after_enzyme,
+            print_passes,
             sanitizer_options.as_ref(),
             pgo_gen_path.as_ref().map_or(std::ptr::null(), |s| s.as_ptr()),
             pgo_use_path.as_ref().map_or(std::ptr::null(), |s| s.as_ptr()),
@@ -694,11 +715,12 @@ pub(crate) unsafe fn optimize(
     let llcx = &*module.module_llvm.llcx;
     let _handlers = DiagnosticHandlers::new(cgcx, dcx, llcx, module, CodegenDiagnosticsStage::Opt);
 
-    let module_name = module.name.clone();
-    let module_name = Some(&module_name[..]);
-
     if config.emit_no_opt_bc {
-        let out = cgcx.output_filenames.temp_path_ext("no-opt.bc", module_name);
+        let out = cgcx.output_filenames.temp_path_ext_for_cgu(
+            "no-opt.bc",
+            &module.name,
+            cgcx.invocation_temp.as_deref(),
+        );
         write_bitcode_to_file(module, &out)
     }
 
@@ -743,8 +765,11 @@ pub(crate) unsafe fn optimize(
         if let Some(thin_lto_buffer) = thin_lto_buffer {
             let thin_lto_buffer = unsafe { ThinBuffer::from_raw_ptr(thin_lto_buffer) };
             module.thin_lto_buffer = Some(thin_lto_buffer.data().to_vec());
-            let bc_summary_out =
-                cgcx.output_filenames.temp_path(OutputType::ThinLinkBitcode, module_name);
+            let bc_summary_out = cgcx.output_filenames.temp_path_for_cgu(
+                OutputType::ThinLinkBitcode,
+                &module.name,
+                cgcx.invocation_temp.as_deref(),
+            );
             if config.emit_thin_lto_summary
                 && let Some(thin_link_bitcode_filename) = bc_summary_out.file_name()
             {
@@ -801,8 +826,6 @@ pub(crate) unsafe fn codegen(
         let llmod = module.module_llvm.llmod();
         let llcx = &*module.module_llvm.llcx;
         let tm = &*module.module_llvm.tm;
-        let module_name = module.name.clone();
-        let module_name = Some(&module_name[..]);
         let _handlers =
             DiagnosticHandlers::new(cgcx, dcx, llcx, &module, CodegenDiagnosticsStage::Codegen);
 
@@ -814,8 +837,16 @@ pub(crate) unsafe fn codegen(
         // copy it to the .o file, and delete the bitcode if it wasn't
         // otherwise requested.
 
-        let bc_out = cgcx.output_filenames.temp_path(OutputType::Bitcode, module_name);
-        let obj_out = cgcx.output_filenames.temp_path(OutputType::Object, module_name);
+        let bc_out = cgcx.output_filenames.temp_path_for_cgu(
+            OutputType::Bitcode,
+            &module.name,
+            cgcx.invocation_temp.as_deref(),
+        );
+        let obj_out = cgcx.output_filenames.temp_path_for_cgu(
+            OutputType::Object,
+            &module.name,
+            cgcx.invocation_temp.as_deref(),
+        );
 
         if config.bitcode_needed() {
             if config.emit_bc || config.emit_obj == EmitObj::Bitcode {
@@ -857,7 +888,11 @@ pub(crate) unsafe fn codegen(
         if config.emit_ir {
             let _timer =
                 cgcx.prof.generic_activity_with_arg("LLVM_module_codegen_emit_ir", &*module.name);
-            let out = cgcx.output_filenames.temp_path(OutputType::LlvmAssembly, module_name);
+            let out = cgcx.output_filenames.temp_path_for_cgu(
+                OutputType::LlvmAssembly,
+                &module.name,
+                cgcx.invocation_temp.as_deref(),
+            );
             let out_c = path_to_c_string(&out);
 
             extern "C" fn demangle_callback(
@@ -899,7 +934,11 @@ pub(crate) unsafe fn codegen(
         if config.emit_asm {
             let _timer =
                 cgcx.prof.generic_activity_with_arg("LLVM_module_codegen_emit_asm", &*module.name);
-            let path = cgcx.output_filenames.temp_path(OutputType::Assembly, module_name);
+            let path = cgcx.output_filenames.temp_path_for_cgu(
+                OutputType::Assembly,
+                &module.name,
+                cgcx.invocation_temp.as_deref(),
+            );
 
             // We can't use the same module for asm and object code output,
             // because that triggers various errors like invalid IR or broken
@@ -929,7 +968,9 @@ pub(crate) unsafe fn codegen(
                     .prof
                     .generic_activity_with_arg("LLVM_module_codegen_emit_obj", &*module.name);
 
-                let dwo_out = cgcx.output_filenames.temp_path_dwo(module_name);
+                let dwo_out = cgcx
+                    .output_filenames
+                    .temp_path_dwo_for_cgu(&module.name, cgcx.invocation_temp.as_deref());
                 let dwo_out = match (cgcx.split_debuginfo, cgcx.split_dwarf_kind) {
                     // Don't change how DWARF is emitted when disabled.
                     (SplitDebuginfo::Off, _) => None,
@@ -994,6 +1035,7 @@ pub(crate) unsafe fn codegen(
         config.emit_asm,
         config.emit_ir,
         &cgcx.output_filenames,
+        cgcx.invocation_temp.as_deref(),
     ))
 }
 
diff --git a/compiler/rustc_codegen_llvm/src/base.rs b/compiler/rustc_codegen_llvm/src/base.rs
index 6bd27914dbd..e4fac35aa44 100644
--- a/compiler/rustc_codegen_llvm/src/base.rs
+++ b/compiler/rustc_codegen_llvm/src/base.rs
@@ -83,15 +83,15 @@ pub(crate) fn compile_codegen_unit(
         // Instantiate monomorphizations without filling out definitions yet...
         let llvm_module = ModuleLlvm::new(tcx, cgu_name.as_str());
         {
-            let cx = CodegenCx::new(tcx, cgu, &llvm_module);
+            let mut cx = CodegenCx::new(tcx, cgu, &llvm_module);
             let mono_items = cx.codegen_unit.items_in_deterministic_order(cx.tcx);
             for &(mono_item, data) in &mono_items {
                 mono_item.predefine::<Builder<'_, '_, '_>>(&cx, data.linkage, data.visibility);
             }
 
             // ... and now that we have everything pre-defined, fill out those definitions.
-            for &(mono_item, _) in &mono_items {
-                mono_item.define::<Builder<'_, '_, '_>>(&cx);
+            for &(mono_item, item_data) in &mono_items {
+                mono_item.define::<Builder<'_, '_, '_>>(&mut cx, item_data);
             }
 
             // If this codegen unit contains the main function, also create the
diff --git a/compiler/rustc_codegen_llvm/src/builder.rs b/compiler/rustc_codegen_llvm/src/builder.rs
index 35134e9f5a0..04c8118b616 100644
--- a/compiler/rustc_codegen_llvm/src/builder.rs
+++ b/compiler/rustc_codegen_llvm/src/builder.rs
@@ -123,7 +123,7 @@ impl<'a, 'll, CX: Borrow<SCx<'ll>>> GenericBuilder<'a, 'll, CX> {
 /// Empty string, to be used where LLVM expects an instruction name, indicating
 /// that the instruction is to be left unnamed (i.e. numbered, in textual IR).
 // FIXME(eddyb) pass `&CStr` directly to FFI once it's a thin pointer.
-const UNNAMED: *const c_char = c"".as_ptr();
+pub(crate) const UNNAMED: *const c_char = c"".as_ptr();
 
 impl<'ll, CX: Borrow<SCx<'ll>>> BackendTypes for GenericBuilder<'_, 'll, CX> {
     type Value = <GenericCx<'ll, CX> as BackendTypes>::Value;
@@ -594,6 +594,7 @@ impl<'a, 'll, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
     fn load(&mut self, ty: &'ll Type, ptr: &'ll Value, align: Align) -> &'ll Value {
         unsafe {
             let load = llvm::LLVMBuildLoad2(self.llbuilder, ty, ptr, UNNAMED);
+            let align = align.min(self.cx().tcx.sess.target.max_reliable_alignment());
             llvm::LLVMSetAlignment(load, align.bytes() as c_uint);
             load
         }
@@ -807,6 +808,7 @@ impl<'a, 'll, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
         assert_eq!(self.cx.type_kind(self.cx.val_ty(ptr)), TypeKind::Pointer);
         unsafe {
             let store = llvm::LLVMBuildStore(self.llbuilder, val, ptr);
+            let align = align.min(self.cx().tcx.sess.target.max_reliable_alignment());
             let align =
                 if flags.contains(MemFlags::UNALIGNED) { 1 } else { align.bytes() as c_uint };
             llvm::LLVMSetAlignment(store, align);
diff --git a/compiler/rustc_codegen_llvm/src/builder/autodiff.rs b/compiler/rustc_codegen_llvm/src/builder/autodiff.rs
index 5e7ef27143b..c5c13ac097a 100644
--- a/compiler/rustc_codegen_llvm/src/builder/autodiff.rs
+++ b/compiler/rustc_codegen_llvm/src/builder/autodiff.rs
@@ -10,7 +10,7 @@ use rustc_middle::bug;
 use tracing::{debug, trace};
 
 use crate::back::write::llvm_err;
-use crate::builder::SBuilder;
+use crate::builder::{SBuilder, UNNAMED};
 use crate::context::SimpleCx;
 use crate::declare::declare_simple_fn;
 use crate::errors::{AutoDiffWithoutEnable, LlvmError};
@@ -51,6 +51,7 @@ fn has_sret(fnc: &Value) -> bool {
 // using iterators and peek()?
 fn match_args_from_caller_to_enzyme<'ll>(
     cx: &SimpleCx<'ll>,
+    builder: &SBuilder<'ll, 'll>,
     width: u32,
     args: &mut Vec<&'ll llvm::Value>,
     inputs: &[DiffActivity],
@@ -78,7 +79,9 @@ fn match_args_from_caller_to_enzyme<'ll>(
     let enzyme_const = cx.create_metadata("enzyme_const".to_string()).unwrap();
     let enzyme_out = cx.create_metadata("enzyme_out".to_string()).unwrap();
     let enzyme_dup = cx.create_metadata("enzyme_dup".to_string()).unwrap();
+    let enzyme_dupv = cx.create_metadata("enzyme_dupv".to_string()).unwrap();
     let enzyme_dupnoneed = cx.create_metadata("enzyme_dupnoneed".to_string()).unwrap();
+    let enzyme_dupnoneedv = cx.create_metadata("enzyme_dupnoneedv".to_string()).unwrap();
 
     while activity_pos < inputs.len() {
         let diff_activity = inputs[activity_pos as usize];
@@ -90,13 +93,34 @@ fn match_args_from_caller_to_enzyme<'ll>(
             DiffActivity::Active => (enzyme_out, false),
             DiffActivity::ActiveOnly => (enzyme_out, false),
             DiffActivity::Dual => (enzyme_dup, true),
+            DiffActivity::Dualv => (enzyme_dupv, true),
             DiffActivity::DualOnly => (enzyme_dupnoneed, true),
+            DiffActivity::DualvOnly => (enzyme_dupnoneedv, true),
             DiffActivity::Duplicated => (enzyme_dup, true),
             DiffActivity::DuplicatedOnly => (enzyme_dupnoneed, true),
-            DiffActivity::FakeActivitySize => (enzyme_const, false),
+            DiffActivity::FakeActivitySize(_) => (enzyme_const, false),
         };
         let outer_arg = outer_args[outer_pos];
         args.push(cx.get_metadata_value(activity));
+        if matches!(diff_activity, DiffActivity::Dualv) {
+            let next_outer_arg = outer_args[outer_pos + 1];
+            let elem_bytes_size: u64 = match inputs[activity_pos + 1] {
+                DiffActivity::FakeActivitySize(Some(s)) => s.into(),
+                _ => bug!("incorrect Dualv handling recognized."),
+            };
+            // stride: sizeof(T) * n_elems.
+            // n_elems is the next integer.
+            // Now we multiply `4 * next_outer_arg` to get the stride.
+            let mul = unsafe {
+                llvm::LLVMBuildMul(
+                    builder.llbuilder,
+                    cx.get_const_i64(elem_bytes_size),
+                    next_outer_arg,
+                    UNNAMED,
+                )
+            };
+            args.push(mul);
+        }
         args.push(outer_arg);
         if duplicated {
             // We know that duplicated args by construction have a following argument,
@@ -114,7 +138,7 @@ fn match_args_from_caller_to_enzyme<'ll>(
                 } else {
                     let next_activity = inputs[activity_pos + 1];
                     // We analyze the MIR types and add this dummy activity if we visit a slice.
-                    next_activity == DiffActivity::FakeActivitySize
+                    matches!(next_activity, DiffActivity::FakeActivitySize(_))
                 }
             };
             if slice {
@@ -125,7 +149,10 @@ fn match_args_from_caller_to_enzyme<'ll>(
                 // int2 >= int1, which means the shadow vector is large enough to store the gradient.
                 assert_eq!(cx.type_kind(next_outer_ty), TypeKind::Integer);
 
-                for i in 0..(width as usize) {
+                let iterations =
+                    if matches!(diff_activity, DiffActivity::Dualv) { 1 } else { width as usize };
+
+                for i in 0..iterations {
                     let next_outer_arg2 = outer_args[outer_pos + 2 * (i + 1)];
                     let next_outer_ty2 = cx.val_ty(next_outer_arg2);
                     assert_eq!(cx.type_kind(next_outer_ty2), TypeKind::Pointer);
@@ -136,7 +163,7 @@ fn match_args_from_caller_to_enzyme<'ll>(
                 }
                 args.push(cx.get_metadata_value(enzyme_const));
                 args.push(next_outer_arg);
-                outer_pos += 2 + 2 * width as usize;
+                outer_pos += 2 + 2 * iterations;
                 activity_pos += 2;
             } else {
                 // A duplicated pointer will have the following two outer_fn arguments:
@@ -334,6 +361,11 @@ fn generate_enzyme_call<'ll>(
         let attr = llvm::AttributeKind::NoInline.create_attr(cx.llcx);
         attributes::apply_to_llfn(ad_fn, Function, &[attr]);
 
+        // We add a made-up attribute just such that we can recognize it after AD to update
+        // (no)-inline attributes. We'll then also remove this attribute.
+        let enzyme_marker_attr = llvm::CreateAttrString(cx.llcx, "enzyme_marker");
+        attributes::apply_to_llfn(outer_fn, Function, &[enzyme_marker_attr]);
+
         // first, remove all calls from fnc
         let entry = llvm::LLVMGetFirstBasicBlock(outer_fn);
         let br = llvm::LLVMRustGetTerminator(entry);
@@ -360,6 +392,7 @@ fn generate_enzyme_call<'ll>(
         let outer_args: Vec<&llvm::Value> = get_params(outer_fn);
         match_args_from_caller_to_enzyme(
             &cx,
+            &builder,
             attrs.width,
             &mut args,
             &attrs.input_activity,
@@ -445,7 +478,7 @@ pub(crate) fn differentiate<'ll>(
         return Err(diag_handler.handle().emit_almost_fatal(AutoDiffWithoutEnable));
     }
 
-    // Before dumping the module, we want all the TypeTrees to become part of the module.
+    // Here we replace the placeholder code with the actual autodiff code, which calls Enzyme.
     for item in diff_items.iter() {
         let name = item.source.clone();
         let fn_def: Option<&llvm::Value> = cx.get_function(&name);
diff --git a/compiler/rustc_codegen_llvm/src/common.rs b/compiler/rustc_codegen_llvm/src/common.rs
index 457e5452ce9..a6f277e4455 100644
--- a/compiler/rustc_codegen_llvm/src/common.rs
+++ b/compiler/rustc_codegen_llvm/src/common.rs
@@ -4,8 +4,8 @@ use std::borrow::Borrow;
 
 use libc::{c_char, c_uint};
 use rustc_abi as abi;
+use rustc_abi::HasDataLayout;
 use rustc_abi::Primitive::Pointer;
-use rustc_abi::{AddressSpace, HasDataLayout};
 use rustc_ast::Mutability;
 use rustc_codegen_ssa::common::TypeKind;
 use rustc_codegen_ssa::traits::*;
@@ -269,7 +269,8 @@ impl<'ll, 'tcx> ConstCodegenMethods for CodegenCx<'ll, 'tcx> {
             }
             Scalar::Ptr(ptr, _size) => {
                 let (prov, offset) = ptr.into_parts();
-                let (base_addr, base_addr_space) = match self.tcx.global_alloc(prov.alloc_id()) {
+                let global_alloc = self.tcx.global_alloc(prov.alloc_id());
+                let base_addr = match global_alloc {
                     GlobalAlloc::Memory(alloc) => {
                         // For ZSTs directly codegen an aligned pointer.
                         // This avoids generating a zero-sized constant value and actually needing a
@@ -301,12 +302,10 @@ impl<'ll, 'tcx> ConstCodegenMethods for CodegenCx<'ll, 'tcx> {
                                     format!("alloc_{hash:032x}").as_bytes(),
                                 );
                             }
-                            (value, AddressSpace::DATA)
+                            value
                         }
                     }
-                    GlobalAlloc::Function { instance, .. } => {
-                        (self.get_fn_addr(instance), self.data_layout().instruction_address_space)
-                    }
+                    GlobalAlloc::Function { instance, .. } => self.get_fn_addr(instance),
                     GlobalAlloc::VTable(ty, dyn_ty) => {
                         let alloc = self
                             .tcx
@@ -319,14 +318,15 @@ impl<'ll, 'tcx> ConstCodegenMethods for CodegenCx<'ll, 'tcx> {
                             .unwrap_memory();
                         let init = const_alloc_to_llvm(self, alloc, /*static*/ false);
                         let value = self.static_addr_of_impl(init, alloc.inner().align, None);
-                        (value, AddressSpace::DATA)
+                        value
                     }
                     GlobalAlloc::Static(def_id) => {
                         assert!(self.tcx.is_static(def_id));
                         assert!(!self.tcx.is_thread_local_static(def_id));
-                        (self.get_static(def_id), AddressSpace::DATA)
+                        self.get_static(def_id)
                     }
                 };
+                let base_addr_space = global_alloc.address_space(self);
                 let llval = unsafe {
                     llvm::LLVMConstInBoundsGEP2(
                         self.type_i8(),
diff --git a/compiler/rustc_codegen_llvm/src/context.rs b/compiler/rustc_codegen_llvm/src/context.rs
index 4ec69995518..ed50515b707 100644
--- a/compiler/rustc_codegen_llvm/src/context.rs
+++ b/compiler/rustc_codegen_llvm/src/context.rs
@@ -698,6 +698,16 @@ impl<'ll, CX: Borrow<SCx<'ll>>> GenericCx<'ll, CX> {
             llvm::LLVMMDStringInContext2(self.llcx(), name.as_ptr() as *const c_char, name.len())
         })
     }
+
+    pub(crate) fn get_functions(&self) -> Vec<&'ll Value> {
+        let mut functions = vec![];
+        let mut func = unsafe { llvm::LLVMGetFirstFunction(self.llmod()) };
+        while let Some(f) = func {
+            functions.push(f);
+            func = unsafe { llvm::LLVMGetNextFunction(f) }
+        }
+        functions
+    }
 }
 
 impl<'ll, 'tcx> MiscCodegenMethods<'tcx> for CodegenCx<'ll, 'tcx> {
diff --git a/compiler/rustc_codegen_llvm/src/debuginfo/create_scope_map.rs b/compiler/rustc_codegen_llvm/src/debuginfo/create_scope_map.rs
index f52991b3697..d2591139d6e 100644
--- a/compiler/rustc_codegen_llvm/src/debuginfo/create_scope_map.rs
+++ b/compiler/rustc_codegen_llvm/src/debuginfo/create_scope_map.rs
@@ -3,7 +3,6 @@ use std::collections::hash_map::Entry;
 use rustc_codegen_ssa::mir::debuginfo::{DebugScope, FunctionDebugContext};
 use rustc_codegen_ssa::traits::*;
 use rustc_data_structures::fx::FxHashMap;
-use rustc_index::Idx;
 use rustc_index::bit_set::DenseBitSet;
 use rustc_middle::mir::{Body, SourceScope};
 use rustc_middle::ty::layout::{FnAbiOf, HasTypingEnv};
@@ -43,8 +42,7 @@ pub(crate) fn compute_mir_scopes<'ll, 'tcx>(
     let mut instantiated = DenseBitSet::new_empty(mir.source_scopes.len());
     let mut discriminators = FxHashMap::default();
     // Instantiate all scopes.
-    for idx in 0..mir.source_scopes.len() {
-        let scope = SourceScope::new(idx);
+    for scope in mir.source_scopes.indices() {
         make_mir_scope(
             cx,
             instance,
diff --git a/compiler/rustc_codegen_llvm/src/debuginfo/metadata.rs b/compiler/rustc_codegen_llvm/src/debuginfo/metadata.rs
index 2eaaf127e41..7f3e486ca31 100644
--- a/compiler/rustc_codegen_llvm/src/debuginfo/metadata.rs
+++ b/compiler/rustc_codegen_llvm/src/debuginfo/metadata.rs
@@ -910,7 +910,8 @@ pub(crate) fn build_compile_unit_di_node<'ll, 'tcx>(
         && let Some(f) = output_filenames.split_dwarf_path(
             tcx.sess.split_debuginfo(),
             tcx.sess.opts.unstable_opts.split_dwarf_kind,
-            Some(codegen_unit_name),
+            codegen_unit_name,
+            tcx.sess.invocation_temp.as_deref(),
         ) {
         // We get a path relative to the working directory from split_dwarf_path
         Some(tcx.sess.source_map().path_mapping().to_real_filename(f))
@@ -1314,31 +1315,21 @@ fn build_generic_type_param_di_nodes<'ll, 'tcx>(
     ty: Ty<'tcx>,
 ) -> SmallVec<Option<&'ll DIType>> {
     if let ty::Adt(def, args) = *ty.kind() {
-        let generics = cx.tcx.generics_of(def.did());
-        return get_template_parameters(cx, generics, args);
-    }
-
-    return smallvec![];
-}
-
-pub(super) fn get_template_parameters<'ll, 'tcx>(
-    cx: &CodegenCx<'ll, 'tcx>,
-    generics: &ty::Generics,
-    args: ty::GenericArgsRef<'tcx>,
-) -> SmallVec<Option<&'ll DIType>> {
-    if args.types().next().is_some() {
-        let names = get_parameter_names(cx, generics);
-        let template_params: SmallVec<_> = iter::zip(args, names)
-            .filter_map(|(kind, name)| {
-                kind.as_type().map(|ty| {
-                    let actual_type = cx.tcx.normalize_erasing_regions(cx.typing_env(), ty);
-                    let actual_type_di_node = type_di_node(cx, actual_type);
-                    Some(cx.create_template_type_parameter(name.as_str(), actual_type_di_node))
+        if args.types().next().is_some() {
+            let generics = cx.tcx.generics_of(def.did());
+            let names = get_parameter_names(cx, generics);
+            let template_params: SmallVec<_> = iter::zip(args, names)
+                .filter_map(|(kind, name)| {
+                    kind.as_type().map(|ty| {
+                        let actual_type = cx.tcx.normalize_erasing_regions(cx.typing_env(), ty);
+                        let actual_type_di_node = type_di_node(cx, actual_type);
+                        Some(cx.create_template_type_parameter(name.as_str(), actual_type_di_node))
+                    })
                 })
-            })
-            .collect();
+                .collect();
 
-        return template_params;
+            return template_params;
+        }
     }
 
     return smallvec![];
diff --git a/compiler/rustc_codegen_llvm/src/debuginfo/metadata/enums/cpp_like.rs b/compiler/rustc_codegen_llvm/src/debuginfo/metadata/enums/cpp_like.rs
index 07075be55fa..e9574108696 100644
--- a/compiler/rustc_codegen_llvm/src/debuginfo/metadata/enums/cpp_like.rs
+++ b/compiler/rustc_codegen_llvm/src/debuginfo/metadata/enums/cpp_like.rs
@@ -721,8 +721,7 @@ fn build_union_fields_for_direct_tag_coroutine<'ll, 'tcx>(
         _ => unreachable!(),
     };
 
-    let coroutine_layout =
-        cx.tcx.coroutine_layout(coroutine_def_id, coroutine_args.kind_ty()).unwrap();
+    let coroutine_layout = cx.tcx.coroutine_layout(coroutine_def_id, coroutine_args.args).unwrap();
 
     let common_upvar_names = cx.tcx.closure_saved_names_of_captured_variables(coroutine_def_id);
     let variant_range = coroutine_args.variant_range(coroutine_def_id, cx.tcx);
diff --git a/compiler/rustc_codegen_llvm/src/debuginfo/metadata/enums/mod.rs b/compiler/rustc_codegen_llvm/src/debuginfo/metadata/enums/mod.rs
index 6792c307fdc..7c701926d2c 100644
--- a/compiler/rustc_codegen_llvm/src/debuginfo/metadata/enums/mod.rs
+++ b/compiler/rustc_codegen_llvm/src/debuginfo/metadata/enums/mod.rs
@@ -363,7 +363,6 @@ fn build_coroutine_variant_struct_type_di_node<'ll, 'tcx>(
 
             state_specific_fields.into_iter().chain(common_fields).collect()
         },
-        // FIXME: this is a no-op. `build_generic_type_param_di_nodes` only works for Adts.
         |cx| build_generic_type_param_di_nodes(cx, coroutine_type_and_layout.ty),
     )
     .di_node
diff --git a/compiler/rustc_codegen_llvm/src/debuginfo/metadata/enums/native.rs b/compiler/rustc_codegen_llvm/src/debuginfo/metadata/enums/native.rs
index bfd131cfd3d..20a841f2287 100644
--- a/compiler/rustc_codegen_llvm/src/debuginfo/metadata/enums/native.rs
+++ b/compiler/rustc_codegen_llvm/src/debuginfo/metadata/enums/native.rs
@@ -174,10 +174,8 @@ pub(super) fn build_coroutine_di_node<'ll, 'tcx>(
             DIFlags::FlagZero,
         ),
         |cx, coroutine_type_di_node| {
-            let coroutine_layout = cx
-                .tcx
-                .coroutine_layout(coroutine_def_id, coroutine_args.as_coroutine().kind_ty())
-                .unwrap();
+            let coroutine_layout =
+                cx.tcx.coroutine_layout(coroutine_def_id, coroutine_args).unwrap();
 
             let Variants::Multiple { tag_encoding: TagEncoding::Direct, ref variants, .. } =
                 coroutine_type_and_layout.variants
diff --git a/compiler/rustc_codegen_llvm/src/debuginfo/metadata/type_map.rs b/compiler/rustc_codegen_llvm/src/debuginfo/metadata/type_map.rs
index ae2ab32ef53..56fb12d3c22 100644
--- a/compiler/rustc_codegen_llvm/src/debuginfo/metadata/type_map.rs
+++ b/compiler/rustc_codegen_llvm/src/debuginfo/metadata/type_map.rs
@@ -247,6 +247,16 @@ pub(super) fn stub<'ll, 'tcx>(
     StubInfo { metadata, unique_type_id }
 }
 
+struct AdtStackPopGuard<'ll, 'tcx, 'a> {
+    cx: &'a CodegenCx<'ll, 'tcx>,
+}
+
+impl<'ll, 'tcx, 'a> Drop for AdtStackPopGuard<'ll, 'tcx, 'a> {
+    fn drop(&mut self) {
+        debug_context(self.cx).adt_stack.borrow_mut().pop();
+    }
+}
+
 /// This function enables creating debuginfo nodes that can recursively refer to themselves.
 /// It will first insert the given stub into the type map and only then execute the `members`
 /// and `generics` closures passed in. These closures have access to the stub so they can
@@ -261,6 +271,44 @@ pub(super) fn build_type_with_children<'ll, 'tcx>(
 ) -> DINodeCreationResult<'ll> {
     assert_eq!(debug_context(cx).type_map.di_node_for_unique_id(stub_info.unique_type_id), None);
 
+    let mut _adt_stack_pop_guard = None;
+    if let UniqueTypeId::Ty(ty, ..) = stub_info.unique_type_id
+        && let ty::Adt(adt_def, args) = ty.kind()
+    {
+        let def_id = adt_def.did();
+        // If any sub type reference the original type definition and the sub type has a type
+        // parameter that strictly contains the original parameter, the original type is a recursive
+        // type that can expanding indefinitely. Example,
+        // ```
+        // enum Recursive<T> {
+        //     Recurse(*const Recursive<Wrap<T>>),
+        //     Item(T),
+        // }
+        // ```
+        let is_expanding_recursive =
+            debug_context(cx).adt_stack.borrow().iter().any(|(parent_def_id, parent_args)| {
+                if def_id == *parent_def_id {
+                    args.iter().zip(parent_args.iter()).any(|(arg, parent_arg)| {
+                        if let (Some(arg), Some(parent_arg)) = (arg.as_type(), parent_arg.as_type())
+                        {
+                            arg != parent_arg && arg.contains(parent_arg)
+                        } else {
+                            false
+                        }
+                    })
+                } else {
+                    false
+                }
+            });
+        if is_expanding_recursive {
+            // FIXME: indicate that this is an expanding recursive type in stub metadata?
+            return DINodeCreationResult::new(stub_info.metadata, false);
+        } else {
+            debug_context(cx).adt_stack.borrow_mut().push((def_id, args));
+            _adt_stack_pop_guard = Some(AdtStackPopGuard { cx });
+        }
+    }
+
     debug_context(cx).type_map.insert(stub_info.unique_type_id, stub_info.metadata);
 
     let members: SmallVec<_> =
diff --git a/compiler/rustc_codegen_llvm/src/debuginfo/mod.rs b/compiler/rustc_codegen_llvm/src/debuginfo/mod.rs
index ae7d080db66..c5085927923 100644
--- a/compiler/rustc_codegen_llvm/src/debuginfo/mod.rs
+++ b/compiler/rustc_codegen_llvm/src/debuginfo/mod.rs
@@ -2,8 +2,8 @@
 
 use std::cell::{OnceCell, RefCell};
 use std::ops::Range;
-use std::ptr;
 use std::sync::Arc;
+use std::{iter, ptr};
 
 use libc::c_uint;
 use metadata::create_subroutine_type;
@@ -66,6 +66,7 @@ pub(crate) struct CodegenUnitDebugContext<'ll, 'tcx> {
     created_files: RefCell<UnordMap<Option<(StableSourceFileId, SourceFileHash)>, &'ll DIFile>>,
 
     type_map: metadata::TypeMap<'ll, 'tcx>,
+    adt_stack: RefCell<Vec<(DefId, GenericArgsRef<'tcx>)>>,
     namespace_map: RefCell<DefIdMap<&'ll DIScope>>,
     recursion_marker_type: OnceCell<&'ll DIType>,
 }
@@ -80,6 +81,7 @@ impl<'ll, 'tcx> CodegenUnitDebugContext<'ll, 'tcx> {
             builder,
             created_files: Default::default(),
             type_map: Default::default(),
+            adt_stack: Default::default(),
             namespace_map: RefCell::new(Default::default()),
             recursion_marker_type: OnceCell::new(),
         }
@@ -486,10 +488,40 @@ impl<'ll, 'tcx> DebugInfoCodegenMethods<'tcx> for CodegenCx<'ll, 'tcx> {
             generics: &ty::Generics,
             args: GenericArgsRef<'tcx>,
         ) -> &'ll DIArray {
-            let template_params = metadata::get_template_parameters(cx, generics, args);
+            if args.types().next().is_none() {
+                return create_DIArray(DIB(cx), &[]);
+            }
+
+            // Again, only create type information if full debuginfo is enabled
+            let template_params: Vec<_> = if cx.sess().opts.debuginfo == DebugInfo::Full {
+                let names = get_parameter_names(cx, generics);
+                iter::zip(args, names)
+                    .filter_map(|(kind, name)| {
+                        kind.as_type().map(|ty| {
+                            let actual_type = cx.tcx.normalize_erasing_regions(cx.typing_env(), ty);
+                            let actual_type_metadata = type_di_node(cx, actual_type);
+                            Some(cx.create_template_type_parameter(
+                                name.as_str(),
+                                actual_type_metadata,
+                            ))
+                        })
+                    })
+                    .collect()
+            } else {
+                vec![]
+            };
+
             create_DIArray(DIB(cx), &template_params)
         }
 
+        fn get_parameter_names(cx: &CodegenCx<'_, '_>, generics: &ty::Generics) -> Vec<Symbol> {
+            let mut names = generics.parent.map_or_else(Vec::new, |def_id| {
+                get_parameter_names(cx, cx.tcx.generics_of(def_id))
+            });
+            names.extend(generics.own_params.iter().map(|param| param.name));
+            names
+        }
+
         /// Returns a scope, plus `true` if that's a type scope for "class" methods,
         /// otherwise `false` for plain namespace scopes.
         fn get_containing_scope<'ll, 'tcx>(
diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs
index 67135fcc308..ffeab59b05c 100644
--- a/compiler/rustc_codegen_llvm/src/intrinsic.rs
+++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs
@@ -1184,18 +1184,6 @@ fn generic_simd_intrinsic<'ll, 'tcx>(
         }};
     }
 
-    /// Returns the bitwidth of the `$ty` argument if it is an `Int` type.
-    macro_rules! require_int_ty {
-        ($ty: expr, $diag: expr) => {
-            match $ty {
-                ty::Int(i) => i.bit_width().unwrap_or_else(|| bx.data_layout().pointer_size.bits()),
-                _ => {
-                    return_error!($diag);
-                }
-            }
-        };
-    }
-
     /// Returns the bitwidth of the `$ty` argument if it is an `Int` or `Uint` type.
     macro_rules! require_int_or_uint_ty {
         ($ty: expr, $diag: expr) => {
@@ -1421,7 +1409,7 @@ fn generic_simd_intrinsic<'ll, 'tcx>(
         return Ok(bx.shuffle_vector(args[0].immediate(), args[1].immediate(), indices));
     }
 
-    if name == sym::simd_insert {
+    if name == sym::simd_insert || name == sym::simd_insert_dyn {
         require!(
             in_elem == arg_tys[2],
             InvalidMonomorphization::InsertedType {
@@ -1432,40 +1420,49 @@ fn generic_simd_intrinsic<'ll, 'tcx>(
                 out_ty: arg_tys[2]
             }
         );
-        let idx = bx
-            .const_to_opt_u128(args[1].immediate(), false)
-            .expect("typeck should have ensure that this is a const");
-        if idx >= in_len.into() {
-            return_error!(InvalidMonomorphization::SimdIndexOutOfBounds {
-                span,
-                name,
-                arg_idx: 1,
-                total_len: in_len.into(),
-            });
-        }
-        return Ok(bx.insert_element(
-            args[0].immediate(),
-            args[2].immediate(),
-            bx.const_i32(idx as i32),
-        ));
+
+        let index_imm = if name == sym::simd_insert {
+            let idx = bx
+                .const_to_opt_u128(args[1].immediate(), false)
+                .expect("typeck should have ensure that this is a const");
+            if idx >= in_len.into() {
+                return_error!(InvalidMonomorphization::SimdIndexOutOfBounds {
+                    span,
+                    name,
+                    arg_idx: 1,
+                    total_len: in_len.into(),
+                });
+            }
+            bx.const_i32(idx as i32)
+        } else {
+            args[1].immediate()
+        };
+
+        return Ok(bx.insert_element(args[0].immediate(), args[2].immediate(), index_imm));
     }
-    if name == sym::simd_extract {
+    if name == sym::simd_extract || name == sym::simd_extract_dyn {
         require!(
             ret_ty == in_elem,
             InvalidMonomorphization::ReturnType { span, name, in_elem, in_ty, ret_ty }
         );
-        let idx = bx
-            .const_to_opt_u128(args[1].immediate(), false)
-            .expect("typeck should have ensure that this is a const");
-        if idx >= in_len.into() {
-            return_error!(InvalidMonomorphization::SimdIndexOutOfBounds {
-                span,
-                name,
-                arg_idx: 1,
-                total_len: in_len.into(),
-            });
-        }
-        return Ok(bx.extract_element(args[0].immediate(), bx.const_i32(idx as i32)));
+        let index_imm = if name == sym::simd_extract {
+            let idx = bx
+                .const_to_opt_u128(args[1].immediate(), false)
+                .expect("typeck should have ensure that this is a const");
+            if idx >= in_len.into() {
+                return_error!(InvalidMonomorphization::SimdIndexOutOfBounds {
+                    span,
+                    name,
+                    arg_idx: 1,
+                    total_len: in_len.into(),
+                });
+            }
+            bx.const_i32(idx as i32)
+        } else {
+            args[1].immediate()
+        };
+
+        return Ok(bx.extract_element(args[0].immediate(), index_imm));
     }
 
     if name == sym::simd_select {
@@ -1476,9 +1473,9 @@ fn generic_simd_intrinsic<'ll, 'tcx>(
             m_len == v_len,
             InvalidMonomorphization::MismatchedLengths { span, name, m_len, v_len }
         );
-        let in_elem_bitwidth = require_int_ty!(
+        let in_elem_bitwidth = require_int_or_uint_ty!(
             m_elem_ty.kind(),
-            InvalidMonomorphization::MaskType { span, name, ty: m_elem_ty }
+            InvalidMonomorphization::MaskWrongElementType { span, name, ty: m_elem_ty }
         );
         let m_i1s = vector_mask_to_bitmask(bx, args[0].immediate(), in_elem_bitwidth, m_len);
         return Ok(bx.select(m_i1s, args[1].immediate(), args[2].immediate()));
@@ -1499,7 +1496,7 @@ fn generic_simd_intrinsic<'ll, 'tcx>(
         // Integer vector <i{in_bitwidth} x in_len>:
         let in_elem_bitwidth = require_int_or_uint_ty!(
             in_elem.kind(),
-            InvalidMonomorphization::VectorArgument { span, name, in_ty, in_elem }
+            InvalidMonomorphization::MaskWrongElementType { span, name, ty: in_elem }
         );
 
         let i1xn = vector_mask_to_bitmask(bx, args[0].immediate(), in_elem_bitwidth, in_len);
@@ -1723,14 +1720,9 @@ fn generic_simd_intrinsic<'ll, 'tcx>(
             }
         );
 
-        let mask_elem_bitwidth = require_int_ty!(
+        let mask_elem_bitwidth = require_int_or_uint_ty!(
             element_ty2.kind(),
-            InvalidMonomorphization::ThirdArgElementType {
-                span,
-                name,
-                expected_element: element_ty2,
-                third_arg: arg_tys[2]
-            }
+            InvalidMonomorphization::MaskWrongElementType { span, name, ty: element_ty2 }
         );
 
         // Alignment of T, must be a constant integer value:
@@ -1825,14 +1817,9 @@ fn generic_simd_intrinsic<'ll, 'tcx>(
             }
         );
 
-        let m_elem_bitwidth = require_int_ty!(
+        let m_elem_bitwidth = require_int_or_uint_ty!(
             mask_elem.kind(),
-            InvalidMonomorphization::ThirdArgElementType {
-                span,
-                name,
-                expected_element: values_elem,
-                third_arg: mask_ty,
-            }
+            InvalidMonomorphization::MaskWrongElementType { span, name, ty: mask_elem }
         );
 
         let mask = vector_mask_to_bitmask(bx, args[0].immediate(), m_elem_bitwidth, mask_len);
@@ -1915,14 +1902,9 @@ fn generic_simd_intrinsic<'ll, 'tcx>(
             }
         );
 
-        let m_elem_bitwidth = require_int_ty!(
+        let m_elem_bitwidth = require_int_or_uint_ty!(
             mask_elem.kind(),
-            InvalidMonomorphization::ThirdArgElementType {
-                span,
-                name,
-                expected_element: values_elem,
-                third_arg: mask_ty,
-            }
+            InvalidMonomorphization::MaskWrongElementType { span, name, ty: mask_elem }
         );
 
         let mask = vector_mask_to_bitmask(bx, args[0].immediate(), m_elem_bitwidth, mask_len);
@@ -2010,15 +1992,10 @@ fn generic_simd_intrinsic<'ll, 'tcx>(
             }
         );
 
-        // The element type of the third argument must be a signed integer type of any width:
-        let mask_elem_bitwidth = require_int_ty!(
+        // The element type of the third argument must be an integer type of any width:
+        let mask_elem_bitwidth = require_int_or_uint_ty!(
             element_ty2.kind(),
-            InvalidMonomorphization::ThirdArgElementType {
-                span,
-                name,
-                expected_element: element_ty2,
-                third_arg: arg_tys[2]
-            }
+            InvalidMonomorphization::MaskWrongElementType { span, name, ty: element_ty2 }
         );
 
         // Alignment of T, must be a constant integer value:
diff --git a/compiler/rustc_codegen_llvm/src/lib.rs b/compiler/rustc_codegen_llvm/src/lib.rs
index 425381b0ffa..e8010ec9fc4 100644
--- a/compiler/rustc_codegen_llvm/src/lib.rs
+++ b/compiler/rustc_codegen_llvm/src/lib.rs
@@ -6,6 +6,7 @@
 
 // tidy-alphabetical-start
 #![allow(internal_features)]
+#![cfg_attr(bootstrap, feature(let_chains))]
 #![doc(html_root_url = "https://doc.rust-lang.org/nightly/nightly-rustc/")]
 #![doc(rust_logo)]
 #![feature(assert_matches)]
@@ -15,7 +16,6 @@
 #![feature(if_let_guard)]
 #![feature(impl_trait_in_assoc_type)]
 #![feature(iter_intersperse)]
-#![feature(let_chains)]
 #![feature(rustdoc_internals)]
 #![feature(slice_as_array)]
 #![feature(try_blocks)]
@@ -29,7 +29,7 @@ use back::owned_target_machine::OwnedTargetMachine;
 use back::write::{create_informational_target_machine, create_target_machine};
 use context::SimpleCx;
 use errors::{AutoDiffWithoutLTO, ParseTargetMachineConfig};
-use llvm_util::target_features_cfg;
+use llvm_util::target_config;
 use rustc_ast::expand::allocator::AllocatorKind;
 use rustc_ast::expand::autodiff_attrs::AutoDiffItem;
 use rustc_codegen_ssa::back::lto::{LtoModuleCodegen, SerializedModule, ThinModule};
@@ -37,7 +37,7 @@ use rustc_codegen_ssa::back::write::{
     CodegenContext, FatLtoInput, ModuleConfig, TargetMachineFactoryConfig, TargetMachineFactoryFn,
 };
 use rustc_codegen_ssa::traits::*;
-use rustc_codegen_ssa::{CodegenResults, CompiledModule, ModuleCodegen};
+use rustc_codegen_ssa::{CodegenResults, CompiledModule, ModuleCodegen, TargetConfig};
 use rustc_data_structures::fx::FxIndexMap;
 use rustc_errors::{DiagCtxtHandle, FatalError};
 use rustc_metadata::EncodedMetadata;
@@ -338,8 +338,8 @@ impl CodegenBackend for LlvmCodegenBackend {
         llvm_util::print_version();
     }
 
-    fn target_features_cfg(&self, sess: &Session) -> (Vec<Symbol>, Vec<Symbol>) {
-        target_features_cfg(sess)
+    fn target_config(&self, sess: &Session) -> TargetConfig {
+        target_config(sess)
     }
 
     fn codegen_crate<'tcx>(
diff --git a/compiler/rustc_codegen_llvm/src/llvm/enzyme_ffi.rs b/compiler/rustc_codegen_llvm/src/llvm/enzyme_ffi.rs
index a9b3bdf7344..2ad39fc8538 100644
--- a/compiler/rustc_codegen_llvm/src/llvm/enzyme_ffi.rs
+++ b/compiler/rustc_codegen_llvm/src/llvm/enzyme_ffi.rs
@@ -19,6 +19,19 @@ unsafe extern "C" {
     pub(crate) fn LLVMRustVerifyFunction(V: &Value, action: LLVMRustVerifierFailureAction) -> Bool;
     pub(crate) fn LLVMRustHasAttributeAtIndex(V: &Value, i: c_uint, Kind: AttributeKind) -> bool;
     pub(crate) fn LLVMRustGetArrayNumElements(Ty: &Type) -> u64;
+    pub(crate) fn LLVMRustHasFnAttribute(
+        F: &Value,
+        Name: *const c_char,
+        NameLen: libc::size_t,
+    ) -> bool;
+    pub(crate) fn LLVMRustRemoveFnAttribute(F: &Value, Name: *const c_char, NameLen: libc::size_t);
+    pub(crate) fn LLVMGetFirstFunction(M: &Module) -> Option<&Value>;
+    pub(crate) fn LLVMGetNextFunction(Fn: &Value) -> Option<&Value>;
+    pub(crate) fn LLVMRustRemoveEnumAttributeAtIndex(
+        Fn: &Value,
+        index: c_uint,
+        kind: AttributeKind,
+    );
 }
 
 unsafe extern "C" {
diff --git a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs
index 9ff04f72903..ffb490dcdc2 100644
--- a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs
+++ b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs
@@ -2454,6 +2454,9 @@ unsafe extern "C" {
         DisableSimplifyLibCalls: bool,
         EmitLifetimeMarkers: bool,
         RunEnzyme: bool,
+        PrintBeforeEnzyme: bool,
+        PrintAfterEnzyme: bool,
+        PrintPasses: bool,
         SanitizerOptions: Option<&SanitizerOptions>,
         PGOGenPath: *const c_char,
         PGOUsePath: *const c_char,
diff --git a/compiler/rustc_codegen_llvm/src/llvm/mod.rs b/compiler/rustc_codegen_llvm/src/llvm/mod.rs
index 6ca81c651ed..d14aab06073 100644
--- a/compiler/rustc_codegen_llvm/src/llvm/mod.rs
+++ b/compiler/rustc_codegen_llvm/src/llvm/mod.rs
@@ -41,6 +41,32 @@ pub(crate) fn AddFunctionAttributes<'ll>(
     }
 }
 
+pub(crate) fn HasAttributeAtIndex<'ll>(
+    llfn: &'ll Value,
+    idx: AttributePlace,
+    kind: AttributeKind,
+) -> bool {
+    unsafe { LLVMRustHasAttributeAtIndex(llfn, idx.as_uint(), kind) }
+}
+
+pub(crate) fn HasStringAttribute<'ll>(llfn: &'ll Value, name: &str) -> bool {
+    unsafe { LLVMRustHasFnAttribute(llfn, name.as_c_char_ptr(), name.len()) }
+}
+
+pub(crate) fn RemoveStringAttrFromFn<'ll>(llfn: &'ll Value, name: &str) {
+    unsafe { LLVMRustRemoveFnAttribute(llfn, name.as_c_char_ptr(), name.len()) }
+}
+
+pub(crate) fn RemoveRustEnumAttributeAtIndex(
+    llfn: &Value,
+    place: AttributePlace,
+    kind: AttributeKind,
+) {
+    unsafe {
+        LLVMRustRemoveEnumAttributeAtIndex(llfn, place.as_uint(), kind);
+    }
+}
+
 pub(crate) fn AddCallSiteAttributes<'ll>(
     callsite: &'ll Value,
     idx: AttributePlace,
diff --git a/compiler/rustc_codegen_llvm/src/llvm_util.rs b/compiler/rustc_codegen_llvm/src/llvm_util.rs
index 36e35f81392..507cbf20d89 100644
--- a/compiler/rustc_codegen_llvm/src/llvm_util.rs
+++ b/compiler/rustc_codegen_llvm/src/llvm_util.rs
@@ -6,6 +6,7 @@ use std::sync::Once;
 use std::{ptr, slice, str};
 
 use libc::c_int;
+use rustc_codegen_ssa::TargetConfig;
 use rustc_codegen_ssa::base::wants_wasm_eh;
 use rustc_codegen_ssa::codegen_attrs::check_tied_features;
 use rustc_data_structures::fx::{FxHashMap, FxHashSet};
@@ -272,6 +273,12 @@ pub(crate) fn to_llvm_features<'a>(sess: &Session, s: &'a str) -> Option<LLVMFea
         ("aarch64", "fpmr") => None, // only existed in 18
         ("arm", "fp16") => Some(LLVMFeature::new("fullfp16")),
         // Filter out features that are not supported by the current LLVM version
+        ("loongarch64", "div32" | "lam-bh" | "lamcas" | "ld-seq-sa" | "scq")
+            if get_version().0 < 20 =>
+        {
+            None
+        }
+        // Filter out features that are not supported by the current LLVM version
         ("riscv32" | "riscv64", "zacas") if get_version().0 < 20 => None,
         // Enable the evex512 target feature if an avx512 target feature is enabled.
         ("x86", s) if s.starts_with("avx512") => {
@@ -302,7 +309,7 @@ pub(crate) fn to_llvm_features<'a>(sess: &Session, s: &'a str) -> Option<LLVMFea
 /// Must express features in the way Rust understands them.
 ///
 /// We do not have to worry about RUSTC_SPECIFIC_FEATURES here, those are handled outside codegen.
-pub(crate) fn target_features_cfg(sess: &Session) -> (Vec<Symbol>, Vec<Symbol>) {
+pub(crate) fn target_config(sess: &Session) -> TargetConfig {
     // Add base features for the target.
     // We do *not* add the -Ctarget-features there, and instead duplicate the logic for that below.
     // The reason is that if LLVM considers a feature implied but we do not, we don't want that to
@@ -402,7 +409,85 @@ pub(crate) fn target_features_cfg(sess: &Session) -> (Vec<Symbol>, Vec<Symbol>)
 
     let target_features = f(false);
     let unstable_target_features = f(true);
-    (target_features, unstable_target_features)
+    let mut cfg = TargetConfig {
+        target_features,
+        unstable_target_features,
+        has_reliable_f16: true,
+        has_reliable_f16_math: true,
+        has_reliable_f128: true,
+        has_reliable_f128_math: true,
+    };
+
+    update_target_reliable_float_cfg(sess, &mut cfg);
+    cfg
+}
+
+/// Determine whether or not experimental float types are reliable based on known bugs.
+fn update_target_reliable_float_cfg(sess: &Session, cfg: &mut TargetConfig) {
+    let target_arch = sess.target.arch.as_ref();
+    let target_os = sess.target.options.os.as_ref();
+    let target_env = sess.target.options.env.as_ref();
+    let target_abi = sess.target.options.abi.as_ref();
+    let target_pointer_width = sess.target.pointer_width;
+
+    cfg.has_reliable_f16 = match (target_arch, target_os) {
+        // Selection failure <https://github.com/llvm/llvm-project/issues/50374>
+        ("s390x", _) => false,
+        // Unsupported <https://github.com/llvm/llvm-project/issues/94434>
+        ("arm64ec", _) => false,
+        // MinGW ABI bugs <https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115054>
+        ("x86_64", "windows") if target_env == "gnu" && target_abi != "llvm" => false,
+        // Infinite recursion <https://github.com/llvm/llvm-project/issues/97981>
+        ("csky", _) => false,
+        ("hexagon", _) => false,
+        ("powerpc" | "powerpc64", _) => false,
+        ("sparc" | "sparc64", _) => false,
+        ("wasm32" | "wasm64", _) => false,
+        // `f16` support only requires that symbols converting to and from `f32` are available. We
+        // provide these in `compiler-builtins`, so `f16` should be available on all platforms that
+        // do not have other ABI issues or LLVM crashes.
+        _ => true,
+    };
+
+    cfg.has_reliable_f128 = match (target_arch, target_os) {
+        // Unsupported <https://github.com/llvm/llvm-project/issues/94434>
+        ("arm64ec", _) => false,
+        // Selection bug <https://github.com/llvm/llvm-project/issues/96432>
+        ("mips64" | "mips64r6", _) => false,
+        // Selection bug <https://github.com/llvm/llvm-project/issues/95471>
+        ("nvptx64", _) => false,
+        // ABI bugs <https://github.com/rust-lang/rust/issues/125109> et al. (full
+        // list at <https://github.com/rust-lang/rust/issues/116909>)
+        ("powerpc" | "powerpc64", _) => false,
+        // ABI unsupported  <https://github.com/llvm/llvm-project/issues/41838>
+        ("sparc", _) => false,
+        // Stack alignment bug <https://github.com/llvm/llvm-project/issues/77401>. NB: tests may
+        // not fail if our compiler-builtins is linked.
+        ("x86", _) => false,
+        // MinGW ABI bugs <https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115054>
+        ("x86_64", "windows") if target_env == "gnu" && target_abi != "llvm" => false,
+        // There are no known problems on other platforms, so the only requirement is that symbols
+        // are available. `compiler-builtins` provides all symbols required for core `f128`
+        // support, so this should work for everything else.
+        _ => true,
+    };
+
+    // Assume that working `f16` means working `f16` math for most platforms, since
+    // operations just go through `f32`.
+    cfg.has_reliable_f16_math = cfg.has_reliable_f16;
+
+    cfg.has_reliable_f128_math = match (target_arch, target_os) {
+        // LLVM lowers `fp128` math to `long double` symbols even on platforms where
+        // `long double` is not IEEE binary128. See
+        // <https://github.com/llvm/llvm-project/issues/44744>.
+        //
+        // This rules out anything that doesn't have `long double` = `binary128`; <= 32 bits
+        // (ld is `f64`), anything other than Linux (Windows and MacOS use `f64`), and `x86`
+        // (ld is 80-bit extended precision).
+        ("x86_64", _) => false,
+        (_, "linux") if target_pointer_width == 64 => true,
+        _ => false,
+    } && cfg.has_reliable_f128;
 }
 
 pub(crate) fn print_version() {
@@ -686,7 +771,7 @@ pub(crate) fn global_llvm_features(
                 )
             } else if let Some(feature) = feature.strip_prefix('-') {
                 // FIXME: Why do we not remove implied features on "-" here?
-                // We do the equivalent above in `target_features_cfg`.
+                // We do the equivalent above in `target_config`.
                 // See <https://github.com/rust-lang/rust/issues/134792>.
                 all_rust_features.push((false, feature));
             } else if !feature.is_empty() {
diff --git a/compiler/rustc_codegen_llvm/src/type_.rs b/compiler/rustc_codegen_llvm/src/type_.rs
index b89ce90d1a1..169036f5152 100644
--- a/compiler/rustc_codegen_llvm/src/type_.rs
+++ b/compiler/rustc_codegen_llvm/src/type_.rs
@@ -128,6 +128,10 @@ impl<'ll, CX: Borrow<SCx<'ll>>> GenericCx<'ll, CX> {
         (**self).borrow().llcx
     }
 
+    pub(crate) fn llmod(&self) -> &'ll llvm::Module {
+        (**self).borrow().llmod
+    }
+
     pub(crate) fn isize_ty(&self) -> &'ll Type {
         (**self).borrow().isize_ty
     }