13 files changed, 630 insertions, 146 deletions
diff --git a/compiler/rustc_codegen_llvm/src/back/lto.rs b/compiler/rustc_codegen_llvm/src/back/lto.rs
index 655e1c95373..767835c34f0 100644
--- a/compiler/rustc_codegen_llvm/src/back/lto.rs
+++ b/compiler/rustc_codegen_llvm/src/back/lto.rs
@@ -1,33 +1,28 @@
 use std::collections::BTreeMap;
 use std::ffi::{CStr, CString};
 use std::fs::File;
-use std::path::Path;
+use std::path::{Path, PathBuf};
 use std::ptr::NonNull;
 use std::sync::Arc;
 use std::{io, iter, slice};
 
 use object::read::archive::ArchiveFile;
 use rustc_codegen_ssa::back::lto::{SerializedModule, ThinModule, ThinShared};
-use rustc_codegen_ssa::back::symbol_export;
 use rustc_codegen_ssa::back::write::{CodegenContext, FatLtoInput};
 use rustc_codegen_ssa::traits::*;
 use rustc_codegen_ssa::{ModuleCodegen, ModuleKind, looks_like_rust_object_file};
 use rustc_data_structures::fx::FxHashMap;
 use rustc_data_structures::memmap::Mmap;
 use rustc_errors::{DiagCtxtHandle, FatalError};
-use rustc_hir::def_id::LOCAL_CRATE;
 use rustc_middle::bug;
 use rustc_middle::dep_graph::WorkProduct;
-use rustc_middle::middle::exported_symbols::{SymbolExportInfo, SymbolExportLevel};
-use rustc_session::config::{self, CrateType, Lto};
+use rustc_session::config::{self, Lto};
 use tracing::{debug, info};
 
 use crate::back::write::{
     self, CodegenDiagnosticsStage, DiagnosticHandlers, bitcode_section_name, save_temp_bitcode,
 };
-use crate::errors::{
-    DynamicLinkingWithLTO, LlvmError, LtoBitcodeFromRlib, LtoDisallowed, LtoDylib, LtoProcMacro,
-};
+use crate::errors::{LlvmError, LtoBitcodeFromRlib};
 use crate::llvm::AttributePlace::Function;
 use crate::llvm::{self, build_string};
 use crate::{LlvmCodegenBackend, ModuleLlvm, SimpleCx, attributes};
@@ -36,45 +31,21 @@ use crate::{LlvmCodegenBackend, ModuleLlvm, SimpleCx, attributes};
 /// session to determine which CGUs we can reuse.
 const THIN_LTO_KEYS_INCR_COMP_FILE_NAME: &str = "thin-lto-past-keys.bin";
 
-fn crate_type_allows_lto(crate_type: CrateType) -> bool {
-    match crate_type {
-        CrateType::Executable
-        | CrateType::Dylib
-        | CrateType::Staticlib
-        | CrateType::Cdylib
-        | CrateType::ProcMacro
-        | CrateType::Sdylib => true,
-        CrateType::Rlib => false,
-    }
-}
-
 fn prepare_lto(
     cgcx: &CodegenContext<LlvmCodegenBackend>,
+    exported_symbols_for_lto: &[String],
+    each_linked_rlib_for_lto: &[PathBuf],
     dcx: DiagCtxtHandle<'_>,
 ) -> Result<(Vec<CString>, Vec<(SerializedModule<ModuleBuffer>, CString)>), FatalError> {
-    let export_threshold = match cgcx.lto {
-        // We're just doing LTO for our one crate
-        Lto::ThinLocal => SymbolExportLevel::Rust,
-
-        // We're doing LTO for the entire crate graph
-        Lto::Fat | Lto::Thin => symbol_export::crates_export_threshold(&cgcx.crate_types),
-
-        Lto::No => panic!("didn't request LTO but we're doing LTO"),
-    };
+    let mut symbols_below_threshold = exported_symbols_for_lto
+        .iter()
+        .map(|symbol| CString::new(symbol.to_owned()).unwrap())
+        .collect::<Vec<CString>>();
 
-    let symbol_filter = &|&(ref name, info): &(String, SymbolExportInfo)| {
-        if info.level.is_below_threshold(export_threshold) || info.used {
-            Some(CString::new(name.as_str()).unwrap())
-        } else {
-            None
-        }
-    };
-    let exported_symbols = cgcx.exported_symbols.as_ref().expect("needs exported symbols for LTO");
-    let mut symbols_below_threshold = {
-        let _timer = cgcx.prof.generic_activity("LLVM_lto_generate_symbols_below_threshold");
-        exported_symbols[&LOCAL_CRATE].iter().filter_map(symbol_filter).collect::<Vec<CString>>()
-    };
-    info!("{} symbols to preserve in this crate", symbols_below_threshold.len());
+    // __llvm_profile_counter_bias is pulled in at link time by an undefined reference to
+    // __llvm_profile_runtime, therefore we won't know until link time if this symbol
+    // should have default visibility.
+    symbols_below_threshold.push(c"__llvm_profile_counter_bias".to_owned());
 
     // If we're performing LTO for the entire crate graph, then for each of our
     // upstream dependencies, find the corresponding rlib and load the bitcode
@@ -84,37 +55,7 @@ fn prepare_lto(
     // with either fat or thin LTO
     let mut upstream_modules = Vec::new();
     if cgcx.lto != Lto::ThinLocal {
-        // Make sure we actually can run LTO
-        for crate_type in cgcx.crate_types.iter() {
-            if !crate_type_allows_lto(*crate_type) {
-                dcx.emit_err(LtoDisallowed);
-                return Err(FatalError);
-            } else if *crate_type == CrateType::Dylib {
-                if !cgcx.opts.unstable_opts.dylib_lto {
-                    dcx.emit_err(LtoDylib);
-                    return Err(FatalError);
-                }
-            } else if *crate_type == CrateType::ProcMacro && !cgcx.opts.unstable_opts.dylib_lto {
-                dcx.emit_err(LtoProcMacro);
-                return Err(FatalError);
-            }
-        }
-
-        if cgcx.opts.cg.prefer_dynamic && !cgcx.opts.unstable_opts.dylib_lto {
-            dcx.emit_err(DynamicLinkingWithLTO);
-            return Err(FatalError);
-        }
-
-        for &(cnum, ref path) in cgcx.each_linked_rlib_for_lto.iter() {
-            let exported_symbols =
-                cgcx.exported_symbols.as_ref().expect("needs exported symbols for LTO");
-            {
-                let _timer =
-                    cgcx.prof.generic_activity("LLVM_lto_generate_symbols_below_threshold");
-                symbols_below_threshold
-                    .extend(exported_symbols[&cnum].iter().filter_map(symbol_filter));
-            }
-
+        for path in each_linked_rlib_for_lto {
             let archive_data = unsafe {
                 Mmap::map(std::fs::File::open(&path).expect("couldn't open rlib"))
                     .expect("couldn't map rlib")
@@ -147,10 +88,6 @@ fn prepare_lto(
         }
     }
 
-    // __llvm_profile_counter_bias is pulled in at link time by an undefined reference to
-    // __llvm_profile_runtime, therefore we won't know until link time if this symbol
-    // should have default visibility.
-    symbols_below_threshold.push(c"__llvm_profile_counter_bias".to_owned());
     Ok((symbols_below_threshold, upstream_modules))
 }
 
@@ -199,15 +136,17 @@ fn get_bitcode_slice_from_object_data<'a>(
 /// for further optimization.
 pub(crate) fn run_fat(
     cgcx: &CodegenContext<LlvmCodegenBackend>,
+    exported_symbols_for_lto: &[String],
+    each_linked_rlib_for_lto: &[PathBuf],
     modules: Vec<FatLtoInput<LlvmCodegenBackend>>,
-    cached_modules: Vec<(SerializedModule<ModuleBuffer>, WorkProduct)>,
 ) -> Result<ModuleCodegen<ModuleLlvm>, FatalError> {
     let dcx = cgcx.create_dcx();
     let dcx = dcx.handle();
-    let (symbols_below_threshold, upstream_modules) = prepare_lto(cgcx, dcx)?;
+    let (symbols_below_threshold, upstream_modules) =
+        prepare_lto(cgcx, exported_symbols_for_lto, each_linked_rlib_for_lto, dcx)?;
     let symbols_below_threshold =
         symbols_below_threshold.iter().map(|c| c.as_ptr()).collect::<Vec<_>>();
-    fat_lto(cgcx, dcx, modules, cached_modules, upstream_modules, &symbols_below_threshold)
+    fat_lto(cgcx, dcx, modules, upstream_modules, &symbols_below_threshold)
 }
 
 /// Performs thin LTO by performing necessary global analysis and returning two
@@ -215,12 +154,15 @@ pub(crate) fn run_fat(
 /// can simply be copied over from the incr. comp. cache.
 pub(crate) fn run_thin(
     cgcx: &CodegenContext<LlvmCodegenBackend>,
+    exported_symbols_for_lto: &[String],
+    each_linked_rlib_for_lto: &[PathBuf],
     modules: Vec<(String, ThinBuffer)>,
     cached_modules: Vec<(SerializedModule<ModuleBuffer>, WorkProduct)>,
 ) -> Result<(Vec<ThinModule<LlvmCodegenBackend>>, Vec<WorkProduct>), FatalError> {
     let dcx = cgcx.create_dcx();
     let dcx = dcx.handle();
-    let (symbols_below_threshold, upstream_modules) = prepare_lto(cgcx, dcx)?;
+    let (symbols_below_threshold, upstream_modules) =
+        prepare_lto(cgcx, exported_symbols_for_lto, each_linked_rlib_for_lto, dcx)?;
     let symbols_below_threshold =
         symbols_below_threshold.iter().map(|c| c.as_ptr()).collect::<Vec<_>>();
     if cgcx.opts.cg.linker_plugin_lto.enabled() {
@@ -245,7 +187,6 @@ fn fat_lto(
     cgcx: &CodegenContext<LlvmCodegenBackend>,
     dcx: DiagCtxtHandle<'_>,
     modules: Vec<FatLtoInput<LlvmCodegenBackend>>,
-    cached_modules: Vec<(SerializedModule<ModuleBuffer>, WorkProduct)>,
     mut serialized_modules: Vec<(SerializedModule<ModuleBuffer>, CString)>,
     symbols_below_threshold: &[*const libc::c_char],
 ) -> Result<ModuleCodegen<ModuleLlvm>, FatalError> {
@@ -258,21 +199,12 @@ fn fat_lto(
     //   modules that are serialized in-memory.
     // * `in_memory` contains modules which are already parsed and in-memory,
     //   such as from multi-CGU builds.
-    //
-    // All of `cached_modules` (cached from previous incremental builds) can
-    // immediately go onto the `serialized_modules` modules list and then we can
-    // split the `modules` array into these two lists.
     let mut in_memory = Vec::new();
-    serialized_modules.extend(cached_modules.into_iter().map(|(buffer, wp)| {
-        info!("pushing cached module {:?}", wp.cgu_name);
-        (buffer, CString::new(wp.cgu_name).unwrap())
-    }));
     for module in modules {
         match module {
             FatLtoInput::InMemory(m) => in_memory.push(m),
             FatLtoInput::Serialized { name, buffer } => {
                 info!("pushing serialized module {:?}", name);
-                let buffer = SerializedModule::Local(buffer);
                 serialized_modules.push((buffer, CString::new(name).unwrap()));
             }
         }
@@ -654,6 +586,7 @@ pub(crate) fn run_pass_manager(
     // We then run the llvm_optimize function a second time, to optimize the code which we generated
     // in the enzyme differentiation pass.
     let enable_ad = config.autodiff.contains(&config::AutoDiff::Enable);
+    let enable_gpu = config.offload.contains(&config::Offload::Enable);
     let stage = if thin {
         write::AutodiffStage::PreAD
     } else {
@@ -668,6 +601,12 @@ pub(crate) fn run_pass_manager(
         write::llvm_optimize(cgcx, dcx, module, None, config, opt_level, opt_stage, stage)?;
     }
 
+    if enable_gpu && !thin {
+        let cx =
+            SimpleCx::new(module.module_llvm.llmod(), &module.module_llvm.llcx, cgcx.pointer_size);
+        crate::builder::gpu_offload::handle_gpu_code(cgcx, &cx);
+    }
+
     if cfg!(llvm_enzyme) && enable_ad && !thin {
         let cx =
             SimpleCx::new(module.module_llvm.llmod(), &module.module_llvm.llcx, cgcx.pointer_size);
diff --git a/compiler/rustc_codegen_llvm/src/back/owned_target_machine.rs b/compiler/rustc_codegen_llvm/src/back/owned_target_machine.rs
index dfde4595590..8e82013e94a 100644
--- a/compiler/rustc_codegen_llvm/src/back/owned_target_machine.rs
+++ b/compiler/rustc_codegen_llvm/src/back/owned_target_machine.rs
@@ -39,6 +39,7 @@ impl OwnedTargetMachine {
         debug_info_compression: &CStr,
         use_emulated_tls: bool,
         args_cstr_buff: &[u8],
+        use_wasm_eh: bool,
     ) -> Result<Self, LlvmError<'static>> {
         assert!(args_cstr_buff.len() > 0);
         assert!(
@@ -72,6 +73,7 @@ impl OwnedTargetMachine {
                 use_emulated_tls,
                 args_cstr_buff.as_ptr() as *const c_char,
                 args_cstr_buff.len(),
+                use_wasm_eh,
             )
         };
 
diff --git a/compiler/rustc_codegen_llvm/src/back/write.rs b/compiler/rustc_codegen_llvm/src/back/write.rs
index 68279008c03..6f8fba2a30d 100644
--- a/compiler/rustc_codegen_llvm/src/back/write.rs
+++ b/compiler/rustc_codegen_llvm/src/back/write.rs
@@ -15,6 +15,7 @@ use rustc_codegen_ssa::back::write::{
     BitcodeSection, CodegenContext, EmitObj, ModuleConfig, TargetMachineFactoryConfig,
     TargetMachineFactoryFn,
 };
+use rustc_codegen_ssa::base::wants_wasm_eh;
 use rustc_codegen_ssa::traits::*;
 use rustc_codegen_ssa::{CompiledModule, ModuleCodegen, ModuleKind};
 use rustc_data_structures::profiling::SelfProfilerRef;
@@ -285,6 +286,8 @@ pub(crate) fn target_machine_factory(
     let file_name_display_preference =
         sess.filename_display_preference(RemapPathScopeComponents::DEBUGINFO);
 
+    let use_wasm_eh = wants_wasm_eh(sess);
+
     Arc::new(move |config: TargetMachineFactoryConfig| {
         let path_to_cstring_helper = |path: Option<PathBuf>| -> CString {
             let path = path.unwrap_or_default();
@@ -321,6 +324,7 @@ pub(crate) fn target_machine_factory(
             &debuginfo_compression,
             use_emulated_tls,
             &args_cstr_buff,
+            use_wasm_eh,
         )
     })
 }
diff --git a/compiler/rustc_codegen_llvm/src/builder.rs b/compiler/rustc_codegen_llvm/src/builder.rs
index 514923ad6f3..0ade9edb0d2 100644
--- a/compiler/rustc_codegen_llvm/src/builder.rs
+++ b/compiler/rustc_codegen_llvm/src/builder.rs
@@ -3,6 +3,7 @@ use std::ops::Deref;
 use std::{iter, ptr};
 
 pub(crate) mod autodiff;
+pub(crate) mod gpu_offload;
 
 use libc::{c_char, c_uint, size_t};
 use rustc_abi as abi;
@@ -117,6 +118,74 @@ impl<'a, 'll, CX: Borrow<SCx<'ll>>> GenericBuilder<'a, 'll, CX> {
         }
         bx
     }
+
+    // The generic builder has less functionality and thus (unlike the other alloca) we can not
+    // easily jump to the beginning of the function to place our allocas there. We trust the user
+    // to manually do that. FIXME(offload): improve the genericCx and add more llvm wrappers to
+    // handle this.
+    pub(crate) fn direct_alloca(&mut self, ty: &'ll Type, align: Align, name: &str) -> &'ll Value {
+        let val = unsafe {
+            let alloca = llvm::LLVMBuildAlloca(self.llbuilder, ty, UNNAMED);
+            llvm::LLVMSetAlignment(alloca, align.bytes() as c_uint);
+            // Cast to default addrspace if necessary
+            llvm::LLVMBuildPointerCast(self.llbuilder, alloca, self.cx.type_ptr(), UNNAMED)
+        };
+        if name != "" {
+            let name = std::ffi::CString::new(name).unwrap();
+            llvm::set_value_name(val, &name.as_bytes());
+        }
+        val
+    }
+
+    pub(crate) fn inbounds_gep(
+        &mut self,
+        ty: &'ll Type,
+        ptr: &'ll Value,
+        indices: &[&'ll Value],
+    ) -> &'ll Value {
+        unsafe {
+            llvm::LLVMBuildGEPWithNoWrapFlags(
+                self.llbuilder,
+                ty,
+                ptr,
+                indices.as_ptr(),
+                indices.len() as c_uint,
+                UNNAMED,
+                GEPNoWrapFlags::InBounds,
+            )
+        }
+    }
+
+    pub(crate) fn store(&mut self, val: &'ll Value, ptr: &'ll Value, align: Align) -> &'ll Value {
+        debug!("Store {:?} -> {:?}", val, ptr);
+        assert_eq!(self.cx.type_kind(self.cx.val_ty(ptr)), TypeKind::Pointer);
+        unsafe {
+            let store = llvm::LLVMBuildStore(self.llbuilder, val, ptr);
+            llvm::LLVMSetAlignment(store, align.bytes() as c_uint);
+            store
+        }
+    }
+
+    pub(crate) fn load(&mut self, ty: &'ll Type, ptr: &'ll Value, align: Align) -> &'ll Value {
+        unsafe {
+            let load = llvm::LLVMBuildLoad2(self.llbuilder, ty, ptr, UNNAMED);
+            llvm::LLVMSetAlignment(load, align.bytes() as c_uint);
+            load
+        }
+    }
+
+    fn memset(&mut self, ptr: &'ll Value, fill_byte: &'ll Value, size: &'ll Value, align: Align) {
+        unsafe {
+            llvm::LLVMRustBuildMemSet(
+                self.llbuilder,
+                ptr,
+                align.bytes() as c_uint,
+                fill_byte,
+                size,
+                false,
+            );
+        }
+    }
 }
 
 /// Empty string, to be used where LLVM expects an instruction name, indicating
diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs
new file mode 100644
index 00000000000..1280ab1442a
--- /dev/null
+++ b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs
@@ -0,0 +1,439 @@
+use std::ffi::CString;
+
+use llvm::Linkage::*;
+use rustc_abi::Align;
+use rustc_codegen_ssa::back::write::CodegenContext;
+use rustc_codegen_ssa::traits::BaseTypeCodegenMethods;
+
+use crate::builder::SBuilder;
+use crate::common::AsCCharPtr;
+use crate::llvm::AttributePlace::Function;
+use crate::llvm::{self, Linkage, Type, Value};
+use crate::{LlvmCodegenBackend, SimpleCx, attributes};
+
+pub(crate) fn handle_gpu_code<'ll>(
+    _cgcx: &CodegenContext<LlvmCodegenBackend>,
+    cx: &'ll SimpleCx<'_>,
+) {
+    // The offload memory transfer type for each kernel
+    let mut o_types = vec![];
+    let mut kernels = vec![];
+    let offload_entry_ty = add_tgt_offload_entry(&cx);
+    for num in 0..9 {
+        let kernel = cx.get_function(&format!("kernel_{num}"));
+        if let Some(kernel) = kernel {
+            o_types.push(gen_define_handling(&cx, kernel, offload_entry_ty, num));
+            kernels.push(kernel);
+        }
+    }
+
+    gen_call_handling(&cx, &kernels, &o_types);
+}
+
+// What is our @1 here? A magic global, used in our data_{begin/update/end}_mapper:
+// @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+// @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @0 }, align 8
+fn generate_at_one<'ll>(cx: &'ll SimpleCx<'_>) -> &'ll llvm::Value {
+    // @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+    let unknown_txt = ";unknown;unknown;0;0;;";
+    let c_entry_name = CString::new(unknown_txt).unwrap();
+    let c_val = c_entry_name.as_bytes_with_nul();
+    let initializer = crate::common::bytes_in_context(cx.llcx, c_val);
+    let at_zero = add_unnamed_global(&cx, &"", initializer, PrivateLinkage);
+    llvm::set_alignment(at_zero, Align::ONE);
+
+    // @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @0 }, align 8
+    let struct_ident_ty = cx.type_named_struct("struct.ident_t");
+    let struct_elems = vec![
+        cx.get_const_i32(0),
+        cx.get_const_i32(2),
+        cx.get_const_i32(0),
+        cx.get_const_i32(22),
+        at_zero,
+    ];
+    let struct_elems_ty: Vec<_> = struct_elems.iter().map(|&x| cx.val_ty(x)).collect();
+    let initializer = crate::common::named_struct(struct_ident_ty, &struct_elems);
+    cx.set_struct_body(struct_ident_ty, &struct_elems_ty, false);
+    let at_one = add_unnamed_global(&cx, &"", initializer, PrivateLinkage);
+    llvm::set_alignment(at_one, Align::EIGHT);
+    at_one
+}
+
+pub(crate) fn add_tgt_offload_entry<'ll>(cx: &'ll SimpleCx<'_>) -> &'ll llvm::Type {
+    let offload_entry_ty = cx.type_named_struct("struct.__tgt_offload_entry");
+    let tptr = cx.type_ptr();
+    let ti64 = cx.type_i64();
+    let ti32 = cx.type_i32();
+    let ti16 = cx.type_i16();
+    // For each kernel to run on the gpu, we will later generate one entry of this type.
+    // copied from LLVM
+    // typedef struct {
+    //   uint64_t Reserved;
+    //   uint16_t Version;
+    //   uint16_t Kind;
+    //   uint32_t Flags; Flags associated with the entry (see Target Region Entry Flags)
+    //   void *Address; Address of global symbol within device image (function or global)
+    //   char *SymbolName;
+    //   uint64_t Size; Size of the entry info (0 if it is a function)
+    //   uint64_t Data;
+    //   void *AuxAddr;
+    // } __tgt_offload_entry;
+    let entry_elements = vec![ti64, ti16, ti16, ti32, tptr, tptr, ti64, ti64, tptr];
+    cx.set_struct_body(offload_entry_ty, &entry_elements, false);
+    offload_entry_ty
+}
+
+fn gen_tgt_kernel_global<'ll>(cx: &'ll SimpleCx<'_>) {
+    let kernel_arguments_ty = cx.type_named_struct("struct.__tgt_kernel_arguments");
+    let tptr = cx.type_ptr();
+    let ti64 = cx.type_i64();
+    let ti32 = cx.type_i32();
+    let tarr = cx.type_array(ti32, 3);
+
+    // Taken from the LLVM APITypes.h declaration:
+    //struct KernelArgsTy {
+    //  uint32_t Version = 0; // Version of this struct for ABI compatibility.
+    //  uint32_t NumArgs = 0; // Number of arguments in each input pointer.
+    //  void **ArgBasePtrs =
+    //      nullptr;                 // Base pointer of each argument (e.g. a struct).
+    //  void **ArgPtrs = nullptr;    // Pointer to the argument data.
+    //  int64_t *ArgSizes = nullptr; // Size of the argument data in bytes.
+    //  int64_t *ArgTypes = nullptr; // Type of the data (e.g. to / from).
+    //  void **ArgNames = nullptr;   // Name of the data for debugging, possibly null.
+    //  void **ArgMappers = nullptr; // User-defined mappers, possibly null.
+    //  uint64_t Tripcount =
+    //      0; // Tripcount for the teams / distribute loop, 0 otherwise.
+    //  struct {
+    //    uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause.
+    //    uint64_t IsCUDA : 1; // Was this kernel spawned via CUDA.
+    //    uint64_t Unused : 62;
+    //  } Flags = {0, 0, 0};
+    //  // The number of teams (for x,y,z dimension).
+    //  uint32_t NumTeams[3] = {0, 0, 0};
+    //  // The number of threads (for x,y,z dimension).
+    //  uint32_t ThreadLimit[3] = {0, 0, 0};
+    //  uint32_t DynCGroupMem = 0; // Amount of dynamic cgroup memory requested.
+    //};
+    let kernel_elements =
+        vec![ti32, ti32, tptr, tptr, tptr, tptr, tptr, tptr, ti64, ti64, tarr, tarr, ti32];
+
+    cx.set_struct_body(kernel_arguments_ty, &kernel_elements, false);
+    // For now we don't handle kernels, so for now we just add a global dummy
+    // to make sure that the __tgt_offload_entry is defined and handled correctly.
+    cx.declare_global("my_struct_global2", kernel_arguments_ty);
+}
+
+fn gen_tgt_data_mappers<'ll>(
+    cx: &'ll SimpleCx<'_>,
+) -> (&'ll llvm::Value, &'ll llvm::Value, &'ll llvm::Value, &'ll llvm::Type) {
+    let tptr = cx.type_ptr();
+    let ti64 = cx.type_i64();
+    let ti32 = cx.type_i32();
+
+    let args = vec![tptr, ti64, ti32, tptr, tptr, tptr, tptr, tptr, tptr];
+    let mapper_fn_ty = cx.type_func(&args, cx.type_void());
+    let mapper_begin = "__tgt_target_data_begin_mapper";
+    let mapper_update = "__tgt_target_data_update_mapper";
+    let mapper_end = "__tgt_target_data_end_mapper";
+    let begin_mapper_decl = declare_offload_fn(&cx, mapper_begin, mapper_fn_ty);
+    let update_mapper_decl = declare_offload_fn(&cx, mapper_update, mapper_fn_ty);
+    let end_mapper_decl = declare_offload_fn(&cx, mapper_end, mapper_fn_ty);
+
+    let nounwind = llvm::AttributeKind::NoUnwind.create_attr(cx.llcx);
+    attributes::apply_to_llfn(begin_mapper_decl, Function, &[nounwind]);
+    attributes::apply_to_llfn(update_mapper_decl, Function, &[nounwind]);
+    attributes::apply_to_llfn(end_mapper_decl, Function, &[nounwind]);
+
+    (begin_mapper_decl, update_mapper_decl, end_mapper_decl, mapper_fn_ty)
+}
+
+fn add_priv_unnamed_arr<'ll>(cx: &SimpleCx<'ll>, name: &str, vals: &[u64]) -> &'ll llvm::Value {
+    let ti64 = cx.type_i64();
+    let mut size_val = Vec::with_capacity(vals.len());
+    for &val in vals {
+        size_val.push(cx.get_const_i64(val));
+    }
+    let initializer = cx.const_array(ti64, &size_val);
+    add_unnamed_global(cx, name, initializer, PrivateLinkage)
+}
+
+pub(crate) fn add_unnamed_global<'ll>(
+    cx: &SimpleCx<'ll>,
+    name: &str,
+    initializer: &'ll llvm::Value,
+    l: Linkage,
+) -> &'ll llvm::Value {
+    let llglobal = add_global(cx, name, initializer, l);
+    llvm::LLVMSetUnnamedAddress(llglobal, llvm::UnnamedAddr::Global);
+    llglobal
+}
+
+pub(crate) fn add_global<'ll>(
+    cx: &SimpleCx<'ll>,
+    name: &str,
+    initializer: &'ll llvm::Value,
+    l: Linkage,
+) -> &'ll llvm::Value {
+    let c_name = CString::new(name).unwrap();
+    let llglobal: &'ll llvm::Value = llvm::add_global(cx.llmod, cx.val_ty(initializer), &c_name);
+    llvm::set_global_constant(llglobal, true);
+    llvm::set_linkage(llglobal, l);
+    llvm::set_initializer(llglobal, initializer);
+    llglobal
+}
+
+fn gen_define_handling<'ll>(
+    cx: &'ll SimpleCx<'_>,
+    kernel: &'ll llvm::Value,
+    offload_entry_ty: &'ll llvm::Type,
+    num: i64,
+) -> &'ll llvm::Value {
+    let types = cx.func_params_types(cx.get_type_of_global(kernel));
+    // It seems like non-pointer values are automatically mapped. So here, we focus on pointer (or
+    // reference) types.
+    let num_ptr_types = types
+        .iter()
+        .map(|&x| matches!(cx.type_kind(x), rustc_codegen_ssa::common::TypeKind::Pointer))
+        .count();
+
+    // We do not know their size anymore at this level, so hardcode a placeholder.
+    // A follow-up pr will track these from the frontend, where we still have Rust types.
+    // Then, we will be able to figure out that e.g. `&[f32;256]` will result in 4*256 bytes.
+    // I decided that 1024 bytes is a great placeholder value for now.
+    add_priv_unnamed_arr(&cx, &format!(".offload_sizes.{num}"), &vec![1024; num_ptr_types]);
+    // Here we figure out whether something needs to be copied to the gpu (=1), from the gpu (=2),
+    // or both to and from the gpu (=3). Other values shouldn't affect us for now.
+    // A non-mutable reference or pointer will be 1, an array that's not read, but fully overwritten
+    // will be 2. For now, everything is 3, until we have our frontend set up.
+    let o_types =
+        add_priv_unnamed_arr(&cx, &format!(".offload_maptypes.{num}"), &vec![3; num_ptr_types]);
+    // Next: For each function, generate these three entries. A weak constant,
+    // the llvm.rodata entry name, and  the omp_offloading_entries value
+
+    let name = format!(".kernel_{num}.region_id");
+    let initializer = cx.get_const_i8(0);
+    let region_id = add_unnamed_global(&cx, &name, initializer, WeakAnyLinkage);
+
+    let c_entry_name = CString::new(format!("kernel_{num}")).unwrap();
+    let c_val = c_entry_name.as_bytes_with_nul();
+    let offload_entry_name = format!(".offloading.entry_name.{num}");
+
+    let initializer = crate::common::bytes_in_context(cx.llcx, c_val);
+    let llglobal = add_unnamed_global(&cx, &offload_entry_name, initializer, InternalLinkage);
+    llvm::set_alignment(llglobal, Align::ONE);
+    llvm::set_section(llglobal, c".llvm.rodata.offloading");
+
+    // Not actively used yet, for calling real kernels
+    let name = format!(".offloading.entry.kernel_{num}");
+
+    // See the __tgt_offload_entry documentation above.
+    let reserved = cx.get_const_i64(0);
+    let version = cx.get_const_i16(1);
+    let kind = cx.get_const_i16(1);
+    let flags = cx.get_const_i32(0);
+    let size = cx.get_const_i64(0);
+    let data = cx.get_const_i64(0);
+    let aux_addr = cx.const_null(cx.type_ptr());
+    let elems = vec![reserved, version, kind, flags, region_id, llglobal, size, data, aux_addr];
+
+    let initializer = crate::common::named_struct(offload_entry_ty, &elems);
+    let c_name = CString::new(name).unwrap();
+    let llglobal = llvm::add_global(cx.llmod, offload_entry_ty, &c_name);
+    llvm::set_global_constant(llglobal, true);
+    llvm::set_linkage(llglobal, WeakAnyLinkage);
+    llvm::set_initializer(llglobal, initializer);
+    llvm::set_alignment(llglobal, Align::ONE);
+    let c_section_name = CString::new(".omp_offloading_entries").unwrap();
+    llvm::set_section(llglobal, &c_section_name);
+    o_types
+}
+
+fn declare_offload_fn<'ll>(
+    cx: &'ll SimpleCx<'_>,
+    name: &str,
+    ty: &'ll llvm::Type,
+) -> &'ll llvm::Value {
+    crate::declare::declare_simple_fn(
+        cx,
+        name,
+        llvm::CallConv::CCallConv,
+        llvm::UnnamedAddr::No,
+        llvm::Visibility::Default,
+        ty,
+    )
+}
+
+// For each kernel *call*, we now use some of our previous declared globals to move data to and from
+// the gpu. We don't have a proper frontend yet, so we assume that every call to a kernel function
+// from main is intended to run on the GPU. For now, we only handle the data transfer part of it.
+// If two consecutive kernels use the same memory, we still move it to the host and back to the gpu.
+// Since in our frontend users (by default) don't have to specify data transfer, this is something
+// we should optimize in the future! We also assume that everything should be copied back and forth,
+// but sometimes we can directly zero-allocate on the device and only move back, or if something is
+// immutable, we might only copy it to the device, but not back.
+//
+// Current steps:
+// 0. Alloca some variables for the following steps
+// 1. set insert point before kernel call.
+// 2. generate all the GEPS and stores, to be used in 3)
+// 3. generate __tgt_target_data_begin calls to move data to the GPU
+//
+// unchanged: keep kernel call. Later move the kernel to the GPU
+//
+// 4. set insert point after kernel call.
+// 5. generate all the GEPS and stores, to be used in 6)
+// 6. generate __tgt_target_data_end calls to move data from the GPU
+fn gen_call_handling<'ll>(
+    cx: &'ll SimpleCx<'_>,
+    _kernels: &[&'ll llvm::Value],
+    o_types: &[&'ll llvm::Value],
+) {
+    // %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr }
+    let tptr = cx.type_ptr();
+    let ti32 = cx.type_i32();
+    let tgt_bin_desc_ty = vec![ti32, tptr, tptr, tptr];
+    let tgt_bin_desc = cx.type_named_struct("struct.__tgt_bin_desc");
+    cx.set_struct_body(tgt_bin_desc, &tgt_bin_desc_ty, false);
+
+    gen_tgt_kernel_global(&cx);
+    let (begin_mapper_decl, _, end_mapper_decl, fn_ty) = gen_tgt_data_mappers(&cx);
+
+    let main_fn = cx.get_function("main");
+    let Some(main_fn) = main_fn else { return };
+    let kernel_name = "kernel_1";
+    let call = unsafe {
+        llvm::LLVMRustGetFunctionCall(main_fn, kernel_name.as_c_char_ptr(), kernel_name.len())
+    };
+    let Some(kernel_call) = call else {
+        return;
+    };
+    let kernel_call_bb = unsafe { llvm::LLVMGetInstructionParent(kernel_call) };
+    let called = unsafe { llvm::LLVMGetCalledValue(kernel_call).unwrap() };
+    let mut builder = SBuilder::build(cx, kernel_call_bb);
+
+    let types = cx.func_params_types(cx.get_type_of_global(called));
+    let num_args = types.len() as u64;
+
+    // Step 0)
+    // %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr }
+    // %6 = alloca %struct.__tgt_bin_desc, align 8
+    unsafe { llvm::LLVMRustPositionBuilderPastAllocas(builder.llbuilder, main_fn) };
+
+    let tgt_bin_desc_alloca = builder.direct_alloca(tgt_bin_desc, Align::EIGHT, "EmptyDesc");
+
+    let ty = cx.type_array(cx.type_ptr(), num_args);
+    // Baseptr are just the input pointer to the kernel, stored in a local alloca
+    let a1 = builder.direct_alloca(ty, Align::EIGHT, ".offload_baseptrs");
+    // Ptrs are the result of a gep into the baseptr, at least for our trivial types.
+    let a2 = builder.direct_alloca(ty, Align::EIGHT, ".offload_ptrs");
+    // These represent the sizes in bytes, e.g. the entry for `&[f64; 16]` will be 8*16.
+    let ty2 = cx.type_array(cx.type_i64(), num_args);
+    let a4 = builder.direct_alloca(ty2, Align::EIGHT, ".offload_sizes");
+    // Now we allocate once per function param, a copy to be passed to one of our maps.
+    let mut vals = vec![];
+    let mut geps = vec![];
+    let i32_0 = cx.get_const_i32(0);
+    for (index, in_ty) in types.iter().enumerate() {
+        // get function arg, store it into the alloca, and read it.
+        let p = llvm::get_param(called, index as u32);
+        let name = llvm::get_value_name(p);
+        let name = str::from_utf8(&name).unwrap();
+        let arg_name = format!("{name}.addr");
+        let alloca = builder.direct_alloca(in_ty, Align::EIGHT, &arg_name);
+
+        builder.store(p, alloca, Align::EIGHT);
+        let val = builder.load(in_ty, alloca, Align::EIGHT);
+        let gep = builder.inbounds_gep(cx.type_f32(), val, &[i32_0]);
+        vals.push(val);
+        geps.push(gep);
+    }
+
+    // Step 1)
+    unsafe { llvm::LLVMRustPositionBefore(builder.llbuilder, kernel_call) };
+    builder.memset(tgt_bin_desc_alloca, cx.get_const_i8(0), cx.get_const_i64(32), Align::EIGHT);
+
+    let mapper_fn_ty = cx.type_func(&[cx.type_ptr()], cx.type_void());
+    let register_lib_decl = declare_offload_fn(&cx, "__tgt_register_lib", mapper_fn_ty);
+    let unregister_lib_decl = declare_offload_fn(&cx, "__tgt_unregister_lib", mapper_fn_ty);
+    let init_ty = cx.type_func(&[], cx.type_void());
+    let init_rtls_decl = declare_offload_fn(cx, "__tgt_init_all_rtls", init_ty);
+
+    // call void @__tgt_register_lib(ptr noundef %6)
+    builder.call(mapper_fn_ty, register_lib_decl, &[tgt_bin_desc_alloca], None);
+    // call void @__tgt_init_all_rtls()
+    builder.call(init_ty, init_rtls_decl, &[], None);
+
+    for i in 0..num_args {
+        let idx = cx.get_const_i32(i);
+        let gep1 = builder.inbounds_gep(ty, a1, &[i32_0, idx]);
+        builder.store(vals[i as usize], gep1, Align::EIGHT);
+        let gep2 = builder.inbounds_gep(ty, a2, &[i32_0, idx]);
+        builder.store(geps[i as usize], gep2, Align::EIGHT);
+        let gep3 = builder.inbounds_gep(ty2, a4, &[i32_0, idx]);
+        // As mentioned above, we don't use Rust type information yet. So for now we will just
+        // assume that we have 1024 bytes, 256 f32 values.
+        // FIXME(offload): write an offload frontend and handle arbitrary types.
+        builder.store(cx.get_const_i64(1024), gep3, Align::EIGHT);
+    }
+
+    // For now we have a very simplistic indexing scheme into our
+    // offload_{baseptrs,ptrs,sizes}. We will probably improve this along with our gpu frontend pr.
+    fn get_geps<'a, 'll>(
+        builder: &mut SBuilder<'a, 'll>,
+        cx: &'ll SimpleCx<'ll>,
+        ty: &'ll Type,
+        ty2: &'ll Type,
+        a1: &'ll Value,
+        a2: &'ll Value,
+        a4: &'ll Value,
+    ) -> (&'ll Value, &'ll Value, &'ll Value) {
+        let i32_0 = cx.get_const_i32(0);
+
+        let gep1 = builder.inbounds_gep(ty, a1, &[i32_0, i32_0]);
+        let gep2 = builder.inbounds_gep(ty, a2, &[i32_0, i32_0]);
+        let gep3 = builder.inbounds_gep(ty2, a4, &[i32_0, i32_0]);
+        (gep1, gep2, gep3)
+    }
+
+    fn generate_mapper_call<'a, 'll>(
+        builder: &mut SBuilder<'a, 'll>,
+        cx: &'ll SimpleCx<'ll>,
+        geps: (&'ll Value, &'ll Value, &'ll Value),
+        o_type: &'ll Value,
+        fn_to_call: &'ll Value,
+        fn_ty: &'ll Type,
+        num_args: u64,
+        s_ident_t: &'ll Value,
+    ) {
+        let nullptr = cx.const_null(cx.type_ptr());
+        let i64_max = cx.get_const_i64(u64::MAX);
+        let num_args = cx.get_const_i32(num_args);
+        let args =
+            vec![s_ident_t, i64_max, num_args, geps.0, geps.1, geps.2, o_type, nullptr, nullptr];
+        builder.call(fn_ty, fn_to_call, &args, None);
+    }
+
+    // Step 2)
+    let s_ident_t = generate_at_one(&cx);
+    let o = o_types[0];
+    let geps = get_geps(&mut builder, &cx, ty, ty2, a1, a2, a4);
+    generate_mapper_call(&mut builder, &cx, geps, o, begin_mapper_decl, fn_ty, num_args, s_ident_t);
+
+    // Step 3)
+    // Here we will add code for the actual kernel launches in a follow-up PR.
+    // FIXME(offload): launch kernels
+
+    // Step 4)
+    unsafe { llvm::LLVMRustPositionAfter(builder.llbuilder, kernel_call) };
+
+    let geps = get_geps(&mut builder, &cx, ty, ty2, a1, a2, a4);
+    generate_mapper_call(&mut builder, &cx, geps, o, end_mapper_decl, fn_ty, num_args, s_ident_t);
+
+    builder.call(mapper_fn_ty, unregister_lib_decl, &[tgt_bin_desc_alloca], None);
+
+    // With this we generated the following begin and end mappers. We could easily generate the
+    // update mapper in an update.
+    // call void @__tgt_target_data_begin_mapper(ptr @1, i64 -1, i32 3, ptr %27, ptr %28, ptr %29, ptr @.offload_maptypes, ptr null, ptr null)
+    // call void @__tgt_target_data_update_mapper(ptr @1, i64 -1, i32 2, ptr %46, ptr %47, ptr %48, ptr @.offload_maptypes.1, ptr null, ptr null)
+    // call void @__tgt_target_data_end_mapper(ptr @1, i64 -1, i32 3, ptr %49, ptr %50, ptr %51, ptr @.offload_maptypes, ptr null, ptr null)
+}
diff --git a/compiler/rustc_codegen_llvm/src/common.rs b/compiler/rustc_codegen_llvm/src/common.rs
index f9ab96b5789..f29fefb66f0 100644
--- a/compiler/rustc_codegen_llvm/src/common.rs
+++ b/compiler/rustc_codegen_llvm/src/common.rs
@@ -118,6 +118,10 @@ impl<'ll, CX: Borrow<SCx<'ll>>> GenericCx<'ll, CX> {
             r
         }
     }
+
+    pub(crate) fn const_null(&self, t: &'ll Type) -> &'ll Value {
+        unsafe { llvm::LLVMConstNull(t) }
+    }
 }
 
 impl<'ll, 'tcx> ConstCodegenMethods for CodegenCx<'ll, 'tcx> {
@@ -377,6 +381,11 @@ pub(crate) fn bytes_in_context<'ll>(llcx: &'ll llvm::Context, bytes: &[u8]) -> &
     }
 }
 
+pub(crate) fn named_struct<'ll>(ty: &'ll Type, elts: &[&'ll Value]) -> &'ll Value {
+    let len = c_uint::try_from(elts.len()).expect("LLVMConstStructInContext elements len overflow");
+    unsafe { llvm::LLVMConstNamedStruct(ty, elts.as_ptr(), len) }
+}
+
 fn struct_in_context<'ll>(
     llcx: &'ll llvm::Context,
     elts: &[&'ll Value],
diff --git a/compiler/rustc_codegen_llvm/src/context.rs b/compiler/rustc_codegen_llvm/src/context.rs
index 6a23becaa96..ee77774c688 100644
--- a/compiler/rustc_codegen_llvm/src/context.rs
+++ b/compiler/rustc_codegen_llvm/src/context.rs
@@ -207,11 +207,16 @@ pub(crate) unsafe fn create_module<'ll>(
             // LLVM 21 updated the default layout on nvptx: https://github.com/llvm/llvm-project/pull/124961
             target_data_layout = target_data_layout.replace("e-p6:32:32-i64", "e-i64");
         }
+        if sess.target.arch == "amdgpu" {
+            // LLVM 21 adds the address width for address space 8.
+            // See https://github.com/llvm/llvm-project/pull/139419
+            target_data_layout = target_data_layout.replace("p8:128:128:128:48", "p8:128:128")
+        }
     }
 
     // Ensure the data-layout values hardcoded remain the defaults.
     {
-        let tm = crate::back::write::create_informational_target_machine(tcx.sess, false);
+        let tm = crate::back::write::create_informational_target_machine(sess, false);
         unsafe {
             llvm::LLVMRustSetDataLayoutFromTargetMachine(llmod, tm.raw());
         }
@@ -680,6 +685,22 @@ impl<'ll, CX: Borrow<SCx<'ll>>> GenericCx<'ll, CX> {
         unsafe { llvm::LLVMConstInt(ty, val, llvm::False) }
     }
 
+    pub(crate) fn get_const_i64(&self, n: u64) -> &'ll Value {
+        self.get_const_int(self.type_i64(), n)
+    }
+
+    pub(crate) fn get_const_i32(&self, n: u64) -> &'ll Value {
+        self.get_const_int(self.type_i32(), n)
+    }
+
+    pub(crate) fn get_const_i16(&self, n: u64) -> &'ll Value {
+        self.get_const_int(self.type_i16(), n)
+    }
+
+    pub(crate) fn get_const_i8(&self, n: u64) -> &'ll Value {
+        self.get_const_int(self.type_i8(), n)
+    }
+
     pub(crate) fn get_function(&self, name: &str) -> Option<&'ll Value> {
         let name = SmallCStr::new(name);
         unsafe { llvm::LLVMGetNamedFunction((**self).borrow().llmod, name.as_ptr()) }
diff --git a/compiler/rustc_codegen_llvm/src/coverageinfo/mapgen/spans.rs b/compiler/rustc_codegen_llvm/src/coverageinfo/mapgen/spans.rs
index 39a59560c9d..574463be7ff 100644
--- a/compiler/rustc_codegen_llvm/src/coverageinfo/mapgen/spans.rs
+++ b/compiler/rustc_codegen_llvm/src/coverageinfo/mapgen/spans.rs
@@ -39,7 +39,10 @@ impl Coords {
 /// or other expansions), and if it does happen then skipping a span or function is
 /// better than an ICE or `llvm-cov` failure that the user might have no way to avoid.
 pub(crate) fn make_coords(source_map: &SourceMap, file: &SourceFile, span: Span) -> Option<Coords> {
-    let span = ensure_non_empty_span(source_map, span)?;
+    if span.is_empty() {
+        debug_assert!(false, "can't make coords from empty span: {span:?}");
+        return None;
+    }
 
     let lo = span.lo();
     let hi = span.hi();
@@ -70,29 +73,6 @@ pub(crate) fn make_coords(source_map: &SourceMap, file: &SourceFile, span: Span)
     })
 }
 
-fn ensure_non_empty_span(source_map: &SourceMap, span: Span) -> Option<Span> {
-    if !span.is_empty() {
-        return Some(span);
-    }
-
-    // The span is empty, so try to enlarge it to cover an adjacent '{' or '}'.
-    source_map
-        .span_to_source(span, |src, start, end| try {
-            // Adjusting span endpoints by `BytePos(1)` is normally a bug,
-            // but in this case we have specifically checked that the character
-            // we're skipping over is one of two specific ASCII characters, so
-            // adjusting by exactly 1 byte is correct.
-            if src.as_bytes().get(end).copied() == Some(b'{') {
-                Some(span.with_hi(span.hi() + BytePos(1)))
-            } else if start > 0 && src.as_bytes()[start - 1] == b'}' {
-                Some(span.with_lo(span.lo() - BytePos(1)))
-            } else {
-                None
-            }
-        })
-        .ok()?
-}
-
 /// If `llvm-cov` sees a source region that is improperly ordered (end < start),
 /// it will immediately exit with a fatal error. To prevent that from happening,
 /// discard regions that are improperly ordered, or might be interpreted in a
diff --git a/compiler/rustc_codegen_llvm/src/declare.rs b/compiler/rustc_codegen_llvm/src/declare.rs
index eb75716d768..960a895a203 100644
--- a/compiler/rustc_codegen_llvm/src/declare.rs
+++ b/compiler/rustc_codegen_llvm/src/declare.rs
@@ -215,7 +215,9 @@ impl<'ll, 'tcx> CodegenCx<'ll, 'tcx> {
 
         llfn
     }
+}
 
+impl<'ll, CX: Borrow<SCx<'ll>>> GenericCx<'ll, CX> {
     /// Declare a global with an intention to define it.
     ///
     /// Use this function when you intend to define a global. This function will
@@ -234,13 +236,13 @@ impl<'ll, 'tcx> CodegenCx<'ll, 'tcx> {
     ///
     /// Use this function when you intend to define a global without a name.
     pub(crate) fn define_private_global(&self, ty: &'ll Type) -> &'ll Value {
-        unsafe { llvm::LLVMRustInsertPrivateGlobal(self.llmod, ty) }
+        unsafe { llvm::LLVMRustInsertPrivateGlobal(self.llmod(), ty) }
     }
 
     /// Gets declared value by name.
     pub(crate) fn get_declared_value(&self, name: &str) -> Option<&'ll Value> {
         debug!("get_declared_value(name={:?})", name);
-        unsafe { llvm::LLVMRustGetNamedValue(self.llmod, name.as_c_char_ptr(), name.len()) }
+        unsafe { llvm::LLVMRustGetNamedValue(self.llmod(), name.as_c_char_ptr(), name.len()) }
     }
 
     /// Gets defined or externally defined (AvailableExternally linkage) value by
diff --git a/compiler/rustc_codegen_llvm/src/errors.rs b/compiler/rustc_codegen_llvm/src/errors.rs
index 31d49e86319..2a889888a39 100644
--- a/compiler/rustc_codegen_llvm/src/errors.rs
+++ b/compiler/rustc_codegen_llvm/src/errors.rs
@@ -20,11 +20,6 @@ pub(crate) struct SymbolAlreadyDefined<'a> {
 #[diag(codegen_llvm_sanitizer_memtag_requires_mte)]
 pub(crate) struct SanitizerMemtagRequiresMte;
 
-#[derive(Diagnostic)]
-#[diag(codegen_llvm_dynamic_linking_with_lto)]
-#[note]
-pub(crate) struct DynamicLinkingWithLTO;
-
 pub(crate) struct ParseTargetMachineConfig<'a>(pub LlvmError<'a>);
 
 impl<G: EmissionGuarantee> Diagnostic<'_, G> for ParseTargetMachineConfig<'_> {
@@ -42,18 +37,6 @@ impl<G: EmissionGuarantee> Diagnostic<'_, G> for ParseTargetMachineConfig<'_> {
 pub(crate) struct AutoDiffWithoutEnable;
 
 #[derive(Diagnostic)]
-#[diag(codegen_llvm_lto_disallowed)]
-pub(crate) struct LtoDisallowed;
-
-#[derive(Diagnostic)]
-#[diag(codegen_llvm_lto_dylib)]
-pub(crate) struct LtoDylib;
-
-#[derive(Diagnostic)]
-#[diag(codegen_llvm_lto_proc_macro)]
-pub(crate) struct LtoProcMacro;
-
-#[derive(Diagnostic)]
 #[diag(codegen_llvm_lto_bitcode_from_rlib)]
 pub(crate) struct LtoBitcodeFromRlib {
     pub llvm_err: String,
diff --git a/compiler/rustc_codegen_llvm/src/lib.rs b/compiler/rustc_codegen_llvm/src/lib.rs
index 6db4e122ad6..8b1913cfa75 100644
--- a/compiler/rustc_codegen_llvm/src/lib.rs
+++ b/compiler/rustc_codegen_llvm/src/lib.rs
@@ -22,6 +22,7 @@
 use std::any::Any;
 use std::ffi::CStr;
 use std::mem::ManuallyDrop;
+use std::path::PathBuf;
 
 use back::owned_target_machine::OwnedTargetMachine;
 use back::write::{create_informational_target_machine, create_target_machine};
@@ -176,11 +177,13 @@ impl WriteBackendMethods for LlvmCodegenBackend {
     }
     fn run_and_optimize_fat_lto(
         cgcx: &CodegenContext<Self>,
+        exported_symbols_for_lto: &[String],
+        each_linked_rlib_for_lto: &[PathBuf],
         modules: Vec<FatLtoInput<Self>>,
-        cached_modules: Vec<(SerializedModule<Self::ModuleBuffer>, WorkProduct)>,
         diff_fncs: Vec<AutoDiffItem>,
     ) -> Result<ModuleCodegen<Self::Module>, FatalError> {
-        let mut module = back::lto::run_fat(cgcx, modules, cached_modules)?;
+        let mut module =
+            back::lto::run_fat(cgcx, exported_symbols_for_lto, each_linked_rlib_for_lto, modules)?;
 
         if !diff_fncs.is_empty() {
             builder::autodiff::differentiate(&module, cgcx, diff_fncs)?;
@@ -194,10 +197,18 @@ impl WriteBackendMethods for LlvmCodegenBackend {
     }
     fn run_thin_lto(
         cgcx: &CodegenContext<Self>,
+        exported_symbols_for_lto: &[String],
+        each_linked_rlib_for_lto: &[PathBuf],
         modules: Vec<(String, Self::ThinBuffer)>,
         cached_modules: Vec<(SerializedModule<Self::ModuleBuffer>, WorkProduct)>,
     ) -> Result<(Vec<ThinModule<Self>>, Vec<WorkProduct>), FatalError> {
-        back::lto::run_thin(cgcx, modules, cached_modules)
+        back::lto::run_thin(
+            cgcx,
+            exported_symbols_for_lto,
+            each_linked_rlib_for_lto,
+            modules,
+            cached_modules,
+        )
     }
     fn optimize(
         cgcx: &CodegenContext<Self>,
@@ -412,6 +423,20 @@ impl ModuleLlvm {
         }
     }
 
+    fn tm_from_cgcx(
+        cgcx: &CodegenContext<LlvmCodegenBackend>,
+        name: &str,
+        dcx: DiagCtxtHandle<'_>,
+    ) -> Result<OwnedTargetMachine, FatalError> {
+        let tm_factory_config = TargetMachineFactoryConfig::new(cgcx, name);
+        match (cgcx.tm_factory)(tm_factory_config) {
+            Ok(m) => Ok(m),
+            Err(e) => {
+                return Err(dcx.emit_almost_fatal(ParseTargetMachineConfig(e)));
+            }
+        }
+    }
+
     fn parse(
         cgcx: &CodegenContext<LlvmCodegenBackend>,
         name: &CStr,
@@ -421,13 +446,7 @@ impl ModuleLlvm {
         unsafe {
             let llcx = llvm::LLVMRustContextCreate(cgcx.fewer_names);
             let llmod_raw = back::lto::parse_module(llcx, name, buffer, dcx)?;
-            let tm_factory_config = TargetMachineFactoryConfig::new(cgcx, name.to_str().unwrap());
-            let tm = match (cgcx.tm_factory)(tm_factory_config) {
-                Ok(m) => m,
-                Err(e) => {
-                    return Err(dcx.emit_almost_fatal(ParseTargetMachineConfig(e)));
-                }
-            };
+            let tm = ModuleLlvm::tm_from_cgcx(cgcx, name.to_str().unwrap(), dcx)?;
 
             Ok(ModuleLlvm { llmod_raw, llcx, tm: ManuallyDrop::new(tm) })
         }
diff --git a/compiler/rustc_codegen_llvm/src/llvm/enzyme_ffi.rs b/compiler/rustc_codegen_llvm/src/llvm/enzyme_ffi.rs
index c696b8d8ff2..56d756e52cc 100644
--- a/compiler/rustc_codegen_llvm/src/llvm/enzyme_ffi.rs
+++ b/compiler/rustc_codegen_llvm/src/llvm/enzyme_ffi.rs
@@ -4,7 +4,7 @@ use libc::{c_char, c_uint};
 
 use super::MetadataKindId;
 use super::ffi::{AttributeKind, BasicBlock, Metadata, Module, Type, Value};
-use crate::llvm::Bool;
+use crate::llvm::{Bool, Builder};
 
 #[link(name = "llvm-wrapper", kind = "static")]
 unsafe extern "C" {
@@ -31,6 +31,14 @@ unsafe extern "C" {
         index: c_uint,
         kind: AttributeKind,
     );
+    pub(crate) fn LLVMRustPositionBefore<'a>(B: &'a Builder<'_>, I: &'a Value);
+    pub(crate) fn LLVMRustPositionAfter<'a>(B: &'a Builder<'_>, I: &'a Value);
+    pub(crate) fn LLVMRustGetFunctionCall(
+        F: &Value,
+        name: *const c_char,
+        NameLen: libc::size_t,
+    ) -> Option<&Value>;
+
 }
 
 unsafe extern "C" {
diff --git a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs
index 0b1e632cbc4..edfb29dd1be 100644
--- a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs
+++ b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs
@@ -1138,6 +1138,11 @@ unsafe extern "C" {
         Count: c_uint,
         Packed: Bool,
     ) -> &'a Value;
+    pub(crate) fn LLVMConstNamedStruct<'a>(
+        StructTy: &'a Type,
+        ConstantVals: *const &'a Value,
+        Count: c_uint,
+    ) -> &'a Value;
     pub(crate) fn LLVMConstVector(ScalarConstantVals: *const &Value, Size: c_uint) -> &Value;
 
     // Constant expressions
@@ -1217,6 +1222,8 @@ unsafe extern "C" {
     ) -> &'a BasicBlock;
 
     // Operations on instructions
+    pub(crate) fn LLVMGetInstructionParent(Inst: &Value) -> &BasicBlock;
+    pub(crate) fn LLVMGetCalledValue(CallInst: &Value) -> Option<&Value>;
     pub(crate) fn LLVMIsAInstruction(Val: &Value) -> Option<&Value>;
     pub(crate) fn LLVMGetFirstBasicBlock(Fn: &Value) -> &BasicBlock;
     pub(crate) fn LLVMGetOperand(Val: &Value, Index: c_uint) -> Option<&Value>;
@@ -2425,6 +2432,7 @@ unsafe extern "C" {
         UseEmulatedTls: bool,
         ArgsCstrBuff: *const c_char,
         ArgsCstrBuffLen: usize,
+        UseWasmEH: bool,
     ) -> *mut TargetMachine;
 
     pub(crate) fn LLVMRustDisposeTargetMachine(T: *mut TargetMachine);
@@ -2556,6 +2564,7 @@ unsafe extern "C" {
 
     pub(crate) fn LLVMRustSetDataLayoutFromTargetMachine<'a>(M: &'a Module, TM: &'a TargetMachine);
 
+    pub(crate) fn LLVMRustPositionBuilderPastAllocas<'a>(B: &Builder<'a>, Fn: &'a Value);
     pub(crate) fn LLVMRustPositionBuilderAtStart<'a>(B: &Builder<'a>, BB: &'a BasicBlock);
 
     pub(crate) fn LLVMRustSetModulePICLevel(M: &Module);