24 files changed, 736 insertions, 276 deletions
diff --git a/compiler/rustc_codegen_llvm/Cargo.toml b/compiler/rustc_codegen_llvm/Cargo.toml
index 5ab22f8fc4d..2d11628250c 100644
--- a/compiler/rustc_codegen_llvm/Cargo.toml
+++ b/compiler/rustc_codegen_llvm/Cargo.toml
@@ -19,7 +19,6 @@ object = { version = "0.37.0", default-features = false, features = ["std", "rea
 rustc-demangle = "0.1.21"
 rustc_abi = { path = "../rustc_abi" }
 rustc_ast = { path = "../rustc_ast" }
-rustc_attr_data_structures = { path = "../rustc_attr_data_structures" }
 rustc_codegen_ssa = { path = "../rustc_codegen_ssa" }
 rustc_data_structures = { path = "../rustc_data_structures" }
 rustc_errors = { path = "../rustc_errors" }
@@ -38,11 +37,14 @@ rustc_session = { path = "../rustc_session" }
 rustc_span = { path = "../rustc_span" }
 rustc_symbol_mangling = { path = "../rustc_symbol_mangling" }
 rustc_target = { path = "../rustc_target" }
-serde = { version = "1", features = [ "derive" ]}
+serde = { version = "1", features = ["derive"] }
 serde_json = "1"
 smallvec = { version = "1.8.1", features = ["union", "may_dangle"] }
 tracing = "0.1"
 # tidy-alphabetical-end
 
 [features]
+# tidy-alphabetical-start
 check_only = ["rustc_llvm/check_only"]
+# tidy-alphabetical-end
+
diff --git a/compiler/rustc_codegen_llvm/messages.ftl b/compiler/rustc_codegen_llvm/messages.ftl
index f197ea74473..ce9a51b539d 100644
--- a/compiler/rustc_codegen_llvm/messages.ftl
+++ b/compiler/rustc_codegen_llvm/messages.ftl
@@ -2,10 +2,6 @@ codegen_llvm_autodiff_without_enable = using the autodiff feature requires -Z au
 
 codegen_llvm_copy_bitcode = failed to copy bitcode to object file: {$err}
 
-codegen_llvm_dynamic_linking_with_lto =
-    cannot prefer dynamic linking when performing LTO
-    .note = only 'staticlib', 'bin', and 'cdylib' outputs are supported with LTO
-
 
 codegen_llvm_fixed_x18_invalid_arch = the `-Zfixed-x18` flag is not supported on the `{$arch}` architecture
 
@@ -16,13 +12,7 @@ codegen_llvm_from_llvm_optimization_diag = {$filename}:{$line}:{$column} {$pass_
 codegen_llvm_load_bitcode = failed to load bitcode of module "{$name}"
 codegen_llvm_load_bitcode_with_llvm_err = failed to load bitcode of module "{$name}": {$llvm_err}
 
-codegen_llvm_lto_bitcode_from_rlib = failed to get bitcode from object file for LTO ({$llvm_err})
-
-codegen_llvm_lto_disallowed = lto can only be run for executables, cdylibs and static library outputs
-
-codegen_llvm_lto_dylib = lto cannot be used for `dylib` crate type without `-Zdylib-lto`
-
-codegen_llvm_lto_proc_macro = lto cannot be used for `proc-macro` crate type without `-Zdylib-lto`
+codegen_llvm_lto_bitcode_from_rlib = failed to get bitcode from object file for LTO ({$err})
 
 codegen_llvm_mismatch_data_layout =
     data-layout for target `{$rustc_target}`, `{$rustc_layout}`, differs from LLVM target's `{$llvm_target}` default layout, `{$llvm_layout}`
diff --git a/compiler/rustc_codegen_llvm/src/attributes.rs b/compiler/rustc_codegen_llvm/src/attributes.rs
index c32f11b27f3..c548f467583 100644
--- a/compiler/rustc_codegen_llvm/src/attributes.rs
+++ b/compiler/rustc_codegen_llvm/src/attributes.rs
@@ -1,6 +1,6 @@
 //! Set and unset common attributes on LLVM values.
-use rustc_attr_data_structures::{InlineAttr, InstructionSetAttr, OptimizeAttr};
 use rustc_codegen_ssa::traits::*;
+use rustc_hir::attrs::{InlineAttr, InstructionSetAttr, OptimizeAttr};
 use rustc_hir::def_id::DefId;
 use rustc_middle::middle::codegen_fn_attrs::{CodegenFnAttrFlags, PatchableFunctionEntry};
 use rustc_middle::ty::{self, TyCtxt};
diff --git a/compiler/rustc_codegen_llvm/src/back/lto.rs b/compiler/rustc_codegen_llvm/src/back/lto.rs
index 655e1c95373..c269f11e931 100644
--- a/compiler/rustc_codegen_llvm/src/back/lto.rs
+++ b/compiler/rustc_codegen_llvm/src/back/lto.rs
@@ -1,33 +1,29 @@
 use std::collections::BTreeMap;
 use std::ffi::{CStr, CString};
 use std::fs::File;
-use std::path::Path;
+use std::path::{Path, PathBuf};
 use std::ptr::NonNull;
 use std::sync::Arc;
 use std::{io, iter, slice};
 
 use object::read::archive::ArchiveFile;
+use object::{Object, ObjectSection};
 use rustc_codegen_ssa::back::lto::{SerializedModule, ThinModule, ThinShared};
-use rustc_codegen_ssa::back::symbol_export;
 use rustc_codegen_ssa::back::write::{CodegenContext, FatLtoInput};
 use rustc_codegen_ssa::traits::*;
 use rustc_codegen_ssa::{ModuleCodegen, ModuleKind, looks_like_rust_object_file};
 use rustc_data_structures::fx::FxHashMap;
 use rustc_data_structures::memmap::Mmap;
 use rustc_errors::{DiagCtxtHandle, FatalError};
-use rustc_hir::def_id::LOCAL_CRATE;
 use rustc_middle::bug;
 use rustc_middle::dep_graph::WorkProduct;
-use rustc_middle::middle::exported_symbols::{SymbolExportInfo, SymbolExportLevel};
-use rustc_session::config::{self, CrateType, Lto};
+use rustc_session::config::{self, Lto};
 use tracing::{debug, info};
 
 use crate::back::write::{
     self, CodegenDiagnosticsStage, DiagnosticHandlers, bitcode_section_name, save_temp_bitcode,
 };
-use crate::errors::{
-    DynamicLinkingWithLTO, LlvmError, LtoBitcodeFromRlib, LtoDisallowed, LtoDylib, LtoProcMacro,
-};
+use crate::errors::{LlvmError, LtoBitcodeFromRlib};
 use crate::llvm::AttributePlace::Function;
 use crate::llvm::{self, build_string};
 use crate::{LlvmCodegenBackend, ModuleLlvm, SimpleCx, attributes};
@@ -36,45 +32,21 @@ use crate::{LlvmCodegenBackend, ModuleLlvm, SimpleCx, attributes};
 /// session to determine which CGUs we can reuse.
 const THIN_LTO_KEYS_INCR_COMP_FILE_NAME: &str = "thin-lto-past-keys.bin";
 
-fn crate_type_allows_lto(crate_type: CrateType) -> bool {
-    match crate_type {
-        CrateType::Executable
-        | CrateType::Dylib
-        | CrateType::Staticlib
-        | CrateType::Cdylib
-        | CrateType::ProcMacro
-        | CrateType::Sdylib => true,
-        CrateType::Rlib => false,
-    }
-}
-
 fn prepare_lto(
     cgcx: &CodegenContext<LlvmCodegenBackend>,
+    exported_symbols_for_lto: &[String],
+    each_linked_rlib_for_lto: &[PathBuf],
     dcx: DiagCtxtHandle<'_>,
 ) -> Result<(Vec<CString>, Vec<(SerializedModule<ModuleBuffer>, CString)>), FatalError> {
-    let export_threshold = match cgcx.lto {
-        // We're just doing LTO for our one crate
-        Lto::ThinLocal => SymbolExportLevel::Rust,
-
-        // We're doing LTO for the entire crate graph
-        Lto::Fat | Lto::Thin => symbol_export::crates_export_threshold(&cgcx.crate_types),
-
-        Lto::No => panic!("didn't request LTO but we're doing LTO"),
-    };
+    let mut symbols_below_threshold = exported_symbols_for_lto
+        .iter()
+        .map(|symbol| CString::new(symbol.to_owned()).unwrap())
+        .collect::<Vec<CString>>();
 
-    let symbol_filter = &|&(ref name, info): &(String, SymbolExportInfo)| {
-        if info.level.is_below_threshold(export_threshold) || info.used {
-            Some(CString::new(name.as_str()).unwrap())
-        } else {
-            None
-        }
-    };
-    let exported_symbols = cgcx.exported_symbols.as_ref().expect("needs exported symbols for LTO");
-    let mut symbols_below_threshold = {
-        let _timer = cgcx.prof.generic_activity("LLVM_lto_generate_symbols_below_threshold");
-        exported_symbols[&LOCAL_CRATE].iter().filter_map(symbol_filter).collect::<Vec<CString>>()
-    };
-    info!("{} symbols to preserve in this crate", symbols_below_threshold.len());
+    // __llvm_profile_counter_bias is pulled in at link time by an undefined reference to
+    // __llvm_profile_runtime, therefore we won't know until link time if this symbol
+    // should have default visibility.
+    symbols_below_threshold.push(c"__llvm_profile_counter_bias".to_owned());
 
     // If we're performing LTO for the entire crate graph, then for each of our
     // upstream dependencies, find the corresponding rlib and load the bitcode
@@ -84,37 +56,7 @@ fn prepare_lto(
     // with either fat or thin LTO
     let mut upstream_modules = Vec::new();
     if cgcx.lto != Lto::ThinLocal {
-        // Make sure we actually can run LTO
-        for crate_type in cgcx.crate_types.iter() {
-            if !crate_type_allows_lto(*crate_type) {
-                dcx.emit_err(LtoDisallowed);
-                return Err(FatalError);
-            } else if *crate_type == CrateType::Dylib {
-                if !cgcx.opts.unstable_opts.dylib_lto {
-                    dcx.emit_err(LtoDylib);
-                    return Err(FatalError);
-                }
-            } else if *crate_type == CrateType::ProcMacro && !cgcx.opts.unstable_opts.dylib_lto {
-                dcx.emit_err(LtoProcMacro);
-                return Err(FatalError);
-            }
-        }
-
-        if cgcx.opts.cg.prefer_dynamic && !cgcx.opts.unstable_opts.dylib_lto {
-            dcx.emit_err(DynamicLinkingWithLTO);
-            return Err(FatalError);
-        }
-
-        for &(cnum, ref path) in cgcx.each_linked_rlib_for_lto.iter() {
-            let exported_symbols =
-                cgcx.exported_symbols.as_ref().expect("needs exported symbols for LTO");
-            {
-                let _timer =
-                    cgcx.prof.generic_activity("LLVM_lto_generate_symbols_below_threshold");
-                symbols_below_threshold
-                    .extend(exported_symbols[&cnum].iter().filter_map(symbol_filter));
-            }
-
+        for path in each_linked_rlib_for_lto {
             let archive_data = unsafe {
                 Mmap::map(std::fs::File::open(&path).expect("couldn't open rlib"))
                     .expect("couldn't map rlib")
@@ -147,10 +89,6 @@ fn prepare_lto(
         }
     }
 
-    // __llvm_profile_counter_bias is pulled in at link time by an undefined reference to
-    // __llvm_profile_runtime, therefore we won't know until link time if this symbol
-    // should have default visibility.
-    symbols_below_threshold.push(c"__llvm_profile_counter_bias".to_owned());
     Ok((symbols_below_threshold, upstream_modules))
 }
 
@@ -168,46 +106,32 @@ fn get_bitcode_slice_from_object_data<'a>(
     // name" which in the public API for sections gets treated as part of the section name, but
     // internally in MachOObjectFile.cpp gets treated separately.
     let section_name = bitcode_section_name(cgcx).to_str().unwrap().trim_start_matches("__LLVM,");
-    let mut len = 0;
-    let data = unsafe {
-        llvm::LLVMRustGetSliceFromObjectDataByName(
-            obj.as_ptr(),
-            obj.len(),
-            section_name.as_ptr(),
-            section_name.len(),
-            &mut len,
-        )
-    };
-    if !data.is_null() {
-        assert!(len != 0);
-        let bc = unsafe { slice::from_raw_parts(data, len) };
 
-        // `bc` must be a sub-slice of `obj`.
-        assert!(obj.as_ptr() <= bc.as_ptr());
-        assert!(bc[bc.len()..bc.len()].as_ptr() <= obj[obj.len()..obj.len()].as_ptr());
+    let obj =
+        object::File::parse(obj).map_err(|err| LtoBitcodeFromRlib { err: err.to_string() })?;
 
-        Ok(bc)
-    } else {
-        assert!(len == 0);
-        Err(LtoBitcodeFromRlib {
-            llvm_err: llvm::last_error().unwrap_or_else(|| "unknown LLVM error".to_string()),
-        })
-    }
+    let section = obj
+        .section_by_name(section_name)
+        .ok_or_else(|| LtoBitcodeFromRlib { err: format!("Can't find section {section_name}") })?;
+
+    section.data().map_err(|err| LtoBitcodeFromRlib { err: err.to_string() })
 }
 
 /// Performs fat LTO by merging all modules into a single one and returning it
 /// for further optimization.
 pub(crate) fn run_fat(
     cgcx: &CodegenContext<LlvmCodegenBackend>,
+    exported_symbols_for_lto: &[String],
+    each_linked_rlib_for_lto: &[PathBuf],
     modules: Vec<FatLtoInput<LlvmCodegenBackend>>,
-    cached_modules: Vec<(SerializedModule<ModuleBuffer>, WorkProduct)>,
 ) -> Result<ModuleCodegen<ModuleLlvm>, FatalError> {
     let dcx = cgcx.create_dcx();
     let dcx = dcx.handle();
-    let (symbols_below_threshold, upstream_modules) = prepare_lto(cgcx, dcx)?;
+    let (symbols_below_threshold, upstream_modules) =
+        prepare_lto(cgcx, exported_symbols_for_lto, each_linked_rlib_for_lto, dcx)?;
     let symbols_below_threshold =
         symbols_below_threshold.iter().map(|c| c.as_ptr()).collect::<Vec<_>>();
-    fat_lto(cgcx, dcx, modules, cached_modules, upstream_modules, &symbols_below_threshold)
+    fat_lto(cgcx, dcx, modules, upstream_modules, &symbols_below_threshold)
 }
 
 /// Performs thin LTO by performing necessary global analysis and returning two
@@ -215,12 +139,15 @@ pub(crate) fn run_fat(
 /// can simply be copied over from the incr. comp. cache.
 pub(crate) fn run_thin(
     cgcx: &CodegenContext<LlvmCodegenBackend>,
+    exported_symbols_for_lto: &[String],
+    each_linked_rlib_for_lto: &[PathBuf],
     modules: Vec<(String, ThinBuffer)>,
     cached_modules: Vec<(SerializedModule<ModuleBuffer>, WorkProduct)>,
 ) -> Result<(Vec<ThinModule<LlvmCodegenBackend>>, Vec<WorkProduct>), FatalError> {
     let dcx = cgcx.create_dcx();
     let dcx = dcx.handle();
-    let (symbols_below_threshold, upstream_modules) = prepare_lto(cgcx, dcx)?;
+    let (symbols_below_threshold, upstream_modules) =
+        prepare_lto(cgcx, exported_symbols_for_lto, each_linked_rlib_for_lto, dcx)?;
     let symbols_below_threshold =
         symbols_below_threshold.iter().map(|c| c.as_ptr()).collect::<Vec<_>>();
     if cgcx.opts.cg.linker_plugin_lto.enabled() {
@@ -245,7 +172,6 @@ fn fat_lto(
     cgcx: &CodegenContext<LlvmCodegenBackend>,
     dcx: DiagCtxtHandle<'_>,
     modules: Vec<FatLtoInput<LlvmCodegenBackend>>,
-    cached_modules: Vec<(SerializedModule<ModuleBuffer>, WorkProduct)>,
     mut serialized_modules: Vec<(SerializedModule<ModuleBuffer>, CString)>,
     symbols_below_threshold: &[*const libc::c_char],
 ) -> Result<ModuleCodegen<ModuleLlvm>, FatalError> {
@@ -258,21 +184,12 @@ fn fat_lto(
     //   modules that are serialized in-memory.
     // * `in_memory` contains modules which are already parsed and in-memory,
     //   such as from multi-CGU builds.
-    //
-    // All of `cached_modules` (cached from previous incremental builds) can
-    // immediately go onto the `serialized_modules` modules list and then we can
-    // split the `modules` array into these two lists.
     let mut in_memory = Vec::new();
-    serialized_modules.extend(cached_modules.into_iter().map(|(buffer, wp)| {
-        info!("pushing cached module {:?}", wp.cgu_name);
-        (buffer, CString::new(wp.cgu_name).unwrap())
-    }));
     for module in modules {
         match module {
             FatLtoInput::InMemory(m) => in_memory.push(m),
             FatLtoInput::Serialized { name, buffer } => {
                 info!("pushing serialized module {:?}", name);
-                let buffer = SerializedModule::Local(buffer);
                 serialized_modules.push((buffer, CString::new(name).unwrap()));
             }
         }
@@ -573,10 +490,10 @@ fn thin_lto(
 
         // Save the current ThinLTO import information for the next compilation
         // session, overwriting the previous serialized data (if any).
-        if let Some(path) = key_map_path {
-            if let Err(err) = curr_key_map.save_to_file(&path) {
-                return Err(write::llvm_err(dcx, LlvmError::WriteThinLtoKey { err }));
-            }
+        if let Some(path) = key_map_path
+            && let Err(err) = curr_key_map.save_to_file(&path)
+        {
+            return Err(write::llvm_err(dcx, LlvmError::WriteThinLtoKey { err }));
         }
 
         Ok((opt_jobs, copy_jobs))
@@ -654,6 +571,7 @@ pub(crate) fn run_pass_manager(
     // We then run the llvm_optimize function a second time, to optimize the code which we generated
     // in the enzyme differentiation pass.
     let enable_ad = config.autodiff.contains(&config::AutoDiff::Enable);
+    let enable_gpu = config.offload.contains(&config::Offload::Enable);
     let stage = if thin {
         write::AutodiffStage::PreAD
     } else {
@@ -668,6 +586,12 @@ pub(crate) fn run_pass_manager(
         write::llvm_optimize(cgcx, dcx, module, None, config, opt_level, opt_stage, stage)?;
     }
 
+    if enable_gpu && !thin {
+        let cx =
+            SimpleCx::new(module.module_llvm.llmod(), &module.module_llvm.llcx, cgcx.pointer_size);
+        crate::builder::gpu_offload::handle_gpu_code(cgcx, &cx);
+    }
+
     if cfg!(llvm_enzyme) && enable_ad && !thin {
         let cx =
             SimpleCx::new(module.module_llvm.llmod(), &module.module_llvm.llcx, cgcx.pointer_size);
diff --git a/compiler/rustc_codegen_llvm/src/back/owned_target_machine.rs b/compiler/rustc_codegen_llvm/src/back/owned_target_machine.rs
index dfde4595590..8e82013e94a 100644
--- a/compiler/rustc_codegen_llvm/src/back/owned_target_machine.rs
+++ b/compiler/rustc_codegen_llvm/src/back/owned_target_machine.rs
@@ -39,6 +39,7 @@ impl OwnedTargetMachine {
         debug_info_compression: &CStr,
         use_emulated_tls: bool,
         args_cstr_buff: &[u8],
+        use_wasm_eh: bool,
     ) -> Result<Self, LlvmError<'static>> {
         assert!(args_cstr_buff.len() > 0);
         assert!(
@@ -72,6 +73,7 @@ impl OwnedTargetMachine {
                 use_emulated_tls,
                 args_cstr_buff.as_ptr() as *const c_char,
                 args_cstr_buff.len(),
+                use_wasm_eh,
             )
         };
 
diff --git a/compiler/rustc_codegen_llvm/src/back/write.rs b/compiler/rustc_codegen_llvm/src/back/write.rs
index 68279008c03..85a06f457eb 100644
--- a/compiler/rustc_codegen_llvm/src/back/write.rs
+++ b/compiler/rustc_codegen_llvm/src/back/write.rs
@@ -15,6 +15,7 @@ use rustc_codegen_ssa::back::write::{
     BitcodeSection, CodegenContext, EmitObj, ModuleConfig, TargetMachineFactoryConfig,
     TargetMachineFactoryFn,
 };
+use rustc_codegen_ssa::base::wants_wasm_eh;
 use rustc_codegen_ssa::traits::*;
 use rustc_codegen_ssa::{CompiledModule, ModuleCodegen, ModuleKind};
 use rustc_data_structures::profiling::SelfProfilerRef;
@@ -285,6 +286,8 @@ pub(crate) fn target_machine_factory(
     let file_name_display_preference =
         sess.filename_display_preference(RemapPathScopeComponents::DEBUGINFO);
 
+    let use_wasm_eh = wants_wasm_eh(sess);
+
     Arc::new(move |config: TargetMachineFactoryConfig| {
         let path_to_cstring_helper = |path: Option<PathBuf>| -> CString {
             let path = path.unwrap_or_default();
@@ -321,6 +324,7 @@ pub(crate) fn target_machine_factory(
             &debuginfo_compression,
             use_emulated_tls,
             &args_cstr_buff,
+            use_wasm_eh,
         )
     })
 }
@@ -792,29 +796,6 @@ pub(crate) fn optimize(
     Ok(())
 }
 
-pub(crate) fn link(
-    cgcx: &CodegenContext<LlvmCodegenBackend>,
-    dcx: DiagCtxtHandle<'_>,
-    mut modules: Vec<ModuleCodegen<ModuleLlvm>>,
-) -> Result<ModuleCodegen<ModuleLlvm>, FatalError> {
-    use super::lto::{Linker, ModuleBuffer};
-    // Sort the modules by name to ensure deterministic behavior.
-    modules.sort_by(|a, b| a.name.cmp(&b.name));
-    let (first, elements) =
-        modules.split_first().expect("Bug! modules must contain at least one module.");
-
-    let mut linker = Linker::new(first.module_llvm.llmod());
-    for module in elements {
-        let _timer = cgcx.prof.generic_activity_with_arg("LLVM_link_module", &*module.name);
-        let buffer = ModuleBuffer::new(module.module_llvm.llmod());
-        linker
-            .add(buffer.data())
-            .map_err(|()| llvm_err(dcx, LlvmError::SerializeModule { name: &module.name }))?;
-    }
-    drop(linker);
-    Ok(modules.remove(0))
-}
-
 pub(crate) fn codegen(
     cgcx: &CodegenContext<LlvmCodegenBackend>,
     module: ModuleCodegen<ModuleLlvm>,
diff --git a/compiler/rustc_codegen_llvm/src/builder.rs b/compiler/rustc_codegen_llvm/src/builder.rs
index 514923ad6f3..da2a153d819 100644
--- a/compiler/rustc_codegen_llvm/src/builder.rs
+++ b/compiler/rustc_codegen_llvm/src/builder.rs
@@ -3,6 +3,7 @@ use std::ops::Deref;
 use std::{iter, ptr};
 
 pub(crate) mod autodiff;
+pub(crate) mod gpu_offload;
 
 use libc::{c_char, c_uint, size_t};
 use rustc_abi as abi;
@@ -14,6 +15,7 @@ use rustc_codegen_ssa::mir::place::PlaceRef;
 use rustc_codegen_ssa::traits::*;
 use rustc_data_structures::small_c_str::SmallCStr;
 use rustc_hir::def_id::DefId;
+use rustc_middle::bug;
 use rustc_middle::middle::codegen_fn_attrs::CodegenFnAttrs;
 use rustc_middle::ty::layout::{
     FnAbiError, FnAbiOfHelpers, FnAbiRequest, HasTypingEnv, LayoutError, LayoutOfHelpers,
@@ -23,7 +25,7 @@ use rustc_middle::ty::{self, Instance, Ty, TyCtxt};
 use rustc_sanitizers::{cfi, kcfi};
 use rustc_session::config::OptLevel;
 use rustc_span::Span;
-use rustc_target::callconv::FnAbi;
+use rustc_target::callconv::{FnAbi, PassMode};
 use rustc_target::spec::{HasTargetSpec, SanitizerSet, Target};
 use smallvec::SmallVec;
 use tracing::{debug, instrument};
@@ -117,6 +119,74 @@ impl<'a, 'll, CX: Borrow<SCx<'ll>>> GenericBuilder<'a, 'll, CX> {
         }
         bx
     }
+
+    // The generic builder has less functionality and thus (unlike the other alloca) we can not
+    // easily jump to the beginning of the function to place our allocas there. We trust the user
+    // to manually do that. FIXME(offload): improve the genericCx and add more llvm wrappers to
+    // handle this.
+    pub(crate) fn direct_alloca(&mut self, ty: &'ll Type, align: Align, name: &str) -> &'ll Value {
+        let val = unsafe {
+            let alloca = llvm::LLVMBuildAlloca(self.llbuilder, ty, UNNAMED);
+            llvm::LLVMSetAlignment(alloca, align.bytes() as c_uint);
+            // Cast to default addrspace if necessary
+            llvm::LLVMBuildPointerCast(self.llbuilder, alloca, self.cx.type_ptr(), UNNAMED)
+        };
+        if name != "" {
+            let name = std::ffi::CString::new(name).unwrap();
+            llvm::set_value_name(val, &name.as_bytes());
+        }
+        val
+    }
+
+    pub(crate) fn inbounds_gep(
+        &mut self,
+        ty: &'ll Type,
+        ptr: &'ll Value,
+        indices: &[&'ll Value],
+    ) -> &'ll Value {
+        unsafe {
+            llvm::LLVMBuildGEPWithNoWrapFlags(
+                self.llbuilder,
+                ty,
+                ptr,
+                indices.as_ptr(),
+                indices.len() as c_uint,
+                UNNAMED,
+                GEPNoWrapFlags::InBounds,
+            )
+        }
+    }
+
+    pub(crate) fn store(&mut self, val: &'ll Value, ptr: &'ll Value, align: Align) -> &'ll Value {
+        debug!("Store {:?} -> {:?}", val, ptr);
+        assert_eq!(self.cx.type_kind(self.cx.val_ty(ptr)), TypeKind::Pointer);
+        unsafe {
+            let store = llvm::LLVMBuildStore(self.llbuilder, val, ptr);
+            llvm::LLVMSetAlignment(store, align.bytes() as c_uint);
+            store
+        }
+    }
+
+    pub(crate) fn load(&mut self, ty: &'ll Type, ptr: &'ll Value, align: Align) -> &'ll Value {
+        unsafe {
+            let load = llvm::LLVMBuildLoad2(self.llbuilder, ty, ptr, UNNAMED);
+            llvm::LLVMSetAlignment(load, align.bytes() as c_uint);
+            load
+        }
+    }
+
+    fn memset(&mut self, ptr: &'ll Value, fill_byte: &'ll Value, size: &'ll Value, align: Align) {
+        unsafe {
+            llvm::LLVMRustBuildMemSet(
+                self.llbuilder,
+                ptr,
+                align.bytes() as c_uint,
+                fill_byte,
+                size,
+                false,
+            );
+        }
+    }
 }
 
 /// Empty string, to be used where LLVM expects an instruction name, indicating
@@ -618,10 +688,10 @@ impl<'a, 'll, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
                         bx.nonnull_metadata(load);
                     }
 
-                    if let Some(pointee) = layout.pointee_info_at(bx, offset) {
-                        if let Some(_) = pointee.safe {
-                            bx.align_metadata(load, pointee.align);
-                        }
+                    if let Some(pointee) = layout.pointee_info_at(bx, offset)
+                        && let Some(_) = pointee.safe
+                    {
+                        bx.align_metadata(load, pointee.align);
                     }
                 }
                 abi::Primitive::Float(_) => {}
@@ -1362,6 +1432,28 @@ impl<'a, 'll, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
         call
     }
 
+    fn tail_call(
+        &mut self,
+        llty: Self::Type,
+        fn_attrs: Option<&CodegenFnAttrs>,
+        fn_abi: &FnAbi<'tcx, Ty<'tcx>>,
+        llfn: Self::Value,
+        args: &[Self::Value],
+        funclet: Option<&Self::Funclet>,
+        instance: Option<Instance<'tcx>>,
+    ) {
+        let call = self.call(llty, fn_attrs, Some(fn_abi), llfn, args, funclet, instance);
+        llvm::LLVMRustSetTailCallKind(call, llvm::TailCallKind::MustTail);
+
+        match &fn_abi.ret.mode {
+            PassMode::Ignore | PassMode::Indirect { .. } => self.ret_void(),
+            PassMode::Direct(_) | PassMode::Pair { .. } => self.ret(call),
+            mode @ PassMode::Cast { .. } => {
+                bug!("Encountered `PassMode::{mode:?}` during codegen")
+            }
+        }
+    }
+
     fn zext(&mut self, val: &'ll Value, dest_ty: &'ll Type) -> &'ll Value {
         unsafe { llvm::LLVMBuildZExt(self.llbuilder, val, dest_ty, UNNAMED) }
     }
diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs
new file mode 100644
index 00000000000..1280ab1442a
--- /dev/null
+++ b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs
@@ -0,0 +1,439 @@
+use std::ffi::CString;
+
+use llvm::Linkage::*;
+use rustc_abi::Align;
+use rustc_codegen_ssa::back::write::CodegenContext;
+use rustc_codegen_ssa::traits::BaseTypeCodegenMethods;
+
+use crate::builder::SBuilder;
+use crate::common::AsCCharPtr;
+use crate::llvm::AttributePlace::Function;
+use crate::llvm::{self, Linkage, Type, Value};
+use crate::{LlvmCodegenBackend, SimpleCx, attributes};
+
+pub(crate) fn handle_gpu_code<'ll>(
+    _cgcx: &CodegenContext<LlvmCodegenBackend>,
+    cx: &'ll SimpleCx<'_>,
+) {
+    // The offload memory transfer type for each kernel
+    let mut o_types = vec![];
+    let mut kernels = vec![];
+    let offload_entry_ty = add_tgt_offload_entry(&cx);
+    for num in 0..9 {
+        let kernel = cx.get_function(&format!("kernel_{num}"));
+        if let Some(kernel) = kernel {
+            o_types.push(gen_define_handling(&cx, kernel, offload_entry_ty, num));
+            kernels.push(kernel);
+        }
+    }
+
+    gen_call_handling(&cx, &kernels, &o_types);
+}
+
+// What is our @1 here? A magic global, used in our data_{begin/update/end}_mapper:
+// @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+// @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @0 }, align 8
+fn generate_at_one<'ll>(cx: &'ll SimpleCx<'_>) -> &'ll llvm::Value {
+    // @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+    let unknown_txt = ";unknown;unknown;0;0;;";
+    let c_entry_name = CString::new(unknown_txt).unwrap();
+    let c_val = c_entry_name.as_bytes_with_nul();
+    let initializer = crate::common::bytes_in_context(cx.llcx, c_val);
+    let at_zero = add_unnamed_global(&cx, &"", initializer, PrivateLinkage);
+    llvm::set_alignment(at_zero, Align::ONE);
+
+    // @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @0 }, align 8
+    let struct_ident_ty = cx.type_named_struct("struct.ident_t");
+    let struct_elems = vec![
+        cx.get_const_i32(0),
+        cx.get_const_i32(2),
+        cx.get_const_i32(0),
+        cx.get_const_i32(22),
+        at_zero,
+    ];
+    let struct_elems_ty: Vec<_> = struct_elems.iter().map(|&x| cx.val_ty(x)).collect();
+    let initializer = crate::common::named_struct(struct_ident_ty, &struct_elems);
+    cx.set_struct_body(struct_ident_ty, &struct_elems_ty, false);
+    let at_one = add_unnamed_global(&cx, &"", initializer, PrivateLinkage);
+    llvm::set_alignment(at_one, Align::EIGHT);
+    at_one
+}
+
+pub(crate) fn add_tgt_offload_entry<'ll>(cx: &'ll SimpleCx<'_>) -> &'ll llvm::Type {
+    let offload_entry_ty = cx.type_named_struct("struct.__tgt_offload_entry");
+    let tptr = cx.type_ptr();
+    let ti64 = cx.type_i64();
+    let ti32 = cx.type_i32();
+    let ti16 = cx.type_i16();
+    // For each kernel to run on the gpu, we will later generate one entry of this type.
+    // copied from LLVM
+    // typedef struct {
+    //   uint64_t Reserved;
+    //   uint16_t Version;
+    //   uint16_t Kind;
+    //   uint32_t Flags; Flags associated with the entry (see Target Region Entry Flags)
+    //   void *Address; Address of global symbol within device image (function or global)
+    //   char *SymbolName;
+    //   uint64_t Size; Size of the entry info (0 if it is a function)
+    //   uint64_t Data;
+    //   void *AuxAddr;
+    // } __tgt_offload_entry;
+    let entry_elements = vec![ti64, ti16, ti16, ti32, tptr, tptr, ti64, ti64, tptr];
+    cx.set_struct_body(offload_entry_ty, &entry_elements, false);
+    offload_entry_ty
+}
+
+fn gen_tgt_kernel_global<'ll>(cx: &'ll SimpleCx<'_>) {
+    let kernel_arguments_ty = cx.type_named_struct("struct.__tgt_kernel_arguments");
+    let tptr = cx.type_ptr();
+    let ti64 = cx.type_i64();
+    let ti32 = cx.type_i32();
+    let tarr = cx.type_array(ti32, 3);
+
+    // Taken from the LLVM APITypes.h declaration:
+    //struct KernelArgsTy {
+    //  uint32_t Version = 0; // Version of this struct for ABI compatibility.
+    //  uint32_t NumArgs = 0; // Number of arguments in each input pointer.
+    //  void **ArgBasePtrs =
+    //      nullptr;                 // Base pointer of each argument (e.g. a struct).
+    //  void **ArgPtrs = nullptr;    // Pointer to the argument data.
+    //  int64_t *ArgSizes = nullptr; // Size of the argument data in bytes.
+    //  int64_t *ArgTypes = nullptr; // Type of the data (e.g. to / from).
+    //  void **ArgNames = nullptr;   // Name of the data for debugging, possibly null.
+    //  void **ArgMappers = nullptr; // User-defined mappers, possibly null.
+    //  uint64_t Tripcount =
+    //      0; // Tripcount for the teams / distribute loop, 0 otherwise.
+    //  struct {
+    //    uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause.
+    //    uint64_t IsCUDA : 1; // Was this kernel spawned via CUDA.
+    //    uint64_t Unused : 62;
+    //  } Flags = {0, 0, 0};
+    //  // The number of teams (for x,y,z dimension).
+    //  uint32_t NumTeams[3] = {0, 0, 0};
+    //  // The number of threads (for x,y,z dimension).
+    //  uint32_t ThreadLimit[3] = {0, 0, 0};
+    //  uint32_t DynCGroupMem = 0; // Amount of dynamic cgroup memory requested.
+    //};
+    let kernel_elements =
+        vec![ti32, ti32, tptr, tptr, tptr, tptr, tptr, tptr, ti64, ti64, tarr, tarr, ti32];
+
+    cx.set_struct_body(kernel_arguments_ty, &kernel_elements, false);
+    // For now we don't handle kernels, so for now we just add a global dummy
+    // to make sure that the __tgt_offload_entry is defined and handled correctly.
+    cx.declare_global("my_struct_global2", kernel_arguments_ty);
+}
+
+fn gen_tgt_data_mappers<'ll>(
+    cx: &'ll SimpleCx<'_>,
+) -> (&'ll llvm::Value, &'ll llvm::Value, &'ll llvm::Value, &'ll llvm::Type) {
+    let tptr = cx.type_ptr();
+    let ti64 = cx.type_i64();
+    let ti32 = cx.type_i32();
+
+    let args = vec![tptr, ti64, ti32, tptr, tptr, tptr, tptr, tptr, tptr];
+    let mapper_fn_ty = cx.type_func(&args, cx.type_void());
+    let mapper_begin = "__tgt_target_data_begin_mapper";
+    let mapper_update = "__tgt_target_data_update_mapper";
+    let mapper_end = "__tgt_target_data_end_mapper";
+    let begin_mapper_decl = declare_offload_fn(&cx, mapper_begin, mapper_fn_ty);
+    let update_mapper_decl = declare_offload_fn(&cx, mapper_update, mapper_fn_ty);
+    let end_mapper_decl = declare_offload_fn(&cx, mapper_end, mapper_fn_ty);
+
+    let nounwind = llvm::AttributeKind::NoUnwind.create_attr(cx.llcx);
+    attributes::apply_to_llfn(begin_mapper_decl, Function, &[nounwind]);
+    attributes::apply_to_llfn(update_mapper_decl, Function, &[nounwind]);
+    attributes::apply_to_llfn(end_mapper_decl, Function, &[nounwind]);
+
+    (begin_mapper_decl, update_mapper_decl, end_mapper_decl, mapper_fn_ty)
+}
+
+fn add_priv_unnamed_arr<'ll>(cx: &SimpleCx<'ll>, name: &str, vals: &[u64]) -> &'ll llvm::Value {
+    let ti64 = cx.type_i64();
+    let mut size_val = Vec::with_capacity(vals.len());
+    for &val in vals {
+        size_val.push(cx.get_const_i64(val));
+    }
+    let initializer = cx.const_array(ti64, &size_val);
+    add_unnamed_global(cx, name, initializer, PrivateLinkage)
+}
+
+pub(crate) fn add_unnamed_global<'ll>(
+    cx: &SimpleCx<'ll>,
+    name: &str,
+    initializer: &'ll llvm::Value,
+    l: Linkage,
+) -> &'ll llvm::Value {
+    let llglobal = add_global(cx, name, initializer, l);
+    llvm::LLVMSetUnnamedAddress(llglobal, llvm::UnnamedAddr::Global);
+    llglobal
+}
+
+pub(crate) fn add_global<'ll>(
+    cx: &SimpleCx<'ll>,
+    name: &str,
+    initializer: &'ll llvm::Value,
+    l: Linkage,
+) -> &'ll llvm::Value {
+    let c_name = CString::new(name).unwrap();
+    let llglobal: &'ll llvm::Value = llvm::add_global(cx.llmod, cx.val_ty(initializer), &c_name);
+    llvm::set_global_constant(llglobal, true);
+    llvm::set_linkage(llglobal, l);
+    llvm::set_initializer(llglobal, initializer);
+    llglobal
+}
+
+fn gen_define_handling<'ll>(
+    cx: &'ll SimpleCx<'_>,
+    kernel: &'ll llvm::Value,
+    offload_entry_ty: &'ll llvm::Type,
+    num: i64,
+) -> &'ll llvm::Value {
+    let types = cx.func_params_types(cx.get_type_of_global(kernel));
+    // It seems like non-pointer values are automatically mapped. So here, we focus on pointer (or
+    // reference) types.
+    let num_ptr_types = types
+        .iter()
+        .map(|&x| matches!(cx.type_kind(x), rustc_codegen_ssa::common::TypeKind::Pointer))
+        .count();
+
+    // We do not know their size anymore at this level, so hardcode a placeholder.
+    // A follow-up pr will track these from the frontend, where we still have Rust types.
+    // Then, we will be able to figure out that e.g. `&[f32;256]` will result in 4*256 bytes.
+    // I decided that 1024 bytes is a great placeholder value for now.
+    add_priv_unnamed_arr(&cx, &format!(".offload_sizes.{num}"), &vec![1024; num_ptr_types]);
+    // Here we figure out whether something needs to be copied to the gpu (=1), from the gpu (=2),
+    // or both to and from the gpu (=3). Other values shouldn't affect us for now.
+    // A non-mutable reference or pointer will be 1, an array that's not read, but fully overwritten
+    // will be 2. For now, everything is 3, until we have our frontend set up.
+    let o_types =
+        add_priv_unnamed_arr(&cx, &format!(".offload_maptypes.{num}"), &vec![3; num_ptr_types]);
+    // Next: For each function, generate these three entries. A weak constant,
+    // the llvm.rodata entry name, and  the omp_offloading_entries value
+
+    let name = format!(".kernel_{num}.region_id");
+    let initializer = cx.get_const_i8(0);
+    let region_id = add_unnamed_global(&cx, &name, initializer, WeakAnyLinkage);
+
+    let c_entry_name = CString::new(format!("kernel_{num}")).unwrap();
+    let c_val = c_entry_name.as_bytes_with_nul();
+    let offload_entry_name = format!(".offloading.entry_name.{num}");
+
+    let initializer = crate::common::bytes_in_context(cx.llcx, c_val);
+    let llglobal = add_unnamed_global(&cx, &offload_entry_name, initializer, InternalLinkage);
+    llvm::set_alignment(llglobal, Align::ONE);
+    llvm::set_section(llglobal, c".llvm.rodata.offloading");
+
+    // Not actively used yet, for calling real kernels
+    let name = format!(".offloading.entry.kernel_{num}");
+
+    // See the __tgt_offload_entry documentation above.
+    let reserved = cx.get_const_i64(0);
+    let version = cx.get_const_i16(1);
+    let kind = cx.get_const_i16(1);
+    let flags = cx.get_const_i32(0);
+    let size = cx.get_const_i64(0);
+    let data = cx.get_const_i64(0);
+    let aux_addr = cx.const_null(cx.type_ptr());
+    let elems = vec![reserved, version, kind, flags, region_id, llglobal, size, data, aux_addr];
+
+    let initializer = crate::common::named_struct(offload_entry_ty, &elems);
+    let c_name = CString::new(name).unwrap();
+    let llglobal = llvm::add_global(cx.llmod, offload_entry_ty, &c_name);
+    llvm::set_global_constant(llglobal, true);
+    llvm::set_linkage(llglobal, WeakAnyLinkage);
+    llvm::set_initializer(llglobal, initializer);
+    llvm::set_alignment(llglobal, Align::ONE);
+    let c_section_name = CString::new(".omp_offloading_entries").unwrap();
+    llvm::set_section(llglobal, &c_section_name);
+    o_types
+}
+
+fn declare_offload_fn<'ll>(
+    cx: &'ll SimpleCx<'_>,
+    name: &str,
+    ty: &'ll llvm::Type,
+) -> &'ll llvm::Value {
+    crate::declare::declare_simple_fn(
+        cx,
+        name,
+        llvm::CallConv::CCallConv,
+        llvm::UnnamedAddr::No,
+        llvm::Visibility::Default,
+        ty,
+    )
+}
+
+// For each kernel *call*, we now use some of our previous declared globals to move data to and from
+// the gpu. We don't have a proper frontend yet, so we assume that every call to a kernel function
+// from main is intended to run on the GPU. For now, we only handle the data transfer part of it.
+// If two consecutive kernels use the same memory, we still move it to the host and back to the gpu.
+// Since in our frontend users (by default) don't have to specify data transfer, this is something
+// we should optimize in the future! We also assume that everything should be copied back and forth,
+// but sometimes we can directly zero-allocate on the device and only move back, or if something is
+// immutable, we might only copy it to the device, but not back.
+//
+// Current steps:
+// 0. Alloca some variables for the following steps
+// 1. set insert point before kernel call.
+// 2. generate all the GEPS and stores, to be used in 3)
+// 3. generate __tgt_target_data_begin calls to move data to the GPU
+//
+// unchanged: keep kernel call. Later move the kernel to the GPU
+//
+// 4. set insert point after kernel call.
+// 5. generate all the GEPS and stores, to be used in 6)
+// 6. generate __tgt_target_data_end calls to move data from the GPU
+fn gen_call_handling<'ll>(
+    cx: &'ll SimpleCx<'_>,
+    _kernels: &[&'ll llvm::Value],
+    o_types: &[&'ll llvm::Value],
+) {
+    // %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr }
+    let tptr = cx.type_ptr();
+    let ti32 = cx.type_i32();
+    let tgt_bin_desc_ty = vec![ti32, tptr, tptr, tptr];
+    let tgt_bin_desc = cx.type_named_struct("struct.__tgt_bin_desc");
+    cx.set_struct_body(tgt_bin_desc, &tgt_bin_desc_ty, false);
+
+    gen_tgt_kernel_global(&cx);
+    let (begin_mapper_decl, _, end_mapper_decl, fn_ty) = gen_tgt_data_mappers(&cx);
+
+    let main_fn = cx.get_function("main");
+    let Some(main_fn) = main_fn else { return };
+    let kernel_name = "kernel_1";
+    let call = unsafe {
+        llvm::LLVMRustGetFunctionCall(main_fn, kernel_name.as_c_char_ptr(), kernel_name.len())
+    };
+    let Some(kernel_call) = call else {
+        return;
+    };
+    let kernel_call_bb = unsafe { llvm::LLVMGetInstructionParent(kernel_call) };
+    let called = unsafe { llvm::LLVMGetCalledValue(kernel_call).unwrap() };
+    let mut builder = SBuilder::build(cx, kernel_call_bb);
+
+    let types = cx.func_params_types(cx.get_type_of_global(called));
+    let num_args = types.len() as u64;
+
+    // Step 0)
+    // %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr }
+    // %6 = alloca %struct.__tgt_bin_desc, align 8
+    unsafe { llvm::LLVMRustPositionBuilderPastAllocas(builder.llbuilder, main_fn) };
+
+    let tgt_bin_desc_alloca = builder.direct_alloca(tgt_bin_desc, Align::EIGHT, "EmptyDesc");
+
+    let ty = cx.type_array(cx.type_ptr(), num_args);
+    // Baseptr are just the input pointer to the kernel, stored in a local alloca
+    let a1 = builder.direct_alloca(ty, Align::EIGHT, ".offload_baseptrs");
+    // Ptrs are the result of a gep into the baseptr, at least for our trivial types.
+    let a2 = builder.direct_alloca(ty, Align::EIGHT, ".offload_ptrs");
+    // These represent the sizes in bytes, e.g. the entry for `&[f64; 16]` will be 8*16.
+    let ty2 = cx.type_array(cx.type_i64(), num_args);
+    let a4 = builder.direct_alloca(ty2, Align::EIGHT, ".offload_sizes");
+    // Now we allocate once per function param, a copy to be passed to one of our maps.
+    let mut vals = vec![];
+    let mut geps = vec![];
+    let i32_0 = cx.get_const_i32(0);
+    for (index, in_ty) in types.iter().enumerate() {
+        // get function arg, store it into the alloca, and read it.
+        let p = llvm::get_param(called, index as u32);
+        let name = llvm::get_value_name(p);
+        let name = str::from_utf8(&name).unwrap();
+        let arg_name = format!("{name}.addr");
+        let alloca = builder.direct_alloca(in_ty, Align::EIGHT, &arg_name);
+
+        builder.store(p, alloca, Align::EIGHT);
+        let val = builder.load(in_ty, alloca, Align::EIGHT);
+        let gep = builder.inbounds_gep(cx.type_f32(), val, &[i32_0]);
+        vals.push(val);
+        geps.push(gep);
+    }
+
+    // Step 1)
+    unsafe { llvm::LLVMRustPositionBefore(builder.llbuilder, kernel_call) };
+    builder.memset(tgt_bin_desc_alloca, cx.get_const_i8(0), cx.get_const_i64(32), Align::EIGHT);
+
+    let mapper_fn_ty = cx.type_func(&[cx.type_ptr()], cx.type_void());
+    let register_lib_decl = declare_offload_fn(&cx, "__tgt_register_lib", mapper_fn_ty);
+    let unregister_lib_decl = declare_offload_fn(&cx, "__tgt_unregister_lib", mapper_fn_ty);
+    let init_ty = cx.type_func(&[], cx.type_void());
+    let init_rtls_decl = declare_offload_fn(cx, "__tgt_init_all_rtls", init_ty);
+
+    // call void @__tgt_register_lib(ptr noundef %6)
+    builder.call(mapper_fn_ty, register_lib_decl, &[tgt_bin_desc_alloca], None);
+    // call void @__tgt_init_all_rtls()
+    builder.call(init_ty, init_rtls_decl, &[], None);
+
+    for i in 0..num_args {
+        let idx = cx.get_const_i32(i);
+        let gep1 = builder.inbounds_gep(ty, a1, &[i32_0, idx]);
+        builder.store(vals[i as usize], gep1, Align::EIGHT);
+        let gep2 = builder.inbounds_gep(ty, a2, &[i32_0, idx]);
+        builder.store(geps[i as usize], gep2, Align::EIGHT);
+        let gep3 = builder.inbounds_gep(ty2, a4, &[i32_0, idx]);
+        // As mentioned above, we don't use Rust type information yet. So for now we will just
+        // assume that we have 1024 bytes, 256 f32 values.
+        // FIXME(offload): write an offload frontend and handle arbitrary types.
+        builder.store(cx.get_const_i64(1024), gep3, Align::EIGHT);
+    }
+
+    // For now we have a very simplistic indexing scheme into our
+    // offload_{baseptrs,ptrs,sizes}. We will probably improve this along with our gpu frontend pr.
+    fn get_geps<'a, 'll>(
+        builder: &mut SBuilder<'a, 'll>,
+        cx: &'ll SimpleCx<'ll>,
+        ty: &'ll Type,
+        ty2: &'ll Type,
+        a1: &'ll Value,
+        a2: &'ll Value,
+        a4: &'ll Value,
+    ) -> (&'ll Value, &'ll Value, &'ll Value) {
+        let i32_0 = cx.get_const_i32(0);
+
+        let gep1 = builder.inbounds_gep(ty, a1, &[i32_0, i32_0]);
+        let gep2 = builder.inbounds_gep(ty, a2, &[i32_0, i32_0]);
+        let gep3 = builder.inbounds_gep(ty2, a4, &[i32_0, i32_0]);
+        (gep1, gep2, gep3)
+    }
+
+    fn generate_mapper_call<'a, 'll>(
+        builder: &mut SBuilder<'a, 'll>,
+        cx: &'ll SimpleCx<'ll>,
+        geps: (&'ll Value, &'ll Value, &'ll Value),
+        o_type: &'ll Value,
+        fn_to_call: &'ll Value,
+        fn_ty: &'ll Type,
+        num_args: u64,
+        s_ident_t: &'ll Value,
+    ) {
+        let nullptr = cx.const_null(cx.type_ptr());
+        let i64_max = cx.get_const_i64(u64::MAX);
+        let num_args = cx.get_const_i32(num_args);
+        let args =
+            vec![s_ident_t, i64_max, num_args, geps.0, geps.1, geps.2, o_type, nullptr, nullptr];
+        builder.call(fn_ty, fn_to_call, &args, None);
+    }
+
+    // Step 2)
+    let s_ident_t = generate_at_one(&cx);
+    let o = o_types[0];
+    let geps = get_geps(&mut builder, &cx, ty, ty2, a1, a2, a4);
+    generate_mapper_call(&mut builder, &cx, geps, o, begin_mapper_decl, fn_ty, num_args, s_ident_t);
+
+    // Step 3)
+    // Here we will add code for the actual kernel launches in a follow-up PR.
+    // FIXME(offload): launch kernels
+
+    // Step 4)
+    unsafe { llvm::LLVMRustPositionAfter(builder.llbuilder, kernel_call) };
+
+    let geps = get_geps(&mut builder, &cx, ty, ty2, a1, a2, a4);
+    generate_mapper_call(&mut builder, &cx, geps, o, end_mapper_decl, fn_ty, num_args, s_ident_t);
+
+    builder.call(mapper_fn_ty, unregister_lib_decl, &[tgt_bin_desc_alloca], None);
+
+    // With this we generated the following begin and end mappers. We could easily generate the
+    // update mapper in an update.
+    // call void @__tgt_target_data_begin_mapper(ptr @1, i64 -1, i32 3, ptr %27, ptr %28, ptr %29, ptr @.offload_maptypes, ptr null, ptr null)
+    // call void @__tgt_target_data_update_mapper(ptr @1, i64 -1, i32 2, ptr %46, ptr %47, ptr %48, ptr @.offload_maptypes.1, ptr null, ptr null)
+    // call void @__tgt_target_data_end_mapper(ptr @1, i64 -1, i32 3, ptr %49, ptr %50, ptr %51, ptr @.offload_maptypes, ptr null, ptr null)
+}
diff --git a/compiler/rustc_codegen_llvm/src/callee.rs b/compiler/rustc_codegen_llvm/src/callee.rs
index 5a3dd90ab24..791a71d73ae 100644
--- a/compiler/rustc_codegen_llvm/src/callee.rs
+++ b/compiler/rustc_codegen_llvm/src/callee.rs
@@ -103,7 +103,7 @@ pub(crate) fn get_fn<'ll, 'tcx>(cx: &CodegenCx<'ll, 'tcx>, instance: Instance<'t
             // This is a monomorphization of a generic function.
             if !(cx.tcx.sess.opts.share_generics()
                 || tcx.codegen_instance_attrs(instance.def).inline
-                    == rustc_attr_data_structures::InlineAttr::Never)
+                    == rustc_hir::attrs::InlineAttr::Never)
             {
                 // When not sharing generics, all instances are in the same
                 // crate and have hidden visibility.
diff --git a/compiler/rustc_codegen_llvm/src/common.rs b/compiler/rustc_codegen_llvm/src/common.rs
index f9ab96b5789..f29fefb66f0 100644
--- a/compiler/rustc_codegen_llvm/src/common.rs
+++ b/compiler/rustc_codegen_llvm/src/common.rs
@@ -118,6 +118,10 @@ impl<'ll, CX: Borrow<SCx<'ll>>> GenericCx<'ll, CX> {
             r
         }
     }
+
+    pub(crate) fn const_null(&self, t: &'ll Type) -> &'ll Value {
+        unsafe { llvm::LLVMConstNull(t) }
+    }
 }
 
 impl<'ll, 'tcx> ConstCodegenMethods for CodegenCx<'ll, 'tcx> {
@@ -377,6 +381,11 @@ pub(crate) fn bytes_in_context<'ll>(llcx: &'ll llvm::Context, bytes: &[u8]) -> &
     }
 }
 
+pub(crate) fn named_struct<'ll>(ty: &'ll Type, elts: &[&'ll Value]) -> &'ll Value {
+    let len = c_uint::try_from(elts.len()).expect("LLVMConstStructInContext elements len overflow");
+    unsafe { llvm::LLVMConstNamedStruct(ty, elts.as_ptr(), len) }
+}
+
 fn struct_in_context<'ll>(
     llcx: &'ll llvm::Context,
     elts: &[&'ll Value],
diff --git a/compiler/rustc_codegen_llvm/src/context.rs b/compiler/rustc_codegen_llvm/src/context.rs
index 6a23becaa96..ee77774c688 100644
--- a/compiler/rustc_codegen_llvm/src/context.rs
+++ b/compiler/rustc_codegen_llvm/src/context.rs
@@ -207,11 +207,16 @@ pub(crate) unsafe fn create_module<'ll>(
             // LLVM 21 updated the default layout on nvptx: https://github.com/llvm/llvm-project/pull/124961
             target_data_layout = target_data_layout.replace("e-p6:32:32-i64", "e-i64");
         }
+        if sess.target.arch == "amdgpu" {
+            // LLVM 21 adds the address width for address space 8.
+            // See https://github.com/llvm/llvm-project/pull/139419
+            target_data_layout = target_data_layout.replace("p8:128:128:128:48", "p8:128:128")
+        }
     }
 
     // Ensure the data-layout values hardcoded remain the defaults.
     {
-        let tm = crate::back::write::create_informational_target_machine(tcx.sess, false);
+        let tm = crate::back::write::create_informational_target_machine(sess, false);
         unsafe {
             llvm::LLVMRustSetDataLayoutFromTargetMachine(llmod, tm.raw());
         }
@@ -680,6 +685,22 @@ impl<'ll, CX: Borrow<SCx<'ll>>> GenericCx<'ll, CX> {
         unsafe { llvm::LLVMConstInt(ty, val, llvm::False) }
     }
 
+    pub(crate) fn get_const_i64(&self, n: u64) -> &'ll Value {
+        self.get_const_int(self.type_i64(), n)
+    }
+
+    pub(crate) fn get_const_i32(&self, n: u64) -> &'ll Value {
+        self.get_const_int(self.type_i32(), n)
+    }
+
+    pub(crate) fn get_const_i16(&self, n: u64) -> &'ll Value {
+        self.get_const_int(self.type_i16(), n)
+    }
+
+    pub(crate) fn get_const_i8(&self, n: u64) -> &'ll Value {
+        self.get_const_int(self.type_i8(), n)
+    }
+
     pub(crate) fn get_function(&self, name: &str) -> Option<&'ll Value> {
         let name = SmallCStr::new(name);
         unsafe { llvm::LLVMGetNamedFunction((**self).borrow().llmod, name.as_ptr()) }
diff --git a/compiler/rustc_codegen_llvm/src/coverageinfo/mapgen.rs b/compiler/rustc_codegen_llvm/src/coverageinfo/mapgen.rs
index a9be833a643..8c9dfcfd18c 100644
--- a/compiler/rustc_codegen_llvm/src/coverageinfo/mapgen.rs
+++ b/compiler/rustc_codegen_llvm/src/coverageinfo/mapgen.rs
@@ -46,21 +46,17 @@ pub(crate) fn finalize(cx: &mut CodegenCx<'_, '_>) {
     debug!("Generating coverage map for CodegenUnit: `{}`", cx.codegen_unit.name());
 
     // FIXME(#132395): Can this be none even when coverage is enabled?
-    let instances_used = match cx.coverage_cx {
-        Some(ref cx) => cx.instances_used.borrow(),
-        None => return,
-    };
+    let Some(ref coverage_cx) = cx.coverage_cx else { return };
 
-    let mut covfun_records = instances_used
-        .iter()
-        .copied()
+    let mut covfun_records = coverage_cx
+        .instances_used()
+        .into_iter()
         // Sort by symbol name, so that the global file table is built in an
         // order that doesn't depend on the stable-hash-based order in which
         // instances were visited during codegen.
         .sorted_by_cached_key(|&instance| tcx.symbol_name(instance).name)
         .filter_map(|instance| prepare_covfun_record(tcx, instance, true))
         .collect::<Vec<_>>();
-    drop(instances_used);
 
     // In a single designated CGU, also prepare covfun records for functions
     // in this crate that were instrumented for coverage, but are unused.
diff --git a/compiler/rustc_codegen_llvm/src/coverageinfo/mapgen/spans.rs b/compiler/rustc_codegen_llvm/src/coverageinfo/mapgen/spans.rs
index 39a59560c9d..574463be7ff 100644
--- a/compiler/rustc_codegen_llvm/src/coverageinfo/mapgen/spans.rs
+++ b/compiler/rustc_codegen_llvm/src/coverageinfo/mapgen/spans.rs
@@ -39,7 +39,10 @@ impl Coords {
 /// or other expansions), and if it does happen then skipping a span or function is
 /// better than an ICE or `llvm-cov` failure that the user might have no way to avoid.
 pub(crate) fn make_coords(source_map: &SourceMap, file: &SourceFile, span: Span) -> Option<Coords> {
-    let span = ensure_non_empty_span(source_map, span)?;
+    if span.is_empty() {
+        debug_assert!(false, "can't make coords from empty span: {span:?}");
+        return None;
+    }
 
     let lo = span.lo();
     let hi = span.hi();
@@ -70,29 +73,6 @@ pub(crate) fn make_coords(source_map: &SourceMap, file: &SourceFile, span: Span)
     })
 }
 
-fn ensure_non_empty_span(source_map: &SourceMap, span: Span) -> Option<Span> {
-    if !span.is_empty() {
-        return Some(span);
-    }
-
-    // The span is empty, so try to enlarge it to cover an adjacent '{' or '}'.
-    source_map
-        .span_to_source(span, |src, start, end| try {
-            // Adjusting span endpoints by `BytePos(1)` is normally a bug,
-            // but in this case we have specifically checked that the character
-            // we're skipping over is one of two specific ASCII characters, so
-            // adjusting by exactly 1 byte is correct.
-            if src.as_bytes().get(end).copied() == Some(b'{') {
-                Some(span.with_hi(span.hi() + BytePos(1)))
-            } else if start > 0 && src.as_bytes()[start - 1] == b'}' {
-                Some(span.with_lo(span.lo() - BytePos(1)))
-            } else {
-                None
-            }
-        })
-        .ok()?
-}
-
 /// If `llvm-cov` sees a source region that is improperly ordered (end < start),
 /// it will immediately exit with a fatal error. To prevent that from happening,
 /// discard regions that are improperly ordered, or might be interpreted in a
diff --git a/compiler/rustc_codegen_llvm/src/coverageinfo/mod.rs b/compiler/rustc_codegen_llvm/src/coverageinfo/mod.rs
index eefbd7cf6c4..119237abd6b 100644
--- a/compiler/rustc_codegen_llvm/src/coverageinfo/mod.rs
+++ b/compiler/rustc_codegen_llvm/src/coverageinfo/mod.rs
@@ -5,7 +5,7 @@ use rustc_abi::Size;
 use rustc_codegen_ssa::traits::{
     BuilderMethods, ConstCodegenMethods, CoverageInfoBuilderMethods, MiscCodegenMethods,
 };
-use rustc_data_structures::fx::{FxHashMap, FxIndexSet};
+use rustc_data_structures::fx::{FxHashMap, FxIndexMap};
 use rustc_middle::mir::coverage::CoverageKind;
 use rustc_middle::ty::Instance;
 use tracing::{debug, instrument};
@@ -20,9 +20,14 @@ mod mapgen;
 
 /// Extra per-CGU context/state needed for coverage instrumentation.
 pub(crate) struct CguCoverageContext<'ll, 'tcx> {
-    /// Coverage data for each instrumented function identified by DefId.
-    pub(crate) instances_used: RefCell<FxIndexSet<Instance<'tcx>>>,
-    pub(crate) pgo_func_name_var_map: RefCell<FxHashMap<Instance<'tcx>, &'ll llvm::Value>>,
+    /// Associates function instances with an LLVM global that holds the
+    /// function's symbol name, as needed by LLVM coverage intrinsics.
+    ///
+    /// Instances in this map are also considered "used" for the purposes of
+    /// emitting covfun records. Every covfun record holds a hash of its
+    /// symbol name, and `llvm-cov` will exit fatally if it can't resolve that
+    /// hash back to an entry in the binary's `__llvm_prf_names` linker section.
+    pub(crate) pgo_func_name_var_map: RefCell<FxIndexMap<Instance<'tcx>, &'ll llvm::Value>>,
     pub(crate) mcdc_condition_bitmap_map: RefCell<FxHashMap<Instance<'tcx>, Vec<&'ll llvm::Value>>>,
 
     covfun_section_name: OnceCell<CString>,
@@ -31,7 +36,6 @@ pub(crate) struct CguCoverageContext<'ll, 'tcx> {
 impl<'ll, 'tcx> CguCoverageContext<'ll, 'tcx> {
     pub(crate) fn new() -> Self {
         Self {
-            instances_used: RefCell::<FxIndexSet<_>>::default(),
             pgo_func_name_var_map: Default::default(),
             mcdc_condition_bitmap_map: Default::default(),
             covfun_section_name: Default::default(),
@@ -53,6 +57,14 @@ impl<'ll, 'tcx> CguCoverageContext<'ll, 'tcx> {
             .and_then(|bitmap_map| bitmap_map.get(decision_depth as usize))
             .copied() // Dereference Option<&&Value> to Option<&Value>
     }
+
+    /// Returns the list of instances considered "used" in this CGU, as
+    /// inferred from the keys of `pgo_func_name_var_map`.
+    pub(crate) fn instances_used(&self) -> Vec<Instance<'tcx>> {
+        // Collecting into a Vec is way easier than trying to juggle RefCell
+        // projections, and this should only run once per CGU anyway.
+        self.pgo_func_name_var_map.borrow().keys().copied().collect::<Vec<_>>()
+    }
 }
 
 impl<'ll, 'tcx> CodegenCx<'ll, 'tcx> {
@@ -78,7 +90,10 @@ impl<'ll, 'tcx> CodegenCx<'ll, 'tcx> {
     /// string, to hold the function name passed to LLVM intrinsic
     /// `instrprof.increment()`. The `Value` is only created once per instance.
     /// Multiple invocations with the same instance return the same `Value`.
-    fn get_pgo_func_name_var(&self, instance: Instance<'tcx>) -> &'ll llvm::Value {
+    ///
+    /// This has the side-effect of causing coverage codegen to consider this
+    /// function "used", making it eligible to emit an associated covfun record.
+    fn ensure_pgo_func_name_var(&self, instance: Instance<'tcx>) -> &'ll llvm::Value {
         debug!("getting pgo_func_name_var for instance={:?}", instance);
         let mut pgo_func_name_var_map = self.coverage_cx().pgo_func_name_var_map.borrow_mut();
         pgo_func_name_var_map.entry(instance).or_insert_with(|| {
@@ -102,7 +117,7 @@ impl<'tcx> CoverageInfoBuilderMethods<'tcx> for Builder<'_, '_, 'tcx> {
             return;
         }
 
-        let fn_name = self.get_pgo_func_name_var(instance);
+        let fn_name = self.ensure_pgo_func_name_var(instance);
         let hash = self.const_u64(function_coverage_info.function_source_hash);
         let bitmap_bits = self.const_u32(function_coverage_info.mcdc_bitmap_bits as u32);
         self.mcdc_parameters(fn_name, hash, bitmap_bits);
@@ -151,11 +166,6 @@ impl<'tcx> CoverageInfoBuilderMethods<'tcx> for Builder<'_, '_, 'tcx> {
             return;
         };
 
-        // Mark the instance as used in this CGU, for coverage purposes.
-        // This includes functions that were not partitioned into this CGU,
-        // but were MIR-inlined into one of this CGU's functions.
-        coverage_cx.instances_used.borrow_mut().insert(instance);
-
         match *kind {
             CoverageKind::SpanMarker | CoverageKind::BlockMarker { .. } => unreachable!(
                 "marker statement {kind:?} should have been removed by CleanupPostBorrowck"
@@ -163,7 +173,7 @@ impl<'tcx> CoverageInfoBuilderMethods<'tcx> for Builder<'_, '_, 'tcx> {
             CoverageKind::VirtualCounter { bcb }
                 if let Some(&id) = ids_info.phys_counter_for_node.get(&bcb) =>
             {
-                let fn_name = bx.get_pgo_func_name_var(instance);
+                let fn_name = bx.ensure_pgo_func_name_var(instance);
                 let hash = bx.const_u64(function_coverage_info.function_source_hash);
                 let num_counters = bx.const_u32(ids_info.num_counters);
                 let index = bx.const_u32(id.as_u32());
@@ -193,7 +203,7 @@ impl<'tcx> CoverageInfoBuilderMethods<'tcx> for Builder<'_, '_, 'tcx> {
                     "bitmap index of the decision out of range"
                 );
 
-                let fn_name = bx.get_pgo_func_name_var(instance);
+                let fn_name = bx.ensure_pgo_func_name_var(instance);
                 let hash = bx.const_u64(function_coverage_info.function_source_hash);
                 let bitmap_index = bx.const_u32(bitmap_idx);
                 bx.mcdc_tvbitmap_update(fn_name, hash, bitmap_index, cond_bitmap);
diff --git a/compiler/rustc_codegen_llvm/src/debuginfo/gdb.rs b/compiler/rustc_codegen_llvm/src/debuginfo/gdb.rs
index 61555ac2f6f..bcfa0381cc1 100644
--- a/compiler/rustc_codegen_llvm/src/debuginfo/gdb.rs
+++ b/compiler/rustc_codegen_llvm/src/debuginfo/gdb.rs
@@ -1,9 +1,10 @@
 // .debug_gdb_scripts binary section.
 
-use rustc_attr_data_structures::{AttributeKind, find_attr};
 use rustc_codegen_ssa::base::collect_debugger_visualizers_transitive;
 use rustc_codegen_ssa::traits::*;
+use rustc_hir::attrs::AttributeKind;
 use rustc_hir::def_id::LOCAL_CRATE;
+use rustc_hir::find_attr;
 use rustc_middle::bug;
 use rustc_middle::middle::debugger_visualizer::DebuggerVisualizerType;
 use rustc_session::config::{CrateType, DebugInfo};
diff --git a/compiler/rustc_codegen_llvm/src/debuginfo/metadata/type_map.rs b/compiler/rustc_codegen_llvm/src/debuginfo/metadata/type_map.rs
index 56fb12d3c22..d1502d2b1e6 100644
--- a/compiler/rustc_codegen_llvm/src/debuginfo/metadata/type_map.rs
+++ b/compiler/rustc_codegen_llvm/src/debuginfo/metadata/type_map.rs
@@ -285,8 +285,8 @@ pub(super) fn build_type_with_children<'ll, 'tcx>(
         //     Item(T),
         // }
         // ```
-        let is_expanding_recursive =
-            debug_context(cx).adt_stack.borrow().iter().any(|(parent_def_id, parent_args)| {
+        let is_expanding_recursive = adt_def.is_enum()
+            && debug_context(cx).adt_stack.borrow().iter().any(|(parent_def_id, parent_args)| {
                 if def_id == *parent_def_id {
                     args.iter().zip(parent_args.iter()).any(|(arg, parent_arg)| {
                         if let (Some(arg), Some(parent_arg)) = (arg.as_type(), parent_arg.as_type())
diff --git a/compiler/rustc_codegen_llvm/src/debuginfo/mod.rs b/compiler/rustc_codegen_llvm/src/debuginfo/mod.rs
index 5ca2505cec4..6cbf2dbf7d3 100644
--- a/compiler/rustc_codegen_llvm/src/debuginfo/mod.rs
+++ b/compiler/rustc_codegen_llvm/src/debuginfo/mod.rs
@@ -533,7 +533,7 @@ impl<'ll, 'tcx> DebugInfoCodegenMethods<'tcx> for CodegenCx<'ll, 'tcx> {
             // First, let's see if this is a method within an inherent impl. Because
             // if yes, we want to make the result subroutine DIE a child of the
             // subroutine's self-type.
-            if let Some(impl_def_id) = cx.tcx.impl_of_method(instance.def_id()) {
+            if let Some(impl_def_id) = cx.tcx.impl_of_assoc(instance.def_id()) {
                 // If the method does *not* belong to a trait, proceed
                 if cx.tcx.trait_id_of_impl(impl_def_id).is_none() {
                     let impl_self_ty = cx.tcx.instantiate_and_normalize_erasing_regions(
diff --git a/compiler/rustc_codegen_llvm/src/declare.rs b/compiler/rustc_codegen_llvm/src/declare.rs
index eb75716d768..960a895a203 100644
--- a/compiler/rustc_codegen_llvm/src/declare.rs
+++ b/compiler/rustc_codegen_llvm/src/declare.rs
@@ -215,7 +215,9 @@ impl<'ll, 'tcx> CodegenCx<'ll, 'tcx> {
 
         llfn
     }
+}
 
+impl<'ll, CX: Borrow<SCx<'ll>>> GenericCx<'ll, CX> {
     /// Declare a global with an intention to define it.
     ///
     /// Use this function when you intend to define a global. This function will
@@ -234,13 +236,13 @@ impl<'ll, 'tcx> CodegenCx<'ll, 'tcx> {
     ///
     /// Use this function when you intend to define a global without a name.
     pub(crate) fn define_private_global(&self, ty: &'ll Type) -> &'ll Value {
-        unsafe { llvm::LLVMRustInsertPrivateGlobal(self.llmod, ty) }
+        unsafe { llvm::LLVMRustInsertPrivateGlobal(self.llmod(), ty) }
     }
 
     /// Gets declared value by name.
     pub(crate) fn get_declared_value(&self, name: &str) -> Option<&'ll Value> {
         debug!("get_declared_value(name={:?})", name);
-        unsafe { llvm::LLVMRustGetNamedValue(self.llmod, name.as_c_char_ptr(), name.len()) }
+        unsafe { llvm::LLVMRustGetNamedValue(self.llmod(), name.as_c_char_ptr(), name.len()) }
     }
 
     /// Gets defined or externally defined (AvailableExternally linkage) value by
diff --git a/compiler/rustc_codegen_llvm/src/errors.rs b/compiler/rustc_codegen_llvm/src/errors.rs
index 31d49e86319..627b0c9ff3b 100644
--- a/compiler/rustc_codegen_llvm/src/errors.rs
+++ b/compiler/rustc_codegen_llvm/src/errors.rs
@@ -20,11 +20,6 @@ pub(crate) struct SymbolAlreadyDefined<'a> {
 #[diag(codegen_llvm_sanitizer_memtag_requires_mte)]
 pub(crate) struct SanitizerMemtagRequiresMte;
 
-#[derive(Diagnostic)]
-#[diag(codegen_llvm_dynamic_linking_with_lto)]
-#[note]
-pub(crate) struct DynamicLinkingWithLTO;
-
 pub(crate) struct ParseTargetMachineConfig<'a>(pub LlvmError<'a>);
 
 impl<G: EmissionGuarantee> Diagnostic<'_, G> for ParseTargetMachineConfig<'_> {
@@ -42,21 +37,9 @@ impl<G: EmissionGuarantee> Diagnostic<'_, G> for ParseTargetMachineConfig<'_> {
 pub(crate) struct AutoDiffWithoutEnable;
 
 #[derive(Diagnostic)]
-#[diag(codegen_llvm_lto_disallowed)]
-pub(crate) struct LtoDisallowed;
-
-#[derive(Diagnostic)]
-#[diag(codegen_llvm_lto_dylib)]
-pub(crate) struct LtoDylib;
-
-#[derive(Diagnostic)]
-#[diag(codegen_llvm_lto_proc_macro)]
-pub(crate) struct LtoProcMacro;
-
-#[derive(Diagnostic)]
 #[diag(codegen_llvm_lto_bitcode_from_rlib)]
 pub(crate) struct LtoBitcodeFromRlib {
-    pub llvm_err: String,
+    pub err: String,
 }
 
 #[derive(Diagnostic)]
diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs
index fcc0d378f06..7b27e496986 100644
--- a/compiler/rustc_codegen_llvm/src/intrinsic.rs
+++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs
@@ -382,26 +382,16 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
                 let width = size.bits();
                 let llty = self.type_ix(width);
                 match name {
-                    sym::ctlz | sym::cttz => {
-                        let y = self.const_bool(false);
-                        let ret = self.call_intrinsic(
-                            format!("llvm.{name}"),
-                            &[llty],
-                            &[args[0].immediate(), y],
-                        );
-
-                        self.intcast(ret, result.layout.llvm_type(self), false)
-                    }
-                    sym::ctlz_nonzero => {
-                        let y = self.const_bool(true);
-                        let ret =
-                            self.call_intrinsic("llvm.ctlz", &[llty], &[args[0].immediate(), y]);
-                        self.intcast(ret, result.layout.llvm_type(self), false)
-                    }
-                    sym::cttz_nonzero => {
-                        let y = self.const_bool(true);
+                    sym::ctlz | sym::ctlz_nonzero | sym::cttz | sym::cttz_nonzero => {
+                        let y =
+                            self.const_bool(name == sym::ctlz_nonzero || name == sym::cttz_nonzero);
+                        let llvm_name = if name == sym::ctlz || name == sym::ctlz_nonzero {
+                            "llvm.ctlz"
+                        } else {
+                            "llvm.cttz"
+                        };
                         let ret =
-                            self.call_intrinsic("llvm.cttz", &[llty], &[args[0].immediate(), y]);
+                            self.call_intrinsic(llvm_name, &[llty], &[args[0].immediate(), y]);
                         self.intcast(ret, result.layout.llvm_type(self), false)
                     }
                     sym::ctpop => {
diff --git a/compiler/rustc_codegen_llvm/src/lib.rs b/compiler/rustc_codegen_llvm/src/lib.rs
index 6db4e122ad6..ca84b6de8b1 100644
--- a/compiler/rustc_codegen_llvm/src/lib.rs
+++ b/compiler/rustc_codegen_llvm/src/lib.rs
@@ -22,6 +22,7 @@
 use std::any::Any;
 use std::ffi::CStr;
 use std::mem::ManuallyDrop;
+use std::path::PathBuf;
 
 use back::owned_target_machine::OwnedTargetMachine;
 use back::write::{create_informational_target_machine, create_target_machine};
@@ -167,20 +168,15 @@ impl WriteBackendMethods for LlvmCodegenBackend {
         let stats = llvm::build_string(|s| unsafe { llvm::LLVMRustPrintStatistics(s) }).unwrap();
         print!("{stats}");
     }
-    fn run_link(
-        cgcx: &CodegenContext<Self>,
-        dcx: DiagCtxtHandle<'_>,
-        modules: Vec<ModuleCodegen<Self::Module>>,
-    ) -> Result<ModuleCodegen<Self::Module>, FatalError> {
-        back::write::link(cgcx, dcx, modules)
-    }
     fn run_and_optimize_fat_lto(
         cgcx: &CodegenContext<Self>,
+        exported_symbols_for_lto: &[String],
+        each_linked_rlib_for_lto: &[PathBuf],
         modules: Vec<FatLtoInput<Self>>,
-        cached_modules: Vec<(SerializedModule<Self::ModuleBuffer>, WorkProduct)>,
         diff_fncs: Vec<AutoDiffItem>,
     ) -> Result<ModuleCodegen<Self::Module>, FatalError> {
-        let mut module = back::lto::run_fat(cgcx, modules, cached_modules)?;
+        let mut module =
+            back::lto::run_fat(cgcx, exported_symbols_for_lto, each_linked_rlib_for_lto, modules)?;
 
         if !diff_fncs.is_empty() {
             builder::autodiff::differentiate(&module, cgcx, diff_fncs)?;
@@ -194,10 +190,18 @@ impl WriteBackendMethods for LlvmCodegenBackend {
     }
     fn run_thin_lto(
         cgcx: &CodegenContext<Self>,
+        exported_symbols_for_lto: &[String],
+        each_linked_rlib_for_lto: &[PathBuf],
         modules: Vec<(String, Self::ThinBuffer)>,
         cached_modules: Vec<(SerializedModule<Self::ModuleBuffer>, WorkProduct)>,
     ) -> Result<(Vec<ThinModule<Self>>, Vec<WorkProduct>), FatalError> {
-        back::lto::run_thin(cgcx, modules, cached_modules)
+        back::lto::run_thin(
+            cgcx,
+            exported_symbols_for_lto,
+            each_linked_rlib_for_lto,
+            modules,
+            cached_modules,
+        )
     }
     fn optimize(
         cgcx: &CodegenContext<Self>,
@@ -412,6 +416,20 @@ impl ModuleLlvm {
         }
     }
 
+    fn tm_from_cgcx(
+        cgcx: &CodegenContext<LlvmCodegenBackend>,
+        name: &str,
+        dcx: DiagCtxtHandle<'_>,
+    ) -> Result<OwnedTargetMachine, FatalError> {
+        let tm_factory_config = TargetMachineFactoryConfig::new(cgcx, name);
+        match (cgcx.tm_factory)(tm_factory_config) {
+            Ok(m) => Ok(m),
+            Err(e) => {
+                return Err(dcx.emit_almost_fatal(ParseTargetMachineConfig(e)));
+            }
+        }
+    }
+
     fn parse(
         cgcx: &CodegenContext<LlvmCodegenBackend>,
         name: &CStr,
@@ -421,13 +439,7 @@ impl ModuleLlvm {
         unsafe {
             let llcx = llvm::LLVMRustContextCreate(cgcx.fewer_names);
             let llmod_raw = back::lto::parse_module(llcx, name, buffer, dcx)?;
-            let tm_factory_config = TargetMachineFactoryConfig::new(cgcx, name.to_str().unwrap());
-            let tm = match (cgcx.tm_factory)(tm_factory_config) {
-                Ok(m) => m,
-                Err(e) => {
-                    return Err(dcx.emit_almost_fatal(ParseTargetMachineConfig(e)));
-                }
-            };
+            let tm = ModuleLlvm::tm_from_cgcx(cgcx, name.to_str().unwrap(), dcx)?;
 
             Ok(ModuleLlvm { llmod_raw, llcx, tm: ManuallyDrop::new(tm) })
         }
diff --git a/compiler/rustc_codegen_llvm/src/llvm/enzyme_ffi.rs b/compiler/rustc_codegen_llvm/src/llvm/enzyme_ffi.rs
index c696b8d8ff2..56d756e52cc 100644
--- a/compiler/rustc_codegen_llvm/src/llvm/enzyme_ffi.rs
+++ b/compiler/rustc_codegen_llvm/src/llvm/enzyme_ffi.rs
@@ -4,7 +4,7 @@ use libc::{c_char, c_uint};
 
 use super::MetadataKindId;
 use super::ffi::{AttributeKind, BasicBlock, Metadata, Module, Type, Value};
-use crate::llvm::Bool;
+use crate::llvm::{Bool, Builder};
 
 #[link(name = "llvm-wrapper", kind = "static")]
 unsafe extern "C" {
@@ -31,6 +31,14 @@ unsafe extern "C" {
         index: c_uint,
         kind: AttributeKind,
     );
+    pub(crate) fn LLVMRustPositionBefore<'a>(B: &'a Builder<'_>, I: &'a Value);
+    pub(crate) fn LLVMRustPositionAfter<'a>(B: &'a Builder<'_>, I: &'a Value);
+    pub(crate) fn LLVMRustGetFunctionCall(
+        F: &Value,
+        name: *const c_char,
+        NameLen: libc::size_t,
+    ) -> Option<&Value>;
+
 }
 
 unsafe extern "C" {
diff --git a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs
index 0b1e632cbc4..2443194ff48 100644
--- a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs
+++ b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs
@@ -97,6 +97,16 @@ pub(crate) enum ModuleFlagMergeBehavior {
 
 // Consts for the LLVM CallConv type, pre-cast to usize.
 
+#[derive(Copy, Clone, PartialEq, Debug)]
+#[repr(C)]
+#[allow(dead_code)]
+pub(crate) enum TailCallKind {
+    None = 0,
+    Tail = 1,
+    MustTail = 2,
+    NoTail = 3,
+}
+
 /// LLVM CallingConv::ID. Should we wrap this?
 ///
 /// See <https://github.com/llvm/llvm-project/blob/main/llvm/include/llvm/IR/CallingConv.h>
@@ -1138,6 +1148,11 @@ unsafe extern "C" {
         Count: c_uint,
         Packed: Bool,
     ) -> &'a Value;
+    pub(crate) fn LLVMConstNamedStruct<'a>(
+        StructTy: &'a Type,
+        ConstantVals: *const &'a Value,
+        Count: c_uint,
+    ) -> &'a Value;
     pub(crate) fn LLVMConstVector(ScalarConstantVals: *const &Value, Size: c_uint) -> &Value;
 
     // Constant expressions
@@ -1181,6 +1196,7 @@ unsafe extern "C" {
     pub(crate) safe fn LLVMIsGlobalConstant(GlobalVar: &Value) -> Bool;
     pub(crate) safe fn LLVMSetGlobalConstant(GlobalVar: &Value, IsConstant: Bool);
     pub(crate) safe fn LLVMSetTailCall(CallInst: &Value, IsTailCall: Bool);
+    pub(crate) safe fn LLVMRustSetTailCallKind(CallInst: &Value, Kind: TailCallKind);
 
     // Operations on attributes
     pub(crate) fn LLVMCreateStringAttribute(
@@ -1217,6 +1233,8 @@ unsafe extern "C" {
     ) -> &'a BasicBlock;
 
     // Operations on instructions
+    pub(crate) fn LLVMGetInstructionParent(Inst: &Value) -> &BasicBlock;
+    pub(crate) fn LLVMGetCalledValue(CallInst: &Value) -> Option<&Value>;
     pub(crate) fn LLVMIsAInstruction(Val: &Value) -> Option<&Value>;
     pub(crate) fn LLVMGetFirstBasicBlock(Fn: &Value) -> &BasicBlock;
     pub(crate) fn LLVMGetOperand(Val: &Value, Index: c_uint) -> Option<&Value>;
@@ -2425,6 +2443,7 @@ unsafe extern "C" {
         UseEmulatedTls: bool,
         ArgsCstrBuff: *const c_char,
         ArgsCstrBuffLen: usize,
+        UseWasmEH: bool,
     ) -> *mut TargetMachine;
 
     pub(crate) fn LLVMRustDisposeTargetMachine(T: *mut TargetMachine);
@@ -2556,6 +2575,7 @@ unsafe extern "C" {
 
     pub(crate) fn LLVMRustSetDataLayoutFromTargetMachine<'a>(M: &'a Module, TM: &'a TargetMachine);
 
+    pub(crate) fn LLVMRustPositionBuilderPastAllocas<'a>(B: &Builder<'a>, Fn: &'a Value);
     pub(crate) fn LLVMRustPositionBuilderAtStart<'a>(B: &Builder<'a>, BB: &'a BasicBlock);
 
     pub(crate) fn LLVMRustSetModulePICLevel(M: &Module);
@@ -2603,13 +2623,6 @@ unsafe extern "C" {
         len: usize,
         Identifier: *const c_char,
     ) -> Option<&Module>;
-    pub(crate) fn LLVMRustGetSliceFromObjectDataByName(
-        data: *const u8,
-        len: usize,
-        name: *const u8,
-        name_len: usize,
-        out_len: &mut usize,
-    ) -> *const u8;
 
     pub(crate) fn LLVMRustLinkerNew(M: &Module) -> &mut Linker<'_>;
     pub(crate) fn LLVMRustLinkerAdd(
diff --git a/compiler/rustc_codegen_llvm/src/llvm_util.rs b/compiler/rustc_codegen_llvm/src/llvm_util.rs
index 0fb987bdf82..53899da183a 100644
--- a/compiler/rustc_codegen_llvm/src/llvm_util.rs
+++ b/compiler/rustc_codegen_llvm/src/llvm_util.rs
@@ -405,6 +405,8 @@ fn update_target_reliable_float_cfg(sess: &Session, cfg: &mut TargetConfig) {
         ("mips64" | "mips64r6", _) => false,
         // Selection bug <https://github.com/llvm/llvm-project/issues/95471>
         ("nvptx64", _) => false,
+        // Unsupported https://github.com/llvm/llvm-project/issues/121122
+        ("amdgpu", _) => false,
         // ABI bugs <https://github.com/rust-lang/rust/issues/125109> et al. (full
         // list at <https://github.com/rust-lang/rust/issues/116909>)
         ("powerpc" | "powerpc64", _) => false,
@@ -433,6 +435,9 @@ fn update_target_reliable_float_cfg(sess: &Session, cfg: &mut TargetConfig) {
         // This rules out anything that doesn't have `long double` = `binary128`; <= 32 bits
         // (ld is `f64`), anything other than Linux (Windows and MacOS use `f64`), and `x86`
         // (ld is 80-bit extended precision).
+        //
+        // musl does not implement the symbols required for f128 math at all.
+        _ if target_env == "musl" => false,
         ("x86_64", _) => false,
         (_, "linux") if target_pointer_width == 64 => true,
         _ => false,