14 files changed, 221 insertions, 196 deletions
diff --git a/compiler/rustc_codegen_ssa/src/back/archive.rs b/compiler/rustc_codegen_ssa/src/back/archive.rs
index 84a56f6b0b5..cfd8ceac3a6 100644
--- a/compiler/rustc_codegen_ssa/src/back/archive.rs
+++ b/compiler/rustc_codegen_ssa/src/back/archive.rs
@@ -40,16 +40,18 @@ pub struct ImportLibraryItem {
     pub is_data: bool,
 }
 
-impl From<ImportLibraryItem> for COFFShortExport {
-    fn from(item: ImportLibraryItem) -> Self {
+impl ImportLibraryItem {
+    fn into_coff_short_export(self, sess: &Session) -> COFFShortExport {
+        let import_name = (sess.target.arch == "arm64ec").then(|| self.name.clone());
         COFFShortExport {
-            name: item.name,
+            name: self.name,
             ext_name: None,
-            symbol_name: item.symbol_name,
-            alias_target: None,
-            ordinal: item.ordinal.unwrap_or(0),
-            noname: item.ordinal.is_some(),
-            data: item.is_data,
+            symbol_name: self.symbol_name,
+            import_name,
+            export_as: None,
+            ordinal: self.ordinal.unwrap_or(0),
+            noname: self.ordinal.is_some(),
+            data: self.is_data,
             private: false,
             constant: false,
         }
@@ -113,7 +115,8 @@ pub trait ArchiveBuilderBuilder {
                     .emit_fatal(ErrorCreatingImportLibrary { lib_name, error: error.to_string() }),
             };
 
-            let exports = items.into_iter().map(Into::into).collect::<Vec<_>>();
+            let exports =
+                items.into_iter().map(|item| item.into_coff_short_export(sess)).collect::<Vec<_>>();
             let machine = match &*sess.target.arch {
                 "x86_64" => MachineTypes::AMD64,
                 "x86" => MachineTypes::I386,
@@ -134,6 +137,7 @@ pub trait ArchiveBuilderBuilder {
                 // when linking a rust staticlib using `/WHOLEARCHIVE`.
                 // See #129020
                 true,
+                &[],
             ) {
                 sess.dcx()
                     .emit_fatal(ErrorCreatingImportLibrary { lib_name, error: error.to_string() });
@@ -527,7 +531,7 @@ impl<'a> ArArchiveBuilder<'a> {
             &entries,
             archive_kind,
             false,
-            /* is_ec = */ self.sess.target.arch == "arm64ec",
+            /* is_ec = */ Some(self.sess.target.arch == "arm64ec"),
         )?;
         archive_tmpfile.flush()?;
         drop(archive_tmpfile);
diff --git a/compiler/rustc_codegen_ssa/src/back/command.rs b/compiler/rustc_codegen_ssa/src/back/command.rs
index 05351bd6ca3..7420f18aacb 100644
--- a/compiler/rustc_codegen_ssa/src/back/command.rs
+++ b/compiler/rustc_codegen_ssa/src/back/command.rs
@@ -109,7 +109,7 @@ impl Command {
             }
             Program::Lld(ref p, flavor) => {
                 let mut c = process::Command::new(p);
-                c.arg("-flavor").arg(flavor.as_str());
+                c.arg("-flavor").arg(flavor.desc());
                 c
             }
         };
diff --git a/compiler/rustc_codegen_ssa/src/back/link.rs b/compiler/rustc_codegen_ssa/src/back/link.rs
index 19c919c0e4e..48b01ea2df1 100644
--- a/compiler/rustc_codegen_ssa/src/back/link.rs
+++ b/compiler/rustc_codegen_ssa/src/back/link.rs
@@ -58,6 +58,7 @@ use super::linker::{self, Linker};
 use super::metadata::{MetadataPosition, create_wrapper_file};
 use super::rpath::{self, RPathConfig};
 use super::{apple, versioned_llvm_target};
+use crate::base::needs_allocator_shim_for_linking;
 use crate::{
     CodegenResults, CompiledModule, CrateInfo, NativeLib, errors, looks_like_rust_object_file,
 };
@@ -2080,9 +2081,17 @@ fn add_local_crate_regular_objects(cmd: &mut dyn Linker, codegen_results: &Codeg
 }
 
 /// Add object files for allocator code linked once for the whole crate tree.
-fn add_local_crate_allocator_objects(cmd: &mut dyn Linker, codegen_results: &CodegenResults) {
-    if let Some(obj) = codegen_results.allocator_module.as_ref().and_then(|m| m.object.as_ref()) {
-        cmd.add_object(obj);
+fn add_local_crate_allocator_objects(
+    cmd: &mut dyn Linker,
+    codegen_results: &CodegenResults,
+    crate_type: CrateType,
+) {
+    if needs_allocator_shim_for_linking(&codegen_results.crate_info.dependency_formats, crate_type)
+    {
+        if let Some(obj) = codegen_results.allocator_module.as_ref().and_then(|m| m.object.as_ref())
+        {
+            cmd.add_object(obj);
+        }
     }
 }
 
@@ -2281,7 +2290,7 @@ fn linker_with_args(
         codegen_results,
         metadata,
     );
-    add_local_crate_allocator_objects(cmd, codegen_results);
+    add_local_crate_allocator_objects(cmd, codegen_results, crate_type);
 
     // Avoid linking to dynamic libraries unless they satisfy some undefined symbols
     // at the point at which they are specified on the command line.
diff --git a/compiler/rustc_codegen_ssa/src/back/linker.rs b/compiler/rustc_codegen_ssa/src/back/linker.rs
index df1e91b12f9..a2efd420a32 100644
--- a/compiler/rustc_codegen_ssa/src/back/linker.rs
+++ b/compiler/rustc_codegen_ssa/src/back/linker.rs
@@ -11,8 +11,9 @@ use rustc_metadata::{
 };
 use rustc_middle::bug;
 use rustc_middle::middle::dependency_format::Linkage;
-use rustc_middle::middle::exported_symbols;
-use rustc_middle::middle::exported_symbols::{ExportedSymbol, SymbolExportInfo, SymbolExportKind};
+use rustc_middle::middle::exported_symbols::{
+    self, ExportedSymbol, SymbolExportInfo, SymbolExportKind, SymbolExportLevel,
+};
 use rustc_middle::ty::TyCtxt;
 use rustc_session::Session;
 use rustc_session::config::{self, CrateType, DebugInfo, LinkerPluginLto, Lto, OptLevel, Strip};
@@ -22,6 +23,8 @@ use tracing::{debug, warn};
 
 use super::command::Command;
 use super::symbol_export;
+use crate::back::symbol_export::allocator_shim_symbols;
+use crate::base::needs_allocator_shim_for_linking;
 use crate::errors;
 
 #[cfg(test)]
@@ -1827,7 +1830,7 @@ fn exported_symbols_for_non_proc_macro(
     let export_threshold = symbol_export::crates_export_threshold(&[crate_type]);
     for_each_exported_symbols_include_dep(tcx, crate_type, |symbol, info, cnum| {
         // Do not export mangled symbols from cdylibs and don't attempt to export compiler-builtins
-        // from any cdylib. The latter doesn't work anyway as we use hidden visibility for
+        // from any dylib. The latter doesn't work anyway as we use hidden visibility for
         // compiler-builtins. Most linkers silently ignore it, but ld64 gives a warning.
         if info.level.is_below_threshold(export_threshold) && !tcx.is_compiler_builtins(cnum) {
             symbols.push((
@@ -1838,6 +1841,14 @@ fn exported_symbols_for_non_proc_macro(
         }
     });
 
+    // Mark allocator shim symbols as exported only if they were generated.
+    if export_threshold == SymbolExportLevel::Rust
+        && needs_allocator_shim_for_linking(tcx.dependency_formats(()), crate_type)
+        && tcx.allocator_kind(()).is_some()
+    {
+        symbols.extend(allocator_shim_symbols(tcx));
+    }
+
     symbols
 }
 
diff --git a/compiler/rustc_codegen_ssa/src/back/lto.rs b/compiler/rustc_codegen_ssa/src/back/lto.rs
index c95038375a1..e6df6a2469f 100644
--- a/compiler/rustc_codegen_ssa/src/back/lto.rs
+++ b/compiler/rustc_codegen_ssa/src/back/lto.rs
@@ -8,8 +8,9 @@ use rustc_middle::ty::TyCtxt;
 use rustc_session::config::{CrateType, Lto};
 use tracing::info;
 
-use crate::back::symbol_export::{self, symbol_name_for_instance_in_crate};
+use crate::back::symbol_export::{self, allocator_shim_symbols, symbol_name_for_instance_in_crate};
 use crate::back::write::CodegenContext;
+use crate::base::allocator_kind_for_codegen;
 use crate::errors::{DynamicLinkingWithLTO, LtoDisallowed, LtoDylib, LtoProcMacro};
 use crate::traits::*;
 
@@ -115,6 +116,11 @@ pub(super) fn exported_symbols_for_lto(
         }
     }
 
+    // Mark allocator shim symbols as exported only if they were generated.
+    if export_threshold == SymbolExportLevel::Rust && allocator_kind_for_codegen(tcx).is_some() {
+        symbols_below_threshold.extend(allocator_shim_symbols(tcx).map(|(name, _kind)| name));
+    }
+
     symbols_below_threshold
 }
 
diff --git a/compiler/rustc_codegen_ssa/src/back/symbol_export.rs b/compiler/rustc_codegen_ssa/src/back/symbol_export.rs
index d8a1480e911..b49e67217fb 100644
--- a/compiler/rustc_codegen_ssa/src/back/symbol_export.rs
+++ b/compiler/rustc_codegen_ssa/src/back/symbol_export.rs
@@ -18,7 +18,7 @@ use rustc_symbol_mangling::mangle_internal_symbol;
 use rustc_target::spec::TlsModel;
 use tracing::debug;
 
-use crate::base::allocator_kind_for_codegen;
+use crate::back::symbol_export;
 
 fn threshold(tcx: TyCtxt<'_>) -> SymbolExportLevel {
     crates_export_threshold(tcx.crate_types())
@@ -217,31 +217,6 @@ fn exported_non_generic_symbols_provider_local<'tcx>(
         ));
     }
 
-    // Mark allocator shim symbols as exported only if they were generated.
-    if allocator_kind_for_codegen(tcx).is_some() {
-        for symbol_name in ALLOCATOR_METHODS
-            .iter()
-            .map(|method| mangle_internal_symbol(tcx, global_fn_name(method.name).as_str()))
-            .chain([
-                mangle_internal_symbol(tcx, "__rust_alloc_error_handler"),
-                mangle_internal_symbol(tcx, OomStrategy::SYMBOL),
-                mangle_internal_symbol(tcx, NO_ALLOC_SHIM_IS_UNSTABLE),
-            ])
-        {
-            let exported_symbol = ExportedSymbol::NoDefId(SymbolName::new(tcx, &symbol_name));
-
-            symbols.push((
-                exported_symbol,
-                SymbolExportInfo {
-                    level: SymbolExportLevel::Rust,
-                    kind: SymbolExportKind::Text,
-                    used: false,
-                    rustc_std_internal_symbol: true,
-                },
-            ));
-        }
-    }
-
     // Sort so we get a stable incr. comp. hash.
     symbols.sort_by_cached_key(|s| s.0.symbol_name_for_local_instance(tcx));
 
@@ -516,6 +491,31 @@ pub(crate) fn provide(providers: &mut Providers) {
         upstream_monomorphizations_for_provider;
 }
 
+pub(crate) fn allocator_shim_symbols(
+    tcx: TyCtxt<'_>,
+) -> impl Iterator<Item = (String, SymbolExportKind)> {
+    ALLOCATOR_METHODS
+        .iter()
+        .map(move |method| mangle_internal_symbol(tcx, global_fn_name(method.name).as_str()))
+        .chain([
+            mangle_internal_symbol(tcx, "__rust_alloc_error_handler"),
+            mangle_internal_symbol(tcx, OomStrategy::SYMBOL),
+            mangle_internal_symbol(tcx, NO_ALLOC_SHIM_IS_UNSTABLE),
+        ])
+        .map(move |symbol_name| {
+            let exported_symbol = ExportedSymbol::NoDefId(SymbolName::new(tcx, &symbol_name));
+
+            (
+                symbol_export::exporting_symbol_name_for_instance_in_crate(
+                    tcx,
+                    exported_symbol,
+                    LOCAL_CRATE,
+                ),
+                SymbolExportKind::Text,
+            )
+        })
+}
+
 fn symbol_export_level(tcx: TyCtxt<'_>, sym_def_id: DefId) -> SymbolExportLevel {
     // We export anything that's not mangled at the "C" layer as it probably has
     // to do with ABI concerns. We do not, however, apply such treatment to
diff --git a/compiler/rustc_codegen_ssa/src/back/write.rs b/compiler/rustc_codegen_ssa/src/back/write.rs
index 92582dcc399..cbaf67d7345 100644
--- a/compiler/rustc_codegen_ssa/src/back/write.rs
+++ b/compiler/rustc_codegen_ssa/src/back/write.rs
@@ -138,12 +138,23 @@ impl ModuleConfig {
 
         let emit_obj = if !should_emit_obj {
             EmitObj::None
-        } else if sess.target.obj_is_bitcode || sess.opts.cg.linker_plugin_lto.enabled() {
+        } else if sess.target.obj_is_bitcode
+            || (sess.opts.cg.linker_plugin_lto.enabled() && !no_builtins)
+        {
             // This case is selected if the target uses objects as bitcode, or
             // if linker plugin LTO is enabled. In the linker plugin LTO case
             // the assumption is that the final link-step will read the bitcode
             // and convert it to object code. This may be done by either the
             // native linker or rustc itself.
+            //
+            // Note, however, that the linker-plugin-lto requested here is
+            // explicitly ignored for `#![no_builtins]` crates. These crates are
+            // specifically ignored by rustc's LTO passes and wouldn't work if
+            // loaded into the linker. These crates define symbols that LLVM
+            // lowers intrinsics to, and these symbol dependencies aren't known
+            // until after codegen. As a result any crate marked
+            // `#![no_builtins]` is assumed to not participate in LTO and
+            // instead goes on to generate object code.
             EmitObj::Bitcode
         } else if need_bitcode_in_object(tcx) {
             EmitObj::ObjectCode(BitcodeSection::Full)
@@ -322,8 +333,8 @@ pub struct CodegenContext<B: WriteBackendMethods> {
     pub crate_types: Vec<CrateType>,
     pub output_filenames: Arc<OutputFilenames>,
     pub invocation_temp: Option<String>,
-    pub regular_module_config: Arc<ModuleConfig>,
-    pub allocator_module_config: Arc<ModuleConfig>,
+    pub module_config: Arc<ModuleConfig>,
+    pub allocator_config: Arc<ModuleConfig>,
     pub tm_factory: TargetMachineFactoryFn<B>,
     pub msvc_imps_needed: bool,
     pub is_pe_coff: bool,
@@ -361,13 +372,6 @@ impl<B: WriteBackendMethods> CodegenContext<B> {
     pub fn create_dcx(&self) -> DiagCtxt {
         DiagCtxt::new(Box::new(self.diag_emitter.clone()))
     }
-
-    pub fn config(&self, kind: ModuleKind) -> &ModuleConfig {
-        match kind {
-            ModuleKind::Regular => &self.regular_module_config,
-            ModuleKind::Allocator => &self.allocator_module_config,
-        }
-    }
 }
 
 fn generate_thin_lto_work<B: ExtraBackendMethods>(
@@ -431,6 +435,7 @@ pub(crate) fn start_async_codegen<B: ExtraBackendMethods>(
     backend: B,
     tcx: TyCtxt<'_>,
     target_cpu: String,
+    allocator_module: Option<ModuleCodegen<B::Module>>,
 ) -> OngoingCodegen<B> {
     let (coordinator_send, coordinator_receive) = channel();
 
@@ -454,6 +459,7 @@ pub(crate) fn start_async_codegen<B: ExtraBackendMethods>(
         coordinator_receive,
         Arc::new(regular_config),
         Arc::new(allocator_config),
+        allocator_module,
         coordinator_send.clone(),
     );
 
@@ -709,15 +715,6 @@ pub(crate) enum WorkItem<B: WriteBackendMethods> {
 }
 
 impl<B: WriteBackendMethods> WorkItem<B> {
-    fn module_kind(&self) -> ModuleKind {
-        match *self {
-            WorkItem::Optimize(ref m) => m.kind,
-            WorkItem::CopyPostLtoArtifacts(_) | WorkItem::FatLto { .. } | WorkItem::ThinLto(_) => {
-                ModuleKind::Regular
-            }
-        }
-    }
-
     /// Generate a short description of this work item suitable for use as a thread name.
     fn short_description(&self) -> String {
         // `pthread_setname()` on *nix ignores anything beyond the first 15
@@ -806,7 +803,7 @@ pub(crate) fn compute_per_cgu_lto_type(
     let linker_does_lto = opts.cg.linker_plugin_lto.enabled();
 
     // When we're automatically doing ThinLTO for multi-codegen-unit
-    // builds we don't actually want to LTO the allocator modules if
+    // builds we don't actually want to LTO the allocator module if
     // it shows up. This is due to various linker shenanigans that
     // we'll encounter later.
     let is_allocator = module_kind == ModuleKind::Allocator;
@@ -832,11 +829,17 @@ pub(crate) fn compute_per_cgu_lto_type(
 fn execute_optimize_work_item<B: ExtraBackendMethods>(
     cgcx: &CodegenContext<B>,
     mut module: ModuleCodegen<B::Module>,
-    module_config: &ModuleConfig,
 ) -> WorkItemResult<B> {
+    let _timer = cgcx.prof.generic_activity_with_arg("codegen_module_optimize", &*module.name);
+
     let dcx = cgcx.create_dcx();
     let dcx = dcx.handle();
 
+    let module_config = match module.kind {
+        ModuleKind::Regular => &cgcx.module_config,
+        ModuleKind::Allocator => &cgcx.allocator_config,
+    };
+
     B::optimize(cgcx, dcx, &mut module, module_config);
 
     // After we've done the initial round of optimizations we need to
@@ -848,7 +851,7 @@ fn execute_optimize_work_item<B: ExtraBackendMethods>(
 
     // If we're doing some form of incremental LTO then we need to be sure to
     // save our module to disk first.
-    let bitcode = if cgcx.config(module.kind).emit_pre_lto_bc {
+    let bitcode = if module_config.emit_pre_lto_bc {
         let filename = pre_lto_bitcode_filename(&module.name);
         cgcx.incr_comp_session_dir.as_ref().map(|path| path.join(&filename))
     } else {
@@ -861,7 +864,7 @@ fn execute_optimize_work_item<B: ExtraBackendMethods>(
             WorkItemResult::Finished(module)
         }
         ComputedLtoType::Thin => {
-            let (name, thin_buffer) = B::prepare_thin(module, false);
+            let (name, thin_buffer) = B::prepare_thin(module);
             if let Some(path) = bitcode {
                 fs::write(&path, thin_buffer.data()).unwrap_or_else(|e| {
                     panic!("Error writing pre-lto-bitcode file `{}`: {}", path.display(), e);
@@ -888,8 +891,11 @@ fn execute_optimize_work_item<B: ExtraBackendMethods>(
 fn execute_copy_from_cache_work_item<B: ExtraBackendMethods>(
     cgcx: &CodegenContext<B>,
     module: CachedModuleCodegen,
-    module_config: &ModuleConfig,
 ) -> WorkItemResult<B> {
+    let _timer = cgcx
+        .prof
+        .generic_activity_with_arg("codegen_copy_artifacts_from_incr_cache", &*module.name);
+
     let incr_comp_session_dir = cgcx.incr_comp_session_dir.as_ref().unwrap();
 
     let mut links_from_incr_cache = Vec::new();
@@ -948,6 +954,7 @@ fn execute_copy_from_cache_work_item<B: ExtraBackendMethods>(
         }
     };
 
+    let module_config = &cgcx.module_config;
     let should_emit_obj = module_config.emit_obj != EmitObj::None;
     let assembly = load_from_incr_cache(module_config.emit_asm, OutputType::Assembly);
     let llvm_ir = load_from_incr_cache(module_config.emit_ir, OutputType::LlvmAssembly);
@@ -959,8 +966,8 @@ fn execute_copy_from_cache_work_item<B: ExtraBackendMethods>(
 
     WorkItemResult::Finished(CompiledModule {
         links_from_incr_cache,
-        name: module.name,
         kind: ModuleKind::Regular,
+        name: module.name,
         object,
         dwarf_object,
         bytecode,
@@ -975,8 +982,9 @@ fn execute_fat_lto_work_item<B: ExtraBackendMethods>(
     each_linked_rlib_for_lto: &[PathBuf],
     mut needs_fat_lto: Vec<FatLtoInput<B>>,
     import_only_modules: Vec<(SerializedModule<B::ModuleBuffer>, WorkProduct)>,
-    module_config: &ModuleConfig,
 ) -> WorkItemResult<B> {
+    let _timer = cgcx.prof.generic_activity_with_arg("codegen_module_perform_lto", "everything");
+
     for (module, wp) in import_only_modules {
         needs_fat_lto.push(FatLtoInput::Serialized { name: wp.cgu_name, buffer: module })
     }
@@ -987,17 +995,18 @@ fn execute_fat_lto_work_item<B: ExtraBackendMethods>(
         each_linked_rlib_for_lto,
         needs_fat_lto,
     );
-    let module = B::codegen(cgcx, module, module_config);
+    let module = B::codegen(cgcx, module, &cgcx.module_config);
     WorkItemResult::Finished(module)
 }
 
 fn execute_thin_lto_work_item<B: ExtraBackendMethods>(
     cgcx: &CodegenContext<B>,
     module: lto::ThinModule<B>,
-    module_config: &ModuleConfig,
 ) -> WorkItemResult<B> {
+    let _timer = cgcx.prof.generic_activity_with_arg("codegen_module_perform_lto", module.name());
+
     let module = B::optimize_thin(cgcx, module);
-    let module = B::codegen(cgcx, module, module_config);
+    let module = B::codegen(cgcx, module, &cgcx.module_config);
     WorkItemResult::Finished(module)
 }
 
@@ -1082,6 +1091,7 @@ fn start_executing_work<B: ExtraBackendMethods>(
     coordinator_receive: Receiver<Message<B>>,
     regular_config: Arc<ModuleConfig>,
     allocator_config: Arc<ModuleConfig>,
+    allocator_module: Option<ModuleCodegen<B::Module>>,
     tx_to_llvm_workers: Sender<Message<B>>,
 ) -> thread::JoinHandle<Result<CompiledModules, ()>> {
     let coordinator_send = tx_to_llvm_workers;
@@ -1146,8 +1156,8 @@ fn start_executing_work<B: ExtraBackendMethods>(
         expanded_args: tcx.sess.expanded_args.clone(),
         diag_emitter: shared_emitter.clone(),
         output_filenames: Arc::clone(tcx.output_filenames(())),
-        regular_module_config: regular_config,
-        allocator_module_config: allocator_config,
+        module_config: regular_config,
+        allocator_config,
         tm_factory: backend.target_machine_factory(tcx.sess, ol, backend_features),
         msvc_imps_needed: msvc_imps_needed(tcx),
         is_pe_coff: tcx.sess.target.is_like_windows,
@@ -1301,7 +1311,6 @@ fn start_executing_work<B: ExtraBackendMethods>(
         // This is where we collect codegen units that have gone all the way
         // through codegen and LLVM.
         let mut compiled_modules = vec![];
-        let mut compiled_allocator_module = None;
         let mut needs_fat_lto = Vec::new();
         let mut needs_thin_lto = Vec::new();
         let mut lto_import_only_modules = Vec::new();
@@ -1342,6 +1351,17 @@ fn start_executing_work<B: ExtraBackendMethods>(
 
         let mut llvm_start_time: Option<VerboseTimingGuard<'_>> = None;
 
+        let compiled_allocator_module = allocator_module.and_then(|allocator_module| {
+            match execute_optimize_work_item(&cgcx, allocator_module) {
+                WorkItemResult::Finished(compiled_module) => return Some(compiled_module),
+                WorkItemResult::NeedsFatLto(fat_lto_input) => needs_fat_lto.push(fat_lto_input),
+                WorkItemResult::NeedsThinLto(name, thin_buffer) => {
+                    needs_thin_lto.push((name, thin_buffer))
+                }
+            }
+            None
+        });
+
         // Run the message loop while there's still anything that needs message
         // processing. Note that as soon as codegen is aborted we simply want to
         // wait for all existing work to finish, so many of the conditions here
@@ -1575,15 +1595,7 @@ fn start_executing_work<B: ExtraBackendMethods>(
 
                     match result {
                         Ok(WorkItemResult::Finished(compiled_module)) => {
-                            match compiled_module.kind {
-                                ModuleKind::Regular => {
-                                    compiled_modules.push(compiled_module);
-                                }
-                                ModuleKind::Allocator => {
-                                    assert!(compiled_allocator_module.is_none());
-                                    compiled_allocator_module = Some(compiled_module);
-                                }
-                            }
+                            compiled_modules.push(compiled_module);
                         }
                         Ok(WorkItemResult::NeedsFatLto(fat_lto_input)) => {
                             assert!(!started_lto);
@@ -1711,46 +1723,22 @@ fn spawn_work<'a, B: ExtraBackendMethods>(
     let cgcx = cgcx.clone();
 
     B::spawn_named_thread(cgcx.time_trace, work.short_description(), move || {
-        let result = std::panic::catch_unwind(AssertUnwindSafe(|| {
-            let module_config = cgcx.config(work.module_kind());
-
-            match work {
-                WorkItem::Optimize(m) => {
-                    let _timer =
-                        cgcx.prof.generic_activity_with_arg("codegen_module_optimize", &*m.name);
-                    execute_optimize_work_item(&cgcx, m, module_config)
-                }
-                WorkItem::CopyPostLtoArtifacts(m) => {
-                    let _timer = cgcx.prof.generic_activity_with_arg(
-                        "codegen_copy_artifacts_from_incr_cache",
-                        &*m.name,
-                    );
-                    execute_copy_from_cache_work_item(&cgcx, m, module_config)
-                }
-                WorkItem::FatLto {
-                    exported_symbols_for_lto,
-                    each_linked_rlib_for_lto,
-                    needs_fat_lto,
-                    import_only_modules,
-                } => {
-                    let _timer = cgcx
-                        .prof
-                        .generic_activity_with_arg("codegen_module_perform_lto", "everything");
-                    execute_fat_lto_work_item(
-                        &cgcx,
-                        &exported_symbols_for_lto,
-                        &each_linked_rlib_for_lto,
-                        needs_fat_lto,
-                        import_only_modules,
-                        module_config,
-                    )
-                }
-                WorkItem::ThinLto(m) => {
-                    let _timer =
-                        cgcx.prof.generic_activity_with_arg("codegen_module_perform_lto", m.name());
-                    execute_thin_lto_work_item(&cgcx, m, module_config)
-                }
-            }
+        let result = std::panic::catch_unwind(AssertUnwindSafe(|| match work {
+            WorkItem::Optimize(m) => execute_optimize_work_item(&cgcx, m),
+            WorkItem::CopyPostLtoArtifacts(m) => execute_copy_from_cache_work_item(&cgcx, m),
+            WorkItem::FatLto {
+                exported_symbols_for_lto,
+                each_linked_rlib_for_lto,
+                needs_fat_lto,
+                import_only_modules,
+            } => execute_fat_lto_work_item(
+                &cgcx,
+                &exported_symbols_for_lto,
+                &each_linked_rlib_for_lto,
+                needs_fat_lto,
+                import_only_modules,
+            ),
+            WorkItem::ThinLto(m) => execute_thin_lto_work_item(&cgcx, m),
         }));
 
         let msg = match result {
diff --git a/compiler/rustc_codegen_ssa/src/base.rs b/compiler/rustc_codegen_ssa/src/base.rs
index 8abaf201aba..45b028aa8ef 100644
--- a/compiler/rustc_codegen_ssa/src/base.rs
+++ b/compiler/rustc_codegen_ssa/src/base.rs
@@ -17,6 +17,7 @@ use rustc_hir::lang_items::LangItem;
 use rustc_hir::{ItemId, Target};
 use rustc_middle::middle::codegen_fn_attrs::CodegenFnAttrs;
 use rustc_middle::middle::debugger_visualizer::{DebuggerVisualizerFile, DebuggerVisualizerType};
+use rustc_middle::middle::dependency_format::Dependencies;
 use rustc_middle::middle::exported_symbols::{self, SymbolExportKind};
 use rustc_middle::middle::lang_items;
 use rustc_middle::mir::BinOp;
@@ -630,14 +631,30 @@ pub fn allocator_kind_for_codegen(tcx: TyCtxt<'_>) -> Option<AllocatorKind> {
     // If the crate doesn't have an `allocator_kind` set then there's definitely
     // no shim to generate. Otherwise we also check our dependency graph for all
     // our output crate types. If anything there looks like its a `Dynamic`
-    // linkage, then it's already got an allocator shim and we'll be using that
-    // one instead. If nothing exists then it's our job to generate the
-    // allocator!
-    let any_dynamic_crate = tcx.dependency_formats(()).iter().any(|(_, list)| {
+    // linkage for all crate types we may link as, then it's already got an
+    // allocator shim and we'll be using that one instead. If nothing exists
+    // then it's our job to generate the allocator! If crate types disagree
+    // about whether an allocator shim is necessary or not, we generate one
+    // and let needs_allocator_shim_for_linking decide at link time whether or
+    // not to use it for any particular linker invocation.
+    let all_crate_types_any_dynamic_crate = tcx.dependency_formats(()).iter().all(|(_, list)| {
         use rustc_middle::middle::dependency_format::Linkage;
         list.iter().any(|&linkage| linkage == Linkage::Dynamic)
     });
-    if any_dynamic_crate { None } else { tcx.allocator_kind(()) }
+    if all_crate_types_any_dynamic_crate { None } else { tcx.allocator_kind(()) }
+}
+
+/// Decide if this particular crate type needs an allocator shim linked in.
+/// This may return true even when allocator_kind_for_codegen returns false. In
+/// this case no allocator shim shall be linked.
+pub(crate) fn needs_allocator_shim_for_linking(
+    dependency_formats: &Dependencies,
+    crate_type: CrateType,
+) -> bool {
+    use rustc_middle::middle::dependency_format::Linkage;
+    let any_dynamic_crate =
+        dependency_formats[&crate_type].iter().any(|&linkage| linkage == Linkage::Dynamic);
+    !any_dynamic_crate
 }
 
 pub fn codegen_crate<B: ExtraBackendMethods>(
@@ -647,7 +664,7 @@ pub fn codegen_crate<B: ExtraBackendMethods>(
 ) -> OngoingCodegen<B> {
     // Skip crate items and just output metadata in -Z no-codegen mode.
     if tcx.sess.opts.unstable_opts.no_codegen || !tcx.sess.opts.output_types.should_codegen() {
-        let ongoing_codegen = start_async_codegen(backend, tcx, target_cpu);
+        let ongoing_codegen = start_async_codegen(backend, tcx, target_cpu, None);
 
         ongoing_codegen.codegen_finished(tcx);
 
@@ -678,7 +695,27 @@ pub fn codegen_crate<B: ExtraBackendMethods>(
         }
     }
 
-    let ongoing_codegen = start_async_codegen(backend.clone(), tcx, target_cpu);
+    // Codegen an allocator shim, if necessary.
+    let allocator_module = if let Some(kind) = allocator_kind_for_codegen(tcx) {
+        let llmod_id =
+            cgu_name_builder.build_cgu_name(LOCAL_CRATE, &["crate"], Some("allocator")).to_string();
+
+        tcx.sess.time("write_allocator_module", || {
+            let module = backend.codegen_allocator(
+                tcx,
+                &llmod_id,
+                kind,
+                // If allocator_kind is Some then alloc_error_handler_kind must
+                // also be Some.
+                tcx.alloc_error_handler_kind(()).unwrap(),
+            );
+            Some(ModuleCodegen::new_allocator(llmod_id, module))
+        })
+    } else {
+        None
+    };
+
+    let ongoing_codegen = start_async_codegen(backend.clone(), tcx, target_cpu, allocator_module);
 
     // For better throughput during parallel processing by LLVM, we used to sort
     // CGUs largest to smallest. This would lead to better thread utilization
@@ -795,35 +832,6 @@ pub fn codegen_crate<B: ExtraBackendMethods>(
         }
     }
 
-    // Codegen an allocator shim, if necessary.
-    // Do this last to ensure the LLVM_passes timer doesn't start while no CGUs have been codegened
-    // yet for the backend to optimize.
-    if let Some(kind) = allocator_kind_for_codegen(tcx) {
-        let llmod_id =
-            cgu_name_builder.build_cgu_name(LOCAL_CRATE, &["crate"], Some("allocator")).to_string();
-        let module_llvm = tcx.sess.time("write_allocator_module", || {
-            backend.codegen_allocator(
-                tcx,
-                &llmod_id,
-                kind,
-                // If allocator_kind is Some then alloc_error_handler_kind must
-                // also be Some.
-                tcx.alloc_error_handler_kind(()).unwrap(),
-            )
-        });
-
-        ongoing_codegen.wait_for_signal_to_codegen_item();
-        ongoing_codegen.check_for_errors(tcx.sess);
-
-        // These modules are generally cheap and won't throw off scheduling.
-        let cost = 0;
-        submit_codegened_module_to_llvm(
-            &ongoing_codegen.coordinator,
-            ModuleCodegen::new_allocator(llmod_id, module_llvm),
-            cost,
-        );
-    }
-
     ongoing_codegen.codegen_finished(tcx);
 
     // Since the main thread is sometimes blocked during codegen, we keep track
diff --git a/compiler/rustc_codegen_ssa/src/codegen_attrs.rs b/compiler/rustc_codegen_ssa/src/codegen_attrs.rs
index 961bb788149..dc500c363f4 100644
--- a/compiler/rustc_codegen_ssa/src/codegen_attrs.rs
+++ b/compiler/rustc_codegen_ssa/src/codegen_attrs.rs
@@ -428,9 +428,16 @@ fn check_result(
     // llvm/llvm-project#70563).
     if !codegen_fn_attrs.target_features.is_empty()
         && matches!(codegen_fn_attrs.inline, InlineAttr::Always)
+        && !tcx.features().target_feature_inline_always()
         && let Some(span) = interesting_spans.inline
     {
-        tcx.dcx().span_err(span, "cannot use `#[inline(always)]` with `#[target_feature]`");
+        feature_err(
+            tcx.sess,
+            sym::target_feature_inline_always,
+            span,
+            "cannot use `#[inline(always)]` with `#[target_feature]`",
+        )
+        .emit();
     }
 
     // warn that inline has no effect when no_sanitize is present
@@ -555,15 +562,6 @@ fn codegen_fn_attrs(tcx: TyCtxt<'_>, did: LocalDefId) -> CodegenFnAttrs {
     codegen_fn_attrs
 }
 
-/// If the provided DefId is a method in a trait impl, return the DefId of the method prototype.
-fn opt_trait_item(tcx: TyCtxt<'_>, def_id: DefId) -> Option<DefId> {
-    let impl_item = tcx.opt_associated_item(def_id)?;
-    match impl_item.container {
-        ty::AssocItemContainer::Impl => impl_item.trait_item_def_id,
-        _ => None,
-    }
-}
-
 fn disabled_sanitizers_for(tcx: TyCtxt<'_>, did: LocalDefId) -> SanitizerSet {
     // Backtrack to the crate root.
     let mut disabled = match tcx.opt_local_parent(did) {
@@ -593,14 +591,15 @@ fn disabled_sanitizers_for(tcx: TyCtxt<'_>, did: LocalDefId) -> SanitizerSet {
 /// Checks if the provided DefId is a method in a trait impl for a trait which has track_caller
 /// applied to the method prototype.
 fn should_inherit_track_caller(tcx: TyCtxt<'_>, def_id: DefId) -> bool {
-    let Some(trait_item) = opt_trait_item(tcx, def_id) else { return false };
-    tcx.codegen_fn_attrs(trait_item).flags.intersects(CodegenFnAttrFlags::TRACK_CALLER)
+    tcx.trait_item_of(def_id).is_some_and(|id| {
+        tcx.codegen_fn_attrs(id).flags.intersects(CodegenFnAttrFlags::TRACK_CALLER)
+    })
 }
 
 /// If the provided DefId is a method in a trait impl, return the value of the `#[align]`
 /// attribute on the method prototype (if any).
 fn inherited_align<'tcx>(tcx: TyCtxt<'tcx>, def_id: DefId) -> Option<Align> {
-    tcx.codegen_fn_attrs(opt_trait_item(tcx, def_id)?).alignment
+    tcx.codegen_fn_attrs(tcx.trait_item_of(def_id)?).alignment
 }
 
 /// We now check the #\[rustc_autodiff\] attributes which we generated from the #[autodiff(...)]
diff --git a/compiler/rustc_codegen_ssa/src/lib.rs b/compiler/rustc_codegen_ssa/src/lib.rs
index fe0500a5d4c..baba8f9ca3e 100644
--- a/compiler/rustc_codegen_ssa/src/lib.rs
+++ b/compiler/rustc_codegen_ssa/src/lib.rs
@@ -119,7 +119,7 @@ impl<M> ModuleCodegen<M> {
         });
 
         CompiledModule {
-            name: self.name.clone(),
+            name: self.name,
             kind: self.kind,
             object,
             dwarf_object,
diff --git a/compiler/rustc_codegen_ssa/src/mir/block.rs b/compiler/rustc_codegen_ssa/src/mir/block.rs
index c3dc3e42b83..6492ef73956 100644
--- a/compiler/rustc_codegen_ssa/src/mir/block.rs
+++ b/compiler/rustc_codegen_ssa/src/mir/block.rs
@@ -519,6 +519,9 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
             match self.locals[mir::Local::from_usize(1 + va_list_arg_idx)] {
                 LocalRef::Place(va_list) => {
                     bx.va_end(va_list.val.llval);
+
+                    // Explicitly end the lifetime of the `va_list`, improves LLVM codegen.
+                    bx.lifetime_end(va_list.val.llval, va_list.layout.size);
                 }
                 _ => bug!("C-variadic function must have a `VaList` place"),
             }
diff --git a/compiler/rustc_codegen_ssa/src/mir/mod.rs b/compiler/rustc_codegen_ssa/src/mir/mod.rs
index 06873313e2e..6b109e8b8e2 100644
--- a/compiler/rustc_codegen_ssa/src/mir/mod.rs
+++ b/compiler/rustc_codegen_ssa/src/mir/mod.rs
@@ -438,6 +438,10 @@ fn arg_local_refs<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>(
 
             if fx.fn_abi.c_variadic && arg_index == fx.fn_abi.args.len() {
                 let va_list = PlaceRef::alloca(bx, bx.layout_of(arg_ty));
+
+                // Explicitly start the lifetime of the `va_list`, improves LLVM codegen.
+                bx.lifetime_start(va_list.val.llval, va_list.layout.size);
+
                 bx.va_start(va_list.val.llval);
 
                 return LocalRef::Place(va_list);
diff --git a/compiler/rustc_codegen_ssa/src/mir/rvalue.rs b/compiler/rustc_codegen_ssa/src/mir/rvalue.rs
index 8a67b8d6e5f..f6f2e3f2a3a 100644
--- a/compiler/rustc_codegen_ssa/src/mir/rvalue.rs
+++ b/compiler/rustc_codegen_ssa/src/mir/rvalue.rs
@@ -1,3 +1,4 @@
+use itertools::Itertools as _;
 use rustc_abi::{self as abi, FIRST_VARIANT};
 use rustc_middle::ty::adjustment::PointerCoercion;
 use rustc_middle::ty::layout::{HasTyCtxt, HasTypingEnv, LayoutOf, TyAndLayout};
@@ -111,14 +112,13 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
                     let size = bx.const_usize(dest.layout.size.bytes());
 
                     // Use llvm.memset.p0i8.* to initialize all same byte arrays
-                    if let Some(int) = bx.cx().const_to_opt_u128(v, false) {
-                        let bytes = &int.to_le_bytes()[..cg_elem.layout.size.bytes_usize()];
-                        let first = bytes[0];
-                        if bytes[1..].iter().all(|&b| b == first) {
-                            let fill = bx.cx().const_u8(first);
-                            bx.memset(start, fill, size, dest.val.align, MemFlags::empty());
-                            return true;
-                        }
+                    if let Some(int) = bx.cx().const_to_opt_u128(v, false)
+                        && let bytes = &int.to_le_bytes()[..cg_elem.layout.size.bytes_usize()]
+                        && let Ok(&byte) = bytes.iter().all_equal_value()
+                    {
+                        let fill = bx.cx().const_u8(byte);
+                        bx.memset(start, fill, size, dest.val.align, MemFlags::empty());
+                        return true;
                     }
 
                     // Use llvm.memset.p0i8.* to initialize byte arrays
@@ -130,13 +130,10 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
                     false
                 };
 
-                match cg_elem.val {
-                    OperandValue::Immediate(v) => {
-                        if try_init_all_same(bx, v) {
-                            return;
-                        }
-                    }
-                    _ => (),
+                if let OperandValue::Immediate(v) = cg_elem.val
+                    && try_init_all_same(bx, v)
+                {
+                    return;
                 }
 
                 let count = self
diff --git a/compiler/rustc_codegen_ssa/src/traits/write.rs b/compiler/rustc_codegen_ssa/src/traits/write.rs
index cc7c4e46d7b..1ac1d7ef2e2 100644
--- a/compiler/rustc_codegen_ssa/src/traits/write.rs
+++ b/compiler/rustc_codegen_ssa/src/traits/write.rs
@@ -50,16 +50,12 @@ pub trait WriteBackendMethods: Clone + 'static {
         module: ModuleCodegen<Self::Module>,
         config: &ModuleConfig,
     ) -> CompiledModule;
-    fn prepare_thin(
-        module: ModuleCodegen<Self::Module>,
-        want_summary: bool,
-    ) -> (String, Self::ThinBuffer);
+    fn prepare_thin(module: ModuleCodegen<Self::Module>) -> (String, Self::ThinBuffer);
     fn serialize_module(module: ModuleCodegen<Self::Module>) -> (String, Self::ModuleBuffer);
 }
 
 pub trait ThinBufferMethods: Send + Sync {
     fn data(&self) -> &[u8];
-    fn thin_link_data(&self) -> &[u8];
 }
 
 pub trait ModuleBufferMethods: Send + Sync {