about summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--compiler/rustc_codegen_gcc/src/back/lto.rs9
-rw-r--r--compiler/rustc_codegen_gcc/src/base.rs11
-rw-r--r--compiler/rustc_codegen_gcc/src/lib.rs2
-rw-r--r--compiler/rustc_codegen_llvm/src/back/lto.rs27
-rw-r--r--compiler/rustc_codegen_llvm/src/back/write.rs126
-rw-r--r--compiler/rustc_codegen_llvm/src/base.rs8
-rw-r--r--compiler/rustc_codegen_llvm/src/lib.rs2
-rw-r--r--compiler/rustc_codegen_llvm/src/llvm/ffi.rs4
-rw-r--r--compiler/rustc_codegen_ssa/src/back/write.rs8
-rw-r--r--compiler/rustc_codegen_ssa/src/base.rs2
-rw-r--r--compiler/rustc_codegen_ssa/src/lib.rs20
-rw-r--r--compiler/rustc_codegen_ssa/src/traits/write.rs2
-rw-r--r--compiler/rustc_llvm/llvm-wrapper/PassWrapper.cpp66
-rw-r--r--compiler/rustc_session/src/config.rs3
-rw-r--r--tests/codegen/overaligned-constant.rs4
-rw-r--r--tests/codegen/uninit-consts.rs8
-rw-r--r--tests/run-make/pgo-embed-bc-lto/interesting.rs16
-rw-r--r--tests/run-make/pgo-embed-bc-lto/main.rs5
-rw-r--r--tests/run-make/pgo-embed-bc-lto/opaque.rs5
-rw-r--r--tests/run-make/pgo-embed-bc-lto/rmake.rs67
20 files changed, 294 insertions, 101 deletions
diff --git a/compiler/rustc_codegen_gcc/src/back/lto.rs b/compiler/rustc_codegen_gcc/src/back/lto.rs
index cb4caec8c32..e5221c7da31 100644
--- a/compiler/rustc_codegen_gcc/src/back/lto.rs
+++ b/compiler/rustc_codegen_gcc/src/back/lto.rs
@@ -632,17 +632,16 @@ pub unsafe fn optimize_thin_module(
             Arc::new(SyncContext::new(context))
         }
     };
-    let module = ModuleCodegen {
-        module_llvm: GccContext {
+    let module = ModuleCodegen::new_regular(
+        thin_module.name().to_string(),
+        GccContext {
             context,
             should_combine_object_files,
             // TODO(antoyo): use the correct relocation model here.
             relocation_model: RelocModel::Pic,
             temp_dir: None,
         },
-        name: thin_module.name().to_string(),
-        kind: ModuleKind::Regular,
-    };
+    );
     /*{
         let target = &*module.module_llvm.tm;
         let llmod = module.module_llvm.llmod();
diff --git a/compiler/rustc_codegen_gcc/src/base.rs b/compiler/rustc_codegen_gcc/src/base.rs
index 962f4b161d7..9b495174a3f 100644
--- a/compiler/rustc_codegen_gcc/src/base.rs
+++ b/compiler/rustc_codegen_gcc/src/base.rs
@@ -4,10 +4,10 @@ use std::sync::Arc;
 use std::time::Instant;
 
 use gccjit::{CType, Context, FunctionType, GlobalKind};
+use rustc_codegen_ssa::ModuleCodegen;
 use rustc_codegen_ssa::base::maybe_create_entry_wrapper;
 use rustc_codegen_ssa::mono_item::MonoItemExt;
 use rustc_codegen_ssa::traits::DebugInfoCodegenMethods;
-use rustc_codegen_ssa::{ModuleCodegen, ModuleKind};
 use rustc_middle::dep_graph;
 use rustc_middle::mir::mono::Linkage;
 #[cfg(feature = "master")]
@@ -237,16 +237,15 @@ pub fn compile_codegen_unit(
             }
         }
 
-        ModuleCodegen {
-            name: cgu_name.to_string(),
-            module_llvm: GccContext {
+        ModuleCodegen::new_regular(
+            cgu_name.to_string(),
+            GccContext {
                 context: Arc::new(SyncContext::new(context)),
                 relocation_model: tcx.sess.relocation_model(),
                 should_combine_object_files: false,
                 temp_dir: None,
             },
-            kind: ModuleKind::Regular,
-        }
+        )
     }
 
     (module, cost)
diff --git a/compiler/rustc_codegen_gcc/src/lib.rs b/compiler/rustc_codegen_gcc/src/lib.rs
index 6455bcec685..9d91aab72ab 100644
--- a/compiler/rustc_codegen_gcc/src/lib.rs
+++ b/compiler/rustc_codegen_gcc/src/lib.rs
@@ -393,7 +393,7 @@ impl WriteBackendMethods for GccCodegenBackend {
     unsafe fn optimize(
         _cgcx: &CodegenContext<Self>,
         _dcx: DiagCtxtHandle<'_>,
-        module: &ModuleCodegen<Self::Module>,
+        module: &mut ModuleCodegen<Self::Module>,
         config: &ModuleConfig,
     ) -> Result<(), FatalError> {
         module.module_llvm.context.set_optimization_level(to_gcc_opt_level(config.opt_level));
diff --git a/compiler/rustc_codegen_llvm/src/back/lto.rs b/compiler/rustc_codegen_llvm/src/back/lto.rs
index 05a2cd1c5a3..668795191a2 100644
--- a/compiler/rustc_codegen_llvm/src/back/lto.rs
+++ b/compiler/rustc_codegen_llvm/src/back/lto.rs
@@ -2,6 +2,7 @@ use std::collections::BTreeMap;
 use std::ffi::{CStr, CString};
 use std::fs::File;
 use std::path::Path;
+use std::ptr::NonNull;
 use std::sync::Arc;
 use std::{io, iter, slice};
 
@@ -305,11 +306,8 @@ fn fat_lto(
             assert!(!serialized_modules.is_empty(), "must have at least one serialized module");
             let (buffer, name) = serialized_modules.remove(0);
             info!("no in-memory regular modules to choose from, parsing {:?}", name);
-            ModuleCodegen {
-                module_llvm: ModuleLlvm::parse(cgcx, &name, buffer.data(), dcx)?,
-                name: name.into_string().unwrap(),
-                kind: ModuleKind::Regular,
-            }
+            let llvm_module = ModuleLlvm::parse(cgcx, &name, buffer.data(), dcx)?;
+            ModuleCodegen::new_regular(name.into_string().unwrap(), llvm_module)
         }
     };
     {
@@ -655,14 +653,14 @@ pub(crate) fn run_pass_manager(
     }
 
     unsafe {
-        write::llvm_optimize(cgcx, dcx, module, config, opt_level, opt_stage, stage)?;
+        write::llvm_optimize(cgcx, dcx, module, None, config, opt_level, opt_stage, stage)?;
     }
 
     if cfg!(llvm_enzyme) && enable_ad {
         let opt_stage = llvm::OptStage::FatLTO;
         let stage = write::AutodiffStage::PostAD;
         unsafe {
-            write::llvm_optimize(cgcx, dcx, module, config, opt_level, opt_stage, stage)?;
+            write::llvm_optimize(cgcx, dcx, module, None, config, opt_level, opt_stage, stage)?;
         }
 
         // This is the final IR, so people should be able to inspect the optimized autodiff output.
@@ -729,6 +727,11 @@ impl ThinBuffer {
             ThinBuffer(buffer)
         }
     }
+
+    pub unsafe fn from_raw_ptr(ptr: *mut llvm::ThinLTOBuffer) -> ThinBuffer {
+        let mut ptr = NonNull::new(ptr).unwrap();
+        ThinBuffer(unsafe { ptr.as_mut() })
+    }
 }
 
 impl ThinBufferMethods for ThinBuffer {
@@ -772,11 +775,11 @@ pub(crate) unsafe fn optimize_thin_module(
     // crates but for locally codegened modules we may be able to reuse
     // that LLVM Context and Module.
     let module_llvm = ModuleLlvm::parse(cgcx, module_name, thin_module.data(), dcx)?;
-    let mut module = ModuleCodegen {
-        module_llvm,
-        name: thin_module.name().to_string(),
-        kind: ModuleKind::Regular,
-    };
+    let mut module = ModuleCodegen::new_regular(thin_module.name(), module_llvm);
+    // Given that the newly created module lacks a thinlto buffer for embedding, we need to re-add it here.
+    if cgcx.config(ModuleKind::Regular).embed_bitcode() {
+        module.thin_lto_buffer = Some(thin_module.data().to_vec());
+    }
     {
         let target = &*module.module_llvm.tm;
         let llmod = module.module_llvm.llmod();
diff --git a/compiler/rustc_codegen_llvm/src/back/write.rs b/compiler/rustc_codegen_llvm/src/back/write.rs
index 29d6121844f..bead4c82a81 100644
--- a/compiler/rustc_codegen_llvm/src/back/write.rs
+++ b/compiler/rustc_codegen_llvm/src/back/write.rs
@@ -1,6 +1,7 @@
 use std::ffi::{CStr, CString};
 use std::io::{self, Write};
 use std::path::{Path, PathBuf};
+use std::ptr::null_mut;
 use std::sync::Arc;
 use std::{fs, slice, str};
 
@@ -15,7 +16,7 @@ use rustc_codegen_ssa::back::write::{
     TargetMachineFactoryFn,
 };
 use rustc_codegen_ssa::traits::*;
-use rustc_codegen_ssa::{CompiledModule, ModuleCodegen};
+use rustc_codegen_ssa::{CompiledModule, ModuleCodegen, ModuleKind};
 use rustc_data_structures::profiling::SelfProfilerRef;
 use rustc_data_structures::small_c_str::SmallCStr;
 use rustc_errors::{DiagCtxtHandle, FatalError, Level};
@@ -551,6 +552,7 @@ pub(crate) unsafe fn llvm_optimize(
     cgcx: &CodegenContext<LlvmCodegenBackend>,
     dcx: DiagCtxtHandle<'_>,
     module: &ModuleCodegen<ModuleLlvm>,
+    thin_lto_buffer: Option<&mut *mut llvm::ThinLTOBuffer>,
     config: &ModuleConfig,
     opt_level: config::OptLevel,
     opt_stage: llvm::OptStage,
@@ -584,7 +586,17 @@ pub(crate) unsafe fn llvm_optimize(
         vectorize_loop = config.vectorize_loop;
     }
     trace!(?unroll_loops, ?vectorize_slp, ?vectorize_loop, ?run_enzyme);
-    let using_thin_buffers = opt_stage == llvm::OptStage::PreLinkThinLTO || config.bitcode_needed();
+    if thin_lto_buffer.is_some() {
+        assert!(
+            matches!(
+                opt_stage,
+                llvm::OptStage::PreLinkNoLTO
+                    | llvm::OptStage::PreLinkFatLTO
+                    | llvm::OptStage::PreLinkThinLTO
+            ),
+            "the bitcode for LTO can only be obtained at the pre-link stage"
+        );
+    }
     let pgo_gen_path = get_pgo_gen_path(config);
     let pgo_use_path = get_pgo_use_path(config);
     let pgo_sample_use_path = get_pgo_sample_use_path(config);
@@ -644,7 +656,9 @@ pub(crate) unsafe fn llvm_optimize(
             config.no_prepopulate_passes,
             config.verify_llvm_ir,
             config.lint_llvm_ir,
-            using_thin_buffers,
+            thin_lto_buffer,
+            config.emit_thin_lto,
+            config.emit_thin_lto_summary,
             config.merge_functions,
             unroll_loops,
             vectorize_slp,
@@ -675,7 +689,7 @@ pub(crate) unsafe fn llvm_optimize(
 pub(crate) unsafe fn optimize(
     cgcx: &CodegenContext<LlvmCodegenBackend>,
     dcx: DiagCtxtHandle<'_>,
-    module: &ModuleCodegen<ModuleLlvm>,
+    module: &mut ModuleCodegen<ModuleLlvm>,
     config: &ModuleConfig,
 ) -> Result<(), FatalError> {
     let _timer = cgcx.prof.generic_activity_with_arg("LLVM_module_optimize", &*module.name);
@@ -705,9 +719,53 @@ pub(crate) unsafe fn optimize(
         // Otherwise we pretend AD is already done and run the normal opt pipeline (=PostAD).
         let consider_ad = cfg!(llvm_enzyme) && config.autodiff.contains(&config::AutoDiff::Enable);
         let autodiff_stage = if consider_ad { AutodiffStage::PreAD } else { AutodiffStage::PostAD };
-        return unsafe {
-            llvm_optimize(cgcx, dcx, module, config, opt_level, opt_stage, autodiff_stage)
+        // The embedded bitcode is used to run LTO/ThinLTO.
+        // The bitcode obtained during the `codegen` phase is no longer suitable for performing LTO.
+        // It may have undergone LTO due to ThinLocal, so we need to obtain the embedded bitcode at
+        // this point.
+        let mut thin_lto_buffer = if (module.kind == ModuleKind::Regular
+            && config.emit_obj == EmitObj::ObjectCode(BitcodeSection::Full))
+            || config.emit_thin_lto_summary
+        {
+            Some(null_mut())
+        } else {
+            None
         };
+        unsafe {
+            llvm_optimize(
+                cgcx,
+                dcx,
+                module,
+                thin_lto_buffer.as_mut(),
+                config,
+                opt_level,
+                opt_stage,
+                autodiff_stage,
+            )
+        }?;
+        if let Some(thin_lto_buffer) = thin_lto_buffer {
+            let thin_lto_buffer = unsafe { ThinBuffer::from_raw_ptr(thin_lto_buffer) };
+            module.thin_lto_buffer = Some(thin_lto_buffer.data().to_vec());
+            let bc_summary_out =
+                cgcx.output_filenames.temp_path(OutputType::ThinLinkBitcode, module_name);
+            if config.emit_thin_lto_summary
+                && let Some(thin_link_bitcode_filename) = bc_summary_out.file_name()
+            {
+                let summary_data = thin_lto_buffer.thin_link_data();
+                cgcx.prof.artifact_size(
+                    "llvm_bitcode_summary",
+                    thin_link_bitcode_filename.to_string_lossy(),
+                    summary_data.len() as u64,
+                );
+                let _timer = cgcx.prof.generic_activity_with_arg(
+                    "LLVM_module_codegen_emit_bitcode_summary",
+                    &*module.name,
+                );
+                if let Err(err) = fs::write(&bc_summary_out, summary_data) {
+                    dcx.emit_err(WriteBytecode { path: &bc_summary_out, err });
+                }
+            }
+        }
     }
     Ok(())
 }
@@ -760,59 +818,41 @@ pub(crate) unsafe fn codegen(
         // otherwise requested.
 
         let bc_out = cgcx.output_filenames.temp_path(OutputType::Bitcode, module_name);
-        let bc_summary_out =
-            cgcx.output_filenames.temp_path(OutputType::ThinLinkBitcode, module_name);
         let obj_out = cgcx.output_filenames.temp_path(OutputType::Object, module_name);
 
         if config.bitcode_needed() {
-            let _timer = cgcx
-                .prof
-                .generic_activity_with_arg("LLVM_module_codegen_make_bitcode", &*module.name);
-            let thin = ThinBuffer::new(llmod, config.emit_thin_lto, config.emit_thin_lto_summary);
-            let data = thin.data();
-
-            if let Some(bitcode_filename) = bc_out.file_name() {
-                cgcx.prof.artifact_size(
-                    "llvm_bitcode",
-                    bitcode_filename.to_string_lossy(),
-                    data.len() as u64,
-                );
-            }
-
-            if config.emit_thin_lto_summary
-                && let Some(thin_link_bitcode_filename) = bc_summary_out.file_name()
-            {
-                let summary_data = thin.thin_link_data();
-                cgcx.prof.artifact_size(
-                    "llvm_bitcode_summary",
-                    thin_link_bitcode_filename.to_string_lossy(),
-                    summary_data.len() as u64,
-                );
-
-                let _timer = cgcx.prof.generic_activity_with_arg(
-                    "LLVM_module_codegen_emit_bitcode_summary",
-                    &*module.name,
-                );
-                if let Err(err) = fs::write(&bc_summary_out, summary_data) {
-                    dcx.emit_err(WriteBytecode { path: &bc_summary_out, err });
-                }
-            }
-
             if config.emit_bc || config.emit_obj == EmitObj::Bitcode {
+                let thin = {
+                    let _timer = cgcx.prof.generic_activity_with_arg(
+                        "LLVM_module_codegen_make_bitcode",
+                        &*module.name,
+                    );
+                    ThinBuffer::new(llmod, config.emit_thin_lto, false)
+                };
+                let data = thin.data();
                 let _timer = cgcx
                     .prof
                     .generic_activity_with_arg("LLVM_module_codegen_emit_bitcode", &*module.name);
+                if let Some(bitcode_filename) = bc_out.file_name() {
+                    cgcx.prof.artifact_size(
+                        "llvm_bitcode",
+                        bitcode_filename.to_string_lossy(),
+                        data.len() as u64,
+                    );
+                }
                 if let Err(err) = fs::write(&bc_out, data) {
                     dcx.emit_err(WriteBytecode { path: &bc_out, err });
                 }
             }
 
-            if config.emit_obj == EmitObj::ObjectCode(BitcodeSection::Full) {
+            if config.embed_bitcode() && module.kind == ModuleKind::Regular {
                 let _timer = cgcx
                     .prof
                     .generic_activity_with_arg("LLVM_module_codegen_embed_bitcode", &*module.name);
+                let thin_bc =
+                    module.thin_lto_buffer.as_deref().expect("cannot find embedded bitcode");
                 unsafe {
-                    embed_bitcode(cgcx, llcx, llmod, &config.bc_cmdline, data);
+                    embed_bitcode(cgcx, llcx, llmod, &config.bc_cmdline, &thin_bc);
                 }
             }
         }
diff --git a/compiler/rustc_codegen_llvm/src/base.rs b/compiler/rustc_codegen_llvm/src/base.rs
index d35c7945bae..6bd27914dbd 100644
--- a/compiler/rustc_codegen_llvm/src/base.rs
+++ b/compiler/rustc_codegen_llvm/src/base.rs
@@ -13,10 +13,10 @@
 
 use std::time::Instant;
 
+use rustc_codegen_ssa::ModuleCodegen;
 use rustc_codegen_ssa::base::maybe_create_entry_wrapper;
 use rustc_codegen_ssa::mono_item::MonoItemExt;
 use rustc_codegen_ssa::traits::*;
-use rustc_codegen_ssa::{ModuleCodegen, ModuleKind};
 use rustc_data_structures::small_c_str::SmallCStr;
 use rustc_middle::dep_graph;
 use rustc_middle::middle::codegen_fn_attrs::CodegenFnAttrs;
@@ -133,11 +133,7 @@ pub(crate) fn compile_codegen_unit(
             }
         }
 
-        ModuleCodegen {
-            name: cgu_name.to_string(),
-            module_llvm: llvm_module,
-            kind: ModuleKind::Regular,
-        }
+        ModuleCodegen::new_regular(cgu_name.to_string(), llvm_module)
     }
 
     (module, cost)
diff --git a/compiler/rustc_codegen_llvm/src/lib.rs b/compiler/rustc_codegen_llvm/src/lib.rs
index e9e1b644f18..c88372db491 100644
--- a/compiler/rustc_codegen_llvm/src/lib.rs
+++ b/compiler/rustc_codegen_llvm/src/lib.rs
@@ -194,7 +194,7 @@ impl WriteBackendMethods for LlvmCodegenBackend {
     unsafe fn optimize(
         cgcx: &CodegenContext<Self>,
         dcx: DiagCtxtHandle<'_>,
-        module: &ModuleCodegen<Self::Module>,
+        module: &mut ModuleCodegen<Self::Module>,
         config: &ModuleConfig,
     ) -> Result<(), FatalError> {
         unsafe { back::write::optimize(cgcx, dcx, module, config) }
diff --git a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs
index da91e6edbcf..2dc14e4613d 100644
--- a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs
+++ b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs
@@ -2425,7 +2425,9 @@ unsafe extern "C" {
         NoPrepopulatePasses: bool,
         VerifyIR: bool,
         LintIR: bool,
-        UseThinLTOBuffers: bool,
+        ThinLTOBuffer: Option<&mut *mut ThinLTOBuffer>,
+        EmitThinLTO: bool,
+        EmitThinLTOSummary: bool,
         MergeFunctions: bool,
         UnrollLoops: bool,
         SLPVectorize: bool,
diff --git a/compiler/rustc_codegen_ssa/src/back/write.rs b/compiler/rustc_codegen_ssa/src/back/write.rs
index f008bd12ed8..c8bb229998e 100644
--- a/compiler/rustc_codegen_ssa/src/back/write.rs
+++ b/compiler/rustc_codegen_ssa/src/back/write.rs
@@ -278,6 +278,10 @@ impl ModuleConfig {
             || self.emit_obj == EmitObj::Bitcode
             || self.emit_obj == EmitObj::ObjectCode(BitcodeSection::Full)
     }
+
+    pub fn embed_bitcode(&self) -> bool {
+        self.emit_obj == EmitObj::ObjectCode(BitcodeSection::Full)
+    }
 }
 
 /// Configuration passed to the function returned by the `target_machine_factory`.
@@ -877,14 +881,14 @@ pub(crate) fn compute_per_cgu_lto_type(
 
 fn execute_optimize_work_item<B: ExtraBackendMethods>(
     cgcx: &CodegenContext<B>,
-    module: ModuleCodegen<B::Module>,
+    mut module: ModuleCodegen<B::Module>,
     module_config: &ModuleConfig,
 ) -> Result<WorkItemResult<B>, FatalError> {
     let dcx = cgcx.create_dcx();
     let dcx = dcx.handle();
 
     unsafe {
-        B::optimize(cgcx, dcx, &module, module_config)?;
+        B::optimize(cgcx, dcx, &mut module, module_config)?;
     }
 
     // After we've done the initial round of optimizations we need to
diff --git a/compiler/rustc_codegen_ssa/src/base.rs b/compiler/rustc_codegen_ssa/src/base.rs
index 73a97d32c2d..63f2f8fa3d1 100644
--- a/compiler/rustc_codegen_ssa/src/base.rs
+++ b/compiler/rustc_codegen_ssa/src/base.rs
@@ -687,7 +687,7 @@ pub fn codegen_crate<B: ExtraBackendMethods>(
         submit_codegened_module_to_llvm(
             &backend,
             &ongoing_codegen.coordinator.sender,
-            ModuleCodegen { name: llmod_id, module_llvm, kind: ModuleKind::Allocator },
+            ModuleCodegen::new_allocator(llmod_id, module_llvm),
             cost,
         );
     }
diff --git a/compiler/rustc_codegen_ssa/src/lib.rs b/compiler/rustc_codegen_ssa/src/lib.rs
index 9d2ac219d59..4e758bfdec3 100644
--- a/compiler/rustc_codegen_ssa/src/lib.rs
+++ b/compiler/rustc_codegen_ssa/src/lib.rs
@@ -75,9 +75,29 @@ pub struct ModuleCodegen<M> {
     pub name: String,
     pub module_llvm: M,
     pub kind: ModuleKind,
+    /// Saving the ThinLTO buffer for embedding in the object file.
+    pub thin_lto_buffer: Option<Vec<u8>>,
 }
 
 impl<M> ModuleCodegen<M> {
+    pub fn new_regular(name: impl Into<String>, module: M) -> Self {
+        Self {
+            name: name.into(),
+            module_llvm: module,
+            kind: ModuleKind::Regular,
+            thin_lto_buffer: None,
+        }
+    }
+
+    pub fn new_allocator(name: impl Into<String>, module: M) -> Self {
+        Self {
+            name: name.into(),
+            module_llvm: module,
+            kind: ModuleKind::Allocator,
+            thin_lto_buffer: None,
+        }
+    }
+
     pub fn into_compiled_module(
         self,
         emit_obj: bool,
diff --git a/compiler/rustc_codegen_ssa/src/traits/write.rs b/compiler/rustc_codegen_ssa/src/traits/write.rs
index 97fe614aa10..c77efdd1728 100644
--- a/compiler/rustc_codegen_ssa/src/traits/write.rs
+++ b/compiler/rustc_codegen_ssa/src/traits/write.rs
@@ -40,7 +40,7 @@ pub trait WriteBackendMethods: 'static + Sized + Clone {
     unsafe fn optimize(
         cgcx: &CodegenContext<Self>,
         dcx: DiagCtxtHandle<'_>,
-        module: &ModuleCodegen<Self::Module>,
+        module: &mut ModuleCodegen<Self::Module>,
         config: &ModuleConfig,
     ) -> Result<(), FatalError>;
     fn optimize_fat(
diff --git a/compiler/rustc_llvm/llvm-wrapper/PassWrapper.cpp b/compiler/rustc_llvm/llvm-wrapper/PassWrapper.cpp
index 9ce4abdb432..6e607baebb5 100644
--- a/compiler/rustc_llvm/llvm-wrapper/PassWrapper.cpp
+++ b/compiler/rustc_llvm/llvm-wrapper/PassWrapper.cpp
@@ -7,6 +7,7 @@
 #include "llvm/Analysis/Lint.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/Bitcode/BitcodeWriterPass.h"
 #include "llvm/CodeGen/CommandFlags.h"
 #include "llvm/IR/AssemblyAnnotationWriter.h"
 #include "llvm/IR/AutoUpgrade.h"
@@ -37,6 +38,7 @@
 #include "llvm/Transforms/Instrumentation/InstrProfiling.h"
 #include "llvm/Transforms/Instrumentation/MemorySanitizer.h"
 #include "llvm/Transforms/Instrumentation/ThreadSanitizer.h"
+#include "llvm/Transforms/Scalar/AnnotationRemarks.h"
 #include "llvm/Transforms/Utils/CanonicalizeAliases.h"
 #include "llvm/Transforms/Utils/FunctionImportUtils.h"
 #include "llvm/Transforms/Utils/NameAnonGlobals.h"
@@ -195,6 +197,19 @@ extern "C" void LLVMRustTimeTraceProfilerFinish(const char *FileName) {
 GEN_SUBTARGETS
 #undef SUBTARGET
 
+// This struct and various functions are sort of a hack right now, but the
+// problem is that we've got in-memory LLVM modules after we generate and
+// optimize all codegen-units for one compilation in rustc. To be compatible
+// with the LTO support above we need to serialize the modules plus their
+// ThinLTO summary into memory.
+//
+// This structure is basically an owned version of a serialize module, with
+// a ThinLTO summary attached.
+struct LLVMRustThinLTOBuffer {
+  std::string data;
+  std::string thin_link_data;
+};
+
 extern "C" bool LLVMRustHasFeature(LLVMTargetMachineRef TM,
                                    const char *Feature) {
   TargetMachine *Target = unwrap(TM);
@@ -704,7 +719,8 @@ extern "C" LLVMRustResult LLVMRustOptimize(
     LLVMModuleRef ModuleRef, LLVMTargetMachineRef TMRef,
     LLVMRustPassBuilderOptLevel OptLevelRust, LLVMRustOptStage OptStage,
     bool IsLinkerPluginLTO, bool NoPrepopulatePasses, bool VerifyIR,
-    bool LintIR, bool UseThinLTOBuffers, bool MergeFunctions, bool UnrollLoops,
+    bool LintIR, LLVMRustThinLTOBuffer **ThinLTOBufferRef, bool EmitThinLTO,
+    bool EmitThinLTOSummary, bool MergeFunctions, bool UnrollLoops,
     bool SLPVectorize, bool LoopVectorize, bool DisableSimplifyLibCalls,
     bool EmitLifetimeMarkers, bool RunEnzyme,
     LLVMRustSanitizerOptions *SanitizerOptions, const char *PGOGenPath,
@@ -952,7 +968,10 @@ extern "C" LLVMRustResult LLVMRustOptimize(
   }
 
   ModulePassManager MPM;
-  bool NeedThinLTOBufferPasses = UseThinLTOBuffers;
+  bool NeedThinLTOBufferPasses = EmitThinLTO;
+  auto ThinLTOBuffer = std::make_unique<LLVMRustThinLTOBuffer>();
+  raw_string_ostream ThinLTODataOS(ThinLTOBuffer->data);
+  raw_string_ostream ThinLinkDataOS(ThinLTOBuffer->thin_link_data);
   if (!NoPrepopulatePasses) {
     // The pre-link pipelines don't support O0 and require using
     // buildO0DefaultPipeline() instead. At the same time, the LTO pipelines do
@@ -976,7 +995,25 @@ extern "C" LLVMRustResult LLVMRustOptimize(
 
       switch (OptStage) {
       case LLVMRustOptStage::PreLinkNoLTO:
-        MPM = PB.buildPerModuleDefaultPipeline(OptLevel);
+        if (ThinLTOBufferRef) {
+          // This is similar to LLVM's `buildFatLTODefaultPipeline`, where the
+          // bitcode for embedding is obtained after performing
+          // `ThinLTOPreLinkDefaultPipeline`.
+          MPM.addPass(PB.buildThinLTOPreLinkDefaultPipeline(OptLevel));
+          if (EmitThinLTO) {
+            MPM.addPass(ThinLTOBitcodeWriterPass(
+                ThinLTODataOS, EmitThinLTOSummary ? &ThinLinkDataOS : nullptr));
+          } else {
+            MPM.addPass(BitcodeWriterPass(ThinLTODataOS));
+          }
+          *ThinLTOBufferRef = ThinLTOBuffer.release();
+          MPM.addPass(PB.buildModuleOptimizationPipeline(
+              OptLevel, ThinOrFullLTOPhase::None));
+          MPM.addPass(
+              createModuleToFunctionPassAdaptor(AnnotationRemarksPass()));
+        } else {
+          MPM = PB.buildPerModuleDefaultPipeline(OptLevel);
+        }
         break;
       case LLVMRustOptStage::PreLinkThinLTO:
         MPM = PB.buildThinLTOPreLinkDefaultPipeline(OptLevel);
@@ -1022,6 +1059,16 @@ extern "C" LLVMRustResult LLVMRustOptimize(
     MPM.addPass(CanonicalizeAliasesPass());
     MPM.addPass(NameAnonGlobalPass());
   }
+  // For `-Copt-level=0`, ThinLTO, or LTO.
+  if (ThinLTOBufferRef && *ThinLTOBufferRef == nullptr) {
+    if (EmitThinLTO) {
+      MPM.addPass(ThinLTOBitcodeWriterPass(
+          ThinLTODataOS, EmitThinLTOSummary ? &ThinLinkDataOS : nullptr));
+    } else {
+      MPM.addPass(BitcodeWriterPass(ThinLTODataOS));
+    }
+    *ThinLTOBufferRef = ThinLTOBuffer.release();
+  }
 
   // now load "-enzyme" pass:
 #ifdef ENZYME
@@ -1500,19 +1547,6 @@ extern "C" bool LLVMRustPrepareThinLTOImport(const LLVMRustThinLTOData *Data,
   return true;
 }
 
-// This struct and various functions are sort of a hack right now, but the
-// problem is that we've got in-memory LLVM modules after we generate and
-// optimize all codegen-units for one compilation in rustc. To be compatible
-// with the LTO support above we need to serialize the modules plus their
-// ThinLTO summary into memory.
-//
-// This structure is basically an owned version of a serialize module, with
-// a ThinLTO summary attached.
-struct LLVMRustThinLTOBuffer {
-  std::string data;
-  std::string thin_link_data;
-};
-
 extern "C" LLVMRustThinLTOBuffer *
 LLVMRustThinLTOBufferCreate(LLVMModuleRef M, bool is_thin, bool emit_summary) {
   auto Ret = std::make_unique<LLVMRustThinLTOBuffer>();
diff --git a/compiler/rustc_session/src/config.rs b/compiler/rustc_session/src/config.rs
index ba7ccd6a02d..7586c5766b5 100644
--- a/compiler/rustc_session/src/config.rs
+++ b/compiler/rustc_session/src/config.rs
@@ -539,7 +539,10 @@ impl FromStr for SplitDwarfKind {
 #[derive(Clone, Copy, PartialEq, Eq, Hash, Debug, PartialOrd, Ord, HashStable_Generic)]
 #[derive(Encodable, Decodable)]
 pub enum OutputType {
+    /// This is the optimized bitcode, which could be either pre-LTO or non-LTO bitcode,
+    /// depending on the specific request type.
     Bitcode,
+    /// This is the summary or index data part of the ThinLTO bitcode.
     ThinLinkBitcode,
     Assembly,
     LlvmAssembly,
diff --git a/tests/codegen/overaligned-constant.rs b/tests/codegen/overaligned-constant.rs
index e5540aca387..0f5977880f2 100644
--- a/tests/codegen/overaligned-constant.rs
+++ b/tests/codegen/overaligned-constant.rs
@@ -9,14 +9,14 @@ struct S(i32);
 
 struct SmallStruct(f32, Option<S>, &'static [f32]);
 
-// CHECK: @0 = private unnamed_addr constant
+// CHECK: [[const:@.*]] = private unnamed_addr constant
 // CHECK-SAME: , align 8
 
 #[no_mangle]
 pub fn overaligned_constant() {
     // CHECK-LABEL: @overaligned_constant
     // CHECK: [[full:%_.*]] = alloca [32 x i8], align 8
-    // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[full]], ptr align 8 @0, i64 32, i1 false)
+    // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[full]], ptr align 8 [[const]], i64 32, i1 false)
     let mut s = S(1);
 
     s.0 = 3;
diff --git a/tests/codegen/uninit-consts.rs b/tests/codegen/uninit-consts.rs
index 649927b87b4..a58008e171e 100644
--- a/tests/codegen/uninit-consts.rs
+++ b/tests/codegen/uninit-consts.rs
@@ -11,15 +11,15 @@ pub struct PartiallyUninit {
     y: MaybeUninit<[u8; 10]>,
 }
 
-// CHECK: [[FULLY_UNINIT:@[0-9]+]] = private unnamed_addr constant <{ [10 x i8] }> undef
+// CHECK: [[FULLY_UNINIT:@.*]] = private unnamed_addr constant <{ [10 x i8] }> undef
 
-// CHECK: [[PARTIALLY_UNINIT:@[0-9]+]] = private unnamed_addr constant <{ [4 x i8], [12 x i8] }> <{ [4 x i8] c"{{\\EF\\BE\\AD\\DE|\\DE\\AD\\BE\\EF}}", [12 x i8] undef }>, align 4
+// CHECK: [[PARTIALLY_UNINIT:@.*]] = private unnamed_addr constant <{ [4 x i8], [12 x i8] }> <{ [4 x i8] c"{{\\EF\\BE\\AD\\DE|\\DE\\AD\\BE\\EF}}", [12 x i8] undef }>, align 4
 
 // This shouldn't contain undef, since it contains more chunks
 // than the default value of uninit_const_chunk_threshold.
-// CHECK: [[UNINIT_PADDING_HUGE:@[0-9]+]] = private unnamed_addr constant <{ [32768 x i8] }> <{ [32768 x i8] c"{{.+}}" }>, align 4
+// CHECK: [[UNINIT_PADDING_HUGE:@.*]] = private unnamed_addr constant <{ [32768 x i8] }> <{ [32768 x i8] c"{{.+}}" }>, align 4
 
-// CHECK: [[FULLY_UNINIT_HUGE:@[0-9]+]] = private unnamed_addr constant <{ [16384 x i8] }> undef
+// CHECK: [[FULLY_UNINIT_HUGE:@.*]] = private unnamed_addr constant <{ [16384 x i8] }> undef
 
 // CHECK-LABEL: @fully_uninit
 #[no_mangle]
diff --git a/tests/run-make/pgo-embed-bc-lto/interesting.rs b/tests/run-make/pgo-embed-bc-lto/interesting.rs
new file mode 100644
index 00000000000..13105c17e12
--- /dev/null
+++ b/tests/run-make/pgo-embed-bc-lto/interesting.rs
@@ -0,0 +1,16 @@
+#![crate_name = "interesting"]
+#![crate_type = "rlib"]
+
+extern crate opaque;
+
+#[no_mangle]
+#[inline(never)]
+pub fn function_called_once() {
+    opaque::foo();
+}
+
+// CHECK-LABEL: @function_called_once
+// CHECK-SAME: !prof [[function_called_once_id:![0-9]+]] {
+// CHECK: "CG Profile"
+// CHECK-NOT: "CG Profile"
+// CHECK-DAG: [[function_called_once_id]] = !{!"function_entry_count", i64 1}
diff --git a/tests/run-make/pgo-embed-bc-lto/main.rs b/tests/run-make/pgo-embed-bc-lto/main.rs
new file mode 100644
index 00000000000..ce8747bef3c
--- /dev/null
+++ b/tests/run-make/pgo-embed-bc-lto/main.rs
@@ -0,0 +1,5 @@
+extern crate interesting;
+
+fn main() {
+    interesting::function_called_once();
+}
diff --git a/tests/run-make/pgo-embed-bc-lto/opaque.rs b/tests/run-make/pgo-embed-bc-lto/opaque.rs
new file mode 100644
index 00000000000..b4467dc7796
--- /dev/null
+++ b/tests/run-make/pgo-embed-bc-lto/opaque.rs
@@ -0,0 +1,5 @@
+#![crate_name = "opaque"]
+#![crate_type = "rlib"]
+
+#[inline(never)]
+pub fn foo() {}
diff --git a/tests/run-make/pgo-embed-bc-lto/rmake.rs b/tests/run-make/pgo-embed-bc-lto/rmake.rs
new file mode 100644
index 00000000000..b7eba0c68e3
--- /dev/null
+++ b/tests/run-make/pgo-embed-bc-lto/rmake.rs
@@ -0,0 +1,67 @@
+// This test case verifies that we successfully complete an LTO build with PGO
+// using the embedded bitcode.
+// It also ensures that the generated IR correctly includes the call results.
+
+//@ needs-profiler-runtime
+//@ ignore-cross-compile
+
+use std::path::Path;
+
+use run_make_support::{
+    has_extension, has_prefix, llvm_filecheck, llvm_profdata, rfs, run, rustc, shallow_find_files,
+};
+
+fn run_test(cg_units: usize) {
+    let path_prof_data_dir = Path::new("prof_data_dir");
+    if path_prof_data_dir.exists() {
+        rfs::remove_dir_all(path_prof_data_dir);
+    }
+    rfs::create_dir_all(&path_prof_data_dir);
+    let path_merged_profdata = path_prof_data_dir.join("merged.profdata");
+    rustc().input("opaque.rs").codegen_units(1).run();
+    rustc()
+        .input("interesting.rs")
+        .profile_generate(&path_prof_data_dir)
+        .opt()
+        .crate_type("lib,cdylib")
+        .codegen_units(cg_units)
+        .run();
+    rustc()
+        .input("main.rs")
+        .arg("-Clto=thin")
+        .opt()
+        .codegen_units(cg_units)
+        .profile_generate(&path_prof_data_dir)
+        .opt()
+        .run();
+    run("main");
+    llvm_profdata().merge().output(&path_merged_profdata).input(path_prof_data_dir).run();
+    rustc()
+        .input("interesting.rs")
+        .profile_use(&path_merged_profdata)
+        .opt()
+        .crate_type("lib,cdylib")
+        .codegen_units(cg_units)
+        .emit("link")
+        .run();
+    rustc()
+        .input("main.rs")
+        .arg("-Clto=thin")
+        .opt()
+        .codegen_units(cg_units)
+        .profile_use(&path_merged_profdata)
+        .emit("llvm-ir,link")
+        .opt()
+        .run();
+    let files = shallow_find_files(".", |path| {
+        has_prefix(path, "main.interesting.interesting") && has_extension(path, "ll")
+    });
+    assert_eq!(files.len(), 1);
+    let llvm_ir = &files[0];
+    llvm_filecheck().patterns("interesting.rs").stdin_buf(rfs::read(llvm_ir)).run();
+}
+
+fn main() {
+    run_test(1);
+    run_test(16);
+}