17 files changed, 113 insertions, 105 deletions
diff --git a/compiler/rustc/Cargo.toml b/compiler/rustc/Cargo.toml
index e3214d1ab9c..9ef8fa75062 100644
--- a/compiler/rustc/Cargo.toml
+++ b/compiler/rustc/Cargo.toml
@@ -30,6 +30,7 @@ features = ['unprefixed_malloc_on_supported_platforms']
 check_only = ['rustc_driver_impl/check_only']
 jemalloc = ['dep:tikv-jemalloc-sys']
 llvm = ['rustc_driver_impl/llvm']
+llvm_enzyme = ['rustc_driver_impl/llvm_enzyme']
 max_level_info = ['rustc_driver_impl/max_level_info']
 rustc_randomized_layouts = ['rustc_driver_impl/rustc_randomized_layouts']
 # tidy-alphabetical-end
diff --git a/compiler/rustc_builtin_macros/Cargo.toml b/compiler/rustc_builtin_macros/Cargo.toml
index e56b9e641a1..ce9a3ce3f24 100644
--- a/compiler/rustc_builtin_macros/Cargo.toml
+++ b/compiler/rustc_builtin_macros/Cargo.toml
@@ -33,3 +33,8 @@ smallvec = { version = "1.8.1", features = ["union", "may_dangle"] }
 thin-vec = "0.2.12"
 tracing = "0.1"
 # tidy-alphabetical-end
+
+[features]
+# tidy-alphabetical-start
+llvm_enzyme = []
+# tidy-alphabetical-end
diff --git a/compiler/rustc_builtin_macros/src/autodiff.rs b/compiler/rustc_builtin_macros/src/autodiff.rs
index 48d0795af5e..f4a923797e2 100644
--- a/compiler/rustc_builtin_macros/src/autodiff.rs
+++ b/compiler/rustc_builtin_macros/src/autodiff.rs
@@ -209,7 +209,8 @@ mod llvm_enzyme {
         mut item: Annotatable,
         mode: DiffMode,
     ) -> Vec<Annotatable> {
-        if cfg!(not(llvm_enzyme)) {
+        // FIXME(bjorn3) maybe have the backend directly tell if autodiff is supported?
+        if cfg!(not(feature = "llvm_enzyme")) {
             ecx.sess.dcx().emit_err(errors::AutoDiffSupportNotBuild { span: meta_item.span });
             return vec![item];
         }
diff --git a/compiler/rustc_codegen_gcc/src/intrinsic/mod.rs b/compiler/rustc_codegen_gcc/src/intrinsic/mod.rs
index eb0a5336a1f..84fa56cf903 100644
--- a/compiler/rustc_codegen_gcc/src/intrinsic/mod.rs
+++ b/compiler/rustc_codegen_gcc/src/intrinsic/mod.rs
@@ -730,7 +730,7 @@ impl<'gcc, 'tcx> ArgAbiExt<'gcc, 'tcx> for ArgAbi<'tcx, Ty<'tcx>> {
         if self.is_sized_indirect() {
             OperandValue::Ref(PlaceValue::new_sized(val, self.layout.align.abi)).store(bx, dst)
         } else if self.is_unsized_indirect() {
-            bug!("unsized `ArgAbi` must be handled through `store_fn_arg`");
+            bug!("unsized `ArgAbi` cannot be stored");
         } else if let PassMode::Cast { ref cast, .. } = self.mode {
             // FIXME(eddyb): Figure out when the simpler Store is safe, clang
             // uses it for i16 -> {i8, i8}, but not for i24 -> {i8, i8, i8}.
@@ -797,12 +797,7 @@ impl<'gcc, 'tcx> ArgAbiExt<'gcc, 'tcx> for ArgAbi<'tcx, Ty<'tcx>> {
                 OperandValue::Pair(next(), next()).store(bx, dst);
             }
             PassMode::Indirect { meta_attrs: Some(_), .. } => {
-                let place_val = PlaceValue {
-                    llval: next(),
-                    llextra: Some(next()),
-                    align: self.layout.align.abi,
-                };
-                OperandValue::Ref(place_val).store(bx, dst);
+                bug!("unsized `ArgAbi` cannot be stored");
             }
             PassMode::Direct(_)
             | PassMode::Indirect { meta_attrs: None, .. }
diff --git a/compiler/rustc_codegen_llvm/Cargo.toml b/compiler/rustc_codegen_llvm/Cargo.toml
index 2d11628250c..67bd1e59bb0 100644
--- a/compiler/rustc_codegen_llvm/Cargo.toml
+++ b/compiler/rustc_codegen_llvm/Cargo.toml
@@ -46,5 +46,6 @@ tracing = "0.1"
 [features]
 # tidy-alphabetical-start
 check_only = ["rustc_llvm/check_only"]
+llvm_enzyme = []
 # tidy-alphabetical-end
 
diff --git a/compiler/rustc_codegen_llvm/src/abi.rs b/compiler/rustc_codegen_llvm/src/abi.rs
index ac7583f5666..11be7041167 100644
--- a/compiler/rustc_codegen_llvm/src/abi.rs
+++ b/compiler/rustc_codegen_llvm/src/abi.rs
@@ -215,9 +215,9 @@ impl<'ll, 'tcx> ArgAbiExt<'ll, 'tcx> for ArgAbi<'tcx, Ty<'tcx>> {
                 let align = attrs.pointee_align.unwrap_or(self.layout.align.abi);
                 OperandValue::Ref(PlaceValue::new_sized(val, align)).store(bx, dst);
             }
-            // Unsized indirect qrguments
+            // Unsized indirect arguments cannot be stored
             PassMode::Indirect { attrs: _, meta_attrs: Some(_), on_stack: _ } => {
-                bug!("unsized `ArgAbi` must be handled through `store_fn_arg`");
+                bug!("unsized `ArgAbi` cannot be stored");
             }
             PassMode::Cast { cast, pad_i32: _ } => {
                 // The ABI mandates that the value is passed as a different struct representation.
@@ -272,12 +272,7 @@ impl<'ll, 'tcx> ArgAbiExt<'ll, 'tcx> for ArgAbi<'tcx, Ty<'tcx>> {
                 OperandValue::Pair(next(), next()).store(bx, dst);
             }
             PassMode::Indirect { attrs: _, meta_attrs: Some(_), on_stack: _ } => {
-                let place_val = PlaceValue {
-                    llval: next(),
-                    llextra: Some(next()),
-                    align: self.layout.align.abi,
-                };
-                OperandValue::Ref(place_val).store(bx, dst);
+                bug!("unsized `ArgAbi` cannot be stored");
             }
             PassMode::Direct(_)
             | PassMode::Indirect { attrs: _, meta_attrs: None, on_stack: _ }
diff --git a/compiler/rustc_codegen_llvm/src/back/lto.rs b/compiler/rustc_codegen_llvm/src/back/lto.rs
index f571716d9dd..78107d95e5a 100644
--- a/compiler/rustc_codegen_llvm/src/back/lto.rs
+++ b/compiler/rustc_codegen_llvm/src/back/lto.rs
@@ -617,7 +617,7 @@ pub(crate) fn run_pass_manager(
         crate::builder::gpu_offload::handle_gpu_code(cgcx, &cx);
     }
 
-    if cfg!(llvm_enzyme) && enable_ad && !thin {
+    if cfg!(feature = "llvm_enzyme") && enable_ad && !thin {
         let opt_stage = llvm::OptStage::FatLTO;
         let stage = write::AutodiffStage::PostAD;
         if !config.autodiff.contains(&config::AutoDiff::NoPostopt) {
diff --git a/compiler/rustc_codegen_llvm/src/back/write.rs b/compiler/rustc_codegen_llvm/src/back/write.rs
index 423f0da4878..bda81fbd19e 100644
--- a/compiler/rustc_codegen_llvm/src/back/write.rs
+++ b/compiler/rustc_codegen_llvm/src/back/write.rs
@@ -574,7 +574,8 @@ pub(crate) unsafe fn llvm_optimize(
     // FIXME(ZuseZ4): In a future update we could figure out how to only optimize individual functions getting
     // differentiated.
 
-    let consider_ad = cfg!(llvm_enzyme) && config.autodiff.contains(&config::AutoDiff::Enable);
+    let consider_ad =
+        cfg!(feature = "llvm_enzyme") && config.autodiff.contains(&config::AutoDiff::Enable);
     let run_enzyme = autodiff_stage == AutodiffStage::DuringAD;
     let print_before_enzyme = config.autodiff.contains(&config::AutoDiff::PrintModBefore);
     let print_after_enzyme = config.autodiff.contains(&config::AutoDiff::PrintModAfter);
@@ -740,7 +741,8 @@ pub(crate) fn optimize(
 
         // If we know that we will later run AD, then we disable vectorization and loop unrolling.
         // Otherwise we pretend AD is already done and run the normal opt pipeline (=PostAD).
-        let consider_ad = cfg!(llvm_enzyme) && config.autodiff.contains(&config::AutoDiff::Enable);
+        let consider_ad =
+            cfg!(feature = "llvm_enzyme") && config.autodiff.contains(&config::AutoDiff::Enable);
         let autodiff_stage = if consider_ad { AutodiffStage::PreAD } else { AutodiffStage::PostAD };
         // The embedded bitcode is used to run LTO/ThinLTO.
         // The bitcode obtained during the `codegen` phase is no longer suitable for performing LTO.
diff --git a/compiler/rustc_codegen_llvm/src/llvm/enzyme_ffi.rs b/compiler/rustc_codegen_llvm/src/llvm/enzyme_ffi.rs
index 56d756e52cc..695435eb6da 100644
--- a/compiler/rustc_codegen_llvm/src/llvm/enzyme_ffi.rs
+++ b/compiler/rustc_codegen_llvm/src/llvm/enzyme_ffi.rs
@@ -59,10 +59,10 @@ pub(crate) enum LLVMRustVerifierFailureAction {
     LLVMReturnStatusAction = 2,
 }
 
-#[cfg(llvm_enzyme)]
+#[cfg(feature = "llvm_enzyme")]
 pub(crate) use self::Enzyme_AD::*;
 
-#[cfg(llvm_enzyme)]
+#[cfg(feature = "llvm_enzyme")]
 pub(crate) mod Enzyme_AD {
     use std::ffi::{CString, c_char};
 
@@ -134,10 +134,10 @@ pub(crate) mod Enzyme_AD {
     }
 }
 
-#[cfg(not(llvm_enzyme))]
+#[cfg(not(feature = "llvm_enzyme"))]
 pub(crate) use self::Fallback_AD::*;
 
-#[cfg(not(llvm_enzyme))]
+#[cfg(not(feature = "llvm_enzyme"))]
 pub(crate) mod Fallback_AD {
     #![allow(unused_variables)]
 
diff --git a/compiler/rustc_const_eval/src/interpret/memory.rs b/compiler/rustc_const_eval/src/interpret/memory.rs
index ebcdb9461d0..323e1cefd58 100644
--- a/compiler/rustc_const_eval/src/interpret/memory.rs
+++ b/compiler/rustc_const_eval/src/interpret/memory.rs
@@ -1504,7 +1504,7 @@ impl<'tcx, M: Machine<'tcx>> InterpCx<'tcx, M> {
         // This will also error if copying partial provenance is not supported.
         let provenance = src_alloc
             .provenance()
-            .prepare_copy(src_range, dest_offset, num_copies, self)
+            .prepare_copy(src_range, self)
             .map_err(|e| e.to_interp_error(src_alloc_id))?;
         // Prepare a copy of the initialization mask.
         let init = src_alloc.init_mask().prepare_copy(src_range);
@@ -1590,7 +1590,7 @@ impl<'tcx, M: Machine<'tcx>> InterpCx<'tcx, M> {
             num_copies,
         );
         // copy the provenance to the destination
-        dest_alloc.provenance_apply_copy(provenance);
+        dest_alloc.provenance_apply_copy(provenance, alloc_range(dest_offset, size), num_copies);
 
         interp_ok(())
     }
diff --git a/compiler/rustc_data_structures/src/lib.rs b/compiler/rustc_data_structures/src/lib.rs
index 17da3ea83c8..e4e86bcc41a 100644
--- a/compiler/rustc_data_structures/src/lib.rs
+++ b/compiler/rustc_data_structures/src/lib.rs
@@ -34,6 +34,7 @@
 #![feature(sized_hierarchy)]
 #![feature(test)]
 #![feature(thread_id_value)]
+#![feature(trusted_len)]
 #![feature(type_alias_impl_trait)]
 #![feature(unwrap_infallible)]
 // tidy-alphabetical-end
diff --git a/compiler/rustc_data_structures/src/sorted_map.rs b/compiler/rustc_data_structures/src/sorted_map.rs
index c002d47815b..15e3e6ea4c3 100644
--- a/compiler/rustc_data_structures/src/sorted_map.rs
+++ b/compiler/rustc_data_structures/src/sorted_map.rs
@@ -1,6 +1,7 @@
 use std::borrow::Borrow;
 use std::cmp::Ordering;
 use std::fmt::Debug;
+use std::iter::TrustedLen;
 use std::mem;
 use std::ops::{Bound, Index, IndexMut, RangeBounds};
 
@@ -215,36 +216,40 @@ impl<K: Ord, V> SortedMap<K, V> {
     /// It is up to the caller to make sure that the elements are sorted by key
     /// and that there are no duplicates.
     #[inline]
-    pub fn insert_presorted(&mut self, elements: Vec<(K, V)>) {
-        if elements.is_empty() {
+    pub fn insert_presorted(
+        &mut self,
+        // We require `TrustedLen` to ensure that the `splice` below is actually efficient.
+        mut elements: impl Iterator<Item = (K, V)> + DoubleEndedIterator + TrustedLen,
+    ) {
+        let Some(first) = elements.next() else {
             return;
-        }
-
-        debug_assert!(elements.array_windows().all(|[fst, snd]| fst.0 < snd.0));
+        };
 
-        let start_index = self.lookup_index_for(&elements[0].0);
+        let start_index = self.lookup_index_for(&first.0);
 
         let elements = match start_index {
             Ok(index) => {
-                let mut elements = elements.into_iter();
-                self.data[index] = elements.next().unwrap();
-                elements
+                self.data[index] = first; // overwrite first element
+                elements.chain(None) // insert the rest below
             }
             Err(index) => {
-                if index == self.data.len() || elements.last().unwrap().0 < self.data[index].0 {
+                let last = elements.next_back();
+                if index == self.data.len()
+                    || last.as_ref().is_none_or(|l| l.0 < self.data[index].0)
+                {
                     // We can copy the whole range without having to mix with
                     // existing elements.
-                    self.data.splice(index..index, elements);
+                    self.data
+                        .splice(index..index, std::iter::once(first).chain(elements).chain(last));
                     return;
                 }
 
-                let mut elements = elements.into_iter();
-                self.data.insert(index, elements.next().unwrap());
-                elements
+                self.data.insert(index, first);
+                elements.chain(last) // insert the rest below
             }
         };
 
-        // Insert the rest
+        // Insert the rest. This is super inefficicent since each insertion copies the entire tail.
         for (k, v) in elements {
             self.insert(k, v);
         }
diff --git a/compiler/rustc_data_structures/src/sorted_map/tests.rs b/compiler/rustc_data_structures/src/sorted_map/tests.rs
index ea4d2f1feac..17d0d3cb170 100644
--- a/compiler/rustc_data_structures/src/sorted_map/tests.rs
+++ b/compiler/rustc_data_structures/src/sorted_map/tests.rs
@@ -171,7 +171,7 @@ fn test_insert_presorted_non_overlapping() {
     map.insert(2, 0);
     map.insert(8, 0);
 
-    map.insert_presorted(vec![(3, 0), (7, 0)]);
+    map.insert_presorted(vec![(3, 0), (7, 0)].into_iter());
 
     let expected = vec![2, 3, 7, 8];
     assert_eq!(keys(map), expected);
@@ -183,7 +183,7 @@ fn test_insert_presorted_first_elem_equal() {
     map.insert(2, 2);
     map.insert(8, 8);
 
-    map.insert_presorted(vec![(2, 0), (7, 7)]);
+    map.insert_presorted(vec![(2, 0), (7, 7)].into_iter());
 
     let expected = vec![(2, 0), (7, 7), (8, 8)];
     assert_eq!(elements(map), expected);
@@ -195,7 +195,7 @@ fn test_insert_presorted_last_elem_equal() {
     map.insert(2, 2);
     map.insert(8, 8);
 
-    map.insert_presorted(vec![(3, 3), (8, 0)]);
+    map.insert_presorted(vec![(3, 3), (8, 0)].into_iter());
 
     let expected = vec![(2, 2), (3, 3), (8, 0)];
     assert_eq!(elements(map), expected);
@@ -207,7 +207,7 @@ fn test_insert_presorted_shuffle() {
     map.insert(2, 2);
     map.insert(7, 7);
 
-    map.insert_presorted(vec![(1, 1), (3, 3), (8, 8)]);
+    map.insert_presorted(vec![(1, 1), (3, 3), (8, 8)].into_iter());
 
     let expected = vec![(1, 1), (2, 2), (3, 3), (7, 7), (8, 8)];
     assert_eq!(elements(map), expected);
@@ -219,7 +219,7 @@ fn test_insert_presorted_at_end() {
     map.insert(1, 1);
     map.insert(2, 2);
 
-    map.insert_presorted(vec![(3, 3), (8, 8)]);
+    map.insert_presorted(vec![(3, 3), (8, 8)].into_iter());
 
     let expected = vec![(1, 1), (2, 2), (3, 3), (8, 8)];
     assert_eq!(elements(map), expected);
diff --git a/compiler/rustc_driver_impl/Cargo.toml b/compiler/rustc_driver_impl/Cargo.toml
index ae1dbd2cf51..46efa50cff3 100644
--- a/compiler/rustc_driver_impl/Cargo.toml
+++ b/compiler/rustc_driver_impl/Cargo.toml
@@ -74,6 +74,7 @@ ctrlc = "3.4.4"
 # tidy-alphabetical-start
 check_only = ['rustc_interface/check_only']
 llvm = ['rustc_interface/llvm']
+llvm_enzyme = ['rustc_interface/llvm_enzyme']
 max_level_info = ['rustc_log/max_level_info']
 rustc_randomized_layouts = [
     'rustc_index/rustc_randomized_layouts',
diff --git a/compiler/rustc_interface/Cargo.toml b/compiler/rustc_interface/Cargo.toml
index 473ac5e0cea..f0836c47740 100644
--- a/compiler/rustc_interface/Cargo.toml
+++ b/compiler/rustc_interface/Cargo.toml
@@ -58,4 +58,5 @@ rustc_abi = { path = "../rustc_abi" }
 # tidy-alphabetical-start
 check_only = ['rustc_codegen_llvm?/check_only']
 llvm = ['dep:rustc_codegen_llvm']
+llvm_enzyme = ['rustc_builtin_macros/llvm_enzyme', 'rustc_codegen_llvm/llvm_enzyme']
 # tidy-alphabetical-end
diff --git a/compiler/rustc_middle/src/mir/interpret/allocation.rs b/compiler/rustc_middle/src/mir/interpret/allocation.rs
index 67962813ae4..8e603ce1b91 100644
--- a/compiler/rustc_middle/src/mir/interpret/allocation.rs
+++ b/compiler/rustc_middle/src/mir/interpret/allocation.rs
@@ -849,8 +849,13 @@ impl<Prov: Provenance, Extra, Bytes: AllocBytes> Allocation<Prov, Extra, Bytes>
     ///
     /// This is dangerous to use as it can violate internal `Allocation` invariants!
     /// It only exists to support an efficient implementation of `mem_copy_repeatedly`.
-    pub fn provenance_apply_copy(&mut self, copy: ProvenanceCopy<Prov>) {
-        self.provenance.apply_copy(copy)
+    pub fn provenance_apply_copy(
+        &mut self,
+        copy: ProvenanceCopy<Prov>,
+        range: AllocRange,
+        repeat: u64,
+    ) {
+        self.provenance.apply_copy(copy, range, repeat)
     }
 
     /// Applies a previously prepared copy of the init mask.
diff --git a/compiler/rustc_middle/src/mir/interpret/allocation/provenance_map.rs b/compiler/rustc_middle/src/mir/interpret/allocation/provenance_map.rs
index 720e58d7aa0..67baf63bbfa 100644
--- a/compiler/rustc_middle/src/mir/interpret/allocation/provenance_map.rs
+++ b/compiler/rustc_middle/src/mir/interpret/allocation/provenance_map.rs
@@ -278,90 +278,78 @@ impl<Prov: Provenance> ProvenanceMap<Prov> {
 
 /// A partial, owned list of provenance to transfer into another allocation.
 ///
-/// Offsets are already adjusted to the destination allocation.
+/// Offsets are relative to the beginning of the copied range.
 pub struct ProvenanceCopy<Prov> {
-    dest_ptrs: Option<Box<[(Size, Prov)]>>,
-    dest_bytes: Option<Box<[(Size, (Prov, u8))]>>,
+    ptrs: Box<[(Size, Prov)]>,
+    bytes: Box<[(Size, (Prov, u8))]>,
 }
 
 impl<Prov: Provenance> ProvenanceMap<Prov> {
     pub fn prepare_copy(
         &self,
-        src: AllocRange,
-        dest: Size,
-        count: u64,
+        range: AllocRange,
         cx: &impl HasDataLayout,
     ) -> AllocResult<ProvenanceCopy<Prov>> {
-        let shift_offset = move |idx, offset| {
-            // compute offset for current repetition
-            let dest_offset = dest + src.size * idx; // `Size` operations
-            // shift offsets from source allocation to destination allocation
-            (offset - src.start) + dest_offset // `Size` operations
-        };
+        let shift_offset = move |offset| offset - range.start;
         let ptr_size = cx.data_layout().pointer_size();
 
         // # Pointer-sized provenances
         // Get the provenances that are entirely within this range.
         // (Different from `range_get_ptrs` which asks if they overlap the range.)
         // Only makes sense if we are copying at least one pointer worth of bytes.
-        let mut dest_ptrs_box = None;
-        if src.size >= ptr_size {
-            let adjusted_end = Size::from_bytes(src.end().bytes() - (ptr_size.bytes() - 1));
-            let ptrs = self.ptrs.range(src.start..adjusted_end);
-            // If `count` is large, this is rather wasteful -- we are allocating a big array here, which
-            // is mostly filled with redundant information since it's just N copies of the same `Prov`s
-            // at slightly adjusted offsets. The reason we do this is so that in `mark_provenance_range`
-            // we can use `insert_presorted`. That wouldn't work with an `Iterator` that just produces
-            // the right sequence of provenance for all N copies.
-            // Basically, this large array would have to be created anyway in the target allocation.
-            let mut dest_ptrs = Vec::with_capacity(ptrs.len() * (count as usize));
-            for i in 0..count {
-                dest_ptrs
-                    .extend(ptrs.iter().map(|&(offset, reloc)| (shift_offset(i, offset), reloc)));
-            }
-            debug_assert_eq!(dest_ptrs.len(), dest_ptrs.capacity());
-            dest_ptrs_box = Some(dest_ptrs.into_boxed_slice());
+        let mut ptrs_box: Box<[_]> = Box::new([]);
+        if range.size >= ptr_size {
+            let adjusted_end = Size::from_bytes(range.end().bytes() - (ptr_size.bytes() - 1));
+            let ptrs = self.ptrs.range(range.start..adjusted_end);
+            ptrs_box = ptrs.iter().map(|&(offset, reloc)| (shift_offset(offset), reloc)).collect();
         };
 
         // # Byte-sized provenances
         // This includes the existing bytewise provenance in the range, and ptr provenance
         // that overlaps with the begin/end of the range.
-        let mut dest_bytes_box = None;
-        let begin_overlap = self.range_ptrs_get(alloc_range(src.start, Size::ZERO), cx).first();
-        let end_overlap = self.range_ptrs_get(alloc_range(src.end(), Size::ZERO), cx).first();
+        let mut bytes_box: Box<[_]> = Box::new([]);
+        let begin_overlap = self.range_ptrs_get(alloc_range(range.start, Size::ZERO), cx).first();
+        let end_overlap = self.range_ptrs_get(alloc_range(range.end(), Size::ZERO), cx).first();
         // We only need to go here if there is some overlap or some bytewise provenance.
         if begin_overlap.is_some() || end_overlap.is_some() || self.bytes.is_some() {
             let mut bytes: Vec<(Size, (Prov, u8))> = Vec::new();
             // First, if there is a part of a pointer at the start, add that.
             if let Some(entry) = begin_overlap {
                 trace!("start overlapping entry: {entry:?}");
-                // For really small copies, make sure we don't run off the end of the `src` range.
-                let entry_end = cmp::min(entry.0 + ptr_size, src.end());
-                for offset in src.start..entry_end {
-                    bytes.push((offset, (entry.1, (offset - entry.0).bytes() as u8)));
+                // For really small copies, make sure we don't run off the end of the range.
+                let entry_end = cmp::min(entry.0 + ptr_size, range.end());
+                for offset in range.start..entry_end {
+                    bytes.push((shift_offset(offset), (entry.1, (offset - entry.0).bytes() as u8)));
                 }
             } else {
                 trace!("no start overlapping entry");
             }
 
             // Then the main part, bytewise provenance from `self.bytes`.
-            bytes.extend(self.range_bytes_get(src));
+            bytes.extend(
+                self.range_bytes_get(range)
+                    .iter()
+                    .map(|&(offset, reloc)| (shift_offset(offset), reloc)),
+            );
 
             // And finally possibly parts of a pointer at the end.
             if let Some(entry) = end_overlap {
                 trace!("end overlapping entry: {entry:?}");
-                // For really small copies, make sure we don't start before `src` does.
-                let entry_start = cmp::max(entry.0, src.start);
-                for offset in entry_start..src.end() {
+                // For really small copies, make sure we don't start before `range` does.
+                let entry_start = cmp::max(entry.0, range.start);
+                for offset in entry_start..range.end() {
                     if bytes.last().is_none_or(|bytes_entry| bytes_entry.0 < offset) {
                         // The last entry, if it exists, has a lower offset than us, so we
                         // can add it at the end and remain sorted.
-                        bytes.push((offset, (entry.1, (offset - entry.0).bytes() as u8)));
+                        bytes.push((
+                            shift_offset(offset),
+                            (entry.1, (offset - entry.0).bytes() as u8),
+                        ));
                     } else {
                         // There already is an entry for this offset in there! This can happen when the
                         // start and end range checks actually end up hitting the same pointer, so we
                         // already added this in the "pointer at the start" part above.
-                        assert!(entry.0 <= src.start);
+                        assert!(entry.0 <= range.start);
                     }
                 }
             } else {
@@ -372,33 +360,40 @@ impl<Prov: Provenance> ProvenanceMap<Prov> {
             if !bytes.is_empty() && !Prov::OFFSET_IS_ADDR {
                 // FIXME(#146291): We need to ensure that we don't mix different pointers with
                 // the same provenance.
-                return Err(AllocError::ReadPartialPointer(src.start));
+                return Err(AllocError::ReadPartialPointer(range.start));
             }
 
             // And again a buffer for the new list on the target side.
-            let mut dest_bytes = Vec::with_capacity(bytes.len() * (count as usize));
-            for i in 0..count {
-                dest_bytes
-                    .extend(bytes.iter().map(|&(offset, reloc)| (shift_offset(i, offset), reloc)));
-            }
-            debug_assert_eq!(dest_bytes.len(), dest_bytes.capacity());
-            dest_bytes_box = Some(dest_bytes.into_boxed_slice());
+            bytes_box = bytes.into_boxed_slice();
         }
 
-        Ok(ProvenanceCopy { dest_ptrs: dest_ptrs_box, dest_bytes: dest_bytes_box })
+        Ok(ProvenanceCopy { ptrs: ptrs_box, bytes: bytes_box })
     }
 
     /// Applies a provenance copy.
     /// The affected range, as defined in the parameters to `prepare_copy` is expected
     /// to be clear of provenance.
-    pub fn apply_copy(&mut self, copy: ProvenanceCopy<Prov>) {
-        if let Some(dest_ptrs) = copy.dest_ptrs {
-            self.ptrs.insert_presorted(dest_ptrs.into());
+    pub fn apply_copy(&mut self, copy: ProvenanceCopy<Prov>, range: AllocRange, repeat: u64) {
+        let shift_offset = |idx: u64, offset: Size| offset + range.start + idx * range.size;
+        if !copy.ptrs.is_empty() {
+            // We want to call `insert_presorted` only once so that, if possible, the entries
+            // after the range we insert are moved back only once.
+            let chunk_len = copy.ptrs.len() as u64;
+            self.ptrs.insert_presorted((0..chunk_len * repeat).map(|i| {
+                let chunk = i / chunk_len;
+                let (offset, reloc) = copy.ptrs[(i % chunk_len) as usize];
+                (shift_offset(chunk, offset), reloc)
+            }));
         }
-        if let Some(dest_bytes) = copy.dest_bytes
-            && !dest_bytes.is_empty()
-        {
-            self.bytes.get_or_insert_with(Box::default).insert_presorted(dest_bytes.into());
+        if !copy.bytes.is_empty() {
+            let chunk_len = copy.bytes.len() as u64;
+            self.bytes.get_or_insert_with(Box::default).insert_presorted(
+                (0..chunk_len * repeat).map(|i| {
+                    let chunk = i / chunk_len;
+                    let (offset, reloc) = copy.bytes[(i % chunk_len) as usize];
+                    (shift_offset(chunk, offset), reloc)
+                }),
+            );
         }
     }
 }