about summary refs log tree commit diff
path: root/compiler/rustc_mir_transform/src
diff options
context:
space:
mode:
authorbors <bors@rust-lang.org>2025-01-10 19:04:26 +0000
committerbors <bors@rust-lang.org>2025-01-10 19:04:26 +0000
commitb1a7dfb91106018f47ed9dc9b27aee1977682868 (patch)
treec2ac8174b0be897938a5a801c58a97a87eee2b1d /compiler/rustc_mir_transform/src
parent336209eef13882bd1e211b24779584cb7ef911eb (diff)
parentcc9a9ecccb2fbbb7687ed076e06411c2f4ac5fbb (diff)
downloadrust-b1a7dfb91106018f47ed9dc9b27aee1977682868.tar.gz
rust-b1a7dfb91106018f47ed9dc9b27aee1977682868.zip
Auto merge of #134082 - davidtwco:forced-inlining, r=saethlin
mir_transform: implement `#[rustc_force_inline]`

Adds `#[rustc_force_inline]` which is similar to always inlining but reports an error if the inlining was not possible.

- `#[rustc_force_inline]` can only be applied to free functions to guarantee that the MIR inliner will be able to resolve calls.
- `rustc_mir_transform::inline::Inline` is refactored into two passes (`Inline` and `ForceInline`), sharing the vast majority of the implementation.
  - `rustc_mir_transform::inline::ForceInline` can't be disabled so annotated items are always inlined.
  - `rustc_mir_transform::inline::ForceInline` runs regardless of optimisation level.
- `#[rustc_force_inline]` won't inline unless target features match, as with normal inlining.
- MIR validation will ICE if a `#[rustc_force_inline]` isn't inlined, to guarantee that it will never be codegened independently. As a further guarantee, monomorphisation collection will always decide that `#[rustc_force_inline]` functions cannot be codegened locally.
- Like other intrinsics, `#[rustc_force_inline]` annotated functions cannot be cast to function pointers.
- As with other rustc attrs, this cannot be used by users, just within the compiler and standard library.
- This is only implemented within rustc, so should avoid any limitations of LLVM's inlining.

It is intended that this attribute be used with intrinsics that must be inlined for security reasons. For example, pointer authentication intrinsics would allow Rust users to make use of pointer authentication instructions, but if these intrinsic functions were in the binary then they could be used as gadgets with ROP attacks, defeating the point of introducing them. We don't have any intrinsics like this today, but I expect to upstream some once a force inlining mechanism such as this is available.

cc rust-lang/rust#131687 rust-lang/rfcs#3711 - this approach should resolve the concerns from these previous attempts

r? `@saethlin`
Diffstat (limited to 'compiler/rustc_mir_transform/src')
-rw-r--r--compiler/rustc_mir_transform/src/cross_crate_inline.rs7
-rw-r--r--compiler/rustc_mir_transform/src/errors.rs28
-rw-r--r--compiler/rustc_mir_transform/src/inline.rs1489
-rw-r--r--compiler/rustc_mir_transform/src/lib.rs8
-rw-r--r--compiler/rustc_mir_transform/src/pass_manager.rs10
-rw-r--r--compiler/rustc_mir_transform/src/shim.rs4
-rw-r--r--compiler/rustc_mir_transform/src/validate.rs11
7 files changed, 921 insertions, 636 deletions
diff --git a/compiler/rustc_mir_transform/src/cross_crate_inline.rs b/compiler/rustc_mir_transform/src/cross_crate_inline.rs
index e1f1dd83f0d..8fce856687c 100644
--- a/compiler/rustc_mir_transform/src/cross_crate_inline.rs
+++ b/compiler/rustc_mir_transform/src/cross_crate_inline.rs
@@ -46,7 +46,7 @@ fn cross_crate_inlinable(tcx: TyCtxt<'_>, def_id: LocalDefId) -> bool {
     // #[inline(never)] to force code generation.
     match codegen_fn_attrs.inline {
         InlineAttr::Never => return false,
-        InlineAttr::Hint | InlineAttr::Always => return true,
+        InlineAttr::Hint | InlineAttr::Always | InlineAttr::Force { .. } => return true,
         _ => {}
     }
 
@@ -69,8 +69,9 @@ fn cross_crate_inlinable(tcx: TyCtxt<'_>, def_id: LocalDefId) -> bool {
     // Don't do any inference if codegen optimizations are disabled and also MIR inlining is not
     // enabled. This ensures that we do inference even if someone only passes -Zinline-mir,
     // which is less confusing than having to also enable -Copt-level=1.
-    if matches!(tcx.sess.opts.optimize, OptLevel::No) && !pm::should_run_pass(tcx, &inline::Inline)
-    {
+    let inliner_will_run = pm::should_run_pass(tcx, &inline::Inline)
+        || inline::ForceInline::should_run_pass_for_callee(tcx, def_id.to_def_id());
+    if matches!(tcx.sess.opts.optimize, OptLevel::No) && !inliner_will_run {
         return false;
     }
 
diff --git a/compiler/rustc_mir_transform/src/errors.rs b/compiler/rustc_mir_transform/src/errors.rs
index 2d9eeddea2e..015633d145f 100644
--- a/compiler/rustc_mir_transform/src/errors.rs
+++ b/compiler/rustc_mir_transform/src/errors.rs
@@ -4,8 +4,8 @@ use rustc_macros::{Diagnostic, LintDiagnostic, Subdiagnostic};
 use rustc_middle::mir::AssertKind;
 use rustc_middle::ty::TyCtxt;
 use rustc_session::lint::{self, Lint};
-use rustc_span::Span;
 use rustc_span::def_id::DefId;
+use rustc_span::{Span, Symbol};
 
 use crate::fluent_generated as fluent;
 
@@ -142,3 +142,29 @@ pub(crate) struct MustNotSuspendReason {
 #[note(mir_transform_note2)]
 #[help]
 pub(crate) struct UndefinedTransmute;
+
+#[derive(Diagnostic)]
+#[diag(mir_transform_force_inline)]
+#[note]
+pub(crate) struct ForceInlineFailure {
+    #[label(mir_transform_caller)]
+    pub caller_span: Span,
+    #[label(mir_transform_callee)]
+    pub callee_span: Span,
+    #[label(mir_transform_attr)]
+    pub attr_span: Span,
+    #[primary_span]
+    #[label(mir_transform_call)]
+    pub call_span: Span,
+    pub callee: String,
+    pub caller: String,
+    pub reason: &'static str,
+    #[subdiagnostic]
+    pub justification: Option<ForceInlineJustification>,
+}
+
+#[derive(Subdiagnostic)]
+#[note(mir_transform_force_inline_justification)]
+pub(crate) struct ForceInlineJustification {
+    pub sym: Symbol,
+}
diff --git a/compiler/rustc_mir_transform/src/inline.rs b/compiler/rustc_mir_transform/src/inline.rs
index 339acbad6b9..e4daa2b9757 100644
--- a/compiler/rustc_mir_transform/src/inline.rs
+++ b/compiler/rustc_mir_transform/src/inline.rs
@@ -10,13 +10,12 @@ use rustc_hir::def_id::DefId;
 use rustc_index::Idx;
 use rustc_index::bit_set::BitSet;
 use rustc_middle::bug;
-use rustc_middle::middle::codegen_fn_attrs::{CodegenFnAttrFlags, CodegenFnAttrs};
+use rustc_middle::middle::codegen_fn_attrs::CodegenFnAttrs;
 use rustc_middle::mir::visit::*;
 use rustc_middle::mir::*;
 use rustc_middle::ty::{self, Instance, InstanceKind, Ty, TyCtxt, TypeFlags, TypeVisitableExt};
 use rustc_session::config::{DebugInfo, OptLevel};
 use rustc_span::source_map::Spanned;
-use rustc_span::sym;
 use tracing::{debug, instrument, trace, trace_span};
 
 use crate::cost_checker::CostChecker;
@@ -29,10 +28,6 @@ pub(crate) mod cycle;
 
 const TOP_DOWN_DEPTH_LIMIT: usize = 5;
 
-// Made public so that `mir_drops_elaborated_and_const_checked` can be overridden
-// by custom rustc drivers, running all the steps by themselves. See #114628.
-pub struct Inline;
-
 #[derive(Clone, Debug)]
 struct CallSite<'tcx> {
     callee: Instance<'tcx>,
@@ -41,14 +36,12 @@ struct CallSite<'tcx> {
     source_info: SourceInfo,
 }
 
+// Made public so that `mir_drops_elaborated_and_const_checked` can be overridden
+// by custom rustc drivers, running all the steps by themselves. See #114628.
+pub struct Inline;
+
 impl<'tcx> crate::MirPass<'tcx> for Inline {
     fn is_enabled(&self, sess: &rustc_session::Session) -> bool {
-        // FIXME(#127234): Coverage instrumentation currently doesn't handle inlined
-        // MIR correctly when Modified Condition/Decision Coverage is enabled.
-        if sess.instrument_coverage_mcdc() {
-            return false;
-        }
-
         if let Some(enabled) = sess.opts.unstable_opts.inline_mir {
             return enabled;
         }
@@ -67,7 +60,7 @@ impl<'tcx> crate::MirPass<'tcx> for Inline {
     fn run_pass(&self, tcx: TyCtxt<'tcx>, body: &mut Body<'tcx>) {
         let span = trace_span!("inline", body = %tcx.def_path_str(body.source.def_id()));
         let _guard = span.enter();
-        if inline(tcx, body) {
+        if inline::<NormalInliner<'tcx>>(tcx, body) {
             debug!("running simplify cfg on {:?}", body.source);
             simplify_cfg(body);
             deref_finder(tcx, body);
@@ -75,47 +68,83 @@ impl<'tcx> crate::MirPass<'tcx> for Inline {
     }
 }
 
-fn inline<'tcx>(tcx: TyCtxt<'tcx>, body: &mut Body<'tcx>) -> bool {
-    let def_id = body.source.def_id().expect_local();
+pub struct ForceInline;
 
-    // Only do inlining into fn bodies.
-    if !tcx.hir().body_owner_kind(def_id).is_fn_or_closure() {
-        return false;
+impl ForceInline {
+    pub fn should_run_pass_for_callee<'tcx>(tcx: TyCtxt<'tcx>, def_id: DefId) -> bool {
+        matches!(tcx.codegen_fn_attrs(def_id).inline, InlineAttr::Force { .. })
     }
-    if body.source.promoted.is_some() {
-        return false;
+}
+
+impl<'tcx> crate::MirPass<'tcx> for ForceInline {
+    fn is_enabled(&self, _: &rustc_session::Session) -> bool {
+        true
     }
-    // Avoid inlining into coroutines, since their `optimized_mir` is used for layout computation,
-    // which can create a cycle, even when no attempt is made to inline the function in the other
-    // direction.
-    if body.coroutine.is_some() {
-        return false;
+
+    fn can_be_overridden(&self) -> bool {
+        false
+    }
+
+    fn run_pass(&self, tcx: TyCtxt<'tcx>, body: &mut Body<'tcx>) {
+        let span = trace_span!("force_inline", body = %tcx.def_path_str(body.source.def_id()));
+        let _guard = span.enter();
+        if inline::<ForceInliner<'tcx>>(tcx, body) {
+            debug!("running simplify cfg on {:?}", body.source);
+            simplify_cfg(body);
+            deref_finder(tcx, body);
+        }
     }
+}
 
-    let typing_env = body.typing_env(tcx);
-    let codegen_fn_attrs = tcx.codegen_fn_attrs(def_id);
+trait Inliner<'tcx> {
+    fn new(tcx: TyCtxt<'tcx>, def_id: DefId, body: &Body<'tcx>) -> Self;
 
-    let mut this = Inliner {
-        tcx,
-        typing_env,
-        codegen_fn_attrs,
-        history: Vec::new(),
-        changed: false,
-        caller_is_inline_forwarder: matches!(
-            codegen_fn_attrs.inline,
-            InlineAttr::Hint | InlineAttr::Always
-        ) && body_is_forwarder(body),
-    };
-    let blocks = START_BLOCK..body.basic_blocks.next_index();
-    this.process_blocks(body, blocks);
-    this.changed
+    fn tcx(&self) -> TyCtxt<'tcx>;
+    fn typing_env(&self) -> ty::TypingEnv<'tcx>;
+    fn history(&self) -> &[DefId];
+    fn caller_def_id(&self) -> DefId;
+
+    /// Has the caller body been changed?
+    fn changed(self) -> bool;
+
+    /// Should inlining happen for a given callee?
+    fn should_inline_for_callee(&self, def_id: DefId) -> bool;
+
+    fn check_caller_mir_body(&self, body: &Body<'tcx>) -> bool;
+
+    /// Returns inlining decision that is based on the examination of callee MIR body.
+    /// Assumes that codegen attributes have been checked for compatibility already.
+    fn check_callee_mir_body(
+        &self,
+        callsite: &CallSite<'tcx>,
+        callee_body: &Body<'tcx>,
+        callee_attrs: &CodegenFnAttrs,
+    ) -> Result<(), &'static str>;
+
+    // How many callsites in a body are we allowed to inline? We need to limit this in order
+    // to prevent super-linear growth in MIR size.
+    fn inline_limit_for_block(&self) -> Option<usize>;
+
+    /// Called when inlining succeeds.
+    fn on_inline_success(
+        &mut self,
+        callsite: &CallSite<'tcx>,
+        caller_body: &mut Body<'tcx>,
+        new_blocks: std::ops::Range<BasicBlock>,
+    );
+
+    /// Called when inlining failed or was not performed.
+    fn on_inline_failure(&self, callsite: &CallSite<'tcx>, reason: &'static str);
+
+    /// Called when the inline limit for a body is reached.
+    fn on_inline_limit_reached(&self) -> bool;
 }
 
-struct Inliner<'tcx> {
+struct ForceInliner<'tcx> {
     tcx: TyCtxt<'tcx>,
     typing_env: ty::TypingEnv<'tcx>,
-    /// Caller codegen attributes.
-    codegen_fn_attrs: &'tcx CodegenFnAttrs,
+    /// `DefId` of caller.
+    def_id: DefId,
     /// Stack of inlined instances.
     /// We only check the `DefId` and not the args because we want to
     /// avoid inlining cases of polymorphic recursion.
@@ -124,366 +153,203 @@ struct Inliner<'tcx> {
     history: Vec<DefId>,
     /// Indicates that the caller body has been modified.
     changed: bool,
-    /// Indicates that the caller is #[inline] and just calls another function,
-    /// and thus we can inline less into it as it'll be inlined itself.
-    caller_is_inline_forwarder: bool,
 }
 
-impl<'tcx> Inliner<'tcx> {
-    fn process_blocks(&mut self, caller_body: &mut Body<'tcx>, blocks: Range<BasicBlock>) {
-        // How many callsites in this body are we allowed to inline? We need to limit this in order
-        // to prevent super-linear growth in MIR size
-        let inline_limit = match self.history.len() {
-            0 => usize::MAX,
-            1..=TOP_DOWN_DEPTH_LIMIT => 1,
-            _ => return,
-        };
-        let mut inlined_count = 0;
-        for bb in blocks {
-            let bb_data = &caller_body[bb];
-            if bb_data.is_cleanup {
-                continue;
-            }
-
-            let Some(callsite) = self.resolve_callsite(caller_body, bb, bb_data) else {
-                continue;
-            };
-
-            let span = trace_span!("process_blocks", %callsite.callee, ?bb);
-            let _guard = span.enter();
-
-            match self.try_inlining(caller_body, &callsite) {
-                Err(reason) => {
-                    debug!("not-inlined {} [{}]", callsite.callee, reason);
-                }
-                Ok(new_blocks) => {
-                    debug!("inlined {}", callsite.callee);
-                    self.changed = true;
-
-                    self.history.push(callsite.callee.def_id());
-                    self.process_blocks(caller_body, new_blocks);
-                    self.history.pop();
-
-                    inlined_count += 1;
-                    if inlined_count == inline_limit {
-                        debug!("inline count reached");
-                        return;
-                    }
-                }
-            }
-        }
+impl<'tcx> Inliner<'tcx> for ForceInliner<'tcx> {
+    fn new(tcx: TyCtxt<'tcx>, def_id: DefId, body: &Body<'tcx>) -> Self {
+        Self { tcx, typing_env: body.typing_env(tcx), def_id, history: Vec::new(), changed: false }
     }
 
-    /// Attempts to inline a callsite into the caller body. When successful returns basic blocks
-    /// containing the inlined body. Otherwise returns an error describing why inlining didn't take
-    /// place.
-    fn try_inlining(
-        &self,
-        caller_body: &mut Body<'tcx>,
-        callsite: &CallSite<'tcx>,
-    ) -> Result<std::ops::Range<BasicBlock>, &'static str> {
-        self.check_mir_is_available(caller_body, callsite.callee)?;
-
-        let callee_attrs = self.tcx.codegen_fn_attrs(callsite.callee.def_id());
-        let cross_crate_inlinable = self.tcx.cross_crate_inlinable(callsite.callee.def_id());
-        self.check_codegen_attributes(callsite, callee_attrs, cross_crate_inlinable)?;
-
-        // Intrinsic fallback bodies are automatically made cross-crate inlineable,
-        // but at this stage we don't know whether codegen knows the intrinsic,
-        // so just conservatively don't inline it. This also ensures that we do not
-        // accidentally inline the body of an intrinsic that *must* be overridden.
-        if self.tcx.has_attr(callsite.callee.def_id(), sym::rustc_intrinsic) {
-            return Err("Callee is an intrinsic, do not inline fallback bodies");
-        }
-
-        let terminator = caller_body[callsite.block].terminator.as_ref().unwrap();
-        let TerminatorKind::Call { args, destination, .. } = &terminator.kind else { bug!() };
-        let destination_ty = destination.ty(&caller_body.local_decls, self.tcx).ty;
-        for arg in args {
-            if !arg.node.ty(&caller_body.local_decls, self.tcx).is_sized(self.tcx, self.typing_env)
-            {
-                // We do not allow inlining functions with unsized params. Inlining these functions
-                // could create unsized locals, which are unsound and being phased out.
-                return Err("Call has unsized argument");
-            }
-        }
-
-        let callee_body = try_instance_mir(self.tcx, callsite.callee.def)?;
-        self.check_mir_body(callsite, callee_body, callee_attrs, cross_crate_inlinable)?;
-
-        let Ok(callee_body) = callsite.callee.try_instantiate_mir_and_normalize_erasing_regions(
-            self.tcx,
-            self.typing_env,
-            ty::EarlyBinder::bind(callee_body.clone()),
-        ) else {
-            return Err("failed to normalize callee body");
-        };
-
-        // Normally, this shouldn't be required, but trait normalization failure can create a
-        // validation ICE.
-        if !validate_types(self.tcx, self.typing_env, &callee_body, &caller_body).is_empty() {
-            return Err("failed to validate callee body");
-        }
-
-        // Check call signature compatibility.
-        // Normally, this shouldn't be required, but trait normalization failure can create a
-        // validation ICE.
-        let output_type = callee_body.return_ty();
-        if !util::sub_types(self.tcx, self.typing_env, output_type, destination_ty) {
-            trace!(?output_type, ?destination_ty);
-            return Err("failed to normalize return type");
-        }
-        if callsite.fn_sig.abi() == ExternAbi::RustCall {
-            // FIXME: Don't inline user-written `extern "rust-call"` functions,
-            // since this is generally perf-negative on rustc, and we hope that
-            // LLVM will inline these functions instead.
-            if callee_body.spread_arg.is_some() {
-                return Err("do not inline user-written rust-call functions");
-            }
+    fn tcx(&self) -> TyCtxt<'tcx> {
+        self.tcx
+    }
 
-            let (self_arg, arg_tuple) = match &args[..] {
-                [arg_tuple] => (None, arg_tuple),
-                [self_arg, arg_tuple] => (Some(self_arg), arg_tuple),
-                _ => bug!("Expected `rust-call` to have 1 or 2 args"),
-            };
+    fn typing_env(&self) -> ty::TypingEnv<'tcx> {
+        self.typing_env
+    }
 
-            let self_arg_ty =
-                self_arg.map(|self_arg| self_arg.node.ty(&caller_body.local_decls, self.tcx));
+    fn history(&self) -> &[DefId] {
+        &self.history
+    }
 
-            let arg_tuple_ty = arg_tuple.node.ty(&caller_body.local_decls, self.tcx);
-            let ty::Tuple(arg_tuple_tys) = *arg_tuple_ty.kind() else {
-                bug!("Closure arguments are not passed as a tuple");
-            };
+    fn caller_def_id(&self) -> DefId {
+        self.def_id
+    }
 
-            for (arg_ty, input) in
-                self_arg_ty.into_iter().chain(arg_tuple_tys).zip(callee_body.args_iter())
-            {
-                let input_type = callee_body.local_decls[input].ty;
-                if !util::sub_types(self.tcx, self.typing_env, input_type, arg_ty) {
-                    trace!(?arg_ty, ?input_type);
-                    return Err("failed to normalize tuple argument type");
-                }
-            }
-        } else {
-            for (arg, input) in args.iter().zip(callee_body.args_iter()) {
-                let input_type = callee_body.local_decls[input].ty;
-                let arg_ty = arg.node.ty(&caller_body.local_decls, self.tcx);
-                if !util::sub_types(self.tcx, self.typing_env, input_type, arg_ty) {
-                    trace!(?arg_ty, ?input_type);
-                    return Err("failed to normalize argument type");
-                }
-            }
-        }
+    fn changed(self) -> bool {
+        self.changed
+    }
 
-        let old_blocks = caller_body.basic_blocks.next_index();
-        self.inline_call(caller_body, callsite, callee_body);
-        let new_blocks = old_blocks..caller_body.basic_blocks.next_index();
+    fn should_inline_for_callee(&self, def_id: DefId) -> bool {
+        ForceInline::should_run_pass_for_callee(self.tcx(), def_id)
+    }
 
-        Ok(new_blocks)
+    fn check_caller_mir_body(&self, _: &Body<'tcx>) -> bool {
+        true
     }
 
-    fn check_mir_is_available(
+    #[instrument(level = "debug", skip(self, callee_body))]
+    fn check_callee_mir_body(
         &self,
-        caller_body: &Body<'tcx>,
-        callee: Instance<'tcx>,
+        _: &CallSite<'tcx>,
+        callee_body: &Body<'tcx>,
+        callee_attrs: &CodegenFnAttrs,
     ) -> Result<(), &'static str> {
-        let caller_def_id = caller_body.source.def_id();
-        let callee_def_id = callee.def_id();
-        if callee_def_id == caller_def_id {
-            return Err("self-recursion");
-        }
-
-        match callee.def {
-            InstanceKind::Item(_) => {
-                // If there is no MIR available (either because it was not in metadata or
-                // because it has no MIR because it's an extern function), then the inliner
-                // won't cause cycles on this.
-                if !self.tcx.is_mir_available(callee_def_id) {
-                    return Err("item MIR unavailable");
-                }
-            }
-            // These have no own callable MIR.
-            InstanceKind::Intrinsic(_) | InstanceKind::Virtual(..) => {
-                return Err("instance without MIR (intrinsic / virtual)");
-            }
-
-            // FIXME(#127030): `ConstParamHasTy` has bad interactions with
-            // the drop shim builder, which does not evaluate predicates in
-            // the correct param-env for types being dropped. Stall resolving
-            // the MIR for this instance until all of its const params are
-            // substituted.
-            InstanceKind::DropGlue(_, Some(ty)) if ty.has_type_flags(TypeFlags::HAS_CT_PARAM) => {
-                return Err("still needs substitution");
-            }
-
-            // This cannot result in an immediate cycle since the callee MIR is a shim, which does
-            // not get any optimizations run on it. Any subsequent inlining may cause cycles, but we
-            // do not need to catch this here, we can wait until the inliner decides to continue
-            // inlining a second time.
-            InstanceKind::VTableShim(_)
-            | InstanceKind::ReifyShim(..)
-            | InstanceKind::FnPtrShim(..)
-            | InstanceKind::ClosureOnceShim { .. }
-            | InstanceKind::ConstructCoroutineInClosureShim { .. }
-            | InstanceKind::DropGlue(..)
-            | InstanceKind::CloneShim(..)
-            | InstanceKind::ThreadLocalShim(..)
-            | InstanceKind::FnPtrAddrShim(..)
-            | InstanceKind::AsyncDropGlueCtorShim(..) => return Ok(()),
+        if callee_body.tainted_by_errors.is_some() {
+            return Err("body has errors");
         }
 
-        if self.tcx.is_constructor(callee_def_id) {
-            trace!("constructors always have MIR");
-            // Constructor functions cannot cause a query cycle.
-            return Ok(());
-        }
-
-        if callee_def_id.is_local() {
-            // If we know for sure that the function we're calling will itself try to
-            // call us, then we avoid inlining that function.
-            if self.tcx.mir_callgraph_reachable((callee, caller_def_id.expect_local())) {
-                return Err("caller might be reachable from callee (query cycle avoidance)");
-            }
-
-            Ok(())
+        let caller_attrs = self.tcx().codegen_fn_attrs(self.caller_def_id());
+        if callee_attrs.instruction_set != caller_attrs.instruction_set
+            && callee_body
+                .basic_blocks
+                .iter()
+                .any(|bb| matches!(bb.terminator().kind, TerminatorKind::InlineAsm { .. }))
+        {
+            // During the attribute checking stage we allow a callee with no
+            // instruction_set assigned to count as compatible with a function that does
+            // assign one. However, during this stage we require an exact match when any
+            // inline-asm is detected. LLVM will still possibly do an inline later on
+            // if the no-attribute function ends up with the same instruction set anyway.
+            Err("cannot move inline-asm across instruction sets")
         } else {
-            // This cannot result in an immediate cycle since the callee MIR is from another crate
-            // and is already optimized. Any subsequent inlining may cause cycles, but we do
-            // not need to catch this here, we can wait until the inliner decides to continue
-            // inlining a second time.
-            trace!("functions from other crates always have MIR");
             Ok(())
         }
     }
 
-    fn resolve_callsite(
-        &self,
-        caller_body: &Body<'tcx>,
-        bb: BasicBlock,
-        bb_data: &BasicBlockData<'tcx>,
-    ) -> Option<CallSite<'tcx>> {
-        // Only consider direct calls to functions
-        let terminator = bb_data.terminator();
-
-        // FIXME(explicit_tail_calls): figure out if we can inline tail calls
-        if let TerminatorKind::Call { ref func, fn_span, .. } = terminator.kind {
-            let func_ty = func.ty(caller_body, self.tcx);
-            if let ty::FnDef(def_id, args) = *func_ty.kind() {
-                // To resolve an instance its args have to be fully normalized.
-                let args = self.tcx.try_normalize_erasing_regions(self.typing_env, args).ok()?;
-                let callee = Instance::try_resolve(self.tcx, self.typing_env, def_id, args)
-                    .ok()
-                    .flatten()?;
-
-                if let InstanceKind::Virtual(..) | InstanceKind::Intrinsic(_) = callee.def {
-                    return None;
-                }
-
-                if self.history.contains(&callee.def_id()) {
-                    return None;
-                }
+    fn inline_limit_for_block(&self) -> Option<usize> {
+        Some(usize::MAX)
+    }
 
-                let fn_sig = self.tcx.fn_sig(def_id).instantiate(self.tcx, args);
+    fn on_inline_success(
+        &mut self,
+        callsite: &CallSite<'tcx>,
+        caller_body: &mut Body<'tcx>,
+        new_blocks: std::ops::Range<BasicBlock>,
+    ) {
+        self.changed = true;
 
-                // Additionally, check that the body that we're inlining actually agrees
-                // with the ABI of the trait that the item comes from.
-                if let InstanceKind::Item(instance_def_id) = callee.def
-                    && self.tcx.def_kind(instance_def_id) == DefKind::AssocFn
-                    && let instance_fn_sig = self.tcx.fn_sig(instance_def_id).skip_binder()
-                    && instance_fn_sig.abi() != fn_sig.abi()
-                {
-                    return None;
-                }
+        self.history.push(callsite.callee.def_id());
+        process_blocks(self, caller_body, new_blocks);
+        self.history.pop();
+    }
 
-                let source_info = SourceInfo { span: fn_span, ..terminator.source_info };
+    fn on_inline_failure(&self, callsite: &CallSite<'tcx>, reason: &'static str) {
+        let tcx = self.tcx();
+        let InlineAttr::Force { attr_span, reason: justification } =
+            tcx.codegen_fn_attrs(callsite.callee.def_id()).inline
+        else {
+            bug!("called on item without required inlining");
+        };
 
-                return Some(CallSite { callee, fn_sig, block: bb, source_info });
-            }
-        }
+        let call_span = callsite.source_info.span;
+        tcx.dcx().emit_err(crate::errors::ForceInlineFailure {
+            call_span,
+            attr_span,
+            caller_span: tcx.def_span(self.def_id),
+            caller: tcx.def_path_str(self.def_id),
+            callee_span: tcx.def_span(callsite.callee.def_id()),
+            callee: tcx.def_path_str(callsite.callee.def_id()),
+            reason,
+            justification: justification.map(|sym| crate::errors::ForceInlineJustification { sym }),
+        });
+    }
 
-        None
+    fn on_inline_limit_reached(&self) -> bool {
+        false
     }
+}
 
-    /// Returns an error if inlining is not possible based on codegen attributes alone. A success
-    /// indicates that inlining decision should be based on other criteria.
-    fn check_codegen_attributes(
-        &self,
-        callsite: &CallSite<'tcx>,
-        callee_attrs: &CodegenFnAttrs,
-        cross_crate_inlinable: bool,
-    ) -> Result<(), &'static str> {
-        if self.tcx.has_attr(callsite.callee.def_id(), sym::rustc_no_mir_inline) {
-            return Err("#[rustc_no_mir_inline]");
-        }
+struct NormalInliner<'tcx> {
+    tcx: TyCtxt<'tcx>,
+    typing_env: ty::TypingEnv<'tcx>,
+    /// `DefId` of caller.
+    def_id: DefId,
+    /// Stack of inlined instances.
+    /// We only check the `DefId` and not the args because we want to
+    /// avoid inlining cases of polymorphic recursion.
+    /// The number of `DefId`s is finite, so checking history is enough
+    /// to ensure that we do not loop endlessly while inlining.
+    history: Vec<DefId>,
+    /// Indicates that the caller body has been modified.
+    changed: bool,
+    /// Indicates that the caller is #[inline] and just calls another function,
+    /// and thus we can inline less into it as it'll be inlined itself.
+    caller_is_inline_forwarder: bool,
+}
 
-        if let InlineAttr::Never = callee_attrs.inline {
-            return Err("never inline hint");
+impl<'tcx> Inliner<'tcx> for NormalInliner<'tcx> {
+    fn new(tcx: TyCtxt<'tcx>, def_id: DefId, body: &Body<'tcx>) -> Self {
+        let typing_env = body.typing_env(tcx);
+        let codegen_fn_attrs = tcx.codegen_fn_attrs(def_id);
+
+        Self {
+            tcx,
+            typing_env,
+            def_id,
+            history: Vec::new(),
+            changed: false,
+            caller_is_inline_forwarder: matches!(
+                codegen_fn_attrs.inline,
+                InlineAttr::Hint | InlineAttr::Always | InlineAttr::Force { .. }
+            ) && body_is_forwarder(body),
         }
+    }
 
-        // Reachability pass defines which functions are eligible for inlining. Generally inlining
-        // other functions is incorrect because they could reference symbols that aren't exported.
-        let is_generic = callsite.callee.args.non_erasable_generics().next().is_some();
-        if !is_generic && !cross_crate_inlinable {
-            return Err("not exported");
-        }
+    fn tcx(&self) -> TyCtxt<'tcx> {
+        self.tcx
+    }
 
-        if callsite.fn_sig.c_variadic() {
-            return Err("C variadic");
-        }
+    fn caller_def_id(&self) -> DefId {
+        self.def_id
+    }
 
-        if callee_attrs.flags.contains(CodegenFnAttrFlags::COLD) {
-            return Err("cold");
-        }
+    fn typing_env(&self) -> ty::TypingEnv<'tcx> {
+        self.typing_env
+    }
 
-        if callee_attrs.no_sanitize != self.codegen_fn_attrs.no_sanitize {
-            return Err("incompatible sanitizer set");
-        }
+    fn history(&self) -> &[DefId] {
+        &self.history
+    }
 
-        // Two functions are compatible if the callee has no attribute (meaning
-        // that it's codegen agnostic), or sets an attribute that is identical
-        // to this function's attribute.
-        if callee_attrs.instruction_set.is_some()
-            && callee_attrs.instruction_set != self.codegen_fn_attrs.instruction_set
-        {
-            return Err("incompatible instruction set");
-        }
+    fn changed(self) -> bool {
+        self.changed
+    }
+
+    fn should_inline_for_callee(&self, _: DefId) -> bool {
+        true
+    }
 
-        let callee_feature_names = callee_attrs.target_features.iter().map(|f| f.name);
-        let this_feature_names = self.codegen_fn_attrs.target_features.iter().map(|f| f.name);
-        if callee_feature_names.ne(this_feature_names) {
-            // In general it is not correct to inline a callee with target features that are a
-            // subset of the caller. This is because the callee might contain calls, and the ABI of
-            // those calls depends on the target features of the surrounding function. By moving a
-            // `Call` terminator from one MIR body to another with more target features, we might
-            // change the ABI of that call!
-            return Err("incompatible target features");
+    fn check_caller_mir_body(&self, body: &Body<'tcx>) -> bool {
+        // Avoid inlining into coroutines, since their `optimized_mir` is used for layout computation,
+        // which can create a cycle, even when no attempt is made to inline the function in the other
+        // direction.
+        if body.coroutine.is_some() {
+            return false;
         }
 
-        Ok(())
+        true
     }
 
-    /// Returns inlining decision that is based on the examination of callee MIR body.
-    /// Assumes that codegen attributes have been checked for compatibility already.
     #[instrument(level = "debug", skip(self, callee_body))]
-    fn check_mir_body(
+    fn check_callee_mir_body(
         &self,
         callsite: &CallSite<'tcx>,
         callee_body: &Body<'tcx>,
         callee_attrs: &CodegenFnAttrs,
-        cross_crate_inlinable: bool,
     ) -> Result<(), &'static str> {
-        let tcx = self.tcx;
+        let tcx = self.tcx();
 
         if let Some(_) = callee_body.tainted_by_errors {
-            return Err("Body is tainted");
+            return Err("body has errors");
         }
 
         let mut threshold = if self.caller_is_inline_forwarder {
-            self.tcx.sess.opts.unstable_opts.inline_mir_forwarder_threshold.unwrap_or(30)
-        } else if cross_crate_inlinable {
-            self.tcx.sess.opts.unstable_opts.inline_mir_hint_threshold.unwrap_or(100)
+            tcx.sess.opts.unstable_opts.inline_mir_forwarder_threshold.unwrap_or(30)
+        } else if tcx.cross_crate_inlinable(callsite.callee.def_id()) {
+            tcx.sess.opts.unstable_opts.inline_mir_hint_threshold.unwrap_or(100)
         } else {
-            self.tcx.sess.opts.unstable_opts.inline_mir_threshold.unwrap_or(50)
+            tcx.sess.opts.unstable_opts.inline_mir_threshold.unwrap_or(50)
         };
 
         // Give a bonus functions with a small number of blocks,
@@ -497,7 +363,7 @@ impl<'tcx> Inliner<'tcx> {
         // FIXME: Give a bonus to functions with only a single caller
 
         let mut checker =
-            CostChecker::new(self.tcx, self.typing_env, Some(callsite.callee), callee_body);
+            CostChecker::new(tcx, self.typing_env(), Some(callsite.callee), callee_body);
 
         checker.add_function_level_costs();
 
@@ -513,20 +379,20 @@ impl<'tcx> Inliner<'tcx> {
             checker.visit_basic_block_data(bb, blk);
 
             let term = blk.terminator();
+            let caller_attrs = tcx.codegen_fn_attrs(self.caller_def_id());
             if let TerminatorKind::Drop { ref place, target, unwind, replace: _ } = term.kind {
                 work_list.push(target);
 
                 // If the place doesn't actually need dropping, treat it like a regular goto.
-                let ty = callsite.callee.instantiate_mir(
-                    self.tcx,
-                    ty::EarlyBinder::bind(&place.ty(callee_body, tcx).ty),
-                );
-                if ty.needs_drop(tcx, self.typing_env)
+                let ty = callsite
+                    .callee
+                    .instantiate_mir(tcx, ty::EarlyBinder::bind(&place.ty(callee_body, tcx).ty));
+                if ty.needs_drop(tcx, self.typing_env())
                     && let UnwindAction::Cleanup(unwind) = unwind
                 {
                     work_list.push(unwind);
                 }
-            } else if callee_attrs.instruction_set != self.codegen_fn_attrs.instruction_set
+            } else if callee_attrs.instruction_set != caller_attrs.instruction_set
                 && matches!(term.kind, TerminatorKind::InlineAsm { .. })
             {
                 // During the attribute checking stage we allow a callee with no
@@ -534,7 +400,7 @@ impl<'tcx> Inliner<'tcx> {
                 // assign one. However, during this stage we require an exact match when any
                 // inline-asm is detected. LLVM will still possibly do an inline later on
                 // if the no-attribute function ends up with the same instruction set anyway.
-                return Err("Cannot move inline-asm across instruction sets");
+                return Err("cannot move inline-asm across instruction sets");
             } else if let TerminatorKind::TailCall { .. } = term.kind {
                 // FIXME(explicit_tail_calls): figure out how exactly functions containing tail
                 // calls can be inlined (and if they even should)
@@ -558,321 +424,688 @@ impl<'tcx> Inliner<'tcx> {
         }
     }
 
-    fn inline_call(
-        &self,
-        caller_body: &mut Body<'tcx>,
+    fn inline_limit_for_block(&self) -> Option<usize> {
+        match self.history.len() {
+            0 => Some(usize::MAX),
+            1..=TOP_DOWN_DEPTH_LIMIT => Some(1),
+            _ => None,
+        }
+    }
+
+    fn on_inline_success(
+        &mut self,
         callsite: &CallSite<'tcx>,
-        mut callee_body: Body<'tcx>,
+        caller_body: &mut Body<'tcx>,
+        new_blocks: std::ops::Range<BasicBlock>,
     ) {
-        let terminator = caller_body[callsite.block].terminator.take().unwrap();
-        let TerminatorKind::Call { func, args, destination, unwind, target, .. } = terminator.kind
-        else {
-            bug!("unexpected terminator kind {:?}", terminator.kind);
-        };
+        self.changed = true;
 
-        let return_block = if let Some(block) = target {
-            // Prepare a new block for code that should execute when call returns. We don't use
-            // target block directly since it might have other predecessors.
-            let data = BasicBlockData::new(
-                Some(Terminator {
-                    source_info: terminator.source_info,
-                    kind: TerminatorKind::Goto { target: block },
-                }),
-                caller_body[block].is_cleanup,
-            );
-            Some(caller_body.basic_blocks_mut().push(data))
-        } else {
-            None
+        self.history.push(callsite.callee.def_id());
+        process_blocks(self, caller_body, new_blocks);
+        self.history.pop();
+    }
+
+    fn on_inline_limit_reached(&self) -> bool {
+        true
+    }
+
+    fn on_inline_failure(&self, _: &CallSite<'tcx>, _: &'static str) {}
+}
+
+fn inline<'tcx, T: Inliner<'tcx>>(tcx: TyCtxt<'tcx>, body: &mut Body<'tcx>) -> bool {
+    let def_id = body.source.def_id();
+
+    // Only do inlining into fn bodies.
+    if !tcx.hir().body_owner_kind(def_id).is_fn_or_closure() {
+        return false;
+    }
+
+    let mut inliner = T::new(tcx, def_id, body);
+    if !inliner.check_caller_mir_body(body) {
+        return false;
+    }
+
+    let blocks = START_BLOCK..body.basic_blocks.next_index();
+    process_blocks(&mut inliner, body, blocks);
+    inliner.changed()
+}
+
+fn process_blocks<'tcx, I: Inliner<'tcx>>(
+    inliner: &mut I,
+    caller_body: &mut Body<'tcx>,
+    blocks: Range<BasicBlock>,
+) {
+    let Some(inline_limit) = inliner.inline_limit_for_block() else { return };
+    let mut inlined_count = 0;
+    for bb in blocks {
+        let bb_data = &caller_body[bb];
+        if bb_data.is_cleanup {
+            continue;
+        }
+
+        let Some(callsite) = resolve_callsite(inliner, caller_body, bb, bb_data) else {
+            continue;
         };
 
-        // If the call is something like `a[*i] = f(i)`, where
-        // `i : &mut usize`, then just duplicating the `a[*i]`
-        // Place could result in two different locations if `f`
-        // writes to `i`. To prevent this we need to create a temporary
-        // borrow of the place and pass the destination as `*temp` instead.
-        fn dest_needs_borrow(place: Place<'_>) -> bool {
-            for elem in place.projection.iter() {
-                match elem {
-                    ProjectionElem::Deref | ProjectionElem::Index(_) => return true,
-                    _ => {}
+        let span = trace_span!("process_blocks", %callsite.callee, ?bb);
+        let _guard = span.enter();
+
+        match try_inlining(inliner, caller_body, &callsite) {
+            Err(reason) => {
+                debug!("not-inlined {} [{}]", callsite.callee, reason);
+                inliner.on_inline_failure(&callsite, reason);
+            }
+            Ok(new_blocks) => {
+                debug!("inlined {}", callsite.callee);
+                inliner.on_inline_success(&callsite, caller_body, new_blocks);
+
+                inlined_count += 1;
+                if inlined_count == inline_limit {
+                    if inliner.on_inline_limit_reached() {
+                        return;
+                    }
                 }
             }
+        }
+    }
+}
+
+fn resolve_callsite<'tcx, I: Inliner<'tcx>>(
+    inliner: &I,
+    caller_body: &Body<'tcx>,
+    bb: BasicBlock,
+    bb_data: &BasicBlockData<'tcx>,
+) -> Option<CallSite<'tcx>> {
+    let tcx = inliner.tcx();
+    // Only consider direct calls to functions
+    let terminator = bb_data.terminator();
+
+    // FIXME(explicit_tail_calls): figure out if we can inline tail calls
+    if let TerminatorKind::Call { ref func, fn_span, .. } = terminator.kind {
+        let func_ty = func.ty(caller_body, tcx);
+        if let ty::FnDef(def_id, args) = *func_ty.kind() {
+            if !inliner.should_inline_for_callee(def_id) {
+                debug!("not enabled");
+                return None;
+            }
+
+            // To resolve an instance its args have to be fully normalized.
+            let args = tcx.try_normalize_erasing_regions(inliner.typing_env(), args).ok()?;
+            let callee =
+                Instance::try_resolve(tcx, inliner.typing_env(), def_id, args).ok().flatten()?;
+
+            if let InstanceKind::Virtual(..) | InstanceKind::Intrinsic(_) = callee.def {
+                return None;
+            }
+
+            if inliner.history().contains(&callee.def_id()) {
+                return None;
+            }
 
-            false
+            let fn_sig = tcx.fn_sig(def_id).instantiate(tcx, args);
+
+            // Additionally, check that the body that we're inlining actually agrees
+            // with the ABI of the trait that the item comes from.
+            if let InstanceKind::Item(instance_def_id) = callee.def
+                && tcx.def_kind(instance_def_id) == DefKind::AssocFn
+                && let instance_fn_sig = tcx.fn_sig(instance_def_id).skip_binder()
+                && instance_fn_sig.abi() != fn_sig.abi()
+            {
+                return None;
+            }
+
+            let source_info = SourceInfo { span: fn_span, ..terminator.source_info };
+
+            return Some(CallSite { callee, fn_sig, block: bb, source_info });
         }
+    }
 
-        let dest = if dest_needs_borrow(destination) {
-            trace!("creating temp for return destination");
-            let dest = Rvalue::Ref(
-                self.tcx.lifetimes.re_erased,
-                BorrowKind::Mut { kind: MutBorrowKind::Default },
-                destination,
-            );
-            let dest_ty = dest.ty(caller_body, self.tcx);
-            let temp =
-                Place::from(self.new_call_temp(caller_body, callsite, dest_ty, return_block));
-            caller_body[callsite.block].statements.push(Statement {
-                source_info: callsite.source_info,
-                kind: StatementKind::Assign(Box::new((temp, dest))),
-            });
-            self.tcx.mk_place_deref(temp)
-        } else {
-            destination
-        };
+    None
+}
 
-        // Always create a local to hold the destination, as `RETURN_PLACE` may appear
-        // where a full `Place` is not allowed.
-        let (remap_destination, destination_local) = if let Some(d) = dest.as_local() {
-            (false, d)
-        } else {
-            (
-                true,
-                self.new_call_temp(
-                    caller_body,
-                    callsite,
-                    destination.ty(caller_body, self.tcx).ty,
-                    return_block,
-                ),
-            )
-        };
+/// Attempts to inline a callsite into the caller body. When successful returns basic blocks
+/// containing the inlined body. Otherwise returns an error describing why inlining didn't take
+/// place.
+fn try_inlining<'tcx, I: Inliner<'tcx>>(
+    inliner: &I,
+    caller_body: &mut Body<'tcx>,
+    callsite: &CallSite<'tcx>,
+) -> Result<std::ops::Range<BasicBlock>, &'static str> {
+    let tcx = inliner.tcx();
+    check_mir_is_available(inliner, caller_body, callsite.callee)?;
+
+    let callee_attrs = tcx.codegen_fn_attrs(callsite.callee.def_id());
+    rustc_mir_build::check_inline::is_inline_valid_on_fn(tcx, callsite.callee.def_id())?;
+    check_codegen_attributes(inliner, callsite, callee_attrs)?;
+
+    let terminator = caller_body[callsite.block].terminator.as_ref().unwrap();
+    let TerminatorKind::Call { args, destination, .. } = &terminator.kind else { bug!() };
+    let destination_ty = destination.ty(&caller_body.local_decls, tcx).ty;
+    for arg in args {
+        if !arg.node.ty(&caller_body.local_decls, tcx).is_sized(tcx, inliner.typing_env()) {
+            // We do not allow inlining functions with unsized params. Inlining these functions
+            // could create unsized locals, which are unsound and being phased out.
+            return Err("call has unsized argument");
+        }
+    }
 
-        // Copy the arguments if needed.
-        let args = self.make_call_args(args, callsite, caller_body, &callee_body, return_block);
+    let callee_body = try_instance_mir(tcx, callsite.callee.def)?;
+    rustc_mir_build::check_inline::is_inline_valid_on_body(tcx, callee_body)?;
+    inliner.check_callee_mir_body(callsite, callee_body, callee_attrs)?;
 
-        let mut integrator = Integrator {
-            args: &args,
-            new_locals: Local::new(caller_body.local_decls.len())..,
-            new_scopes: SourceScope::new(caller_body.source_scopes.len())..,
-            new_blocks: BasicBlock::new(caller_body.basic_blocks.len())..,
-            destination: destination_local,
-            callsite_scope: caller_body.source_scopes[callsite.source_info.scope].clone(),
-            callsite,
-            cleanup_block: unwind,
-            in_cleanup_block: false,
-            return_block,
-            tcx: self.tcx,
-            always_live_locals: BitSet::new_filled(callee_body.local_decls.len()),
+    let Ok(callee_body) = callsite.callee.try_instantiate_mir_and_normalize_erasing_regions(
+        tcx,
+        inliner.typing_env(),
+        ty::EarlyBinder::bind(callee_body.clone()),
+    ) else {
+        debug!("failed to normalize callee body");
+        return Err("implementation limitation");
+    };
+
+    // Normally, this shouldn't be required, but trait normalization failure can create a
+    // validation ICE.
+    if !validate_types(tcx, inliner.typing_env(), &callee_body, &caller_body).is_empty() {
+        debug!("failed to validate callee body");
+        return Err("implementation limitation");
+    }
+
+    // Check call signature compatibility.
+    // Normally, this shouldn't be required, but trait normalization failure can create a
+    // validation ICE.
+    let output_type = callee_body.return_ty();
+    if !util::sub_types(tcx, inliner.typing_env(), output_type, destination_ty) {
+        trace!(?output_type, ?destination_ty);
+        debug!("failed to normalize return type");
+        return Err("implementation limitation");
+    }
+    if callsite.fn_sig.abi() == ExternAbi::RustCall {
+        // FIXME: Don't inline user-written `extern "rust-call"` functions,
+        // since this is generally perf-negative on rustc, and we hope that
+        // LLVM will inline these functions instead.
+        if callee_body.spread_arg.is_some() {
+            return Err("user-written rust-call functions");
+        }
+
+        let (self_arg, arg_tuple) = match &args[..] {
+            [arg_tuple] => (None, arg_tuple),
+            [self_arg, arg_tuple] => (Some(self_arg), arg_tuple),
+            _ => bug!("Expected `rust-call` to have 1 or 2 args"),
         };
 
-        // Map all `Local`s, `SourceScope`s and `BasicBlock`s to new ones
-        // (or existing ones, in a few special cases) in the caller.
-        integrator.visit_body(&mut callee_body);
+        let self_arg_ty = self_arg.map(|self_arg| self_arg.node.ty(&caller_body.local_decls, tcx));
 
-        // If there are any locals without storage markers, give them storage only for the
-        // duration of the call.
-        for local in callee_body.vars_and_temps_iter() {
-            if integrator.always_live_locals.contains(local) {
-                let new_local = integrator.map_local(local);
-                caller_body[callsite.block].statements.push(Statement {
-                    source_info: callsite.source_info,
-                    kind: StatementKind::StorageLive(new_local),
-                });
+        let arg_tuple_ty = arg_tuple.node.ty(&caller_body.local_decls, tcx);
+        let ty::Tuple(arg_tuple_tys) = *arg_tuple_ty.kind() else {
+            bug!("Closure arguments are not passed as a tuple");
+        };
+
+        for (arg_ty, input) in
+            self_arg_ty.into_iter().chain(arg_tuple_tys).zip(callee_body.args_iter())
+        {
+            let input_type = callee_body.local_decls[input].ty;
+            if !util::sub_types(tcx, inliner.typing_env(), input_type, arg_ty) {
+                trace!(?arg_ty, ?input_type);
+                debug!("failed to normalize tuple argument type");
+                return Err("implementation limitation");
             }
         }
-        if let Some(block) = return_block {
-            // To avoid repeated O(n) insert, push any new statements to the end and rotate
-            // the slice once.
-            let mut n = 0;
-            if remap_destination {
-                caller_body[block].statements.push(Statement {
-                    source_info: callsite.source_info,
-                    kind: StatementKind::Assign(Box::new((
-                        dest,
-                        Rvalue::Use(Operand::Move(destination_local.into())),
-                    ))),
-                });
-                n += 1;
+    } else {
+        for (arg, input) in args.iter().zip(callee_body.args_iter()) {
+            let input_type = callee_body.local_decls[input].ty;
+            let arg_ty = arg.node.ty(&caller_body.local_decls, tcx);
+            if !util::sub_types(tcx, inliner.typing_env(), input_type, arg_ty) {
+                trace!(?arg_ty, ?input_type);
+                debug!("failed to normalize argument type");
+                return Err("implementation limitation");
             }
-            for local in callee_body.vars_and_temps_iter().rev() {
-                if integrator.always_live_locals.contains(local) {
-                    let new_local = integrator.map_local(local);
-                    caller_body[block].statements.push(Statement {
-                        source_info: callsite.source_info,
-                        kind: StatementKind::StorageDead(new_local),
-                    });
-                    n += 1;
-                }
+        }
+    }
+
+    let old_blocks = caller_body.basic_blocks.next_index();
+    inline_call(inliner, caller_body, callsite, callee_body);
+    let new_blocks = old_blocks..caller_body.basic_blocks.next_index();
+
+    Ok(new_blocks)
+}
+
+fn check_mir_is_available<'tcx, I: Inliner<'tcx>>(
+    inliner: &I,
+    caller_body: &Body<'tcx>,
+    callee: Instance<'tcx>,
+) -> Result<(), &'static str> {
+    let caller_def_id = caller_body.source.def_id();
+    let callee_def_id = callee.def_id();
+    if callee_def_id == caller_def_id {
+        return Err("self-recursion");
+    }
+
+    match callee.def {
+        InstanceKind::Item(_) => {
+            // If there is no MIR available (either because it was not in metadata or
+            // because it has no MIR because it's an extern function), then the inliner
+            // won't cause cycles on this.
+            if !inliner.tcx().is_mir_available(callee_def_id) {
+                debug!("item MIR unavailable");
+                return Err("implementation limitation");
             }
-            caller_body[block].statements.rotate_right(n);
+        }
+        // These have no own callable MIR.
+        InstanceKind::Intrinsic(_) | InstanceKind::Virtual(..) => {
+            debug!("instance without MIR (intrinsic / virtual)");
+            return Err("implementation limitation");
         }
 
-        // Insert all of the (mapped) parts of the callee body into the caller.
-        caller_body.local_decls.extend(callee_body.drain_vars_and_temps());
-        caller_body.source_scopes.append(&mut callee_body.source_scopes);
-        if self
-            .tcx
-            .sess
-            .opts
-            .unstable_opts
-            .inline_mir_preserve_debug
-            .unwrap_or(self.tcx.sess.opts.debuginfo != DebugInfo::None)
-        {
-            // Note that we need to preserve these in the standard library so that
-            // people working on rust can build with or without debuginfo while
-            // still getting consistent results from the mir-opt tests.
-            caller_body.var_debug_info.append(&mut callee_body.var_debug_info);
+        // FIXME(#127030): `ConstParamHasTy` has bad interactions with
+        // the drop shim builder, which does not evaluate predicates in
+        // the correct param-env for types being dropped. Stall resolving
+        // the MIR for this instance until all of its const params are
+        // substituted.
+        InstanceKind::DropGlue(_, Some(ty)) if ty.has_type_flags(TypeFlags::HAS_CT_PARAM) => {
+            debug!("still needs substitution");
+            return Err("implementation limitation");
         }
-        caller_body.basic_blocks_mut().append(callee_body.basic_blocks_mut());
 
-        caller_body[callsite.block].terminator = Some(Terminator {
-            source_info: callsite.source_info,
-            kind: TerminatorKind::Goto { target: integrator.map_block(START_BLOCK) },
-        });
+        // This cannot result in an immediate cycle since the callee MIR is a shim, which does
+        // not get any optimizations run on it. Any subsequent inlining may cause cycles, but we
+        // do not need to catch this here, we can wait until the inliner decides to continue
+        // inlining a second time.
+        InstanceKind::VTableShim(_)
+        | InstanceKind::ReifyShim(..)
+        | InstanceKind::FnPtrShim(..)
+        | InstanceKind::ClosureOnceShim { .. }
+        | InstanceKind::ConstructCoroutineInClosureShim { .. }
+        | InstanceKind::DropGlue(..)
+        | InstanceKind::CloneShim(..)
+        | InstanceKind::ThreadLocalShim(..)
+        | InstanceKind::FnPtrAddrShim(..)
+        | InstanceKind::AsyncDropGlueCtorShim(..) => return Ok(()),
+    }
 
-        // Copy required constants from the callee_body into the caller_body. Although we are only
-        // pushing unevaluated consts to `required_consts`, here they may have been evaluated
-        // because we are calling `instantiate_and_normalize_erasing_regions` -- so we filter again.
-        caller_body.required_consts.as_mut().unwrap().extend(
-            callee_body.required_consts().into_iter().filter(|ct| ct.const_.is_required_const()),
-        );
-        // Now that we incorporated the callee's `required_consts`, we can remove the callee from
-        // `mentioned_items` -- but we have to take their `mentioned_items` in return. This does
-        // some extra work here to save the monomorphization collector work later. It helps a lot,
-        // since monomorphization can avoid a lot of work when the "mentioned items" are similar to
-        // the actually used items. By doing this we can entirely avoid visiting the callee!
-        // We need to reconstruct the `required_item` for the callee so that we can find and
-        // remove it.
-        let callee_item = MentionedItem::Fn(func.ty(caller_body, self.tcx));
-        let caller_mentioned_items = caller_body.mentioned_items.as_mut().unwrap();
-        if let Some(idx) = caller_mentioned_items.iter().position(|item| item.node == callee_item) {
-            // We found the callee, so remove it and add its items instead.
-            caller_mentioned_items.remove(idx);
-            caller_mentioned_items.extend(callee_body.mentioned_items());
-        } else {
-            // If we can't find the callee, there's no point in adding its items. Probably it
-            // already got removed by being inlined elsewhere in the same function, so we already
-            // took its items.
+    if inliner.tcx().is_constructor(callee_def_id) {
+        trace!("constructors always have MIR");
+        // Constructor functions cannot cause a query cycle.
+        return Ok(());
+    }
+
+    if callee_def_id.is_local()
+        && !inliner
+            .tcx()
+            .is_lang_item(inliner.tcx().parent(caller_def_id), rustc_hir::LangItem::FnOnce)
+    {
+        // If we know for sure that the function we're calling will itself try to
+        // call us, then we avoid inlining that function.
+        if inliner.tcx().mir_callgraph_reachable((callee, caller_def_id.expect_local())) {
+            debug!("query cycle avoidance");
+            return Err("caller might be reachable from callee");
         }
+
+        Ok(())
+    } else {
+        // This cannot result in an immediate cycle since the callee MIR is from another crate
+        // and is already optimized. Any subsequent inlining may cause cycles, but we do
+        // not need to catch this here, we can wait until the inliner decides to continue
+        // inlining a second time.
+        trace!("functions from other crates always have MIR");
+        Ok(())
     }
+}
 
-    fn make_call_args(
-        &self,
-        args: Box<[Spanned<Operand<'tcx>>]>,
-        callsite: &CallSite<'tcx>,
-        caller_body: &mut Body<'tcx>,
-        callee_body: &Body<'tcx>,
-        return_block: Option<BasicBlock>,
-    ) -> Box<[Local]> {
-        let tcx = self.tcx;
-
-        // There is a bit of a mismatch between the *caller* of a closure and the *callee*.
-        // The caller provides the arguments wrapped up in a tuple:
-        //
-        //     tuple_tmp = (a, b, c)
-        //     Fn::call(closure_ref, tuple_tmp)
-        //
-        // meanwhile the closure body expects the arguments (here, `a`, `b`, and `c`)
-        // as distinct arguments. (This is the "rust-call" ABI hack.) Normally, codegen has
-        // the job of unpacking this tuple. But here, we are codegen. =) So we want to create
-        // a vector like
-        //
-        //     [closure_ref, tuple_tmp.0, tuple_tmp.1, tuple_tmp.2]
-        //
-        // Except for one tiny wrinkle: we don't actually want `tuple_tmp.0`. It's more convenient
-        // if we "spill" that into *another* temporary, so that we can map the argument
-        // variable in the callee MIR directly to an argument variable on our side.
-        // So we introduce temporaries like:
-        //
-        //     tmp0 = tuple_tmp.0
-        //     tmp1 = tuple_tmp.1
-        //     tmp2 = tuple_tmp.2
-        //
-        // and the vector is `[closure_ref, tmp0, tmp1, tmp2]`.
-        if callsite.fn_sig.abi() == ExternAbi::RustCall && callee_body.spread_arg.is_none() {
-            // FIXME(edition_2024): switch back to a normal method call.
-            let mut args = <_>::into_iter(args);
-            let self_ = self.create_temp_if_necessary(
-                args.next().unwrap().node,
-                callsite,
-                caller_body,
-                return_block,
-            );
-            let tuple = self.create_temp_if_necessary(
-                args.next().unwrap().node,
-                callsite,
-                caller_body,
-                return_block,
-            );
-            assert!(args.next().is_none());
-
-            let tuple = Place::from(tuple);
-            let ty::Tuple(tuple_tys) = tuple.ty(caller_body, tcx).ty.kind() else {
-                bug!("Closure arguments are not passed as a tuple");
-            };
+/// Returns an error if inlining is not possible based on codegen attributes alone. A success
+/// indicates that inlining decision should be based on other criteria.
+fn check_codegen_attributes<'tcx, I: Inliner<'tcx>>(
+    inliner: &I,
+    callsite: &CallSite<'tcx>,
+    callee_attrs: &CodegenFnAttrs,
+) -> Result<(), &'static str> {
+    let tcx = inliner.tcx();
+    if let InlineAttr::Never = callee_attrs.inline {
+        return Err("never inline attribute");
+    }
 
-            // The `closure_ref` in our example above.
-            let closure_ref_arg = iter::once(self_);
+    // Reachability pass defines which functions are eligible for inlining. Generally inlining
+    // other functions is incorrect because they could reference symbols that aren't exported.
+    let is_generic = callsite.callee.args.non_erasable_generics().next().is_some();
+    if !is_generic && !tcx.cross_crate_inlinable(callsite.callee.def_id()) {
+        return Err("not exported");
+    }
 
-            // The `tmp0`, `tmp1`, and `tmp2` in our example above.
-            let tuple_tmp_args = tuple_tys.iter().enumerate().map(|(i, ty)| {
-                // This is e.g., `tuple_tmp.0` in our example above.
-                let tuple_field = Operand::Move(tcx.mk_place_field(tuple, FieldIdx::new(i), ty));
+    let codegen_fn_attrs = tcx.codegen_fn_attrs(inliner.caller_def_id());
+    if callee_attrs.no_sanitize != codegen_fn_attrs.no_sanitize {
+        return Err("incompatible sanitizer set");
+    }
 
-                // Spill to a local to make e.g., `tmp0`.
-                self.create_temp_if_necessary(tuple_field, callsite, caller_body, return_block)
-            });
+    // Two functions are compatible if the callee has no attribute (meaning
+    // that it's codegen agnostic), or sets an attribute that is identical
+    // to this function's attribute.
+    if callee_attrs.instruction_set.is_some()
+        && callee_attrs.instruction_set != codegen_fn_attrs.instruction_set
+    {
+        return Err("incompatible instruction set");
+    }
 
-            closure_ref_arg.chain(tuple_tmp_args).collect()
-        } else {
-            // FIXME(edition_2024): switch back to a normal method call.
-            <_>::into_iter(args)
-                .map(|a| self.create_temp_if_necessary(a.node, callsite, caller_body, return_block))
-                .collect()
-        }
+    let callee_feature_names = callee_attrs.target_features.iter().map(|f| f.name);
+    let this_feature_names = codegen_fn_attrs.target_features.iter().map(|f| f.name);
+    if callee_feature_names.ne(this_feature_names) {
+        // In general it is not correct to inline a callee with target features that are a
+        // subset of the caller. This is because the callee might contain calls, and the ABI of
+        // those calls depends on the target features of the surrounding function. By moving a
+        // `Call` terminator from one MIR body to another with more target features, we might
+        // change the ABI of that call!
+        return Err("incompatible target features");
     }
 
-    /// If `arg` is already a temporary, returns it. Otherwise, introduces a fresh
-    /// temporary `T` and an instruction `T = arg`, and returns `T`.
-    fn create_temp_if_necessary(
-        &self,
-        arg: Operand<'tcx>,
-        callsite: &CallSite<'tcx>,
-        caller_body: &mut Body<'tcx>,
-        return_block: Option<BasicBlock>,
-    ) -> Local {
-        // Reuse the operand if it is a moved temporary.
-        if let Operand::Move(place) = &arg
-            && let Some(local) = place.as_local()
-            && caller_body.local_kind(local) == LocalKind::Temp
-        {
-            return local;
+    Ok(())
+}
+
+fn inline_call<'tcx, I: Inliner<'tcx>>(
+    inliner: &I,
+    caller_body: &mut Body<'tcx>,
+    callsite: &CallSite<'tcx>,
+    mut callee_body: Body<'tcx>,
+) {
+    let tcx = inliner.tcx();
+    let terminator = caller_body[callsite.block].terminator.take().unwrap();
+    let TerminatorKind::Call { func, args, destination, unwind, target, .. } = terminator.kind
+    else {
+        bug!("unexpected terminator kind {:?}", terminator.kind);
+    };
+
+    let return_block = if let Some(block) = target {
+        // Prepare a new block for code that should execute when call returns. We don't use
+        // target block directly since it might have other predecessors.
+        let data = BasicBlockData::new(
+            Some(Terminator {
+                source_info: terminator.source_info,
+                kind: TerminatorKind::Goto { target: block },
+            }),
+            caller_body[block].is_cleanup,
+        );
+        Some(caller_body.basic_blocks_mut().push(data))
+    } else {
+        None
+    };
+
+    // If the call is something like `a[*i] = f(i)`, where
+    // `i : &mut usize`, then just duplicating the `a[*i]`
+    // Place could result in two different locations if `f`
+    // writes to `i`. To prevent this we need to create a temporary
+    // borrow of the place and pass the destination as `*temp` instead.
+    fn dest_needs_borrow(place: Place<'_>) -> bool {
+        for elem in place.projection.iter() {
+            match elem {
+                ProjectionElem::Deref | ProjectionElem::Index(_) => return true,
+                _ => {}
+            }
         }
 
-        // Otherwise, create a temporary for the argument.
-        trace!("creating temp for argument {:?}", arg);
-        let arg_ty = arg.ty(caller_body, self.tcx);
-        let local = self.new_call_temp(caller_body, callsite, arg_ty, return_block);
-        caller_body[callsite.block].statements.push(Statement {
-            source_info: callsite.source_info,
-            kind: StatementKind::Assign(Box::new((Place::from(local), Rvalue::Use(arg)))),
-        });
-        local
+        false
     }
 
-    /// Introduces a new temporary into the caller body that is live for the duration of the call.
-    fn new_call_temp(
-        &self,
-        caller_body: &mut Body<'tcx>,
-        callsite: &CallSite<'tcx>,
-        ty: Ty<'tcx>,
-        return_block: Option<BasicBlock>,
-    ) -> Local {
-        let local = caller_body.local_decls.push(LocalDecl::new(ty, callsite.source_info.span));
-
+    let dest = if dest_needs_borrow(destination) {
+        trace!("creating temp for return destination");
+        let dest = Rvalue::Ref(
+            tcx.lifetimes.re_erased,
+            BorrowKind::Mut { kind: MutBorrowKind::Default },
+            destination,
+        );
+        let dest_ty = dest.ty(caller_body, tcx);
+        let temp = Place::from(new_call_temp(caller_body, callsite, dest_ty, return_block));
         caller_body[callsite.block].statements.push(Statement {
             source_info: callsite.source_info,
-            kind: StatementKind::StorageLive(local),
+            kind: StatementKind::Assign(Box::new((temp, dest))),
         });
+        tcx.mk_place_deref(temp)
+    } else {
+        destination
+    };
+
+    // Always create a local to hold the destination, as `RETURN_PLACE` may appear
+    // where a full `Place` is not allowed.
+    let (remap_destination, destination_local) = if let Some(d) = dest.as_local() {
+        (false, d)
+    } else {
+        (
+            true,
+            new_call_temp(caller_body, callsite, destination.ty(caller_body, tcx).ty, return_block),
+        )
+    };
 
-        if let Some(block) = return_block {
-            caller_body[block].statements.insert(0, Statement {
+    // Copy the arguments if needed.
+    let args = make_call_args(inliner, args, callsite, caller_body, &callee_body, return_block);
+
+    let mut integrator = Integrator {
+        args: &args,
+        new_locals: Local::new(caller_body.local_decls.len())..,
+        new_scopes: SourceScope::new(caller_body.source_scopes.len())..,
+        new_blocks: BasicBlock::new(caller_body.basic_blocks.len())..,
+        destination: destination_local,
+        callsite_scope: caller_body.source_scopes[callsite.source_info.scope].clone(),
+        callsite,
+        cleanup_block: unwind,
+        in_cleanup_block: false,
+        return_block,
+        tcx,
+        always_live_locals: BitSet::new_filled(callee_body.local_decls.len()),
+    };
+
+    // Map all `Local`s, `SourceScope`s and `BasicBlock`s to new ones
+    // (or existing ones, in a few special cases) in the caller.
+    integrator.visit_body(&mut callee_body);
+
+    // If there are any locals without storage markers, give them storage only for the
+    // duration of the call.
+    for local in callee_body.vars_and_temps_iter() {
+        if integrator.always_live_locals.contains(local) {
+            let new_local = integrator.map_local(local);
+            caller_body[callsite.block].statements.push(Statement {
+                source_info: callsite.source_info,
+                kind: StatementKind::StorageLive(new_local),
+            });
+        }
+    }
+    if let Some(block) = return_block {
+        // To avoid repeated O(n) insert, push any new statements to the end and rotate
+        // the slice once.
+        let mut n = 0;
+        if remap_destination {
+            caller_body[block].statements.push(Statement {
                 source_info: callsite.source_info,
-                kind: StatementKind::StorageDead(local),
+                kind: StatementKind::Assign(Box::new((
+                    dest,
+                    Rvalue::Use(Operand::Move(destination_local.into())),
+                ))),
             });
+            n += 1;
+        }
+        for local in callee_body.vars_and_temps_iter().rev() {
+            if integrator.always_live_locals.contains(local) {
+                let new_local = integrator.map_local(local);
+                caller_body[block].statements.push(Statement {
+                    source_info: callsite.source_info,
+                    kind: StatementKind::StorageDead(new_local),
+                });
+                n += 1;
+            }
         }
+        caller_body[block].statements.rotate_right(n);
+    }
 
-        local
+    // Insert all of the (mapped) parts of the callee body into the caller.
+    caller_body.local_decls.extend(callee_body.drain_vars_and_temps());
+    caller_body.source_scopes.append(&mut callee_body.source_scopes);
+    if tcx
+        .sess
+        .opts
+        .unstable_opts
+        .inline_mir_preserve_debug
+        .unwrap_or(tcx.sess.opts.debuginfo != DebugInfo::None)
+    {
+        // Note that we need to preserve these in the standard library so that
+        // people working on rust can build with or without debuginfo while
+        // still getting consistent results from the mir-opt tests.
+        caller_body.var_debug_info.append(&mut callee_body.var_debug_info);
     }
+    caller_body.basic_blocks_mut().append(callee_body.basic_blocks_mut());
+
+    caller_body[callsite.block].terminator = Some(Terminator {
+        source_info: callsite.source_info,
+        kind: TerminatorKind::Goto { target: integrator.map_block(START_BLOCK) },
+    });
+
+    // Copy required constants from the callee_body into the caller_body. Although we are only
+    // pushing unevaluated consts to `required_consts`, here they may have been evaluated
+    // because we are calling `instantiate_and_normalize_erasing_regions` -- so we filter again.
+    caller_body.required_consts.as_mut().unwrap().extend(
+        callee_body.required_consts().into_iter().filter(|ct| ct.const_.is_required_const()),
+    );
+    // Now that we incorporated the callee's `required_consts`, we can remove the callee from
+    // `mentioned_items` -- but we have to take their `mentioned_items` in return. This does
+    // some extra work here to save the monomorphization collector work later. It helps a lot,
+    // since monomorphization can avoid a lot of work when the "mentioned items" are similar to
+    // the actually used items. By doing this we can entirely avoid visiting the callee!
+    // We need to reconstruct the `required_item` for the callee so that we can find and
+    // remove it.
+    let callee_item = MentionedItem::Fn(func.ty(caller_body, tcx));
+    let caller_mentioned_items = caller_body.mentioned_items.as_mut().unwrap();
+    if let Some(idx) = caller_mentioned_items.iter().position(|item| item.node == callee_item) {
+        // We found the callee, so remove it and add its items instead.
+        caller_mentioned_items.remove(idx);
+        caller_mentioned_items.extend(callee_body.mentioned_items());
+    } else {
+        // If we can't find the callee, there's no point in adding its items. Probably it
+        // already got removed by being inlined elsewhere in the same function, so we already
+        // took its items.
+    }
+}
+
+fn make_call_args<'tcx, I: Inliner<'tcx>>(
+    inliner: &I,
+    args: Box<[Spanned<Operand<'tcx>>]>,
+    callsite: &CallSite<'tcx>,
+    caller_body: &mut Body<'tcx>,
+    callee_body: &Body<'tcx>,
+    return_block: Option<BasicBlock>,
+) -> Box<[Local]> {
+    let tcx = inliner.tcx();
+
+    // There is a bit of a mismatch between the *caller* of a closure and the *callee*.
+    // The caller provides the arguments wrapped up in a tuple:
+    //
+    //     tuple_tmp = (a, b, c)
+    //     Fn::call(closure_ref, tuple_tmp)
+    //
+    // meanwhile the closure body expects the arguments (here, `a`, `b`, and `c`)
+    // as distinct arguments. (This is the "rust-call" ABI hack.) Normally, codegen has
+    // the job of unpacking this tuple. But here, we are codegen. =) So we want to create
+    // a vector like
+    //
+    //     [closure_ref, tuple_tmp.0, tuple_tmp.1, tuple_tmp.2]
+    //
+    // Except for one tiny wrinkle: we don't actually want `tuple_tmp.0`. It's more convenient
+    // if we "spill" that into *another* temporary, so that we can map the argument
+    // variable in the callee MIR directly to an argument variable on our side.
+    // So we introduce temporaries like:
+    //
+    //     tmp0 = tuple_tmp.0
+    //     tmp1 = tuple_tmp.1
+    //     tmp2 = tuple_tmp.2
+    //
+    // and the vector is `[closure_ref, tmp0, tmp1, tmp2]`.
+    if callsite.fn_sig.abi() == ExternAbi::RustCall && callee_body.spread_arg.is_none() {
+        // FIXME(edition_2024): switch back to a normal method call.
+        let mut args = <_>::into_iter(args);
+        let self_ = create_temp_if_necessary(
+            inliner,
+            args.next().unwrap().node,
+            callsite,
+            caller_body,
+            return_block,
+        );
+        let tuple = create_temp_if_necessary(
+            inliner,
+            args.next().unwrap().node,
+            callsite,
+            caller_body,
+            return_block,
+        );
+        assert!(args.next().is_none());
+
+        let tuple = Place::from(tuple);
+        let ty::Tuple(tuple_tys) = tuple.ty(caller_body, tcx).ty.kind() else {
+            bug!("Closure arguments are not passed as a tuple");
+        };
+
+        // The `closure_ref` in our example above.
+        let closure_ref_arg = iter::once(self_);
+
+        // The `tmp0`, `tmp1`, and `tmp2` in our example above.
+        let tuple_tmp_args = tuple_tys.iter().enumerate().map(|(i, ty)| {
+            // This is e.g., `tuple_tmp.0` in our example above.
+            let tuple_field = Operand::Move(tcx.mk_place_field(tuple, FieldIdx::new(i), ty));
+
+            // Spill to a local to make e.g., `tmp0`.
+            create_temp_if_necessary(inliner, tuple_field, callsite, caller_body, return_block)
+        });
+
+        closure_ref_arg.chain(tuple_tmp_args).collect()
+    } else {
+        // FIXME(edition_2024): switch back to a normal method call.
+        <_>::into_iter(args)
+            .map(|a| create_temp_if_necessary(inliner, a.node, callsite, caller_body, return_block))
+            .collect()
+    }
+}
+
+/// If `arg` is already a temporary, returns it. Otherwise, introduces a fresh temporary `T` and an
+/// instruction `T = arg`, and returns `T`.
+fn create_temp_if_necessary<'tcx, I: Inliner<'tcx>>(
+    inliner: &I,
+    arg: Operand<'tcx>,
+    callsite: &CallSite<'tcx>,
+    caller_body: &mut Body<'tcx>,
+    return_block: Option<BasicBlock>,
+) -> Local {
+    // Reuse the operand if it is a moved temporary.
+    if let Operand::Move(place) = &arg
+        && let Some(local) = place.as_local()
+        && caller_body.local_kind(local) == LocalKind::Temp
+    {
+        return local;
+    }
+
+    // Otherwise, create a temporary for the argument.
+    trace!("creating temp for argument {:?}", arg);
+    let arg_ty = arg.ty(caller_body, inliner.tcx());
+    let local = new_call_temp(caller_body, callsite, arg_ty, return_block);
+    caller_body[callsite.block].statements.push(Statement {
+        source_info: callsite.source_info,
+        kind: StatementKind::Assign(Box::new((Place::from(local), Rvalue::Use(arg)))),
+    });
+    local
+}
+
+/// Introduces a new temporary into the caller body that is live for the duration of the call.
+fn new_call_temp<'tcx>(
+    caller_body: &mut Body<'tcx>,
+    callsite: &CallSite<'tcx>,
+    ty: Ty<'tcx>,
+    return_block: Option<BasicBlock>,
+) -> Local {
+    let local = caller_body.local_decls.push(LocalDecl::new(ty, callsite.source_info.span));
+
+    caller_body[callsite.block].statements.push(Statement {
+        source_info: callsite.source_info,
+        kind: StatementKind::StorageLive(local),
+    });
+
+    if let Some(block) = return_block {
+        caller_body[block].statements.insert(0, Statement {
+            source_info: callsite.source_info,
+            kind: StatementKind::StorageDead(local),
+        });
+    }
+
+    local
 }
 
 /**
diff --git a/compiler/rustc_mir_transform/src/lib.rs b/compiler/rustc_mir_transform/src/lib.rs
index e1fba9be5bb..350929ffaa5 100644
--- a/compiler/rustc_mir_transform/src/lib.rs
+++ b/compiler/rustc_mir_transform/src/lib.rs
@@ -141,7 +141,7 @@ declare_passes! {
     mod gvn : GVN;
     // Made public so that `mir_drops_elaborated_and_const_checked` can be overridden
     // by custom rustc drivers, running all the steps by themselves. See #114628.
-    pub mod inline : Inline;
+    pub mod inline : Inline, ForceInline;
     mod instsimplify : InstSimplify { BeforeInline, AfterSimplifyCfg };
     mod jump_threading : JumpThreading;
     mod known_panics_lint : KnownPanicsLint;
@@ -488,7 +488,9 @@ fn mir_drops_elaborated_and_const_checked(tcx: TyCtxt<'_>, def: LocalDefId) -> &
     let is_fn_like = tcx.def_kind(def).is_fn_like();
     if is_fn_like {
         // Do not compute the mir call graph without said call graph actually being used.
-        if pm::should_run_pass(tcx, &inline::Inline) {
+        if pm::should_run_pass(tcx, &inline::Inline)
+            || inline::ForceInline::should_run_pass_for_callee(tcx, def.to_def_id())
+        {
             tcx.ensure_with_value().mir_inliner_callees(ty::InstanceKind::Item(def.to_def_id()));
         }
     }
@@ -664,6 +666,8 @@ fn run_optimization_passes<'tcx>(tcx: TyCtxt<'tcx>, body: &mut Body<'tcx>) {
             // Perform instsimplify before inline to eliminate some trivial calls (like clone
             // shims).
             &instsimplify::InstSimplify::BeforeInline,
+            // Perform inlining of `#[rustc_force_inline]`-annotated callees.
+            &inline::ForceInline,
             // Perform inlining, which may add a lot of code.
             &inline::Inline,
             // Code from other crates may have storage markers, so this needs to happen after
diff --git a/compiler/rustc_mir_transform/src/pass_manager.rs b/compiler/rustc_mir_transform/src/pass_manager.rs
index 8a45ce0762d..c3f0a989ce1 100644
--- a/compiler/rustc_mir_transform/src/pass_manager.rs
+++ b/compiler/rustc_mir_transform/src/pass_manager.rs
@@ -79,6 +79,12 @@ pub(super) trait MirPass<'tcx> {
         true
     }
 
+    /// Returns `true` if this pass can be overridden by `-Zenable-mir-passes`. This should be
+    /// true for basically every pass other than those that are necessary for correctness.
+    fn can_be_overridden(&self) -> bool {
+        true
+    }
+
     fn run_pass(&self, tcx: TyCtxt<'tcx>, body: &mut Body<'tcx>);
 
     fn is_mir_dump_enabled(&self) -> bool {
@@ -176,6 +182,10 @@ where
 {
     let name = pass.name();
 
+    if !pass.can_be_overridden() {
+        return pass.is_enabled(tcx.sess);
+    }
+
     let overridden_passes = &tcx.sess.opts.unstable_opts.mir_enable_passes;
     let overridden =
         overridden_passes.iter().rev().find(|(s, _)| s == &*name).map(|(_name, polarity)| {
diff --git a/compiler/rustc_mir_transform/src/shim.rs b/compiler/rustc_mir_transform/src/shim.rs
index 722da3c420d..4648ec33c93 100644
--- a/compiler/rustc_mir_transform/src/shim.rs
+++ b/compiler/rustc_mir_transform/src/shim.rs
@@ -20,7 +20,7 @@ use rustc_span::{DUMMY_SP, Span};
 use tracing::{debug, instrument};
 
 use crate::{
-    abort_unwinding_calls, add_call_guards, add_moves_for_packed_drops, deref_separator,
+    abort_unwinding_calls, add_call_guards, add_moves_for_packed_drops, deref_separator, inline,
     instsimplify, mentioned_items, pass_manager as pm, remove_noop_landing_pads, simplify,
 };
 
@@ -155,6 +155,8 @@ fn make_shim<'tcx>(tcx: TyCtxt<'tcx>, instance: ty::InstanceKind<'tcx>) -> Body<
             &remove_noop_landing_pads::RemoveNoopLandingPads,
             &simplify::SimplifyCfg::MakeShim,
             &instsimplify::InstSimplify::BeforeInline,
+            // Perform inlining of `#[rustc_force_inline]`-annotated callees.
+            &inline::ForceInline,
             &abort_unwinding_calls::AbortUnwindingCalls,
             &add_call_guards::CriticalCallEdges,
         ],
diff --git a/compiler/rustc_mir_transform/src/validate.rs b/compiler/rustc_mir_transform/src/validate.rs
index a670da94fcc..035670d4903 100644
--- a/compiler/rustc_mir_transform/src/validate.rs
+++ b/compiler/rustc_mir_transform/src/validate.rs
@@ -1,6 +1,7 @@
 //! Validates the MIR to ensure that invariants are upheld.
 
 use rustc_abi::{ExternAbi, FIRST_VARIANT, Size};
+use rustc_attr_parsing::InlineAttr;
 use rustc_data_structures::fx::{FxHashMap, FxHashSet};
 use rustc_hir::LangItem;
 use rustc_index::IndexVec;
@@ -366,7 +367,8 @@ impl<'a, 'tcx> Visitor<'tcx> for CfgChecker<'a, 'tcx> {
                 self.check_edge(location, *target, EdgeKind::Normal);
                 self.check_unwind_edge(location, *unwind);
             }
-            TerminatorKind::Call { args, .. } | TerminatorKind::TailCall { args, .. } => {
+            TerminatorKind::Call { func, args, .. }
+            | TerminatorKind::TailCall { func, args, .. } => {
                 // FIXME(explicit_tail_calls): refactor this & add tail-call specific checks
                 if let TerminatorKind::Call { target, unwind, destination, .. } = terminator.kind {
                     if let Some(target) = target {
@@ -419,6 +421,13 @@ impl<'a, 'tcx> Visitor<'tcx> for CfgChecker<'a, 'tcx> {
                         }
                     }
                 }
+
+                if let ty::FnDef(did, ..) = func.ty(&self.body.local_decls, self.tcx).kind()
+                    && self.body.phase >= MirPhase::Runtime(RuntimePhase::Optimized)
+                    && matches!(self.tcx.codegen_fn_attrs(did).inline, InlineAttr::Force { .. })
+                {
+                    self.fail(location, "`#[rustc_force_inline]`-annotated function not inlined");
+                }
             }
             TerminatorKind::Assert { target, unwind, .. } => {
                 self.check_edge(location, *target, EdgeKind::Normal);