about summary refs log tree commit diff
diff options
context:
space:
mode:
authorMarcelo Domínguez <dmmarcelo27@gmail.com>2025-08-14 15:27:57 +0000
committerMarcelo Domínguez <dmmarcelo27@gmail.com>2025-08-14 16:30:15 +0000
commit250d77e5d72fde69a6406050a3b037635f685378 (patch)
tree67749136fca27852b5fb784c864f7d3564a42a09
parent5c631041aa0b0ad9e161b966b78e6dfdb8011023 (diff)
downloadrust-250d77e5d72fde69a6406050a3b037635f685378.tar.gz
rust-250d77e5d72fde69a6406050a3b037635f685378.zip
Complete functionality and general cleanup
-rw-r--r--Cargo.lock2
-rw-r--r--compiler/rustc_builtin_macros/src/autodiff.rs492
-rw-r--r--compiler/rustc_codegen_gcc/src/lib.rs6
-rw-r--r--compiler/rustc_codegen_llvm/src/builder/autodiff.rs178
-rw-r--r--compiler/rustc_codegen_llvm/src/context.rs5
-rw-r--r--compiler/rustc_codegen_llvm/src/intrinsic.rs212
-rw-r--r--compiler/rustc_codegen_llvm/src/lib.rs6
-rw-r--r--compiler/rustc_codegen_ssa/src/back/write.rs18
-rw-r--r--compiler/rustc_codegen_ssa/src/base.rs7
-rw-r--r--compiler/rustc_codegen_ssa/src/codegen_attrs.rs10
-rw-r--r--compiler/rustc_codegen_ssa/src/traits/write.rs2
-rw-r--r--compiler/rustc_hir_analysis/src/check/intrinsic.rs3
-rw-r--r--compiler/rustc_middle/src/middle/codegen_fn_attrs.rs4
-rw-r--r--compiler/rustc_middle/src/mir/mono.rs2
-rw-r--r--compiler/rustc_monomorphize/Cargo.toml2
-rw-r--r--compiler/rustc_monomorphize/src/collector.rs5
-rw-r--r--compiler/rustc_monomorphize/src/collector/autodiff.rs48
-rw-r--r--compiler/rustc_monomorphize/src/partitioning.rs34
-rw-r--r--compiler/rustc_monomorphize/src/partitioning/autodiff.rs143
-rw-r--r--library/core/src/intrinsics/mod.rs34
-rw-r--r--library/core/src/macros/mod.rs2
-rw-r--r--src/doc/rustc-dev-guide/src/SUMMARY.md1
-rw-r--r--src/doc/rustc-dev-guide/src/autodiff/limitations.md27
-rw-r--r--triagebot.toml3
24 files changed, 419 insertions, 827 deletions
diff --git a/Cargo.lock b/Cargo.lock
index 8a878faecbc..e8590fa484b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4308,7 +4308,6 @@ name = "rustc_monomorphize"
 version = "0.0.0"
 dependencies = [
  "rustc_abi",
- "rustc_ast",
  "rustc_data_structures",
  "rustc_errors",
  "rustc_fluent_macro",
@@ -4317,7 +4316,6 @@ dependencies = [
  "rustc_middle",
  "rustc_session",
  "rustc_span",
- "rustc_symbol_mangling",
  "rustc_target",
  "serde",
  "serde_json",
diff --git a/compiler/rustc_builtin_macros/src/autodiff.rs b/compiler/rustc_builtin_macros/src/autodiff.rs
index 3f8585d35bc..c260dca87c0 100644
--- a/compiler/rustc_builtin_macros/src/autodiff.rs
+++ b/compiler/rustc_builtin_macros/src/autodiff.rs
@@ -15,11 +15,12 @@ mod llvm_enzyme {
     use rustc_ast::tokenstream::*;
     use rustc_ast::visit::AssocCtxt::*;
     use rustc_ast::{
-        self as ast, AssocItemKind, BindingMode, ExprKind, FnRetTy, FnSig, Generics, ItemKind,
-        MetaItemInner, PatKind, QSelf, TyKind, Visibility,
+        self as ast, AngleBracketedArg, AngleBracketedArgs, AnonConst, AssocItemKind, BindingMode,
+        FnRetTy, FnSig, GenericArg, GenericArgs, GenericParamKind, Generics, ItemKind,
+        MetaItemInner, PatKind, Path, PathSegment, TyKind, Visibility,
     };
     use rustc_expand::base::{Annotatable, ExtCtxt};
-    use rustc_span::{Ident, Span, Symbol, kw, sym};
+    use rustc_span::{Ident, Span, Symbol, sym};
     use thin_vec::{ThinVec, thin_vec};
     use tracing::{debug, trace};
 
@@ -179,11 +180,8 @@ mod llvm_enzyme {
     }
 
     /// We expand the autodiff macro to generate a new placeholder function which passes
-    /// type-checking and can be called by users. The function body of the placeholder function will
-    /// later be replaced on LLVM-IR level, so the design of the body is less important and for now
-    /// should just prevent early inlining and optimizations which alter the function signature.
-    /// The exact signature of the generated function depends on the configuration provided by the
-    /// user, but here is an example:
+    /// type-checking and can be called by users. The exact signature of the generated function
+    /// depends on the configuration provided by the user, but here is an example:
     ///
     /// ```
     /// #[autodiff(cos_box, Reverse, Duplicated, Active)]
@@ -199,14 +197,8 @@ mod llvm_enzyme {
     ///     f32::sin(**x)
     /// }
     /// #[rustc_autodiff(Reverse, Duplicated, Active)]
-    /// #[inline(never)]
     /// fn cos_box(x: &Box<f32>, dx: &mut Box<f32>, dret: f32) -> f32 {
-    ///     unsafe {
-    ///         asm!("NOP");
-    ///     };
-    ///     ::core::hint::black_box(sin(x));
-    ///     ::core::hint::black_box((dx, dret));
-    ///     ::core::hint::black_box(sin(x))
+    ///     std::intrinsics::autodiff(sin::<>, cos_box::<>, (x, dx, dret))
     /// }
     /// ```
     /// FIXME(ZuseZ4): Once autodiff is enabled by default, make this a doc comment which is checked
@@ -227,16 +219,24 @@ mod llvm_enzyme {
         // first get information about the annotable item: visibility, signature, name and generic
         // parameters.
         // these will be used to generate the differentiated version of the function
-        let Some((vis, sig, primal, generics)) = (match &item {
-            Annotatable::Item(iitem) => extract_item_info(iitem),
+        let Some((vis, sig, primal, generics, impl_of_trait)) = (match &item {
+            Annotatable::Item(iitem) => {
+                extract_item_info(iitem).map(|(v, s, p, g)| (v, s, p, g, false))
+            }
             Annotatable::Stmt(stmt) => match &stmt.kind {
-                ast::StmtKind::Item(iitem) => extract_item_info(iitem),
+                ast::StmtKind::Item(iitem) => {
+                    extract_item_info(iitem).map(|(v, s, p, g)| (v, s, p, g, false))
+                }
                 _ => None,
             },
-            Annotatable::AssocItem(assoc_item, Impl { .. }) => match &assoc_item.kind {
-                ast::AssocItemKind::Fn(box ast::Fn { sig, ident, generics, .. }) => {
-                    Some((assoc_item.vis.clone(), sig.clone(), ident.clone(), generics.clone()))
-                }
+            Annotatable::AssocItem(assoc_item, Impl { of_trait }) => match &assoc_item.kind {
+                ast::AssocItemKind::Fn(box ast::Fn { sig, ident, generics, .. }) => Some((
+                    assoc_item.vis.clone(),
+                    sig.clone(),
+                    ident.clone(),
+                    generics.clone(),
+                    *of_trait,
+                )),
                 _ => None,
             },
             _ => None,
@@ -254,7 +254,6 @@ mod llvm_enzyme {
         };
 
         let has_ret = has_ret(&sig.decl.output);
-        let sig_span = ecx.with_call_site_ctxt(sig.span);
 
         // create TokenStream from vec elemtents:
         // meta_item doesn't have a .tokens field
@@ -323,28 +322,27 @@ mod llvm_enzyme {
         }
         let span = ecx.with_def_site_ctxt(expand_span);
 
-        let n_active: u32 = x
-            .input_activity
-            .iter()
-            .filter(|a| **a == DiffActivity::Active || **a == DiffActivity::ActiveOnly)
-            .count() as u32;
-        let (d_sig, new_args, idents, errored) = gen_enzyme_decl(ecx, &sig, &x, span);
-
-        // TODO(Sa4dUs): Remove this and all the related logic
-        let _d_body = gen_enzyme_body(
-            ecx, &x, n_active, &sig, &d_sig, primal, &new_args, span, sig_span, idents, errored,
-            &generics,
-        );
+        let d_sig = gen_enzyme_decl(ecx, &sig, &x, span);
 
-        let d_body =
-            call_autodiff(ecx, primal, first_ident(&meta_item_vec[0]), span, &d_sig);
+        let d_body = ecx.block(
+            span,
+            thin_vec![call_autodiff(
+                ecx,
+                primal,
+                first_ident(&meta_item_vec[0]),
+                span,
+                &d_sig,
+                &generics,
+                impl_of_trait,
+            )],
+        );
 
         // The first element of it is the name of the function to be generated
-        let asdf = Box::new(ast::Fn {
+        let d_fn = Box::new(ast::Fn {
             defaultness: ast::Defaultness::Final,
             sig: d_sig,
             ident: first_ident(&meta_item_vec[0]),
-            generics: generics.clone(),
+            generics,
             contract: None,
             body: Some(d_body),
             define_opaque: None,
@@ -433,13 +431,11 @@ mod llvm_enzyme {
             tokens: ts,
         });
 
-        let vis_clone = vis.clone();
-
         let new_id = ecx.sess.psess.attr_id_generator.mk_attr_id();
         let d_attr = outer_normal_attr(&rustc_ad_attr, new_id, span);
         let d_annotatable = match &item {
             Annotatable::AssocItem(_, _) => {
-                let assoc_item: AssocItemKind = ast::AssocItemKind::Fn(asdf);
+                let assoc_item: AssocItemKind = ast::AssocItemKind::Fn(d_fn);
                 let d_fn = Box::new(ast::AssocItem {
                     attrs: thin_vec![d_attr],
                     id: ast::DUMMY_NODE_ID,
@@ -451,13 +447,13 @@ mod llvm_enzyme {
                 Annotatable::AssocItem(d_fn, Impl { of_trait: false })
             }
             Annotatable::Item(_) => {
-                let mut d_fn = ecx.item(span, thin_vec![d_attr], ItemKind::Fn(asdf));
+                let mut d_fn = ecx.item(span, thin_vec![d_attr], ItemKind::Fn(d_fn));
                 d_fn.vis = vis;
 
                 Annotatable::Item(d_fn)
             }
             Annotatable::Stmt(_) => {
-                let mut d_fn = ecx.item(span, thin_vec![d_attr], ItemKind::Fn(asdf));
+                let mut d_fn = ecx.item(span, thin_vec![d_attr], ItemKind::Fn(d_fn));
                 d_fn.vis = vis;
 
                 Annotatable::Stmt(Box::new(ast::Stmt {
@@ -471,9 +467,7 @@ mod llvm_enzyme {
             }
         };
 
-        let dummy_const_annotatable = gen_dummy_const(ecx, span, primal, sig, generics, vis_clone);
-
-        return vec![orig_annotatable, dummy_const_annotatable, d_annotatable];
+        return vec![orig_annotatable, d_annotatable];
     }
 
     // shadow arguments (the extra ones which were not in the original (primal) function), in reverse mode must be
@@ -504,9 +498,11 @@ mod llvm_enzyme {
         diff: Ident,
         span: Span,
         d_sig: &FnSig,
-    ) -> P<ast::Block> {
-        let primal_path_expr = ecx.expr_path(ecx.path_ident(span, primal));
-        let diff_path_expr = ecx.expr_path(ecx.path_ident(span, diff));
+        generics: &Generics,
+        is_impl: bool,
+    ) -> rustc_ast::Stmt {
+        let primal_path_expr = gen_turbofish_expr(ecx, primal, generics, span, is_impl);
+        let diff_path_expr = gen_turbofish_expr(ecx, diff, generics, span, is_impl);
 
         let tuple_expr = ecx.expr_tuple(
             span,
@@ -522,371 +518,65 @@ mod llvm_enzyme {
                 .into(),
         );
 
-        let enzyme_path = ecx.path(
-            span,
-            vec![
-                Ident::from_str("std"),
-                Ident::from_str("intrinsics"),
-                Ident::from_str("autodiff"),
-            ],
-        );
+        let enzyme_path_idents = ecx.std_path(&[sym::intrinsics, sym::autodiff]);
+        let enzyme_path = ecx.path(span, enzyme_path_idents);
         let call_expr = ecx.expr_call(
             span,
             ecx.expr_path(enzyme_path),
             vec![primal_path_expr, diff_path_expr, tuple_expr].into(),
         );
 
-        let block = ecx.block_expr(call_expr);
-
-        block
-    }
-
-    // Generate dummy const to prevent primal function
-    // from being optimized away before applying enzyme
-    // ```
-    // const _: () =
-    // {
-    //     #[used]
-    //     pub static DUMMY_PTR: fn_type = primal_fn;
-    // };
-    // ```
-    fn gen_dummy_const(
-        ecx: &ExtCtxt<'_>,
-        span: Span,
-        primal: Ident,
-        sig: FnSig,
-        generics: Generics,
-        vis: Visibility,
-    ) -> Annotatable {
-        // #[used]
-        let used_attr = P(ast::NormalAttr::from_ident(Ident::with_dummy_span(sym::used)));
-        let new_id = ecx.sess.psess.attr_id_generator.mk_attr_id();
-        let used_attr = outer_normal_attr(&used_attr, new_id, span);
-
-        // static DUMMY_PTR: <fn_type> = <primal_ident>
-        let static_ident = Ident::from_str_and_span("DUMMY_PTR", span);
-        let fn_ptr_ty = ast::TyKind::BareFn(Box::new(ast::BareFnTy {
-            safety: sig.header.safety,
-            ext: sig.header.ext,
-            generic_params: generics.params,
-            decl: sig.decl,
-            decl_span: sig.span,
-        }));
-        let static_ty = ecx.ty(span, fn_ptr_ty);
-
-        let static_expr = ecx.expr_path(ecx.path(span, vec![primal]));
-        let static_item_kind = ast::ItemKind::Static(Box::new(ast::StaticItem {
-            ident: static_ident,
-            ty: static_ty,
-            safety: ast::Safety::Default,
-            mutability: ast::Mutability::Not,
-            expr: Some(static_expr),
-            define_opaque: None,
-        }));
-
-        let static_item = ast::Item {
-            attrs: thin_vec![used_attr],
-            id: ast::DUMMY_NODE_ID,
-            span,
-            vis,
-            kind: static_item_kind,
-            tokens: None,
-        };
-
-        let block_expr = ecx.expr_block(Box::new(ast::Block {
-            stmts: thin_vec![ecx.stmt_item(span, P(static_item))],
-            id: ast::DUMMY_NODE_ID,
-            rules: ast::BlockCheckMode::Default,
-            span,
-            tokens: None,
-        }));
-
-        let const_item = ecx.item_const(
-            span,
-            Ident::from_str_and_span("_", span),
-            ecx.ty(span, ast::TyKind::Tup(thin_vec![])),
-            block_expr,
-        );
-
-        Annotatable::Item(const_item)
+        ecx.stmt_expr(call_expr)
     }
 
-    // Will generate a body of the type:
-    // ```
-    // {
-    //   unsafe {
-    //   asm!("NOP");
-    //   }
-    //   ::core::hint::black_box(primal(args));
-    //   ::core::hint::black_box((args, ret));
-    //   <This part remains to be done by following function>
-    // }
-    // ```
-    fn init_body_helper(
+    // Generate turbofish expression from fn name and generics
+    // Given `foo` and `<A, B, C>` params, gen `foo::<A, B, C>`
+    // We use this expression when passing primal and diff function to the autodiff intrinsic
+    fn gen_turbofish_expr(
         ecx: &ExtCtxt<'_>,
-        span: Span,
-        primal: Ident,
-        new_names: &[String],
-        sig_span: Span,
-        new_decl_span: Span,
-        idents: &[Ident],
-        errored: bool,
+        ident: Ident,
         generics: &Generics,
-    ) -> (Box<ast::Block>, Box<ast::Expr>, Box<ast::Expr>, Box<ast::Expr>) {
-        let blackbox_path = ecx.std_path(&[sym::hint, sym::black_box]);
-        let noop = ast::InlineAsm {
-            asm_macro: ast::AsmMacro::Asm,
-            template: vec![ast::InlineAsmTemplatePiece::String("NOP".into())],
-            template_strs: Box::new([]),
-            operands: vec![],
-            clobber_abis: vec![],
-            options: ast::InlineAsmOptions::PURE | ast::InlineAsmOptions::NOMEM,
-            line_spans: vec![],
-        };
-        let noop_expr = ecx.expr_asm(span, Box::new(noop));
-        let unsf = ast::BlockCheckMode::Unsafe(ast::UnsafeSource::CompilerGenerated);
-        let unsf_block = ast::Block {
-            stmts: thin_vec![ecx.stmt_semi(noop_expr)],
-            id: ast::DUMMY_NODE_ID,
-            tokens: None,
-            rules: unsf,
-            span,
-        };
-        let unsf_expr = ecx.expr_block(Box::new(unsf_block));
-        let blackbox_call_expr = ecx.expr_path(ecx.path(span, blackbox_path));
-        let primal_call = gen_primal_call(ecx, span, primal, idents, generics);
-        let black_box_primal_call = ecx.expr_call(
-            new_decl_span,
-            blackbox_call_expr.clone(),
-            thin_vec![primal_call.clone()],
-        );
-        let tup_args = new_names
-            .iter()
-            .map(|arg| ecx.expr_path(ecx.path_ident(span, Ident::from_str(arg))))
-            .collect();
-
-        let black_box_remaining_args = ecx.expr_call(
-            sig_span,
-            blackbox_call_expr.clone(),
-            thin_vec![ecx.expr_tuple(sig_span, tup_args)],
-        );
-
-        let mut body = ecx.block(span, ThinVec::new());
-        body.stmts.push(ecx.stmt_semi(unsf_expr));
-
-        // This uses primal args which won't be available if we errored before
-        if !errored {
-            body.stmts.push(ecx.stmt_semi(black_box_primal_call.clone()));
-        }
-        body.stmts.push(ecx.stmt_semi(black_box_remaining_args));
-
-        (body, primal_call, black_box_primal_call, blackbox_call_expr)
-    }
-
-    /// We only want this function to type-check, since we will replace the body
-    /// later on llvm level. Using `loop {}` does not cover all return types anymore,
-    /// so instead we manually build something that should pass the type checker.
-    /// We also add a inline_asm line, as one more barrier for rustc to prevent inlining
-    /// or const propagation. inline_asm will also triggers an Enzyme crash if due to another
-    /// bug would ever try to accidentally differentiate this placeholder function body.
-    /// Finally, we also add back_box usages of all input arguments, to prevent rustc
-    /// from optimizing any arguments away.
-    fn gen_enzyme_body(
-        ecx: &ExtCtxt<'_>,
-        x: &AutoDiffAttrs,
-        n_active: u32,
-        sig: &ast::FnSig,
-        d_sig: &ast::FnSig,
-        primal: Ident,
-        new_names: &[String],
         span: Span,
-        sig_span: Span,
-        idents: Vec<Ident>,
-        errored: bool,
-        generics: &Generics,
-    ) -> Box<ast::Block> {
-        let new_decl_span = d_sig.span;
-
-        // Just adding some default inline-asm and black_box usages to prevent early inlining
-        // and optimizations which alter the function signature.
-        //
-        // The bb_primal_call is the black_box call of the primal function. We keep it around,
-        // since it has the convenient property of returning the type of the primal function,
-        // Remember, we only care to match types here.
-        // No matter which return we pick, we always wrap it into a std::hint::black_box call,
-        // to prevent rustc from propagating it into the caller.
-        let (mut body, primal_call, bb_primal_call, bb_call_expr) = init_body_helper(
-            ecx,
-            span,
-            primal,
-            new_names,
-            sig_span,
-            new_decl_span,
-            &idents,
-            errored,
-            generics,
-        );
-
-        if !has_ret(&d_sig.decl.output) {
-            // there is no return type that we have to match, () works fine.
-            return body;
-        }
-
-        // Everything from here onwards just tries to fulfil the return type. Fun!
-
-        // having an active-only return means we'll drop the original return type.
-        // So that can be treated identical to not having one in the first place.
-        let primal_ret = has_ret(&sig.decl.output) && !x.has_active_only_ret();
-
-        if primal_ret && n_active == 0 && x.mode.is_rev() {
-            // We only have the primal ret.
-            body.stmts.push(ecx.stmt_expr(bb_primal_call));
-            return body;
-        }
-
-        if !primal_ret && n_active == 1 {
-            // Again no tuple return, so return default float val.
-            let ty = match d_sig.decl.output {
-                FnRetTy::Ty(ref ty) => ty.clone(),
-                FnRetTy::Default(span) => {
-                    panic!("Did not expect Default ret ty: {:?}", span);
+        is_impl: bool,
+    ) -> Box<ast::Expr> {
+        let generic_args = generics
+            .params
+            .iter()
+            .filter_map(|p| match &p.kind {
+                GenericParamKind::Type { .. } => {
+                    let path = ast::Path::from_ident(p.ident);
+                    let ty = ecx.ty_path(path);
+                    Some(AngleBracketedArg::Arg(GenericArg::Type(ty)))
                 }
-            };
-            let arg = ty.kind.is_simple_path().unwrap();
-            let tmp = ecx.def_site_path(&[arg, kw::Default]);
-            let default_call_expr = ecx.expr_path(ecx.path(span, tmp));
-            let default_call_expr = ecx.expr_call(new_decl_span, default_call_expr, thin_vec![]);
-            body.stmts.push(ecx.stmt_expr(default_call_expr));
-            return body;
-        }
-
-        let mut exprs: Box<ast::Expr> = primal_call;
-        let d_ret_ty = match d_sig.decl.output {
-            FnRetTy::Ty(ref ty) => ty.clone(),
-            FnRetTy::Default(span) => {
-                panic!("Did not expect Default ret ty: {:?}", span);
-            }
-        };
-        if x.mode.is_fwd() {
-            // Fwd mode is easy. If the return activity is Const, we support arbitrary types.
-            // Otherwise, we only support a scalar, a pair of scalars, or an array of scalars.
-            // We checked that (on a best-effort base) in the preceding gen_enzyme_decl function.
-            // In all three cases, we can return `std::hint::black_box(<T>::default())`.
-            if x.ret_activity == DiffActivity::Const {
-                // Here we call the primal function, since our dummy function has the same return
-                // type due to the Const return activity.
-                exprs = ecx.expr_call(new_decl_span, bb_call_expr, thin_vec![exprs]);
-            } else {
-                let q = QSelf { ty: d_ret_ty, path_span: span, position: 0 };
-                let y = ExprKind::Path(
-                    Some(Box::new(q)),
-                    ecx.path_ident(span, Ident::with_dummy_span(kw::Default)),
-                );
-                let default_call_expr = ecx.expr(span, y);
-                let default_call_expr =
-                    ecx.expr_call(new_decl_span, default_call_expr, thin_vec![]);
-                exprs = ecx.expr_call(new_decl_span, bb_call_expr, thin_vec![default_call_expr]);
-            }
-        } else if x.mode.is_rev() {
-            if x.width == 1 {
-                // We either have `-> ArbitraryType` or `-> (ArbitraryType, repeated_float_scalars)`.
-                match d_ret_ty.kind {
-                    TyKind::Tup(ref args) => {
-                        // We have a tuple return type. We need to create a tuple of the same size
-                        // and fill it with default values.
-                        let mut exprs2 = thin_vec![exprs];
-                        for arg in args.iter().skip(1) {
-                            let arg = arg.kind.is_simple_path().unwrap();
-                            let tmp = ecx.def_site_path(&[arg, kw::Default]);
-                            let default_call_expr = ecx.expr_path(ecx.path(span, tmp));
-                            let default_call_expr =
-                                ecx.expr_call(new_decl_span, default_call_expr, thin_vec![]);
-                            exprs2.push(default_call_expr);
-                        }
-                        exprs = ecx.expr_tuple(new_decl_span, exprs2);
-                    }
-                    _ => {
-                        // Interestingly, even the `-> ArbitraryType` case
-                        // ends up getting matched and handled correctly above,
-                        // so we don't have to handle any other case for now.
-                        panic!("Unsupported return type: {:?}", d_ret_ty);
-                    }
+                GenericParamKind::Const { .. } => {
+                    let expr = ecx.expr_path(ast::Path::from_ident(p.ident));
+                    let anon_const = AnonConst { id: ast::DUMMY_NODE_ID, value: expr };
+                    Some(AngleBracketedArg::Arg(GenericArg::Const(anon_const)))
                 }
-            }
-            exprs = ecx.expr_call(new_decl_span, bb_call_expr, thin_vec![exprs]);
-        } else {
-            unreachable!("Unsupported mode: {:?}", x.mode);
-        }
-
-        body.stmts.push(ecx.stmt_expr(exprs));
+                GenericParamKind::Lifetime { .. } => None,
+            })
+            .collect::<ThinVec<_>>();
 
-        body
-    }
+        let args: AngleBracketedArgs = AngleBracketedArgs { span, args: generic_args };
 
-    fn gen_primal_call(
-        ecx: &ExtCtxt<'_>,
-        span: Span,
-        primal: Ident,
-        idents: &[Ident],
-        generics: &Generics,
-    ) -> Box<ast::Expr> {
-        let has_self = idents.len() > 0 && idents[0].name == kw::SelfLower;
+        let segment = PathSegment {
+            ident,
+            id: ast::DUMMY_NODE_ID,
+            args: Some(Box::new(GenericArgs::AngleBracketed(args))),
+        };
 
-        if has_self {
-            let args: ThinVec<_> =
-                idents[1..].iter().map(|arg| ecx.expr_path(ecx.path_ident(span, *arg))).collect();
-            let self_expr = ecx.expr_self(span);
-            ecx.expr_method_call(span, self_expr, primal, args)
+        let segments = if is_impl {
+            thin_vec![
+                PathSegment { ident: Ident::from_str("Self"), id: ast::DUMMY_NODE_ID, args: None },
+                segment,
+            ]
         } else {
-            let args: ThinVec<_> =
-                idents.iter().map(|arg| ecx.expr_path(ecx.path_ident(span, *arg))).collect();
-            let mut primal_path = ecx.path_ident(span, primal);
-
-            let is_generic = !generics.params.is_empty();
-
-            match (is_generic, primal_path.segments.last_mut()) {
-                (true, Some(function_path)) => {
-                    let primal_generic_types = generics
-                        .params
-                        .iter()
-                        .filter(|param| matches!(param.kind, ast::GenericParamKind::Type { .. }));
-
-                    let generated_generic_types = primal_generic_types
-                        .map(|type_param| {
-                            let generic_param = TyKind::Path(
-                                None,
-                                ast::Path {
-                                    span,
-                                    segments: thin_vec![ast::PathSegment {
-                                        ident: type_param.ident,
-                                        args: None,
-                                        id: ast::DUMMY_NODE_ID,
-                                    }],
-                                    tokens: None,
-                                },
-                            );
-
-                            ast::AngleBracketedArg::Arg(ast::GenericArg::Type(Box::new(ast::Ty {
-                                id: type_param.id,
-                                span,
-                                kind: generic_param,
-                                tokens: None,
-                            })))
-                        })
-                        .collect();
-
-                    function_path.args =
-                        Some(Box::new(ast::GenericArgs::AngleBracketed(ast::AngleBracketedArgs {
-                            span,
-                            args: generated_generic_types,
-                        })));
-                }
-                _ => {}
-            }
+            thin_vec![segment]
+        };
 
-            let primal_call_expr = ecx.expr_path(primal_path);
-            ecx.expr_call(span, primal_call_expr, args)
-        }
+        let path = Path { span, segments, tokens: None };
+
+        ecx.expr_path(path)
     }
 
     // Generate the new function declaration. Const arguments are kept as is. Duplicated arguments must
@@ -905,7 +595,7 @@ mod llvm_enzyme {
         sig: &ast::FnSig,
         x: &AutoDiffAttrs,
         span: Span,
-    ) -> (ast::FnSig, Vec<String>, Vec<Ident>, bool) {
+    ) -> ast::FnSig {
         let dcx = ecx.sess.dcx();
         let has_ret = has_ret(&sig.decl.output);
         let sig_args = sig.decl.inputs.len() + if has_ret { 1 } else { 0 };
@@ -917,7 +607,7 @@ mod llvm_enzyme {
                 found: num_activities,
             });
             // This is not the right signature, but we can continue parsing.
-            return (sig.clone(), vec![], vec![], true);
+            return sig.clone();
         }
         assert!(sig.decl.inputs.len() == x.input_activity.len());
         assert!(has_ret == x.has_ret_activity());
@@ -960,7 +650,7 @@ mod llvm_enzyme {
 
         if errors {
             // This is not the right signature, but we can continue parsing.
-            return (sig.clone(), new_inputs, idents, true);
+            return sig.clone();
         }
 
         let unsafe_activities = x
@@ -1174,7 +864,7 @@ mod llvm_enzyme {
         }
         let d_sig = FnSig { header: d_header, decl: d_decl, span };
         trace!("Generated signature: {:?}", d_sig);
-        (d_sig, new_inputs, idents, false)
+        d_sig
     }
 }
 
diff --git a/compiler/rustc_codegen_gcc/src/lib.rs b/compiler/rustc_codegen_gcc/src/lib.rs
index b11f11d38e3..4025aba82da 100644
--- a/compiler/rustc_codegen_gcc/src/lib.rs
+++ b/compiler/rustc_codegen_gcc/src/lib.rs
@@ -93,7 +93,6 @@ use gccjit::{CType, Context, OptimizationLevel};
 #[cfg(feature = "master")]
 use gccjit::{TargetInfo, Version};
 use rustc_ast::expand::allocator::AllocatorKind;
-use rustc_ast::expand::autodiff_attrs::AutoDiffItem;
 use rustc_codegen_ssa::back::lto::{SerializedModule, ThinModule};
 use rustc_codegen_ssa::back::write::{
     CodegenContext, FatLtoInput, ModuleConfig, TargetMachineFactoryFn,
@@ -363,12 +362,7 @@ impl WriteBackendMethods for GccCodegenBackend {
         _exported_symbols_for_lto: &[String],
         each_linked_rlib_for_lto: &[PathBuf],
         modules: Vec<FatLtoInput<Self>>,
-        diff_functions: Vec<AutoDiffItem>,
     ) -> Result<ModuleCodegen<Self::Module>, FatalError> {
-        if !diff_functions.is_empty() {
-            unimplemented!();
-        }
-
         back::lto::run_fat(cgcx, each_linked_rlib_for_lto, modules)
     }
 
diff --git a/compiler/rustc_codegen_llvm/src/builder/autodiff.rs b/compiler/rustc_codegen_llvm/src/builder/autodiff.rs
index 66c34fbcfb1..56116959a62 100644
--- a/compiler/rustc_codegen_llvm/src/builder/autodiff.rs
+++ b/compiler/rustc_codegen_llvm/src/builder/autodiff.rs
@@ -1,40 +1,93 @@
 use std::ptr;
 
-use rustc_ast::expand::autodiff_attrs::{AutoDiffAttrs, AutoDiffItem, DiffActivity, DiffMode};
-use rustc_codegen_ssa::ModuleCodegen;
+use rustc_ast::expand::autodiff_attrs::{AutoDiffAttrs, DiffActivity, DiffMode};
 use rustc_codegen_ssa::common::TypeKind;
 use rustc_codegen_ssa::traits::{BaseTypeCodegenMethods, BuilderMethods};
-use rustc_errors::FatalError;
-use rustc_middle::bug;
-use tracing::{debug, trace};
+use rustc_middle::ty::{PseudoCanonicalInput, Ty, TyCtxt, TypingEnv};
+use rustc_middle::{bug, ty};
+use tracing::debug;
 
-use crate::back::write::llvm_err;
 use crate::builder::{Builder, PlaceRef, UNNAMED};
 use crate::context::SimpleCx;
 use crate::declare::declare_simple_fn;
-use crate::errors::{AutoDiffWithoutEnable, LlvmError};
 use crate::llvm::AttributePlace::Function;
 use crate::llvm::{Metadata, True, Type};
 use crate::value::Value;
-use crate::{CodegenContext, LlvmCodegenBackend, ModuleLlvm, attributes, llvm};
+use crate::{attributes, llvm};
 
-fn _get_params(fnc: &Value) -> Vec<&Value> {
-    let param_num = llvm::LLVMCountParams(fnc) as usize;
-    let mut fnc_args: Vec<&Value> = vec![];
-    fnc_args.reserve(param_num);
-    unsafe {
-        llvm::LLVMGetParams(fnc, fnc_args.as_mut_ptr());
-        fnc_args.set_len(param_num);
+pub(crate) fn adjust_activity_to_abi<'tcx>(
+    tcx: TyCtxt<'tcx>,
+    fn_ty: Ty<'tcx>,
+    da: &mut Vec<DiffActivity>,
+) {
+    if !matches!(fn_ty.kind(), ty::FnDef(..)) {
+        bug!("expected fn def for autodiff, got {:?}", fn_ty);
     }
-    fnc_args
-}
 
-fn _has_sret(fnc: &Value) -> bool {
-    let num_args = llvm::LLVMCountParams(fnc) as usize;
-    if num_args == 0 {
-        false
-    } else {
-        unsafe { llvm::LLVMRustHasAttributeAtIndex(fnc, 0, llvm::AttributeKind::StructRet) }
+    // We don't actually pass the types back into the type system.
+    // All we do is decide how to handle the arguments.
+    let sig = fn_ty.fn_sig(tcx).skip_binder();
+
+    let mut new_activities = vec![];
+    let mut new_positions = vec![];
+    for (i, ty) in sig.inputs().iter().enumerate() {
+        if let Some(inner_ty) = ty.builtin_deref(true) {
+            if inner_ty.is_slice() {
+                // Now we need to figure out the size of each slice element in memory to allow
+                // safety checks and usability improvements in the backend.
+                let sty = match inner_ty.builtin_index() {
+                    Some(sty) => sty,
+                    None => {
+                        panic!("slice element type unknown");
+                    }
+                };
+                let pci = PseudoCanonicalInput {
+                    typing_env: TypingEnv::fully_monomorphized(),
+                    value: sty,
+                };
+
+                let layout = tcx.layout_of(pci);
+                let elem_size = match layout {
+                    Ok(layout) => layout.size,
+                    Err(_) => {
+                        bug!("autodiff failed to compute slice element size");
+                    }
+                };
+                let elem_size: u32 = elem_size.bytes() as u32;
+
+                // We know that the length will be passed as extra arg.
+                if !da.is_empty() {
+                    // We are looking at a slice. The length of that slice will become an
+                    // extra integer on llvm level. Integers are always const.
+                    // However, if the slice get's duplicated, we want to know to later check the
+                    // size. So we mark the new size argument as FakeActivitySize.
+                    // There is one FakeActivitySize per slice, so for convenience we store the
+                    // slice element size in bytes in it. We will use the size in the backend.
+                    let activity = match da[i] {
+                        DiffActivity::DualOnly
+                        | DiffActivity::Dual
+                        | DiffActivity::Dualv
+                        | DiffActivity::DuplicatedOnly
+                        | DiffActivity::Duplicated => {
+                            DiffActivity::FakeActivitySize(Some(elem_size))
+                        }
+                        DiffActivity::Const => DiffActivity::Const,
+                        _ => bug!("unexpected activity for ptr/ref"),
+                    };
+                    new_activities.push(activity);
+                    new_positions.push(i + 1);
+                }
+
+                continue;
+            }
+        }
+    }
+    // now add the extra activities coming from slices
+    // Reverse order to not invalidate the indices
+    for _ in 0..new_activities.len() {
+        let pos = new_positions.pop().unwrap();
+        let activity = new_activities.pop().unwrap();
+        da.insert(pos, activity);
     }
 }
 
@@ -66,12 +119,12 @@ fn match_args_from_caller_to_enzyme<'ll, 'tcx>(
     let mut outer_pos: usize = 0;
     let mut activity_pos = 0;
 
-    let enzyme_const = cx.create_metadata("enzyme_const".to_string()).unwrap();
-    let enzyme_out = cx.create_metadata("enzyme_out".to_string()).unwrap();
-    let enzyme_dup = cx.create_metadata("enzyme_dup".to_string()).unwrap();
-    let enzyme_dupv = cx.create_metadata("enzyme_dupv".to_string()).unwrap();
-    let enzyme_dupnoneed = cx.create_metadata("enzyme_dupnoneed".to_string()).unwrap();
-    let enzyme_dupnoneedv = cx.create_metadata("enzyme_dupnoneedv".to_string()).unwrap();
+    let enzyme_const = cx.create_metadata(b"enzyme_const");
+    let enzyme_out = cx.create_metadata(b"enzyme_out");
+    let enzyme_dup = cx.create_metadata(b"enzyme_dup");
+    let enzyme_dupv = cx.create_metadata(b"enzyme_dupv");
+    let enzyme_dupnoneed = cx.create_metadata(b"enzyme_dupnoneed");
+    let enzyme_dupnoneedv = cx.create_metadata(b"enzyme_dupnoneedv");
 
     while activity_pos < inputs.len() {
         let diff_activity = inputs[activity_pos as usize];
@@ -223,7 +276,7 @@ pub(crate) fn generate_enzyme_call<'ll, 'tcx>(
     //  %0 = fmul double %x, %x
     //  ret double %0
     // }
-    // ```
+    //
     // define double @dsquare(double %x) {
     //  return 0.0;
     // }
@@ -245,8 +298,7 @@ pub(crate) fn generate_enzyme_call<'ll, 'tcx>(
 
     // FIXME(ZuseZ4): the CC/Addr/Vis values are best effort guesses, we should look at tests and
     // think a bit more about what should go here.
-    // FIXME(Sa4dUs): have to find a way to get the cc, using `FastCallConv` for now
-    let cc = 8;
+    let cc = unsafe { llvm::LLVMGetFunctionCallConv(fn_to_diff) };
     let ad_fn = declare_simple_fn(
         cx,
         &ad_name,
@@ -265,12 +317,12 @@ pub(crate) fn generate_enzyme_call<'ll, 'tcx>(
     let mut args = Vec::with_capacity(num_args as usize + 1);
     args.push(fn_to_diff);
 
-    let enzyme_primal_ret = cx.create_metadata("enzyme_primal_return".to_string()).unwrap();
+    let enzyme_primal_ret = cx.create_metadata(b"enzyme_primal_return");
     if matches!(attrs.ret_activity, DiffActivity::Dual | DiffActivity::Active) {
         args.push(cx.get_metadata_value(enzyme_primal_ret));
     }
     if attrs.width > 1 {
-        let enzyme_width = cx.create_metadata("enzyme_width".to_string()).unwrap();
+        let enzyme_width = cx.create_metadata(b"enzyme_width");
         args.push(cx.get_metadata_value(enzyme_width));
         args.push(cx.get_const_int(cx.type_i64(), attrs.width as u64));
     }
@@ -288,61 +340,3 @@ pub(crate) fn generate_enzyme_call<'ll, 'tcx>(
 
     builder.store_to_place(call, dest.val);
 }
-
-pub(crate) fn differentiate<'ll>(
-    module: &'ll ModuleCodegen<ModuleLlvm>,
-    cgcx: &CodegenContext<LlvmCodegenBackend>,
-    diff_items: Vec<AutoDiffItem>,
-) -> Result<(), FatalError> {
-    // TODO(Sa4dUs): delete all this logic
-    for item in &diff_items {
-        trace!("{}", item);
-    }
-
-    let diag_handler = cgcx.create_dcx();
-
-    let cx = SimpleCx::new(module.module_llvm.llmod(), module.module_llvm.llcx, cgcx.pointer_size);
-
-    // First of all, did the user try to use autodiff without using the -Zautodiff=Enable flag?
-    if !diff_items.is_empty()
-        && !cgcx.opts.unstable_opts.autodiff.contains(&rustc_session::config::AutoDiff::Enable)
-    {
-        return Err(diag_handler.handle().emit_almost_fatal(AutoDiffWithoutEnable));
-    }
-
-    // Here we replace the placeholder code with the actual autodiff code, which calls Enzyme.
-    for item in diff_items.iter() {
-        let name = item.source.clone();
-        let fn_def: Option<&llvm::Value> = cx.get_function(&name);
-        let Some(_fn_def) = fn_def else {
-            return Err(llvm_err(
-                diag_handler.handle(),
-                LlvmError::PrepareAutoDiff {
-                    src: item.source.clone(),
-                    target: item.target.clone(),
-                    error: "could not find source function".to_owned(),
-                },
-            ));
-        };
-        debug!(?item.target);
-        let fn_target: Option<&llvm::Value> = cx.get_function(&item.target);
-        let Some(_fn_target) = fn_target else {
-            return Err(llvm_err(
-                diag_handler.handle(),
-                LlvmError::PrepareAutoDiff {
-                    src: item.source.clone(),
-                    target: item.target.clone(),
-                    error: "could not find target function".to_owned(),
-                },
-            ));
-        };
-
-        // generate_enzyme_call(&cx, fn_def, fn_target, item.attrs.clone());
-    }
-
-    // FIXME(ZuseZ4): support SanitizeHWAddress and prevent illegal/unsupported opts
-
-    trace!("done with differentiate()");
-
-    Ok(())
-}
diff --git a/compiler/rustc_codegen_llvm/src/context.rs b/compiler/rustc_codegen_llvm/src/context.rs
index 8eb15571e82..da8c1e5f47b 100644
--- a/compiler/rustc_codegen_llvm/src/context.rs
+++ b/compiler/rustc_codegen_llvm/src/context.rs
@@ -8,7 +8,6 @@ use std::str;
 use rustc_abi::{HasDataLayout, Size, TargetDataLayout, VariantIdx};
 use rustc_codegen_ssa::back::versioned_llvm_target;
 use rustc_codegen_ssa::base::{wants_msvc_seh, wants_wasm_eh};
-use rustc_codegen_ssa::common::TypeKind;
 use rustc_codegen_ssa::errors as ssa_errors;
 use rustc_codegen_ssa::traits::*;
 use rustc_data_structures::base_n::{ALPHANUMERIC_ONLY, ToBaseN};
@@ -660,10 +659,6 @@ impl<'ll, 'tcx> CodegenCx<'ll, 'tcx> {
     }
 }
 impl<'ll> SimpleCx<'ll> {
-    pub(crate) fn _get_return_type(&self, ty: &'ll Type) -> &'ll Type {
-        assert_eq!(self.type_kind(ty), TypeKind::Function);
-        unsafe { llvm::LLVMGetReturnType(ty) }
-    }
     pub(crate) fn get_type_of_global(&self, val: &'ll Value) -> &'ll Type {
         unsafe { llvm::LLVMGlobalGetValueType(val) }
     }
diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs
index 1102fc1d0c8..4935f8d7dff 100644
--- a/compiler/rustc_codegen_llvm/src/intrinsic.rs
+++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs
@@ -9,21 +9,23 @@ use rustc_codegen_ssa::errors::{ExpectedPointerMutability, InvalidMonomorphizati
 use rustc_codegen_ssa::mir::operand::{OperandRef, OperandValue};
 use rustc_codegen_ssa::mir::place::{PlaceRef, PlaceValue};
 use rustc_codegen_ssa::traits::*;
-use rustc_hir as hir;
 use rustc_hir::def_id::LOCAL_CRATE;
+use rustc_hir::{self as hir};
 use rustc_middle::mir::BinOp;
 use rustc_middle::ty::layout::{FnAbiOf, HasTyCtxt, HasTypingEnv, LayoutOf};
-use rustc_middle::ty::{self, GenericArgsRef, Instance, Ty};
+use rustc_middle::ty::{self, GenericArgsRef, Instance, Ty, TyCtxt, TypingEnv};
 use rustc_middle::{bug, span_bug};
 use rustc_span::{Span, Symbol, sym};
 use rustc_symbol_mangling::{mangle_internal_symbol, symbol_name_for_instance_in_crate};
+use rustc_target::callconv::PassMode;
 use rustc_target::spec::PanicStrategy;
 use tracing::debug;
 
 use crate::abi::FnAbiLlvmExt;
 use crate::builder::Builder;
-use crate::builder::autodiff::generate_enzyme_call;
+use crate::builder::autodiff::{adjust_activity_to_abi, generate_enzyme_call};
 use crate::context::CodegenCx;
+use crate::errors::AutoDiffWithoutEnable;
 use crate::llvm::{self, Metadata};
 use crate::type_::Type;
 use crate::type_of::LayoutLlvmExt;
@@ -177,16 +179,9 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
         span: Span,
     ) -> Result<(), ty::Instance<'tcx>> {
         let tcx = self.tcx;
-        let callee_ty = instance.ty(tcx, self.typing_env());
 
-        let fn_args = instance.args;
-
-        let sig = callee_ty.fn_sig(tcx);
-        let sig = tcx.normalize_erasing_late_bound_regions(self.typing_env(), sig);
-        let ret_ty = sig.output();
         let name = tcx.item_name(instance.def_id());
-
-        let llret_ty = self.layout_of(ret_ty).llvm_type(self);
+        let fn_args = instance.args;
 
         let simple = call_simple_intrinsic(self, name, args);
         let llval = match name {
@@ -200,63 +195,7 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
                 )
             }
             sym::autodiff => {
-                let val_arr: Vec<&'ll Value> = match args[2].val {
-                    crate::intrinsic::OperandValue::Ref(ref place_value) => {
-                        let mut ret_arr = vec![];
-                        let tuple_place = PlaceRef { val: *place_value, layout: args[2].layout };
-
-                        for i in 0..tuple_place.layout.layout.0.fields.count() {
-                            let field_place = tuple_place.project_field(self, i);
-                            let field_layout = tuple_place.layout.field(self, i);
-                            let llvm_ty = field_layout.llvm_type(self.cx);
-
-                            let field_val =
-                                self.load(llvm_ty, field_place.val.llval, field_place.val.align);
-
-                            ret_arr.push(field_val)
-                        }
-
-                        ret_arr
-                    }
-                    crate::intrinsic::OperandValue::Pair(v1, v2) => vec![v1, v2],
-                    OperandValue::Immediate(v) => vec![v],
-                    OperandValue::ZeroSized => bug!("unexpected `ZeroSized` arg"),
-                };
-
-                // Get source, diff, and attrs
-                let source_id = match fn_args.into_type_list(tcx)[0].kind() {
-                    ty::FnDef(def_id, _) => def_id,
-                    _ => bug!("invalid args"),
-                };
-                let fn_source = Instance::mono(tcx, *source_id);
-                let source_symbol =
-                    symbol_name_for_instance_in_crate(tcx, fn_source.clone(), LOCAL_CRATE);
-                let fn_to_diff: Option<&'ll llvm::Value> = self.cx.get_function(&source_symbol);
-                let Some(fn_to_diff) = fn_to_diff else { bug!("could not find source function") };
-
-                let diff_id = match fn_args.into_type_list(tcx)[1].kind() {
-                    ty::FnDef(def_id, _) => def_id,
-                    _ => bug!("invalid args"),
-                };
-                let fn_diff = Instance::mono(tcx, *diff_id);
-                let diff_symbol =
-                    symbol_name_for_instance_in_crate(tcx, fn_diff.clone(), LOCAL_CRATE);
-
-                let diff_attrs = autodiff_attrs(tcx, *diff_id);
-                let Some(diff_attrs) = diff_attrs else { bug!("could not find autodiff attrs") };
-
-                // Build body
-                generate_enzyme_call(
-                    self,
-                    self.cx,
-                    fn_to_diff,
-                    &diff_symbol,
-                    llret_ty,
-                    &val_arr,
-                    diff_attrs.clone(),
-                    result,
-                );
-
+                codegen_autodiff(self, tcx, instance, args, result);
                 return Ok(());
             }
             sym::is_val_statically_known => {
@@ -1183,6 +1122,143 @@ fn get_rust_try_fn<'a, 'll, 'tcx>(
     rust_try
 }
 
+fn codegen_autodiff<'ll, 'tcx>(
+    bx: &mut Builder<'_, 'll, 'tcx>,
+    tcx: TyCtxt<'tcx>,
+    instance: ty::Instance<'tcx>,
+    args: &[OperandRef<'tcx, &'ll Value>],
+    result: PlaceRef<'tcx, &'ll Value>,
+) {
+    if !tcx.sess.opts.unstable_opts.autodiff.contains(&rustc_session::config::AutoDiff::Enable) {
+        let _ = tcx.dcx().emit_almost_fatal(AutoDiffWithoutEnable);
+    }
+
+    let fn_args = instance.args;
+    let callee_ty = instance.ty(tcx, bx.typing_env());
+
+    let sig = callee_ty.fn_sig(tcx).skip_binder();
+
+    let ret_ty = sig.output();
+    let llret_ty = bx.layout_of(ret_ty).llvm_type(bx);
+
+    // Get source, diff, and attrs
+    let (source_id, source_args) = match fn_args.into_type_list(tcx)[0].kind() {
+        ty::FnDef(def_id, source_params) => (def_id, source_params),
+        _ => bug!("invalid autodiff intrinsic args"),
+    };
+
+    let fn_source = match Instance::try_resolve(tcx, bx.cx.typing_env(), *source_id, source_args) {
+        Ok(Some(instance)) => instance,
+        Ok(None) => bug!(
+            "could not resolve ({:?}, {:?}) to a specific autodiff instance",
+            source_id,
+            source_args
+        ),
+        Err(_) => {
+            // An error has already been emitted
+            return;
+        }
+    };
+
+    let source_symbol = symbol_name_for_instance_in_crate(tcx, fn_source.clone(), LOCAL_CRATE);
+    let Some(fn_to_diff) = bx.cx.get_function(&source_symbol) else {
+        bug!("could not find source function")
+    };
+
+    let (diff_id, diff_args) = match fn_args.into_type_list(tcx)[1].kind() {
+        ty::FnDef(def_id, diff_args) => (def_id, diff_args),
+        _ => bug!("invalid args"),
+    };
+
+    let fn_diff = match Instance::try_resolve(tcx, bx.cx.typing_env(), *diff_id, diff_args) {
+        Ok(Some(instance)) => instance,
+        Ok(None) => bug!(
+            "could not resolve ({:?}, {:?}) to a specific autodiff instance",
+            diff_id,
+            diff_args
+        ),
+        Err(_) => {
+            // An error has already been emitted
+            return;
+        }
+    };
+
+    let val_arr = get_args_from_tuple(bx, args[2], fn_diff);
+    let diff_symbol = symbol_name_for_instance_in_crate(tcx, fn_diff.clone(), LOCAL_CRATE);
+
+    let Some(mut diff_attrs) = autodiff_attrs(tcx, fn_diff.def_id()) else {
+        bug!("could not find autodiff attrs")
+    };
+
+    adjust_activity_to_abi(
+        tcx,
+        fn_source.ty(tcx, TypingEnv::fully_monomorphized()),
+        &mut diff_attrs.input_activity,
+    );
+
+    // Build body
+    generate_enzyme_call(
+        bx,
+        bx.cx,
+        fn_to_diff,
+        &diff_symbol,
+        llret_ty,
+        &val_arr,
+        diff_attrs.clone(),
+        result,
+    );
+}
+
+fn get_args_from_tuple<'ll, 'tcx>(
+    bx: &mut Builder<'_, 'll, 'tcx>,
+    tuple_op: OperandRef<'tcx, &'ll Value>,
+    fn_instance: Instance<'tcx>,
+) -> Vec<&'ll Value> {
+    let cx = bx.cx;
+    let fn_abi = cx.fn_abi_of_instance(fn_instance, ty::List::empty());
+
+    match tuple_op.val {
+        OperandValue::Immediate(val) => vec![val],
+        OperandValue::Pair(v1, v2) => vec![v1, v2],
+        OperandValue::Ref(ptr) => {
+            let tuple_place = PlaceRef { val: ptr, layout: tuple_op.layout };
+
+            let mut result = Vec::with_capacity(fn_abi.args.len());
+            let mut tuple_index = 0;
+
+            for arg in &fn_abi.args {
+                match arg.mode {
+                    PassMode::Ignore => {}
+                    PassMode::Direct(_) | PassMode::Cast { .. } => {
+                        let field = tuple_place.project_field(bx, tuple_index);
+                        let llvm_ty = field.layout.llvm_type(bx.cx);
+                        let val = bx.load(llvm_ty, field.val.llval, field.val.align);
+                        result.push(val);
+                        tuple_index += 1;
+                    }
+                    PassMode::Pair(_, _) => {
+                        let field = tuple_place.project_field(bx, tuple_index);
+                        let llvm_ty = field.layout.llvm_type(bx.cx);
+                        let pair_val = bx.load(llvm_ty, field.val.llval, field.val.align);
+                        result.push(bx.extract_value(pair_val, 0));
+                        result.push(bx.extract_value(pair_val, 1));
+                        tuple_index += 1;
+                    }
+                    PassMode::Indirect { .. } => {
+                        let field = tuple_place.project_field(bx, tuple_index);
+                        result.push(field.val.llval);
+                        tuple_index += 1;
+                    }
+                }
+            }
+
+            result
+        }
+
+        OperandValue::ZeroSized => vec![],
+    }
+}
+
 fn generic_simd_intrinsic<'ll, 'tcx>(
     bx: &mut Builder<'_, 'll, 'tcx>,
     name: Symbol,
diff --git a/compiler/rustc_codegen_llvm/src/lib.rs b/compiler/rustc_codegen_llvm/src/lib.rs
index ca84b6de8b1..79e80db6f55 100644
--- a/compiler/rustc_codegen_llvm/src/lib.rs
+++ b/compiler/rustc_codegen_llvm/src/lib.rs
@@ -30,7 +30,6 @@ use context::SimpleCx;
 use errors::ParseTargetMachineConfig;
 use llvm_util::target_config;
 use rustc_ast::expand::allocator::AllocatorKind;
-use rustc_ast::expand::autodiff_attrs::AutoDiffItem;
 use rustc_codegen_ssa::back::lto::{SerializedModule, ThinModule};
 use rustc_codegen_ssa::back::write::{
     CodegenContext, FatLtoInput, ModuleConfig, TargetMachineFactoryConfig, TargetMachineFactoryFn,
@@ -173,15 +172,10 @@ impl WriteBackendMethods for LlvmCodegenBackend {
         exported_symbols_for_lto: &[String],
         each_linked_rlib_for_lto: &[PathBuf],
         modules: Vec<FatLtoInput<Self>>,
-        diff_fncs: Vec<AutoDiffItem>,
     ) -> Result<ModuleCodegen<Self::Module>, FatalError> {
         let mut module =
             back::lto::run_fat(cgcx, exported_symbols_for_lto, each_linked_rlib_for_lto, modules)?;
 
-        if !diff_fncs.is_empty() {
-            builder::autodiff::differentiate(&module, cgcx, diff_fncs)?;
-        }
-
         let dcx = cgcx.create_dcx();
         let dcx = dcx.handle();
         back::lto::run_pass_manager(cgcx, dcx, &mut module, false)?;
diff --git a/compiler/rustc_codegen_ssa/src/back/write.rs b/compiler/rustc_codegen_ssa/src/back/write.rs
index aa29afb7f5b..2e8122798d1 100644
--- a/compiler/rustc_codegen_ssa/src/back/write.rs
+++ b/compiler/rustc_codegen_ssa/src/back/write.rs
@@ -7,7 +7,6 @@ use std::{fs, io, mem, str, thread};
 
 use rustc_abi::Size;
 use rustc_ast::attr;
-use rustc_ast::expand::autodiff_attrs::AutoDiffItem;
 use rustc_data_structures::fx::FxIndexMap;
 use rustc_data_structures::jobserver::{self, Acquired};
 use rustc_data_structures::memmap::Mmap;
@@ -38,7 +37,7 @@ use tracing::debug;
 use super::link::{self, ensure_removed};
 use super::lto::{self, SerializedModule};
 use crate::back::lto::check_lto_allowed;
-use crate::errors::{AutodiffWithoutLto, ErrorCreatingRemarkDir};
+use crate::errors::ErrorCreatingRemarkDir;
 use crate::traits::*;
 use crate::{
     CachedModuleCodegen, CodegenResults, CompiledModule, CrateInfo, ModuleCodegen, ModuleKind,
@@ -454,7 +453,6 @@ pub(crate) fn start_async_codegen<B: ExtraBackendMethods>(
     backend: B,
     tcx: TyCtxt<'_>,
     target_cpu: String,
-    autodiff_items: &[AutoDiffItem],
 ) -> OngoingCodegen<B> {
     let (coordinator_send, coordinator_receive) = channel();
 
@@ -473,7 +471,6 @@ pub(crate) fn start_async_codegen<B: ExtraBackendMethods>(
         backend.clone(),
         tcx,
         &crate_info,
-        autodiff_items,
         shared_emitter,
         codegen_worker_send,
         coordinator_receive,
@@ -728,7 +725,6 @@ pub(crate) enum WorkItem<B: WriteBackendMethods> {
         each_linked_rlib_for_lto: Vec<PathBuf>,
         needs_fat_lto: Vec<FatLtoInput<B>>,
         import_only_modules: Vec<(SerializedModule<B::ModuleBuffer>, WorkProduct)>,
-        autodiff: Vec<AutoDiffItem>,
     },
     /// Performs thin-LTO on the given module.
     ThinLto(lto::ThinModule<B>),
@@ -1001,7 +997,6 @@ fn execute_fat_lto_work_item<B: ExtraBackendMethods>(
     each_linked_rlib_for_lto: &[PathBuf],
     mut needs_fat_lto: Vec<FatLtoInput<B>>,
     import_only_modules: Vec<(SerializedModule<B::ModuleBuffer>, WorkProduct)>,
-    autodiff: Vec<AutoDiffItem>,
     module_config: &ModuleConfig,
 ) -> Result<WorkItemResult<B>, FatalError> {
     for (module, wp) in import_only_modules {
@@ -1013,7 +1008,6 @@ fn execute_fat_lto_work_item<B: ExtraBackendMethods>(
         exported_symbols_for_lto,
         each_linked_rlib_for_lto,
         needs_fat_lto,
-        autodiff,
     )?;
     let module = B::codegen(cgcx, module, module_config)?;
     Ok(WorkItemResult::Finished(module))
@@ -1105,7 +1099,6 @@ fn start_executing_work<B: ExtraBackendMethods>(
     backend: B,
     tcx: TyCtxt<'_>,
     crate_info: &CrateInfo,
-    autodiff_items: &[AutoDiffItem],
     shared_emitter: SharedEmitter,
     codegen_worker_send: Sender<CguMessage>,
     coordinator_receive: Receiver<Message<B>>,
@@ -1115,7 +1108,6 @@ fn start_executing_work<B: ExtraBackendMethods>(
 ) -> thread::JoinHandle<Result<CompiledModules, ()>> {
     let coordinator_send = tx_to_llvm_workers;
     let sess = tcx.sess;
-    let autodiff_items = autodiff_items.to_vec();
 
     let mut each_linked_rlib_for_lto = Vec::new();
     let mut each_linked_rlib_file_for_lto = Vec::new();
@@ -1448,7 +1440,6 @@ fn start_executing_work<B: ExtraBackendMethods>(
                                 each_linked_rlib_for_lto: each_linked_rlib_file_for_lto,
                                 needs_fat_lto,
                                 import_only_modules,
-                                autodiff: autodiff_items.clone(),
                             },
                             0,
                         ));
@@ -1456,11 +1447,6 @@ fn start_executing_work<B: ExtraBackendMethods>(
                             helper.request_token();
                         }
                     } else {
-                        if !autodiff_items.is_empty() {
-                            let dcx = cgcx.create_dcx();
-                            dcx.handle().emit_fatal(AutodiffWithoutLto {});
-                        }
-
                         for (work, cost) in generate_thin_lto_work(
                             &cgcx,
                             &exported_symbols_for_lto,
@@ -1795,7 +1781,6 @@ fn spawn_work<'a, B: ExtraBackendMethods>(
                     each_linked_rlib_for_lto,
                     needs_fat_lto,
                     import_only_modules,
-                    autodiff,
                 } => {
                     let _timer = cgcx
                         .prof
@@ -1806,7 +1791,6 @@ fn spawn_work<'a, B: ExtraBackendMethods>(
                         &each_linked_rlib_for_lto,
                         needs_fat_lto,
                         import_only_modules,
-                        autodiff,
                         module_config,
                     )
                 }
diff --git a/compiler/rustc_codegen_ssa/src/base.rs b/compiler/rustc_codegen_ssa/src/base.rs
index b4556ced0b3..b483c01da59 100644
--- a/compiler/rustc_codegen_ssa/src/base.rs
+++ b/compiler/rustc_codegen_ssa/src/base.rs
@@ -647,7 +647,7 @@ pub fn codegen_crate<B: ExtraBackendMethods>(
 ) -> OngoingCodegen<B> {
     // Skip crate items and just output metadata in -Z no-codegen mode.
     if tcx.sess.opts.unstable_opts.no_codegen || !tcx.sess.opts.output_types.should_codegen() {
-        let ongoing_codegen = start_async_codegen(backend, tcx, target_cpu, &[]);
+        let ongoing_codegen = start_async_codegen(backend, tcx, target_cpu);
 
         ongoing_codegen.codegen_finished(tcx);
 
@@ -665,8 +665,7 @@ pub fn codegen_crate<B: ExtraBackendMethods>(
 
     // Run the monomorphization collector and partition the collected items into
     // codegen units.
-    let MonoItemPartitions { codegen_units, autodiff_items, .. } =
-        tcx.collect_and_partition_mono_items(());
+    let MonoItemPartitions { codegen_units, .. } = tcx.collect_and_partition_mono_items(());
 
     // Force all codegen_unit queries so they are already either red or green
     // when compile_codegen_unit accesses them. We are not able to re-execute
@@ -679,7 +678,7 @@ pub fn codegen_crate<B: ExtraBackendMethods>(
         }
     }
 
-    let ongoing_codegen = start_async_codegen(backend.clone(), tcx, target_cpu, autodiff_items);
+    let ongoing_codegen = start_async_codegen(backend.clone(), tcx, target_cpu);
 
     // Codegen an allocator shim, if necessary.
     if let Some(kind) = allocator_kind_for_codegen(tcx) {
diff --git a/compiler/rustc_codegen_ssa/src/codegen_attrs.rs b/compiler/rustc_codegen_ssa/src/codegen_attrs.rs
index a36a772bc97..af70f0deb07 100644
--- a/compiler/rustc_codegen_ssa/src/codegen_attrs.rs
+++ b/compiler/rustc_codegen_ssa/src/codegen_attrs.rs
@@ -177,14 +177,6 @@ fn process_builtin_attrs(
     let mut interesting_spans = InterestingAttributeDiagnosticSpans::default();
     let rust_target_features = tcx.rust_target_features(LOCAL_CRATE);
 
-    // If our rustc version supports autodiff/enzyme, then we call our handler
-    // to check for any `#[rustc_autodiff(...)]` attributes.
-    // FIXME(jdonszelmann): merge with loop below
-    if cfg!(llvm_enzyme) {
-        let ad = autodiff_attrs(tcx, did.into());
-        codegen_fn_attrs.autodiff_item = ad;
-    }
-
     for attr in attrs.iter() {
         if let hir::Attribute::Parsed(p) = attr {
             match p {
@@ -612,7 +604,7 @@ fn inherited_align<'tcx>(tcx: TyCtxt<'tcx>, def_id: DefId) -> Option<Align> {
 /// placeholder functions. We wrote the rustc_autodiff attributes ourself, so this should never
 /// panic, unless we introduced a bug when parsing the autodiff macro.
 //FIXME(jdonszelmann): put in the main loop. No need to have two..... :/ Let's do that when we make autodiff parsed.
-fn autodiff_attrs(tcx: TyCtxt<'_>, id: DefId) -> Option<AutoDiffAttrs> {
+pub fn autodiff_attrs(tcx: TyCtxt<'_>, id: DefId) -> Option<AutoDiffAttrs> {
     let attrs = tcx.get_attrs(id, sym::rustc_autodiff);
 
     let attrs = attrs.filter(|attr| attr.has_name(sym::rustc_autodiff)).collect::<Vec<_>>();
diff --git a/compiler/rustc_codegen_ssa/src/traits/write.rs b/compiler/rustc_codegen_ssa/src/traits/write.rs
index f391c198e1a..c29ad90735b 100644
--- a/compiler/rustc_codegen_ssa/src/traits/write.rs
+++ b/compiler/rustc_codegen_ssa/src/traits/write.rs
@@ -1,6 +1,5 @@
 use std::path::PathBuf;
 
-use rustc_ast::expand::autodiff_attrs::AutoDiffItem;
 use rustc_errors::{DiagCtxtHandle, FatalError};
 use rustc_middle::dep_graph::WorkProduct;
 
@@ -23,7 +22,6 @@ pub trait WriteBackendMethods: Clone + 'static {
         exported_symbols_for_lto: &[String],
         each_linked_rlib_for_lto: &[PathBuf],
         modules: Vec<FatLtoInput<Self>>,
-        diff_fncs: Vec<AutoDiffItem>,
     ) -> Result<ModuleCodegen<Self::Module>, FatalError>;
     /// Performs thin LTO by performing necessary global analysis and returning two
     /// lists, one of the modules that need optimization and another for modules that
diff --git a/compiler/rustc_hir_analysis/src/check/intrinsic.rs b/compiler/rustc_hir_analysis/src/check/intrinsic.rs
index 46371cfe591..f50aed0b3c2 100644
--- a/compiler/rustc_hir_analysis/src/check/intrinsic.rs
+++ b/compiler/rustc_hir_analysis/src/check/intrinsic.rs
@@ -172,8 +172,6 @@ pub(crate) fn check_intrinsic_type(
         }
     };
 
-    let has_autodiff = tcx.has_attr(intrinsic_id, sym::rustc_autodiff);
-
     let bound_vars = tcx.mk_bound_variable_kinds(&[
         ty::BoundVariableKind::Region(ty::BoundRegionKind::Anon),
         ty::BoundVariableKind::Region(ty::BoundRegionKind::Anon),
@@ -198,6 +196,7 @@ pub(crate) fn check_intrinsic_type(
         (Ty::new_ref(tcx, env_region, va_list_ty, mutbl), va_list_ty)
     };
 
+    let safety = intrinsic_operation_unsafety(tcx, intrinsic_id);
     let n_lts = 0;
     let (n_tps, n_cts, inputs, output) = match intrinsic_name {
         sym::autodiff => (4, 0, vec![param(0), param(1), param(2)], param(3)),
diff --git a/compiler/rustc_middle/src/middle/codegen_fn_attrs.rs b/compiler/rustc_middle/src/middle/codegen_fn_attrs.rs
index 2852c4cbd34..7d2fc0995aa 100644
--- a/compiler/rustc_middle/src/middle/codegen_fn_attrs.rs
+++ b/compiler/rustc_middle/src/middle/codegen_fn_attrs.rs
@@ -1,7 +1,6 @@
 use std::borrow::Cow;
 
 use rustc_abi::Align;
-use rustc_ast::expand::autodiff_attrs::AutoDiffAttrs;
 use rustc_hir::attrs::{InlineAttr, InstructionSetAttr, Linkage, OptimizeAttr};
 use rustc_hir::def_id::DefId;
 use rustc_macros::{HashStable, TyDecodable, TyEncodable};
@@ -75,8 +74,6 @@ pub struct CodegenFnAttrs {
     /// The `#[patchable_function_entry(...)]` attribute. Indicates how many nops should be around
     /// the function entry.
     pub patchable_function_entry: Option<PatchableFunctionEntry>,
-    /// For the `#[autodiff]` macros.
-    pub autodiff_item: Option<AutoDiffAttrs>,
 }
 
 #[derive(Copy, Clone, Debug, TyEncodable, TyDecodable, HashStable)]
@@ -182,7 +179,6 @@ impl CodegenFnAttrs {
             instruction_set: None,
             alignment: None,
             patchable_function_entry: None,
-            autodiff_item: None,
         }
     }
 
diff --git a/compiler/rustc_middle/src/mir/mono.rs b/compiler/rustc_middle/src/mir/mono.rs
index 440771b3d68..0e6f797b1e4 100644
--- a/compiler/rustc_middle/src/mir/mono.rs
+++ b/compiler/rustc_middle/src/mir/mono.rs
@@ -2,7 +2,6 @@ use std::borrow::Cow;
 use std::fmt;
 use std::hash::Hash;
 
-use rustc_ast::expand::autodiff_attrs::AutoDiffItem;
 use rustc_data_structures::base_n::{BaseNString, CASE_INSENSITIVE, ToBaseN};
 use rustc_data_structures::fingerprint::Fingerprint;
 use rustc_data_structures::fx::FxIndexMap;
@@ -336,7 +335,6 @@ impl ToStableHashKey<StableHashingContext<'_>> for MonoItem<'_> {
 pub struct MonoItemPartitions<'tcx> {
     pub codegen_units: &'tcx [CodegenUnit<'tcx>],
     pub all_mono_items: &'tcx DefIdSet,
-    pub autodiff_items: &'tcx [AutoDiffItem],
 }
 
 #[derive(Debug, HashStable)]
diff --git a/compiler/rustc_monomorphize/Cargo.toml b/compiler/rustc_monomorphize/Cargo.toml
index 0ed5b4fc0d0..09a55f0b5f8 100644
--- a/compiler/rustc_monomorphize/Cargo.toml
+++ b/compiler/rustc_monomorphize/Cargo.toml
@@ -6,7 +6,6 @@ edition = "2024"
 [dependencies]
 # tidy-alphabetical-start
 rustc_abi = { path = "../rustc_abi" }
-rustc_ast = { path = "../rustc_ast" }
 rustc_data_structures = { path = "../rustc_data_structures" }
 rustc_errors = { path = "../rustc_errors" }
 rustc_fluent_macro = { path = "../rustc_fluent_macro" }
@@ -15,7 +14,6 @@ rustc_macros = { path = "../rustc_macros" }
 rustc_middle = { path = "../rustc_middle" }
 rustc_session = { path = "../rustc_session" }
 rustc_span = { path = "../rustc_span" }
-rustc_symbol_mangling = { path = "../rustc_symbol_mangling" }
 rustc_target = { path = "../rustc_target" }
 serde = "1"
 serde_json = "1"
diff --git a/compiler/rustc_monomorphize/src/collector.rs b/compiler/rustc_monomorphize/src/collector.rs
index 26ca8518434..af2c3177067 100644
--- a/compiler/rustc_monomorphize/src/collector.rs
+++ b/compiler/rustc_monomorphize/src/collector.rs
@@ -205,6 +205,8 @@
 //! this is not implemented however: a mono item will be produced
 //! regardless of whether it is actually needed or not.
 
+mod autodiff;
+
 use std::cell::OnceCell;
 
 use rustc_data_structures::fx::FxIndexMap;
@@ -235,6 +237,7 @@ use rustc_span::source_map::{Spanned, dummy_spanned, respan};
 use rustc_span::{DUMMY_SP, Span};
 use tracing::{debug, instrument, trace};
 
+use crate::collector::autodiff::collect_autodiff_fn;
 use crate::errors::{
     self, EncounteredErrorWhileInstantiating, EncounteredErrorWhileInstantiatingGlobalAsm,
     NoOptimizedMir, RecursionLimit,
@@ -911,6 +914,8 @@ fn visit_instance_use<'tcx>(
         return;
     }
     if let Some(intrinsic) = tcx.intrinsic(instance.def_id()) {
+        collect_autodiff_fn(tcx, instance, intrinsic, output);
+
         if let Some(_requirement) = ValidityRequirement::from_intrinsic(intrinsic.name) {
             // The intrinsics assert_inhabited, assert_zero_valid, and assert_mem_uninitialized_valid will
             // be lowered in codegen to nothing or a call to panic_nounwind. So if we encounter any
diff --git a/compiler/rustc_monomorphize/src/collector/autodiff.rs b/compiler/rustc_monomorphize/src/collector/autodiff.rs
new file mode 100644
index 00000000000..13868cca944
--- /dev/null
+++ b/compiler/rustc_monomorphize/src/collector/autodiff.rs
@@ -0,0 +1,48 @@
+use rustc_middle::bug;
+use rustc_middle::ty::{self, GenericArg, IntrinsicDef, TyCtxt};
+
+use crate::collector::{MonoItems, create_fn_mono_item};
+
+// Here, we force both primal and diff function to be collected in
+// mono so this does not interfere in `autodiff` intrinsics
+// codegen process. If they are unused, LLVM will remove them when
+// compiling with O3.
+pub(crate) fn collect_autodiff_fn<'tcx>(
+    tcx: TyCtxt<'tcx>,
+    instance: ty::Instance<'tcx>,
+    intrinsic: IntrinsicDef,
+    output: &mut MonoItems<'tcx>,
+) {
+    if intrinsic.name != rustc_span::sym::autodiff {
+        return;
+    };
+
+    collect_autodiff_fn_from_arg(instance.args[0], tcx, output);
+}
+
+fn collect_autodiff_fn_from_arg<'tcx>(
+    arg: GenericArg<'tcx>,
+    tcx: TyCtxt<'tcx>,
+    output: &mut MonoItems<'tcx>,
+) {
+    let (instance, span) = match arg.kind() {
+        ty::GenericArgKind::Type(ty) => match ty.kind() {
+            ty::FnDef(def_id, substs) => {
+                let span = tcx.def_span(def_id);
+                let instance = ty::Instance::expect_resolve(
+                    tcx,
+                    ty::TypingEnv::non_body_analysis(tcx, def_id),
+                    *def_id,
+                    substs,
+                    span,
+                );
+
+                (instance, span)
+            }
+            _ => bug!("expected autodiff function"),
+        },
+        _ => bug!("expected type when matching autodiff arg"),
+    };
+
+    output.push(create_fn_mono_item(tcx, instance, span));
+}
diff --git a/compiler/rustc_monomorphize/src/partitioning.rs b/compiler/rustc_monomorphize/src/partitioning.rs
index 628ea2b63de..d784d3540c4 100644
--- a/compiler/rustc_monomorphize/src/partitioning.rs
+++ b/compiler/rustc_monomorphize/src/partitioning.rs
@@ -92,8 +92,6 @@
 //! source-level module, functions from the same module will be available for
 //! inlining, even when they are not marked `#[inline]`.
 
-mod autodiff;
-
 use std::cmp;
 use std::collections::hash_map::Entry;
 use std::fs::{self, File};
@@ -251,17 +249,7 @@ where
             always_export_generics,
         );
 
-        // We can't differentiate a function that got inlined.
-        let autodiff_active = cfg!(llvm_enzyme)
-            && matches!(mono_item, MonoItem::Fn(_))
-            && cx
-                .tcx
-                .codegen_fn_attrs(mono_item.def_id())
-                .autodiff_item
-                .as_ref()
-                .is_some_and(|ad| ad.is_active());
-
-        if !autodiff_active && visibility == Visibility::Hidden && can_be_internalized {
+        if visibility == Visibility::Hidden && can_be_internalized {
             internalization_candidates.insert(mono_item);
         }
         let size_estimate = mono_item.size_estimate(cx.tcx);
@@ -1157,27 +1145,15 @@ fn collect_and_partition_mono_items(tcx: TyCtxt<'_>, (): ()) -> MonoItemPartitio
         }
     }
 
-    #[cfg(not(llvm_enzyme))]
-    let autodiff_mono_items: Vec<_> = vec![];
-    #[cfg(llvm_enzyme)]
-    let mut autodiff_mono_items: Vec<_> = vec![];
     let mono_items: DefIdSet = items
         .iter()
         .filter_map(|mono_item| match *mono_item {
-            MonoItem::Fn(ref instance) => {
-                #[cfg(llvm_enzyme)]
-                autodiff_mono_items.push((mono_item, instance));
-                Some(instance.def_id())
-            }
+            MonoItem::Fn(ref instance) => Some(instance.def_id()),
             MonoItem::Static(def_id) => Some(def_id),
             _ => None,
         })
         .collect();
 
-    let autodiff_items =
-        autodiff::find_autodiff_source_functions(tcx, &usage_map, autodiff_mono_items);
-    let autodiff_items = tcx.arena.alloc_from_iter(autodiff_items);
-
     // Output monomorphization stats per def_id
     if let SwitchWithOptPath::Enabled(ref path) = tcx.sess.opts.unstable_opts.dump_mono_stats
         && let Err(err) =
@@ -1235,11 +1211,7 @@ fn collect_and_partition_mono_items(tcx: TyCtxt<'_>, (): ()) -> MonoItemPartitio
         }
     }
 
-    MonoItemPartitions {
-        all_mono_items: tcx.arena.alloc(mono_items),
-        codegen_units,
-        autodiff_items,
-    }
+    MonoItemPartitions { all_mono_items: tcx.arena.alloc(mono_items), codegen_units }
 }
 
 /// Outputs stats about instantiation counts and estimated size, per `MonoItem`'s
diff --git a/compiler/rustc_monomorphize/src/partitioning/autodiff.rs b/compiler/rustc_monomorphize/src/partitioning/autodiff.rs
deleted file mode 100644
index 22d593b80b8..00000000000
--- a/compiler/rustc_monomorphize/src/partitioning/autodiff.rs
+++ /dev/null
@@ -1,143 +0,0 @@
-use rustc_ast::expand::autodiff_attrs::{AutoDiffItem, DiffActivity};
-use rustc_hir::def_id::LOCAL_CRATE;
-use rustc_middle::bug;
-use rustc_middle::mir::mono::MonoItem;
-use rustc_middle::ty::{self, Instance, PseudoCanonicalInput, Ty, TyCtxt, TypingEnv};
-use rustc_symbol_mangling::symbol_name_for_instance_in_crate;
-use tracing::{debug, trace};
-
-use crate::partitioning::UsageMap;
-
-fn adjust_activity_to_abi<'tcx>(tcx: TyCtxt<'tcx>, fn_ty: Ty<'tcx>, da: &mut Vec<DiffActivity>) {
-    if !matches!(fn_ty.kind(), ty::FnDef(..)) {
-        bug!("expected fn def for autodiff, got {:?}", fn_ty);
-    }
-
-    // We don't actually pass the types back into the type system.
-    // All we do is decide how to handle the arguments.
-    let sig = fn_ty.fn_sig(tcx).skip_binder();
-
-    let mut new_activities = vec![];
-    let mut new_positions = vec![];
-    for (i, ty) in sig.inputs().iter().enumerate() {
-        if let Some(inner_ty) = ty.builtin_deref(true) {
-            if inner_ty.is_slice() {
-                // Now we need to figure out the size of each slice element in memory to allow
-                // safety checks and usability improvements in the backend.
-                let sty = match inner_ty.builtin_index() {
-                    Some(sty) => sty,
-                    None => {
-                        panic!("slice element type unknown");
-                    }
-                };
-                let pci = PseudoCanonicalInput {
-                    typing_env: TypingEnv::fully_monomorphized(),
-                    value: sty,
-                };
-
-                let layout = tcx.layout_of(pci);
-                let elem_size = match layout {
-                    Ok(layout) => layout.size,
-                    Err(_) => {
-                        bug!("autodiff failed to compute slice element size");
-                    }
-                };
-                let elem_size: u32 = elem_size.bytes() as u32;
-
-                // We know that the length will be passed as extra arg.
-                if !da.is_empty() {
-                    // We are looking at a slice. The length of that slice will become an
-                    // extra integer on llvm level. Integers are always const.
-                    // However, if the slice get's duplicated, we want to know to later check the
-                    // size. So we mark the new size argument as FakeActivitySize.
-                    // There is one FakeActivitySize per slice, so for convenience we store the
-                    // slice element size in bytes in it. We will use the size in the backend.
-                    let activity = match da[i] {
-                        DiffActivity::DualOnly
-                        | DiffActivity::Dual
-                        | DiffActivity::Dualv
-                        | DiffActivity::DuplicatedOnly
-                        | DiffActivity::Duplicated => {
-                            DiffActivity::FakeActivitySize(Some(elem_size))
-                        }
-                        DiffActivity::Const => DiffActivity::Const,
-                        _ => bug!("unexpected activity for ptr/ref"),
-                    };
-                    new_activities.push(activity);
-                    new_positions.push(i + 1);
-                }
-
-                continue;
-            }
-        }
-    }
-    // now add the extra activities coming from slices
-    // Reverse order to not invalidate the indices
-    for _ in 0..new_activities.len() {
-        let pos = new_positions.pop().unwrap();
-        let activity = new_activities.pop().unwrap();
-        da.insert(pos, activity);
-    }
-}
-
-pub(crate) fn find_autodiff_source_functions<'tcx>(
-    tcx: TyCtxt<'tcx>,
-    usage_map: &UsageMap<'tcx>,
-    autodiff_mono_items: Vec<(&MonoItem<'tcx>, &Instance<'tcx>)>,
-) -> Vec<AutoDiffItem> {
-    let mut autodiff_items: Vec<AutoDiffItem> = vec![];
-    for (item, instance) in autodiff_mono_items {
-        let target_id = instance.def_id();
-        let cg_fn_attr = &tcx.codegen_fn_attrs(target_id).autodiff_item;
-        let Some(target_attrs) = cg_fn_attr else {
-            continue;
-        };
-        let mut input_activities: Vec<DiffActivity> = target_attrs.input_activity.clone();
-        if target_attrs.is_source() {
-            trace!("source found: {:?}", target_id);
-        }
-        if !target_attrs.apply_autodiff() {
-            continue;
-        }
-
-        let target_symbol = symbol_name_for_instance_in_crate(tcx, instance.clone(), LOCAL_CRATE);
-
-        let source =
-            usage_map.used_map.get(&item).unwrap().into_iter().find_map(|item| match *item {
-                MonoItem::Fn(ref instance_s) => {
-                    let source_id = instance_s.def_id();
-                    if let Some(ad) = &tcx.codegen_fn_attrs(source_id).autodiff_item
-                        && ad.is_active()
-                    {
-                        return Some(instance_s);
-                    }
-                    None
-                }
-                _ => None,
-            });
-        let inst = match source {
-            Some(source) => source,
-            None => continue,
-        };
-
-        debug!("source_id: {:?}", inst.def_id());
-        let fn_ty = inst.ty(tcx, ty::TypingEnv::fully_monomorphized());
-        assert!(fn_ty.is_fn());
-        adjust_activity_to_abi(tcx, fn_ty, &mut input_activities);
-        let symb = symbol_name_for_instance_in_crate(tcx, inst.clone(), LOCAL_CRATE);
-
-        let mut new_target_attrs = target_attrs.clone();
-        new_target_attrs.input_activity = input_activities;
-        let itm = new_target_attrs.into_item(symb, target_symbol);
-        autodiff_items.push(itm);
-    }
-
-    if !autodiff_items.is_empty() {
-        trace!("AUTODIFF ITEMS EXIST");
-        for item in &mut *autodiff_items {
-            trace!("{}", &item);
-        }
-    }
-
-    autodiff_items
-}
diff --git a/library/core/src/intrinsics/mod.rs b/library/core/src/intrinsics/mod.rs
index 6c389c55a5f..dd838d494bc 100644
--- a/library/core/src/intrinsics/mod.rs
+++ b/library/core/src/intrinsics/mod.rs
@@ -3157,6 +3157,40 @@ pub const unsafe fn copysignf64(x: f64, y: f64) -> f64;
 #[rustc_intrinsic]
 pub const unsafe fn copysignf128(x: f128, y: f128) -> f128;
 
+/// Generates the LLVM body for the automatic differentiation of `f` using Enzyme,
+/// with `df` as the derivative function and `args` as its arguments.
+///
+/// Used internally as the body of `df` when expanding the `#[autodiff_forward]`
+/// and `#[autodiff_reverse]` attribute macros.
+///
+/// Type Parameters:
+/// - `F`: The original function to differentiate. Must be a function item.
+/// - `G`: The derivative function. Must be a function item.
+/// - `T`: A tuple of arguments passed to `df`.
+/// - `R`: The return type of the derivative function.
+///
+/// This shows where the `autodiff` intrinsic is used during macro expansion:
+///
+/// ```rust,ignore (macro example)
+/// #[autodiff_forward(df1, Dual, Const, Dual)]
+/// pub fn f1(x: &[f64], y: f64) -> f64 {
+///     unimplemented!()
+/// }
+/// ```
+///
+/// expands to:
+///
+/// ```rust,ignore (macro example)
+/// #[rustc_autodiff]
+/// #[inline(never)]
+/// pub fn f1(x: &[f64], y: f64) -> f64 {
+///     ::core::panicking::panic("not implemented")
+/// }
+/// #[rustc_autodiff(Forward, 1, Dual, Const, Dual)]
+/// pub fn df1(x: &[f64], bx_0: &[f64], y: f64) -> (f64, f64) {
+///     ::core::intrinsics::autodiff(f1::<>, df1::<>, (x, bx_0, y))
+/// }
+/// ```
 #[rustc_nounwind]
 #[rustc_intrinsic]
 pub const fn autodiff<F, G, T: crate::marker::Tuple, R>(f: F, df: G, args: T) -> R;
diff --git a/library/core/src/macros/mod.rs b/library/core/src/macros/mod.rs
index c59290a757b..888369d73f4 100644
--- a/library/core/src/macros/mod.rs
+++ b/library/core/src/macros/mod.rs
@@ -1494,6 +1494,7 @@ pub(crate) mod builtin {
     ///   (or explicitly returns `-> ()`). Otherwise, it must be set to one of the allowed activities.
     #[unstable(feature = "autodiff", issue = "124509")]
     #[allow_internal_unstable(rustc_attrs)]
+    #[allow_internal_unstable(core_intrinsics)]
     #[rustc_builtin_macro]
     pub macro autodiff_forward($item:item) {
         /* compiler built-in */
@@ -1512,6 +1513,7 @@ pub(crate) mod builtin {
     ///   (or explicitly returns `-> ()`). Otherwise, it must be set to one of the allowed activities.
     #[unstable(feature = "autodiff", issue = "124509")]
     #[allow_internal_unstable(rustc_attrs)]
+    #[allow_internal_unstable(core_intrinsics)]
     #[rustc_builtin_macro]
     pub macro autodiff_reverse($item:item) {
         /* compiler built-in */
diff --git a/src/doc/rustc-dev-guide/src/SUMMARY.md b/src/doc/rustc-dev-guide/src/SUMMARY.md
index 9ded467d5cd..025a078ae5b 100644
--- a/src/doc/rustc-dev-guide/src/SUMMARY.md
+++ b/src/doc/rustc-dev-guide/src/SUMMARY.md
@@ -107,7 +107,6 @@
     - [Installation](./autodiff/installation.md)
     - [How to debug](./autodiff/debugging.md)
     - [Autodiff flags](./autodiff/flags.md)
-    - [Current limitations](./autodiff/limitations.md)
 
 # Source Code Representation
 
diff --git a/src/doc/rustc-dev-guide/src/autodiff/limitations.md b/src/doc/rustc-dev-guide/src/autodiff/limitations.md
deleted file mode 100644
index 90afbd51f3f..00000000000
--- a/src/doc/rustc-dev-guide/src/autodiff/limitations.md
+++ /dev/null
@@ -1,27 +0,0 @@
-# Current limitations
- 
-## Safety and Soundness
-
-Enzyme currently assumes that the user passes shadow arguments (`dx`, `dy`, ...) of appropriate size. Under Reverse Mode, we additionally assume that shadow arguments are mutable. In Reverse Mode we adjust the outermost pointer or reference to be mutable. Therefore `&f32` will receive the shadow type `&mut f32`. However, we do not check length for other types than slices (e.g. enums, Vec). We also do not enforce mutability of inner references, but will warn if we recognize them. We do intend to add additional checks over time.
-
-## ABI adjustments
-
-In some cases, a function parameter might get lowered in a way that we currently don't handle correctly, leading to a compile time type mismatch in the `rustc_codegen_llvm` backend. Here are some [examples](https://github.com/EnzymeAD/rust/issues/105).
-
-## Compile Times
-
-Enzyme will often achieve excellent runtime performance, but might increase your compile time by a large factor. For Rust, we already have made significant improvements and have a list of further improvements planed - please reach out if you have time to help here.
-
-### Type Analysis
-
-Most of the times, Type Analysis (TA) is the reason of large (>5x) compile time increases when using Enzyme. This poster explains why we need to run Type Analysis in the bottom left part: [Poster Link](https://c.wsmoses.com/posters/Enzyme-llvmdev.pdf).
-
-We intend to increase the number of locations where we pass down Type information based on Rust types, which in turn will reduce the number of locations where Enzyme has to run Type Analysis, which will help compile times.
-
-### Duplicated Optimizations
-
-The key reason for Enzyme offering often excellent performance is that Enzyme differentiates already optimized LLVM-IR. However, we also (have to) run LLVM's optimization pipeline after differentiating, to make sure that the code which Enzyme generates is optimized properly. As a result you should have excellent runtime performance (please fill an issue if not), but at a compile time cost for running optimizations twice.
-
-### Fat-LTO 
-
-The usage of `#[autodiff(...)]` currently requires compiling your project with Fat-LTO. We technically only need LTO if the function being differentiated calls functions in other compilation units. Therefore, other solutions are possible, but this is the most simple one to get started. 
diff --git a/triagebot.toml b/triagebot.toml
index 6f6e95c5b50..df81bb71160 100644
--- a/triagebot.toml
+++ b/triagebot.toml
@@ -283,7 +283,6 @@ trigger_files = [
     "src/tools/enzyme",
     "src/doc/unstable-book/src/compiler-flags/autodiff.md",
     "compiler/rustc_ast/src/expand/autodiff_attrs.rs",
-    "compiler/rustc_monomorphize/src/partitioning/autodiff.rs",
     "compiler/rustc_codegen_llvm/src/builder/autodiff.rs",
     "compiler/rustc_codegen_llvm/src/llvm/enzyme_ffi.rs",
 ]
@@ -1285,8 +1284,6 @@ cc = ["@ZuseZ4"]
 cc = ["@ZuseZ4"]
 [mentions."compiler/rustc_builtin_macros/src/autodiff.rs"]
 cc = ["@ZuseZ4"]
-[mentions."compiler/rustc_monomorphize/src/partitioning/autodiff.rs"]
-cc = ["@ZuseZ4"]
 [mentions."compiler/rustc_codegen_llvm/src/builder/autodiff.rs"]
 cc = ["@ZuseZ4"]
 [mentions."compiler/rustc_codegen_llvm/src/llvm/enzyme_ffi.rs"]