diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 6a4189a8..a7defcfd 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -61,6 +61,7 @@ jobs:
       matrix:
         label:
           - ext/differentiationinterface
+          - ext/forwarddiff
           - ext/mooncake
         version:
           - '1'
diff --git a/HISTORY.md b/HISTORY.md
index 7f5b2346..12112c61 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,3 +1,7 @@
+## 0.15.2
+
+Added `AbstractPPLForwardDiffExt`, a direct ForwardDiff path for `AutoForwardDiff` (gradient, Jacobian, Hessian, `context`, chunk size, custom `tag`).
+
 ## 0.15.1
 
 Added Hessian support to the AD interface. Pass `order=2` to `prepare(adtype, problem, x)` to build a Hessian-capable evaluator. The new `value_gradient_and_hessian!!(prepared, x)` then returns `(value, gradient, hessian)` in a single call. Both the DifferentiationInterface and Mooncake extensions implement this.
diff --git a/Project.toml b/Project.toml
index e0b3fa4c..57c13443 100644
--- a/Project.toml
+++ b/Project.toml
@@ -3,7 +3,7 @@ uuid = "7a57a42e-76ec-4ea3-a279-07e840d6d9cf"
 keywords = ["probabilistic programming"]
 license = "MIT"
 desc = "Common interfaces for probabilistic programming"
-version = "0.15.1"
+version = "0.15.2"
 
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
@@ -19,14 +19,17 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 
 [weakdeps]
+DiffResults = "163ba53b-c6d8-5494-b064-1a9d43ac40c5"
 DifferentiationInterface = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
+ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [extensions]
 AbstractPPLDifferentiationInterfaceExt = ["DifferentiationInterface"]
 AbstractPPLDistributionsExt = ["Distributions", "LinearAlgebra"]
+AbstractPPLForwardDiffExt = ["ForwardDiff", "DiffResults"]
 AbstractPPLMooncakeExt = ["Mooncake"]
 AbstractPPLTestExt = ["Test"]
 
@@ -36,8 +39,10 @@ AbstractMCMC = "2, 3, 4, 5"
 Accessors = "0.1"
 BangBang = "0.4"
 DensityInterface = "0.4"
+DiffResults = "1"
 DifferentiationInterface = "0.6, 0.7"
 Distributions = "0.25"
+ForwardDiff = "0.10, 1"
 JSON = "0.19 - 0.21, 1"
 LinearAlgebra = "<0.0.1, 1"
 MacroTools = "0.5"
diff --git a/ext/AbstractPPLForwardDiffExt.jl b/ext/AbstractPPLForwardDiffExt.jl
new file mode 100644
index 00000000..da8f7c5c
--- /dev/null
+++ b/ext/AbstractPPLForwardDiffExt.jl
@@ -0,0 +1,225 @@
+module AbstractPPLForwardDiffExt
+
+using AbstractPPL: AbstractPPL
+using AbstractPPL.Evaluators: Evaluators, Prepared, VectorEvaluator, _ad_output_arity
+using ADTypes: AutoForwardDiff
+using ForwardDiff: ForwardDiff
+using DiffResults: DiffResults
+
+# `AutoForwardDiff{CS}` carries the chunk size as a type parameter; `nothing`
+# defers the choice to ForwardDiff.
+_fd_chunk(::AutoForwardDiff{nothing}, x) = ForwardDiff.Chunk(x)
+_fd_chunk(::AutoForwardDiff{CS}, _) where {CS} = ForwardDiff.Chunk{CS}()
+
+# A user-supplied `adtype.tag` (for nested differentiation) is threaded into the
+# `*Config` constructors; `nothing` (the ADTypes default) reproduces
+# ForwardDiff's per-constructor default of `Tag(target, eltype(x))`.
+@inline _fd_tag(adtype::AutoForwardDiff, target, x) =
+    adtype.tag === nothing ? ForwardDiff.Tag(target, eltype(x)) : adtype.tag
+
+# `A::Symbol` ∈ `(:scalar, :vector, :hessian)` encodes both output arity
+# (order=1) and order (order=2 ≡ `:hessian`), so dispatch resolves the hot path
+# and the arity-mismatch failure modes at compile time without a runtime branch.
+# `gradient_result` / `gradient_config` are populated only on `:hessian` caches
+# so `value_and_gradient!!` on an order=2 prep skips the O(n²) Hessian work.
+# `result::Nothing` is the empty-input sentinel: hot paths dispatch on
+# `FDCache{A,Nothing}` to short-circuit before any ForwardDiff call (chunk
+# selection `BoundsError`s on length-zero inputs). The stored `result` aliases
+# the arrays returned by `value_and_*!!`, per the `!!` contract.
+struct FDCache{A,R,C,GR,GC}
+    result::R
+    config::C
+    gradient_result::GR
+    gradient_config::GC
+    function FDCache{A}(
+        result::R, config::C, gradient_result::GR=nothing, gradient_config::GC=nothing
+    ) where {A,R,C,GR,GC}
+        return new{A,R,C,GR,GC}(result, config, gradient_result, gradient_config)
+    end
+end
+
+"""
+    prepare(adtype::AutoForwardDiff, problem, x; check_dims=true, context::Tuple=(), order=1)
+
+Prepare a ForwardDiff gradient, Jacobian, or Hessian evaluator for a vector
+input. `order=1` (default) picks gradient/Jacobian by output arity; `order=2`
+builds Hessian machinery and requires a scalar-valued problem. `context` and
+`check_dims` follow the base `prepare` contract.
+"""
+function AbstractPPL.prepare(
+    adtype::AutoForwardDiff,
+    problem,
+    x::AbstractVector{<:Real};
+    check_dims::Bool=true,
+    context::Tuple=(),
+    order::Int=1,
+)
+    Evaluators._validate_ad_order(order)
+    evaluator = AbstractPPL.prepare(problem, x; check_dims, context)::VectorEvaluator
+    # Probe the output once: the value classifies arity, and the vector branch
+    # reuses it as the Jacobian-result prototype. The base `prepare` contract
+    # promises one prep-time call into `problem`.
+    y_probe = evaluator(x)
+    arity = _ad_output_arity(y_probe)
+    chunk = _fd_chunk(adtype, x)
+    target = Base.Fix2(_fd_call, evaluator)
+    tag = _fd_tag(adtype, target, x)
+
+    if order == 2
+        arity === :scalar || Evaluators._throw_hessian_needs_scalar()
+        length(x) == 0 &&
+            return Prepared(adtype, evaluator, FDCache{:hessian}(nothing, nothing), Val(2))
+        hess_result = DiffResults.MutableDiffResult(
+            zero(eltype(x)), (similar(x), similar(x, length(x), length(x)))
+        )
+        hess_config = ForwardDiff.HessianConfig(target, hess_result, x, chunk, tag)
+        grad_result = DiffResults.MutableDiffResult(zero(eltype(x)), (similar(x),))
+        grad_config = ForwardDiff.GradientConfig(target, x, chunk, tag)
+        cache = FDCache{:hessian}(hess_result, hess_config, grad_result, grad_config)
+        return Prepared(adtype, evaluator, cache, Val(2))
+    end
+
+    if arity === :scalar
+        length(x) == 0 &&
+            return Prepared(adtype, evaluator, FDCache{:scalar}(nothing, nothing))
+        result = DiffResults.MutableDiffResult(zero(eltype(x)), (similar(x),))
+        config = ForwardDiff.GradientConfig(target, x, chunk, tag)
+        return Prepared(adtype, evaluator, FDCache{:scalar}(result, config))
+    else
+        length(x) == 0 &&
+            return Prepared(adtype, evaluator, FDCache{:vector}(nothing, nothing))
+        result = DiffResults.MutableDiffResult(
+            similar(y_probe), (similar(y_probe, length(y_probe), length(x)),)
+        )
+        config = ForwardDiff.JacobianConfig(target, x, chunk, tag)
+        return Prepared(adtype, evaluator, FDCache{:vector}(result, config))
+    end
+end
+
+# Top-level so `typeof(_fd_call)` is stable across `prepare` and the hot paths.
+# ForwardDiff's `*Config` keys its `Tag` on the target type; a closure built
+# inside one method would have a different type from one built inside another,
+# desyncing the per-call `Base.Fix2(_fd_call, evaluator)` target from the
+# config captured at prep time.
+@inline _fd_call(x, e::VectorEvaluator) = e.f(x, e.context...)
+
+# `Val(false)` on every hot-path call below skips `ForwardDiff.checktag`. A
+# user-supplied `adtype.tag` (e.g. DynamicPPL's `DynamicPPLTag` sentinel for
+# nested AD) has a tag-type parameter that does not equal `typeof(target)`, so
+# the default check would error. The tag's role is only to label the outer
+# Dual scope; the config we built at prep time already encodes the right tag.
+
+@inline function AbstractPPL.value_and_gradient!!(
+    p::Prepared{
+        <:AutoForwardDiff,
+        <:VectorEvaluator,
+        <:Union{FDCache{:scalar,Nothing},FDCache{:hessian,Nothing}},
+    },
+    x::AbstractVector{T},
+) where {T<:Real}
+    Evaluators._check_ad_input(p.evaluator, x)
+    return (p.evaluator(x), T[])
+end
+
+@inline function AbstractPPL.value_and_gradient!!(
+    p::Prepared{<:AutoForwardDiff,<:VectorEvaluator,<:FDCache{:scalar}},
+    x::AbstractVector{<:Real},
+)
+    Evaluators._check_ad_input(p.evaluator, x)
+    ForwardDiff.gradient!(
+        p.cache.result, Base.Fix2(_fd_call, p.evaluator), x, p.cache.config, Val(false)
+    )
+    return (DiffResults.value(p.cache.result), DiffResults.gradient(p.cache.result))
+end
+
+# Order=2 prep also satisfies the order=1 gradient contract via the dedicated
+# gradient cache built at prep time — skips the O(n²) Hessian work.
+@inline function AbstractPPL.value_and_gradient!!(
+    p::Prepared{<:AutoForwardDiff,<:VectorEvaluator,<:FDCache{:hessian}},
+    x::AbstractVector{<:Real},
+)
+    Evaluators._check_ad_input(p.evaluator, x)
+    ForwardDiff.gradient!(
+        p.cache.gradient_result,
+        Base.Fix2(_fd_call, p.evaluator),
+        x,
+        p.cache.gradient_config,
+        Val(false),
+    )
+    return (
+        DiffResults.value(p.cache.gradient_result),
+        DiffResults.gradient(p.cache.gradient_result),
+    )
+end
+
+# Arity-mismatch rejections live on dedicated cache tags so dispatch resolves
+# the failure mode at compile time.
+@inline function AbstractPPL.value_and_gradient!!(
+    ::Prepared{<:AutoForwardDiff,<:VectorEvaluator,<:FDCache{:vector}},
+    ::AbstractVector{<:Real},
+)
+    return Evaluators._throw_gradient_needs_scalar()
+end
+
+@inline function AbstractPPL.value_and_jacobian!!(
+    ::Prepared{
+        <:AutoForwardDiff,<:VectorEvaluator,<:Union{FDCache{:scalar},FDCache{:hessian}}
+    },
+    ::AbstractVector{<:Real},
+)
+    return Evaluators._throw_jacobian_needs_vector()
+end
+
+@inline function AbstractPPL.value_and_jacobian!!(
+    p::Prepared{<:AutoForwardDiff,<:VectorEvaluator,<:FDCache{:vector,Nothing}},
+    x::AbstractVector{<:Real},
+)
+    Evaluators._check_ad_input(p.evaluator, x)
+    val = p.evaluator(x)
+    return (val, similar(x, length(val), 0))
+end
+
+@inline function AbstractPPL.value_and_jacobian!!(
+    p::Prepared{<:AutoForwardDiff,<:VectorEvaluator,<:FDCache{:vector}},
+    x::AbstractVector{<:Real},
+)
+    Evaluators._check_ad_input(p.evaluator, x)
+    ForwardDiff.jacobian!(
+        p.cache.result, Base.Fix2(_fd_call, p.evaluator), x, p.cache.config, Val(false)
+    )
+    return (DiffResults.value(p.cache.result), DiffResults.jacobian(p.cache.result))
+end
+
+@inline function AbstractPPL.value_gradient_and_hessian!!(
+    ::Prepared{
+        <:AutoForwardDiff,<:VectorEvaluator,<:Union{FDCache{:scalar},FDCache{:vector}}
+    },
+    ::AbstractVector{<:Real},
+)
+    return Evaluators._throw_hessian_needs_order_2_prep()
+end
+
+@inline function AbstractPPL.value_gradient_and_hessian!!(
+    p::Prepared{<:AutoForwardDiff,<:VectorEvaluator,<:FDCache{:hessian,Nothing}},
+    x::AbstractVector{T},
+) where {T<:Real}
+    Evaluators._check_ad_input(p.evaluator, x)
+    return (p.evaluator(x), T[], similar(x, 0, 0))
+end
+
+@inline function AbstractPPL.value_gradient_and_hessian!!(
+    p::Prepared{<:AutoForwardDiff,<:VectorEvaluator,<:FDCache{:hessian}},
+    x::AbstractVector{<:Real},
+)
+    Evaluators._check_ad_input(p.evaluator, x)
+    ForwardDiff.hessian!(
+        p.cache.result, Base.Fix2(_fd_call, p.evaluator), x, p.cache.config, Val(false)
+    )
+    return (
+        DiffResults.value(p.cache.result),
+        DiffResults.gradient(p.cache.result),
+        DiffResults.hessian(p.cache.result),
+    )
+end
+
+end # module
diff --git a/ext/AbstractPPLTestExt.jl b/ext/AbstractPPLTestExt.jl
index c5c74727..05f9cdd1 100644
--- a/ext/AbstractPPLTestExt.jl
+++ b/ext/AbstractPPLTestExt.jl
@@ -1,333 +1,487 @@
 module AbstractPPLTestExt
 
-using AbstractPPL: AbstractPPL, generate_testcases, run_testcases
-using Test: @test, @test_throws, @testset
+using AbstractPPL: AbstractPPL, generate_testcases, run_testcase
+using Test: @inferred, @test, @test_broken, @test_throws, @testset
 
-struct QuadraticProblem end
-(::QuadraticProblem)(x::AbstractVector{<:Real}) = sum(xi -> xi^2, x)
+"""
+    TestCase(name, tag, f, x_proto; x, value, gradient, jacobian, hessian,
+             context=(), op, exception, inputs, allocations_safe=true)
 
-struct VectorValuedProblem end
-(::VectorValuedProblem)(x::AbstractVector{<:Real}) = [x[1] * x[2], x[2] + x[3]]
+Single tagged case for AD conformance testing. The `tag::Symbol` selects how
+the case is run; the kwargs populate only the fields the tag uses.
 
-struct ValueCase
-    name::String
-    f::Any
-    x_proto::Any
-    x::Any
-    value::Any
-    gradient::Any
-    jacobian::Any
-end
+Reserved tags (recognised by [`run_testcase`](@ref)):
 
-struct HessianCase
+  - `:vector`      — vector input, scalar output (`gradient`) or vector output
+                     (`jacobian`).
+  - `:hessian`     — order=2 round-trip on scalar output.
+  - `:context`     — scalar-output gradient with a non-empty `context::Tuple`
+                     passed to `prepare`.
+  - `:edge`        — error case; `op(prepared, x)` must throw `exception`.
+  - `:cache_reuse` — multiple inputs against a single prepared evaluator
+                     (`inputs::Vector{<:NamedTuple}`, with `(x=, value=,
+                     gradient=)` or `(x=, value=, jacobian=)` per row).
+  - `:namedtuple`  — NamedTuple input and gradient; Mooncake-only.
+
+`allocations_safe=false` opts the case out of the alloc check
+(cases with an allocating primal or empty-input shortcuts that allocate).
+"""
+struct TestCase
     name::String
+    tag::Symbol
     f::Any
     x_proto::Any
     x::Any
     value::Any
     gradient::Any
+    jacobian::Any
     hessian::Any
-end
-
-struct ErrorCase
-    name::String
-    f::Any
-    x_proto::Any
-    x::Any
+    context::Tuple
     op::Any
     exception::Any
+    inputs::Any
+    allocations_safe::Bool
+end
+function TestCase(
+    name,
+    tag::Symbol,
+    f,
+    x_proto;
+    x=nothing,
+    value=nothing,
+    gradient=nothing,
+    jacobian=nothing,
+    hessian=nothing,
+    context::Tuple=(),
+    op=nothing,
+    exception=nothing,
+    inputs=nothing,
+    allocations_safe::Bool=true,
+)
+    return TestCase(
+        name,
+        tag,
+        f,
+        x_proto,
+        x,
+        value,
+        gradient,
+        jacobian,
+        hessian,
+        context,
+        op,
+        exception,
+        inputs,
+        allocations_safe,
+    )
 end
 
+struct QuadraticProblem end
+(::QuadraticProblem)(x::AbstractVector{<:Real}) = sum(xi -> xi^2, x)
+
+struct VectorValuedProblem end
+(::VectorValuedProblem)(x::AbstractVector{<:Real}) = [x[1] * x[2], x[2] + x[3]]
+
+_context_problem(y::AbstractVector{<:Real}, offset) = -0.5 * (y[1] - offset)^2
+
 function AbstractPPL.generate_testcases(::Val{:vector})
     return (
-        ValueCase(
+        TestCase(
             "quadratic (scalar output)",
+            :vector,
             QuadraticProblem(),
-            zeros(3),
-            [3.0, 1.0, 2.0],
-            14.0,
-            [6.0, 2.0, 4.0],
-            nothing,
+            zeros(3);
+            x=[3.0, 1.0, 2.0],
+            value=14.0,
+            gradient=[6.0, 2.0, 4.0],
         ),
-        ValueCase(
+        TestCase(
             "vector-valued (vector output)",
+            :vector,
             VectorValuedProblem(),
-            zeros(3),
-            [2.0, 3.0, 4.0],
-            [6.0, 7.0],
-            nothing,
-            [3.0 2.0 0.0; 0.0 1.0 1.0],
+            zeros(3);
+            x=[2.0, 3.0, 4.0],
+            value=[6.0, 7.0],
+            jacobian=[3.0 2.0 0.0; 0.0 1.0 1.0],
+            allocations_safe=false,  # primal allocates its result vector
         ),
-        ValueCase(
+        TestCase(
             "empty input, scalar output",
+            :vector,
             x -> 7.5,
-            Float64[],
-            Float64[],
-            7.5,
-            Float64[],
-            nothing,
+            Float64[];
+            x=Float64[],
+            value=7.5,
+            gradient=Float64[],
+            allocations_safe=false,  # empty-input shortcut returns fresh `T[]`
         ),
-        ValueCase(
+        TestCase(
             "empty input, vector output",
+            :vector,
             x -> [2.0, 3.0],
-            Float64[],
-            Float64[],
-            [2.0, 3.0],
-            nothing,
-            zeros(2, 0),
+            Float64[];
+            x=Float64[],
+            value=[2.0, 3.0],
+            jacobian=zeros(2, 0),
+            allocations_safe=false,  # empty-input shortcut allocates empty matrix
         ),
-    )
-end
-
-function AbstractPPL.generate_testcases(::Val{:hessian})
-    return (
-        HessianCase(
-            "quadratic (scalar output)",
+        TestCase(
+            "scalar gradient with context",
+            :context,
+            _context_problem,
+            [0.3];
+            x=[0.3],
+            value=_context_problem([0.3], 0.1),
+            gradient=[-(0.3 - 0.1)],
+            context=(0.1,),
+        ),
+        TestCase(
+            "quadratic (hessian)",
+            :hessian,
             QuadraticProblem(),
-            zeros(3),
-            [3.0, 1.0, 2.0],
-            14.0,
-            [6.0, 2.0, 4.0],
-            [2.0 0.0 0.0; 0.0 2.0 0.0; 0.0 0.0 2.0],
+            zeros(3);
+            x=[3.0, 1.0, 2.0],
+            value=14.0,
+            gradient=[6.0, 2.0, 4.0],
+            hessian=[2.0 0.0 0.0; 0.0 2.0 0.0; 0.0 0.0 2.0],
+            allocations_safe=false,  # ForwardDiff/Mooncake hessian path allocates scratch
         ),
-        HessianCase(
-            "empty input, scalar output",
+        TestCase(
+            "empty input, hessian",
+            :hessian,
             x -> 7.5,
-            Float64[],
-            Float64[],
-            7.5,
-            Float64[],
-            zeros(0, 0),
+            Float64[];
+            x=Float64[],
+            value=7.5,
+            gradient=Float64[],
+            hessian=zeros(0, 0),
+            allocations_safe=false,  # empty-input hessian shortcut allocates
         ),
-    )
-end
-
-function AbstractPPL.generate_testcases(::Val{:hessian_edge})
-    return (
-        # `value_gradient_and_hessian!!` rejects order=1 preps regardless of
-        # the underlying problem arity — both paths share the same dispatch
-        # so one case suffices.
-        ErrorCase(
+        # value_gradient_and_hessian!! rejects order=1 preps regardless of arity;
+        # both paths share the dispatch so one case suffices.
+        TestCase(
             "value_gradient_and_hessian!! on order=1 prep",
+            :edge,
             QuadraticProblem(),
-            zeros(3),
-            [3.0, 1.0, 2.0],
-            (prepared, x) -> AbstractPPL.value_gradient_and_hessian!!(prepared, x),
-            r"order=2",
+            zeros(3);
+            x=[3.0, 1.0, 2.0],
+            op=(prepared, x) -> AbstractPPL.value_gradient_and_hessian!!(prepared, x),
+            exception=r"order=2",
         ),
-    )
-end
-
-function AbstractPPL.generate_testcases(::Val{:edge})
-    return (
-        ErrorCase(
+        TestCase(
             "wrong vector length",
+            :edge,
             QuadraticProblem(),
-            zeros(3),
-            [3.0, 1.0, 2.0, 99.0],
-            (prepared, x) -> prepared(x),
-            DimensionMismatch,
+            zeros(3);
+            x=[3.0, 1.0, 2.0, 99.0],
+            op=(prepared, x) -> prepared(x),
+            exception=DimensionMismatch,
         ),
-        ErrorCase(
+        TestCase(
             "non-floating-point vector",
+            :edge,
             QuadraticProblem(),
-            zeros(3),
-            [3, 1, 2],
-            (prepared, x) -> prepared(x),
-            r"floating-point",
+            zeros(3);
+            x=[3, 1, 2],
+            op=(prepared, x) -> prepared(x),
+            exception=r"floating-point",
         ),
-        ErrorCase(
+        TestCase(
             "gradient of vector-valued output",
+            :edge,
             VectorValuedProblem(),
-            zeros(3),
-            [2.0, 3.0, 4.0],
-            (prepared, x) -> AbstractPPL.value_and_gradient!!(prepared, x),
-            r"scalar-valued",
+            zeros(3);
+            x=[2.0, 3.0, 4.0],
+            op=(prepared, x) -> AbstractPPL.value_and_gradient!!(prepared, x),
+            exception=r"scalar-valued",
         ),
-        ErrorCase(
+        TestCase(
             "jacobian of scalar output",
+            :edge,
             QuadraticProblem(),
-            zeros(3),
-            [3.0, 1.0, 2.0],
-            (prepared, x) -> AbstractPPL.value_and_jacobian!!(prepared, x),
-            r"vector-valued",
+            zeros(3);
+            x=[3.0, 1.0, 2.0],
+            op=(prepared, x) -> AbstractPPL.value_and_jacobian!!(prepared, x),
+            exception=r"vector-valued",
         ),
-        ErrorCase(
+        TestCase(
             "gradient of vector-valued output, empty input",
+            :edge,
             x -> [2.0, 3.0],
-            Float64[],
-            Float64[],
-            (prepared, x) -> AbstractPPL.value_and_gradient!!(prepared, x),
-            r"scalar-valued",
+            Float64[];
+            x=Float64[],
+            op=(prepared, x) -> AbstractPPL.value_and_gradient!!(prepared, x),
+            exception=r"scalar-valued",
         ),
-        ErrorCase(
+        TestCase(
             "jacobian of scalar output, empty input",
+            :edge,
             x -> 7.5,
-            Float64[],
-            Float64[],
-            (prepared, x) -> AbstractPPL.value_and_jacobian!!(prepared, x),
-            r"vector-valued",
+            Float64[];
+            x=Float64[],
+            op=(prepared, x) -> AbstractPPL.value_and_jacobian!!(prepared, x),
+            exception=r"vector-valued",
         ),
-        ErrorCase(
+        TestCase(
             "value_and_gradient!! wrong vector length",
+            :edge,
             QuadraticProblem(),
-            zeros(3),
-            [3.0, 1.0, 2.0, 99.0],
-            (prepared, x) -> AbstractPPL.value_and_gradient!!(prepared, x),
-            DimensionMismatch,
+            zeros(3);
+            x=[3.0, 1.0, 2.0, 99.0],
+            op=(prepared, x) -> AbstractPPL.value_and_gradient!!(prepared, x),
+            exception=DimensionMismatch,
         ),
-        ErrorCase(
+        TestCase(
             "value_and_jacobian!! wrong vector length",
+            :edge,
             VectorValuedProblem(),
-            zeros(3),
-            [2.0, 3.0, 4.0, 5.0],
-            (prepared, x) -> AbstractPPL.value_and_jacobian!!(prepared, x),
-            DimensionMismatch,
+            zeros(3);
+            x=[2.0, 3.0, 4.0, 5.0],
+            op=(prepared, x) -> AbstractPPL.value_and_jacobian!!(prepared, x),
+            exception=DimensionMismatch,
         ),
-        ErrorCase(
+        TestCase(
             "value_and_gradient!! non-floating-point vector",
+            :edge,
             QuadraticProblem(),
-            zeros(3),
-            [3, 1, 2],
-            (prepared, x) -> AbstractPPL.value_and_gradient!!(prepared, x),
-            r"floating-point",
+            zeros(3);
+            x=[3, 1, 2],
+            op=(prepared, x) -> AbstractPPL.value_and_gradient!!(prepared, x),
+            exception=r"floating-point",
         ),
-        ErrorCase(
+        TestCase(
             "value_and_jacobian!! non-floating-point vector",
+            :edge,
+            VectorValuedProblem(),
+            zeros(3);
+            x=[2, 3, 4],
+            op=(prepared, x) -> AbstractPPL.value_and_jacobian!!(prepared, x),
+            exception=r"floating-point",
+        ),
+        TestCase(
+            "scalar output, cache reuse",
+            :cache_reuse,
+            QuadraticProblem(),
+            zeros(3);
+            inputs=[
+                (x=[1.0, 2.0, 3.0], value=14.0, gradient=[2.0, 4.0, 6.0]),
+                (x=[4.0, 5.0, 6.0], value=77.0, gradient=[8.0, 10.0, 12.0]),
+                (x=[0.5, -1.0, 2.0], value=5.25, gradient=[1.0, -2.0, 4.0]),
+            ],
+            allocations_safe=false,  # cache-reuse loops aren't single-call alloc tests
+        ),
+        TestCase(
+            "vector output, cache reuse",
+            :cache_reuse,
             VectorValuedProblem(),
-            zeros(3),
-            [2, 3, 4],
-            (prepared, x) -> AbstractPPL.value_and_jacobian!!(prepared, x),
-            r"floating-point",
+            zeros(3);
+            inputs=[
+                (x=[2.0, 3.0, 4.0], value=[6.0, 7.0], jacobian=[3.0 2.0 0.0; 0.0 1.0 1.0]),
+                (x=[5.0, 1.0, 7.0], value=[5.0, 8.0], jacobian=[1.0 5.0 0.0; 0.0 1.0 1.0]),
+                (x=[0.0, 4.0, -2.0], value=[0.0, 2.0], jacobian=[4.0 0.0 0.0; 0.0 1.0 1.0]),
+            ],
+            allocations_safe=false,  # cache-reuse loops aren't single-call alloc tests
         ),
     )
 end
 
 function AbstractPPL.generate_testcases(::Val{:namedtuple})
     return (
-        ValueCase(
+        TestCase(
             "scalar output over (x::Real, y::Vector)",
+            :namedtuple,
             vs -> vs.x^2 + sum(abs2, vs.y),
-            (x=0.0, y=zeros(2)),
-            (x=3.0, y=[1.0, 2.0]),
-            14.0,
-            (x=6.0, y=[2.0, 4.0]),
-            nothing,
+            (x=0.0, y=zeros(2));
+            x=(x=3.0, y=[1.0, 2.0]),
+            value=14.0,
+            gradient=(x=6.0, y=[2.0, 4.0]),
         ),
     )
 end
 
-function AbstractPPL.run_testcases(
-    ::Val{:vector}, prepare_fn=AbstractPPL.prepare; adtype, atol=0, rtol=1e-10
-)
-    for case in generate_testcases(Val(:vector))
-        @testset "$(case.name)" begin
-            prepared = prepare_fn(adtype, case.f, case.x_proto)
-            @test AbstractPPL.order(prepared) == 1
-            @test prepared(case.x) ≈ case.value atol = atol rtol = rtol
-            if case.gradient !== nothing
-                val, grad = AbstractPPL.value_and_gradient!!(prepared, case.x)
-                @test val ≈ case.value atol = atol rtol = rtol
-                @test grad ≈ case.gradient atol = atol rtol = rtol
-            end
-            if case.jacobian !== nothing
-                val, jac = AbstractPPL.value_and_jacobian!!(prepared, case.x)
-                @test val ≈ case.value atol = atol rtol = rtol
-                @test jac ≈ case.jacobian atol = atol rtol = rtol
-            end
-        end
+# ----- helpers -----
+
+# NamedTuple gradients compare per-key (some backends return Mooncake-tagged
+# tangents that aren't directly `≈`-comparable as a whole).
+function _compare_derivative(actual::NamedTuple, expected::NamedTuple; atol, rtol)
+    for k in keys(expected)
+        @test getproperty(actual, k) ≈ getproperty(expected, k) atol = atol rtol = rtol
     end
-    return nothing
+end
+function _compare_derivative(actual, expected; atol, rtol)
+    @test actual ≈ expected atol = atol rtol = rtol
 end
 
-function AbstractPPL.run_testcases(
-    ::Val{:hessian}, prepare_fn=AbstractPPL.prepare; adtype, atol=0, rtol=1e-10
-)
-    for case in generate_testcases(Val(:hessian))
-        @testset "$(case.name)" begin
-            prepared = prepare_fn(adtype, case.f, case.x_proto; order=2)
-            @test AbstractPPL.order(prepared) == 2
-            @test prepared(case.x) ≈ case.value atol = atol rtol = rtol
-            val, grad, hess = AbstractPPL.value_gradient_and_hessian!!(prepared, case.x)
-            @test val ≈ case.value atol = atol rtol = rtol
-            @test grad ≈ case.gradient atol = atol rtol = rtol
-            @test hess ≈ case.hessian atol = atol rtol = rtol
-            # Order=2 prep also satisfies the order=1 gradient contract.
-            val1, grad1 = AbstractPPL.value_and_gradient!!(prepared, case.x)
-            @test val1 ≈ case.value atol = atol rtol = rtol
-            @test grad1 ≈ case.gradient atol = atol rtol = rtol
-        end
-    end
-    for case in generate_testcases(Val(:hessian_edge))
-        @testset "$(case.name)" begin
-            prepared = prepare_fn(adtype, case.f, case.x_proto)
-            @test_throws case.exception case.op(prepared, case.x)
-        end
-    end
+function _record_alloc!(state::Symbol, allocs::Integer)
+    state === :test && @test allocs == 0
+    state === :broken && @test_broken allocs == 0
     return nothing
 end
 
-function AbstractPPL.run_testcases(::Val{:edge}, prepare_fn=AbstractPPL.prepare; adtype)
-    for case in generate_testcases(Val(:edge))
-        @testset "$(case.name)" begin
-            prepared = prepare_fn(adtype, case.f, case.x_proto)
-            @test_throws case.exception case.op(prepared, case.x)
-        end
+# `@inferred` is syntactic and throws on failure; wrap so we can pin `op`'s
+# type via an F-parameter and convert the throw into a Bool.
+function _is_inferred(op::F, args...) where {F}
+    try
+        @inferred op(args...)
+        return true
+    catch
+        return false
     end
+end
+
+function _record_inferred!(state::Symbol, inferred::Bool)
+    state === :test && @test inferred
+    state === :broken && @test_broken inferred
     return nothing
 end
 
-function AbstractPPL.run_testcases(
-    ::Val{:namedtuple}, prepare_fn=AbstractPPL.prepare; adtype, atol=0, rtol=1e-10
-)
-    for case in generate_testcases(Val(:namedtuple))
-        @testset "$(case.name)" begin
-            prepared = prepare_fn(adtype, case.f, case.x_proto)
-            @test prepared(case.x) ≈ case.value atol = atol rtol = rtol
-            if case.gradient !== nothing
-                val, grad = AbstractPPL.value_and_gradient!!(prepared, case.x)
-                @test val ≈ case.value atol = atol rtol = rtol
-                for k in keys(case.gradient)
-                    @test getproperty(grad, k) ≈ getproperty(case.gradient, k) atol = atol rtol =
-                        rtol
-                end
-            end
-        end
+# ----- runner -----
+
+function AbstractPPL.run_testcase(case::TestCase; kwargs...)
+    @testset "$(case.name)" begin
+        _run(Val(case.tag), case; kwargs...)
     end
     return nothing
 end
 
-# Drive `value_and_{gradient,jacobian}!!` twice with different inputs against
-# the same `prepared` evaluator to exercise cache reuse — catches backends
-# whose cache state is corrupted by a prior call.
-function AbstractPPL.run_testcases(
-    ::Val{:cache_reuse}, prepare_fn=AbstractPPL.prepare; adtype, atol=0, rtol=1e-10
+# `:vector` and `:context` share a runner — `case.context` defaults to `()` so
+# threading it through `prepare` is a no-op on `:vector` cases that don't set
+# it.
+function _run(
+    ::Union{Val{:vector},Val{:context}},
+    case;
+    adtype,
+    prepare_fn=AbstractPPL.prepare,
+    atol=0,
+    rtol=1e-10,
+    check_dims::Bool=true,
+    type_stability::Symbol=:skip,
+    allocations::Symbol=:skip,
 )
-    @testset "scalar output, repeated calls" begin
-        prepared = prepare_fn(adtype, QuadraticProblem(), zeros(3))
-        for (x, value, gradient) in (
-            ([1.0, 2.0, 3.0], 14.0, [2.0, 4.0, 6.0]),
-            ([4.0, 5.0, 6.0], 77.0, [8.0, 10.0, 12.0]),
-            ([0.5, -1.0, 2.0], 5.25, [1.0, -2.0, 4.0]),
+    prepared = prepare_fn(adtype, case.f, case.x_proto; check_dims, context=case.context)
+    @test AbstractPPL.order(prepared) == 1
+    @test prepared(case.x) ≈ case.value atol = atol rtol = rtol
+
+    if case.gradient !== nothing
+        val, grad = AbstractPPL.value_and_gradient!!(prepared, case.x)
+        @test val ≈ case.value atol = atol rtol = rtol
+        _compare_derivative(grad, case.gradient; atol, rtol)
+        _maybe_check_alloc!(
+            case, allocations, AbstractPPL.value_and_gradient!!, prepared, case.x
+        )
+        _maybe_check_inferred!(
+            type_stability, AbstractPPL.value_and_gradient!!, prepared, case.x
         )
-            val, grad = AbstractPPL.value_and_gradient!!(prepared, x)
-            @test val ≈ value atol = atol rtol = rtol
-            @test grad ≈ gradient atol = atol rtol = rtol
-        end
     end
-    @testset "vector output, repeated calls" begin
-        prepared = prepare_fn(adtype, VectorValuedProblem(), zeros(3))
-        for (x, value, jacobian) in (
-            ([2.0, 3.0, 4.0], [6.0, 7.0], [3.0 2.0 0.0; 0.0 1.0 1.0]),
-            ([5.0, 1.0, 7.0], [5.0, 8.0], [1.0 5.0 0.0; 0.0 1.0 1.0]),
-            ([0.0, 4.0, -2.0], [0.0, 2.0], [4.0 0.0 0.0; 0.0 1.0 1.0]),
+
+    if case.jacobian !== nothing
+        val, jac = AbstractPPL.value_and_jacobian!!(prepared, case.x)
+        @test val ≈ case.value atol = atol rtol = rtol
+        @test jac ≈ case.jacobian atol = atol rtol = rtol
+        _maybe_check_alloc!(
+            case, allocations, AbstractPPL.value_and_jacobian!!, prepared, case.x
         )
-            val, jac = AbstractPPL.value_and_jacobian!!(prepared, x)
-            @test val ≈ value atol = atol rtol = rtol
-            @test jac ≈ jacobian atol = atol rtol = rtol
+        _maybe_check_inferred!(
+            type_stability, AbstractPPL.value_and_jacobian!!, prepared, case.x
+        )
+    end
+    return nothing
+end
+
+function _run(
+    ::Val{:hessian},
+    case;
+    adtype,
+    prepare_fn=AbstractPPL.prepare,
+    atol=0,
+    rtol=1e-10,
+    check_dims::Bool=true,
+    type_stability::Symbol=:skip,
+    allocations::Symbol=:skip,
+)
+    prepared = prepare_fn(adtype, case.f, case.x_proto; check_dims, order=2)
+    @test AbstractPPL.order(prepared) == 2
+    @test prepared(case.x) ≈ case.value atol = atol rtol = rtol
+
+    val, grad, hess = AbstractPPL.value_gradient_and_hessian!!(prepared, case.x)
+    @test val ≈ case.value atol = atol rtol = rtol
+    @test grad ≈ case.gradient atol = atol rtol = rtol
+    @test hess ≈ case.hessian atol = atol rtol = rtol
+
+    # Order=2 prep also satisfies the order=1 gradient contract.
+    val1, grad1 = AbstractPPL.value_and_gradient!!(prepared, case.x)
+    @test val1 ≈ case.value atol = atol rtol = rtol
+    @test grad1 ≈ case.gradient atol = atol rtol = rtol
+
+    _maybe_check_alloc!(
+        case, allocations, AbstractPPL.value_gradient_and_hessian!!, prepared, case.x
+    )
+    _maybe_check_inferred!(
+        type_stability, AbstractPPL.value_gradient_and_hessian!!, prepared, case.x
+    )
+    return nothing
+end
+
+function _run(::Val{:edge}, case; adtype, prepare_fn=AbstractPPL.prepare, kwargs...)
+    prepared = prepare_fn(adtype, case.f, case.x_proto)
+    @test_throws case.exception case.op(prepared, case.x)
+    return nothing
+end
+
+function _run(
+    ::Val{:cache_reuse},
+    case;
+    adtype,
+    prepare_fn=AbstractPPL.prepare,
+    atol=0,
+    rtol=1e-10,
+    kwargs...,
+)
+    prepared = prepare_fn(adtype, case.f, case.x_proto)
+    for input in case.inputs
+        if haskey(input, :gradient)
+            val, grad = AbstractPPL.value_and_gradient!!(prepared, input.x)
+            @test val ≈ input.value atol = atol rtol = rtol
+            @test grad ≈ input.gradient atol = atol rtol = rtol
+        else
+            val, jac = AbstractPPL.value_and_jacobian!!(prepared, input.x)
+            @test val ≈ input.value atol = atol rtol = rtol
+            @test jac ≈ input.jacobian atol = atol rtol = rtol
         end
     end
     return nothing
 end
 
+function _run(
+    ::Val{:namedtuple},
+    case;
+    adtype,
+    prepare_fn=AbstractPPL.prepare,
+    atol=0,
+    rtol=1e-10,
+    kwargs...,
+)
+    prepared = prepare_fn(adtype, case.f, case.x_proto)
+    @test prepared(case.x) ≈ case.value atol = atol rtol = rtol
+    val, grad = AbstractPPL.value_and_gradient!!(prepared, case.x)
+    @test val ≈ case.value atol = atol rtol = rtol
+    _compare_derivative(grad, case.gradient; atol, rtol)
+    return nothing
+end
+
+_resolve_alloc_state(case::TestCase, state::Symbol) = case.allocations_safe ? state : :skip
+
+function _maybe_check_alloc!(case::TestCase, state::Symbol, op::F, prepared, x) where {F}
+    effective = _resolve_alloc_state(case, state)
+    effective === :skip && return nothing
+    op(prepared, x)  # warm up
+    allocs = @allocated op(prepared, x)
+    return _record_alloc!(effective, allocs)
+end
+
+function _maybe_check_inferred!(state::Symbol, op::F, prepared, x) where {F}
+    state === :skip && return nothing
+    return _record_inferred!(state, _is_inferred(op, prepared, x))
+end
+
 end # module
diff --git a/src/AbstractPPL.jl b/src/AbstractPPL.jl
index 2a32494d..78f0748f 100644
--- a/src/AbstractPPL.jl
+++ b/src/AbstractPPL.jl
@@ -17,31 +17,30 @@ using .Evaluators:
 """
     generate_testcases(::Val{group})
 
-Return a tuple of test cases for the conformance `group`. Implemented by the
-`Test` extension (`AbstractPPLTestExt`). Reserved group keys (extensions must
-not redefine these): `:vector` for value/gradient/jacobian round-trips on
-vector-input evaluators; `:hessian` for `order=2` value/gradient/Hessian
-round-trips on vector-input scalar-output evaluators; `:namedtuple` for
-`NamedTuple`-input evaluators; `:edge` for error-path cases; `:cache_reuse`
-for repeated calls against a single prepared evaluator. Downstream packages
-may add other keys.
+Return a tuple of AD conformance test cases for the input-shape `group`.
+Reserved groups: `:vector` (vector input) and `:namedtuple` (NamedTuple
+input; Mooncake-only). Iterate and pass each to [`run_testcase`](@ref).
+Implemented by the `Test` extension (`AbstractPPLTestExt`).
 """
 function generate_testcases end
 
 """
-    run_testcases(::Val{group}, prepare_fn=AbstractPPL.prepare; adtype, kwargs...)
+    run_testcase(case; adtype, prepare_fn=AbstractPPL.prepare, atol=0, rtol=1e-10,
+                 check_dims=true, type_stability=:skip, allocations=:skip)
 
-Run the test cases produced by [`generate_testcases`](@ref) against an AD
-backend, using `prepare_fn` (default `AbstractPPL.prepare`) to construct each
-prepared evaluator. Implemented by the `Test` extension. See
-[`generate_testcases`](@ref) for reserved group keys.
+Run a single conformance case against an AD backend. `type_stability` and
+`allocations` accept `:skip` / `:test` / `:broken` — `:test` asserts the
+invariant, `:broken` marks it `@test_broken` (use for backends with known
+regressions). Implemented by the `Test` extension.
 """
-function run_testcases end
+function run_testcase end
 
 @static if VERSION >= v"1.11.0"
     eval(
         Meta.parse(
-            "public prepare, value_and_gradient!!, value_and_jacobian!!, value_gradient_and_hessian!!, order, generate_testcases, run_testcases",
+            "public prepare, value_and_gradient!!, value_and_jacobian!!, " *
+            "value_gradient_and_hessian!!, order, " *
+            "generate_testcases, run_testcase",
         ),
     )
 end
diff --git a/test/ext/differentiationinterface/main.jl b/test/ext/differentiationinterface/main.jl
index dff6473e..e709bbd9 100644
--- a/test/ext/differentiationinterface/main.jl
+++ b/test/ext/differentiationinterface/main.jl
@@ -6,7 +6,8 @@ Pkg.instantiate()
 using AbstractPPL:
     AbstractPPL,
     prepare,
-    run_testcases,
+    generate_testcases,
+    run_testcase,
     value_and_gradient!!,
     value_gradient_and_hessian!!,
     order
@@ -22,32 +23,35 @@ quadratic(x::AbstractVector{<:Real}) = sum(xi -> xi^2, x)
 
 @testset "AbstractPPLDifferentiationInterfaceExt" begin
     @testset "ForwardDiff" begin
-        run_testcases(Val(:vector); adtype=AutoForwardDiff(), atol=1e-6, rtol=1e-6)
-        run_testcases(Val(:hessian); adtype=AutoForwardDiff(), atol=1e-6, rtol=1e-6)
-        run_testcases(Val(:cache_reuse); adtype=AutoForwardDiff(), atol=1e-6, rtol=1e-6)
-        run_testcases(Val(:edge); adtype=AutoForwardDiff())
+        for case in generate_testcases(Val(:vector))
+            run_testcase(case; adtype=AutoForwardDiff(), atol=1e-6, rtol=1e-6)
+        end
     end
 
-    # Compiled-tape ReverseDiff goes through the `_di_call_shape(::AutoReverseDiff{true}, …)`
-    # specialisation that closes the evaluator into a `Base.Fix2` target — the
-    # `:cache_reuse` group exercises that path across multiple inputs.
+    # Compiled-tape ReverseDiff closes the evaluator into a `Base.Fix2` target
+    # via `_di_call_shape(::AutoReverseDiff{true}, …)`; the `:cache_reuse`
+    # cases exercise that path across multiple inputs. Skip `:hessian`
+    # (compiled tape doesn't support `prepare_hessian`).
     @testset "ReverseDiff (compiled tape)" begin
         adtype = AutoReverseDiff(; compile=true)
-        run_testcases(Val(:vector); adtype=adtype, atol=1e-6, rtol=1e-6)
-        run_testcases(Val(:cache_reuse); adtype=adtype, atol=1e-6, rtol=1e-6)
-        run_testcases(Val(:edge); adtype=adtype)
+        for case in generate_testcases(Val(:vector))
+            case.tag === :hessian && continue
+            run_testcase(case; adtype, atol=1e-6, rtol=1e-6)
+        end
     end
 
     # The DI cache types' `Mode` parameter is either `:closure` (compiled-tape
     # ReverseDiff) or the integer context length on the constants path. The
     # constants-path integer also documents how many `DI.Constant`s the AD
-    # call passes.
+    # call passes. `AutoReverseDiff()` (non-compiled) is used here because the
+    # direct `AbstractPPLForwardDiffExt` path takes precedence over DI for
+    # `AutoForwardDiff` when both extensions are loaded.
     @testset "DI cache encodes the call mode as a type parameter" begin
         x = [1.0, 2.0, 3.0]
-        prep_noctx = prepare(AutoForwardDiff(), quadratic, x)
+        prep_noctx = prepare(AutoReverseDiff(), quadratic, x)
         prep_closure = prepare(AutoReverseDiff(; compile=true), quadratic, x)
         affine(y, a, b) = a * sum(abs2, y) + b
-        prep_ctx = prepare(AutoForwardDiff(), affine, x; context=(2.0, 1.0))
+        prep_ctx = prepare(AutoReverseDiff(), affine, x; context=(2.0, 1.0))
 
         @test prep_noctx.cache isa DIExt.DIGradientCache{0}
         @test prep_closure.cache isa DIExt.DIGradientCache{:closure}
diff --git a/test/ext/forwarddiff/Project.toml b/test/ext/forwarddiff/Project.toml
new file mode 100644
index 00000000..7666f241
--- /dev/null
+++ b/test/ext/forwarddiff/Project.toml
@@ -0,0 +1,13 @@
+[deps]
+AbstractPPL = "7a57a42e-76ec-4ea3-a279-07e840d6d9cf"
+ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
+DiffResults = "163ba53b-c6d8-5494-b064-1a9d43ac40c5"
+ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[compat]
+ADTypes = "1"
+DiffResults = "1"
+ForwardDiff = "0.10, 1"
+julia = "1.10"
diff --git a/test/ext/forwarddiff/main.jl b/test/ext/forwarddiff/main.jl
new file mode 100644
index 00000000..5173339b
--- /dev/null
+++ b/test/ext/forwarddiff/main.jl
@@ -0,0 +1,50 @@
+using Pkg
+Pkg.activate(@__DIR__)
+Pkg.develop(; path=joinpath(@__DIR__, "..", "..", ".."))
+Pkg.instantiate()
+
+using AbstractPPL:
+    AbstractPPL, prepare, generate_testcases, run_testcase, value_and_gradient!!
+using ADTypes: AutoForwardDiff
+using ForwardDiff
+using Test
+
+@testset "AbstractPPLForwardDiffExt" begin
+    @testset "ForwardDiff (default chunk)" begin
+        for case in generate_testcases(Val(:vector))
+            run_testcase(
+                case;
+                adtype=AutoForwardDiff(),
+                atol=1e-6,
+                rtol=1e-6,
+                allocations=:test,
+                type_stability=:test,
+            )
+        end
+    end
+
+    # `chunksize=2` needs x with at least two elements; skip the `:context`
+    # case (x of length 1) and `:edge` cases (chunk doesn't apply).
+    @testset "ForwardDiff (explicit chunk)" begin
+        ad = AutoForwardDiff(; chunksize=2)
+        for case in generate_testcases(Val(:vector))
+            case.tag ∈ (:vector, :cache_reuse, :hessian) || continue
+            run_testcase(case; adtype=ad, atol=1e-6, rtol=1e-6)
+        end
+    end
+
+    # `AutoForwardDiff(; tag=...)` exists for nested differentiation. The tag's
+    # type parameter is a sentinel chosen by the caller (e.g. DynamicPPL's
+    # `DynamicPPLTag`); it intentionally does not equal `typeof(target)`, so
+    # the hot path must skip `ForwardDiff.checktag` to avoid a false error.
+    @testset "custom AutoForwardDiff tag" begin
+        struct OuterTag end
+        custom = ForwardDiff.Tag{OuterTag,Float64}()
+        x = [1.0, 2.0]
+        prep = prepare(AutoForwardDiff(; tag=custom), x -> sum(abs2, x), x)
+        @test typeof(prep.cache.config).parameters[1] === typeof(custom)
+        val, grad = value_and_gradient!!(prep, x)
+        @test val ≈ 5.0
+        @test grad ≈ [2.0, 4.0]
+    end
+end
diff --git a/test/ext/mooncake/main.jl b/test/ext/mooncake/main.jl
index 855d5f5b..50c2f2d3 100644
--- a/test/ext/mooncake/main.jl
+++ b/test/ext/mooncake/main.jl
@@ -3,25 +3,62 @@ Pkg.activate(@__DIR__)
 Pkg.develop(; path=joinpath(@__DIR__, "..", "..", ".."))
 Pkg.instantiate()
 
-using AbstractPPL: AbstractPPL, prepare, run_testcases, value_and_gradient!!
+using AbstractPPL:
+    AbstractPPL, prepare, generate_testcases, run_testcase, value_and_gradient!!
 using ADTypes: AutoMooncake, AutoMooncakeForward
 using Mooncake
 using Test
 
+# Known-broken paths in Mooncake:
+#   * `value_and_jacobian!!` allocates fresh cotangent/Jacobian buffers on
+#     every call (both modes); forward-mode Jacobian return type infers as
+#     `Tuple{Any, Union{Array{T,3}, Matrix}}`.
+#   * `value_and_gradient!!` on a forward-mode context-lowered prep splats
+#     `args_to_zero` per call and allocates; forward mode also fails inference.
+function _mooncake_alloc(case, adtype)
+    if case.tag === :vector && case.jacobian !== nothing
+        return :broken
+    elseif case.tag === :context && adtype isa AutoMooncakeForward
+        return :broken
+    elseif VERSION < v"1.11"
+        # Mooncake's value_and_gradient!! allocations are flaky on Julia 1.10
+        # (resolver-dependent: some Mooncake versions alloc, others don't).
+        return :skip
+    else
+        return :test
+    end
+end
+# The forward-mode Jacobian inference issue only affects non-empty input;
+# the empty-input shortcut bypasses Mooncake and is inferable on either mode.
+function _mooncake_inferred(case, adtype)
+    is_jac_inf_broken =
+        case.tag === :vector &&
+        case.jacobian !== nothing &&
+        length(case.x) > 0 &&
+        adtype isa AutoMooncakeForward
+    is_ctx_inf_broken = case.tag === :context && adtype isa AutoMooncakeForward
+    return (is_jac_inf_broken || is_ctx_inf_broken) ? :broken : :test
+end
+
 @testset "AbstractPPLMooncakeExt" begin
     for (label, adtype) in (
         ("Mooncake (reverse)", AutoMooncake()),
         ("Mooncake (forward)", AutoMooncakeForward()),
     )
         @testset "$label" begin
-            run_testcases(Val(:vector); adtype=adtype, atol=1e-6, rtol=1e-6)
-            run_testcases(Val(:namedtuple); adtype=adtype, atol=1e-6, rtol=1e-6)
-            run_testcases(Val(:cache_reuse); adtype=adtype, atol=1e-6, rtol=1e-6)
-            run_testcases(Val(:edge); adtype=adtype)
-            # Hessian (`order=2`) is reverse-mode only on the AutoMooncake side;
-            # AutoMooncakeForward routes through the same generic Hessian path
-            # since `Mooncake.prepare_hessian_cache` is mode-agnostic.
-            run_testcases(Val(:hessian); adtype=adtype, atol=1e-6, rtol=1e-6)
+            for case in generate_testcases(Val(:vector))
+                run_testcase(
+                    case;
+                    adtype,
+                    atol=1e-6,
+                    rtol=1e-6,
+                    allocations=_mooncake_alloc(case, adtype),
+                    type_stability=_mooncake_inferred(case, adtype),
+                )
+            end
+            for case in generate_testcases(Val(:namedtuple))
+                run_testcase(case; adtype, atol=1e-6, rtol=1e-6)
+            end
         end
     end
 
diff --git a/test/run_extras.jl b/test/run_extras.jl
index cd2c157e..e84b5ea9 100644
--- a/test/run_extras.jl
+++ b/test/run_extras.jl
@@ -2,9 +2,10 @@
 #
 # Usage (from the repo root):
 #   LABEL=ext/differentiationinterface julia test/run_extras.jl
+#   LABEL=ext/forwarddiff              julia test/run_extras.jl
 #   LABEL=ext/mooncake                 julia test/run_extras.jl
 
-const VALID_LABELS = ("ext/differentiationinterface", "ext/mooncake")
+const VALID_LABELS = ("ext/differentiationinterface", "ext/forwarddiff", "ext/mooncake")
 
 label = get(ENV, "LABEL", nothing)
 label in VALID_LABELS ||