From 35d4f9269d595efc5c8507892a69c3c05f9a4065 Mon Sep 17 00:00:00 2001
From: ChrisRackauckas-Claude <accounts@chrisrackauckas.com>
Date: Fri, 3 Jul 2026 12:18:31 -0400
Subject: [PATCH 1/2] Seed dense arrays of isbits duals without scalar indexing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ForwardDiff 1.x seeds duals with scalar setindex! loops over
structural_eachindex, which errors on GPU arrays ("scalar indexing is
disallowed") — a regression vs 0.10's broadcast seeding. Add a fast path
to the four seed! methods gated on

    duals isa DenseArray && isbitstype(V) && !Base.has_offset_axes(duals, x)

using broadcast for the bulk writes and map! over contiguous views for
the chunk writes. AbstractGPUArray <: DenseArray, so this restores GPU
jacobians without a weak dependency, and it is faster than the scalar
loop on the CPU as well: the structural path pays an O(index)
Iterators.drop walk per chunk, i.e. O(n^2/N) per chunked sweep.

map! (not broadcast) is used for the chunk writes because slicing the
seeds tuple at runtime allocates, and for the index:end write because
the broadcast dotview allocates under --check-bounds=yes on Julia 1.10.

Structural wrappers, non-isbits values (unset-element handling), and
offset axes keep using the structural path unchanged.

Co-Authored-By: Chris Rackauckas <accounts@chrisrackauckas.com>
Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01Vx7zQ96NYk4VV4ML2s3kAC
---
 Project.toml          |  6 +++--
 src/apiutils.jl       | 37 ++++++++++++++++++++++++++--
 test/GPUArraysTest.jl | 56 +++++++++++++++++++++++++++++++++++++++++++
 test/runtests.jl      |  5 ++++
 4 files changed, 100 insertions(+), 4 deletions(-)
 create mode 100644 test/GPUArraysTest.jl

diff --git a/Project.toml b/Project.toml
index 77f40590..48ef2fb3 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "ForwardDiff"
 uuid = "f6369f11-7733-5829-9624-2563aa707210"
-version = "1.4.1"
+version = "1.4.2"
 
 [deps]
 CommonSubexpressions = "bbf7d656-a473-5ed7-a52c-81e309532950"
@@ -28,6 +28,7 @@ DiffRules = "1.4"
 DiffTests = "0.1"
 IrrationalConstants = "0.1, 0.2"
 JET = "0.9, 0.10, 0.11"
+JLArrays = "0.1, 0.2"
 LogExpFunctions = "0.3, 1"
 NaNMath = "1"
 Preferences = "1"
@@ -41,9 +42,10 @@ DiffTests = "de460e47-3fe3-5279-bb4a-814414816d5d"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 IrrationalConstants = "92d709cd-6900-40b7-9082-c6be49f344b6"
 JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b"
+JLArrays = "27aeb0d3-9eb9-45fb-866b-73c2ecf80fcb"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Calculus", "DiffTests", "IrrationalConstants", "JET", "SparseArrays", "StaticArrays", "Test", "InteractiveUtils"]
+test = ["Calculus", "DiffTests", "IrrationalConstants", "JET", "JLArrays", "SparseArrays", "StaticArrays", "Test", "InteractiveUtils"]
diff --git a/src/apiutils.jl b/src/apiutils.jl
index f401a3fc..bcfd6258 100644
--- a/src/apiutils.jl
+++ b/src/apiutils.jl
@@ -70,9 +70,21 @@ function structural_eachindex(x::Diagonal, y::AbstractArray)
     return diagind(x)
 end
 
+# Dense arrays with isbits values need no structural-index or unset-element
+# handling, so they can be seeded with broadcast/`map!` over contiguous views.
+# This keeps GPU arrays (`AbstractGPUArray <: DenseArray`) free of scalar
+# indexing and avoids the O(index) `Iterators.drop` walk of the structural
+# path. Broadcast is fastest for the bulk writes; the chunk writes use `map!`
+# with an index range because slicing the seeds tuple at runtime allocates.
+@inline function dense_seedable(duals, x, ::Type{V}) where {V}
+    duals isa DenseArray && isbitstype(V) && !Base.has_offset_axes(duals, x)
+end
+
 function seed!(duals::AbstractArray{Dual{T,V,N}}, x,
                seed::Partials{N,V} = zero(Partials{N,V})) where {T,V,N}
-    if isbitstype(V)
+    if dense_seedable(duals, x, V) && axes(duals) == axes(x)
+        duals .= Dual{T,V,N}.(x, Ref(seed))
+    elseif isbitstype(V)
         for idx in structural_eachindex(duals, x)
             duals[idx] = Dual{T,V,N}(x[idx], seed)
         end
@@ -90,7 +102,12 @@ end
 
 function seed!(duals::AbstractArray{Dual{T,V,N}}, x,
                seeds::NTuple{N,Partials{N,V}}) where {T,V,N}
-    if isbitstype(V)
+    if dense_seedable(duals, x, V)
+        length(duals) == length(x) || throw(DimensionMismatch())
+        dual_inds = 1:min(N, length(duals))
+        map!((xi, i) -> Dual{T,V,N}(xi, seeds[i]),
+             view(duals, dual_inds), view(x, dual_inds), dual_inds)
+    elseif isbitstype(V)
         for (i, idx) in zip(1:N, structural_eachindex(duals, x))
             duals[idx] = Dual{T,V,N}(x[idx], seeds[i])
         end
@@ -108,6 +125,14 @@ end
 
 function seed!(duals::AbstractArray{Dual{T,V,N}}, x, index,
                seed::Partials{N,V} = zero(Partials{N,V})) where {T,V,N}
+    if dense_seedable(duals, x, V)
+        length(duals) == length(x) || throw(DimensionMismatch())
+        dual_inds = index:length(duals)
+        # map! rather than broadcast: the dotview allocates under
+        # --check-bounds=yes on Julia 1.10
+        map!(xi -> Dual{T,V,N}(xi, seed), view(duals, dual_inds), view(x, dual_inds))
+        return duals
+    end
     offset = index - 1
     idxs = Iterators.drop(structural_eachindex(duals, x), offset)
     if isbitstype(V)
@@ -128,6 +153,14 @@ end
 
 function seed!(duals::AbstractArray{Dual{T,V,N}}, x, index,
                seeds::NTuple{N,Partials{N,V}}, chunksize = N) where {T,V,N}
+    if dense_seedable(duals, x, V)
+        length(duals) == length(x) || throw(DimensionMismatch())
+        shift = index - 1
+        dual_inds = (1 + shift):min(shift + chunksize, length(duals))
+        map!((xi, i) -> Dual{T,V,N}(xi, seeds[i - shift]),
+             view(duals, dual_inds), view(x, dual_inds), dual_inds)
+        return duals
+    end
     offset = index - 1
     idxs = Iterators.drop(structural_eachindex(duals, x), offset)
     if isbitstype(V)
diff --git a/test/GPUArraysTest.jl b/test/GPUArraysTest.jl
new file mode 100644
index 00000000..cbec58fd
--- /dev/null
+++ b/test/GPUArraysTest.jl
@@ -0,0 +1,56 @@
+module GPUArraysTest
+
+using ForwardDiff, Test
+using JLArrays
+
+# JLArrays emulates GPU array semantics (including the scalar-indexing ban) on
+# the CPU, so the dense-array `seed!` fast paths can be exercised on a GPU-like
+# array type without a physical GPU. GPU arrays are covered by the dense fast
+# paths because `AbstractGPUArray <: DenseArray`.
+JLArrays.allowscalar(false)
+
+@testset "ForwardDiff seeding on GPU arrays" begin
+    f(x) = x .^ 2 .+ 2 .* x
+
+    @testset "jacobian, vector mode (length $n)" for n in (1, 4, 8)
+        x = collect(Float64, 1:n)
+        @test Array(ForwardDiff.jacobian(f, JLArray(x))) == ForwardDiff.jacobian(f, x)
+    end
+
+    # lengths above the chunk size exercise the chunked `seed!` methods
+    @testset "jacobian, chunk mode (length $n, chunk $c)" for n in (16, 20, 27), c in (4, 8)
+        x = collect(Float64, 1:n)
+        cfg = ForwardDiff.JacobianConfig(f, JLArray(x), ForwardDiff.Chunk{c}())
+        @test Array(ForwardDiff.jacobian(f, JLArray(x), cfg)) == ForwardDiff.jacobian(f, x)
+    end
+
+    @testset "jacobian! into a GPU array (length $n)" for n in (4, 16)
+        x = collect(Float64, 1:n)
+        out = JLArray(zeros(n, n))
+        ForwardDiff.jacobian!(out, f, JLArray(x))
+        @test Array(out) == ForwardDiff.jacobian(f, x)
+    end
+
+    @testset "jacobian of f! with GPU input and output" begin
+        f!(y, x) = (y .= x .^ 2 .+ 2 .* x; nothing)
+        x = collect(Float64, 1:8)
+        y = zeros(8)
+        J = ForwardDiff.jacobian(f!, JLArray(y), JLArray(x))
+        @test Array(J) == ForwardDiff.jacobian(f!, y, x)
+    end
+
+    @testset "jacobian with matrix input (chunk $c)" for c in (3, 6)
+        X = reshape(collect(Float64, 1:12), 4, 3)
+        g(x) = x .* sum(x)
+        cfg = ForwardDiff.JacobianConfig(g, JLArray(X), ForwardDiff.Chunk{c}())
+        @test Array(ForwardDiff.jacobian(g, JLArray(X), cfg)) ≈ ForwardDiff.jacobian(g, X)
+    end
+
+    @testset "jacobian with view input" begin
+        X = JLArray(reshape(collect(Float64, 1:18), 6, 3))
+        xv = view(X, :, 2)
+        @test Array(ForwardDiff.jacobian(f, xv)) == ForwardDiff.jacobian(f, Array(xv))
+    end
+end
+
+end # module
diff --git a/test/runtests.jl b/test/runtests.jl
index 2193242d..dea551ce 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -43,6 +43,11 @@ Random.seed!(SEED)
         t = @elapsed include("ConfusionTest.jl")
         println("##### done (took $t seconds).")
     end
+    @testset "GPUArrays" begin
+        println("##### Testing seeding on GPU arrays...")
+        t = @elapsed include("GPUArraysTest.jl")
+        println("##### done (took $t seconds).")
+    end
     @testset "Miscellaneous" begin
         println("##### Testing miscellaneous functionality...")
         t = @elapsed include("MiscTest.jl")

From bacc1a22b67772b173bc02687dceb5f14877e835 Mon Sep 17 00:00:00 2001
From: ChrisRackauckas-Claude <accounts@chrisrackauckas.com>
Date: Fri, 3 Jul 2026 12:45:04 -0400
Subject: [PATCH 2/2] Unseed only the chunk in 3-arg seed!

The chunk-mode unseed call seed!(xdual, x, i) only needs to clear the
N-wide chunk seeded at i: the rest of the array is zeroed up front and
every other chunk clears itself. ForwardDiff 0.10 wrote exactly N
elements here; the 1.x rewrite made it write from i to the end of the
array, i.e. O(n^2/2N) redundant dual writes per chunked
gradient/jacobian sweep (~40 GB of memory traffic for gradient! of
100000 elements at chunk 12).

Write at most N elements starting at index: Iterators.take(_, N) on the
structural path, a clamped range on the dense path. The 3-arg seed!
form has no other callers.

gradient! n=1000 (chunk 12): 344 -> 222 us; n=100000: 3.96 -> 2.61 s.

Co-Authored-By: Chris Rackauckas <accounts@chrisrackauckas.com>
Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01Vx7zQ96NYk4VV4ML2s3kAC
---
 src/apiutils.jl | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/apiutils.jl b/src/apiutils.jl
index bcfd6258..94c1d612 100644
--- a/src/apiutils.jl
+++ b/src/apiutils.jl
@@ -123,18 +123,21 @@ function seed!(duals::AbstractArray{Dual{T,V,N}}, x,
     return duals
 end
 
+# Writes at most N elements starting at `index`: chunk mode only ever needs to
+# clear the N-wide chunk it just seeded, so writing through to the end of the
+# array would be O(n) redundant work per chunk (O(n^2) per sweep).
 function seed!(duals::AbstractArray{Dual{T,V,N}}, x, index,
                seed::Partials{N,V} = zero(Partials{N,V})) where {T,V,N}
     if dense_seedable(duals, x, V)
         length(duals) == length(x) || throw(DimensionMismatch())
-        dual_inds = index:length(duals)
+        dual_inds = index:min(index + N - 1, length(duals))
         # map! rather than broadcast: the dotview allocates under
         # --check-bounds=yes on Julia 1.10
         map!(xi -> Dual{T,V,N}(xi, seed), view(duals, dual_inds), view(x, dual_inds))
         return duals
     end
     offset = index - 1
-    idxs = Iterators.drop(structural_eachindex(duals, x), offset)
+    idxs = Iterators.take(Iterators.drop(structural_eachindex(duals, x), offset), N)
     if isbitstype(V)
         for idx in idxs
             duals[idx] = Dual{T,V,N}(x[idx], seed)