From 35d4f9269d595efc5c8507892a69c3c05f9a4065 Mon Sep 17 00:00:00 2001 From: ChrisRackauckas-Claude Date: Fri, 3 Jul 2026 12:18:31 -0400 Subject: [PATCH 1/2] Seed dense arrays of isbits duals without scalar indexing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ForwardDiff 1.x seeds duals with scalar setindex! loops over structural_eachindex, which errors on GPU arrays ("scalar indexing is disallowed") — a regression vs 0.10's broadcast seeding. Add a fast path to the four seed! methods gated on duals isa DenseArray && isbitstype(V) && !Base.has_offset_axes(duals, x) using broadcast for the bulk writes and map! over contiguous views for the chunk writes. AbstractGPUArray <: DenseArray, so this restores GPU jacobians without a weak dependency, and it is faster than the scalar loop on the CPU as well: the structural path pays an O(index) Iterators.drop walk per chunk, i.e. O(n^2/N) per chunked sweep. map! (not broadcast) is used for the chunk writes because slicing the seeds tuple at runtime allocates, and for the index:end write because the broadcast dotview allocates under --check-bounds=yes on Julia 1.10. Structural wrappers, non-isbits values (unset-element handling), and offset axes keep using the structural path unchanged. Co-Authored-By: Chris Rackauckas Co-Authored-By: Claude Fable 5 Claude-Session: https://claude.ai/code/session_01Vx7zQ96NYk4VV4ML2s3kAC --- Project.toml | 6 +++-- src/apiutils.jl | 37 ++++++++++++++++++++++++++-- test/GPUArraysTest.jl | 56 +++++++++++++++++++++++++++++++++++++++++++ test/runtests.jl | 5 ++++ 4 files changed, 100 insertions(+), 4 deletions(-) create mode 100644 test/GPUArraysTest.jl diff --git a/Project.toml b/Project.toml index 77f40590..48ef2fb3 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "ForwardDiff" uuid = "f6369f11-7733-5829-9624-2563aa707210" -version = "1.4.1" +version = "1.4.2" [deps] CommonSubexpressions = "bbf7d656-a473-5ed7-a52c-81e309532950" @@ -28,6 +28,7 @@ DiffRules = "1.4" DiffTests = "0.1" IrrationalConstants = "0.1, 0.2" JET = "0.9, 0.10, 0.11" +JLArrays = "0.1, 0.2" LogExpFunctions = "0.3, 1" NaNMath = "1" Preferences = "1" @@ -41,9 +42,10 @@ DiffTests = "de460e47-3fe3-5279-bb4a-814414816d5d" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" IrrationalConstants = "92d709cd-6900-40b7-9082-c6be49f344b6" JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b" +JLArrays = "27aeb0d3-9eb9-45fb-866b-73c2ecf80fcb" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Calculus", "DiffTests", "IrrationalConstants", "JET", "SparseArrays", "StaticArrays", "Test", "InteractiveUtils"] +test = ["Calculus", "DiffTests", "IrrationalConstants", "JET", "JLArrays", "SparseArrays", "StaticArrays", "Test", "InteractiveUtils"] diff --git a/src/apiutils.jl b/src/apiutils.jl index f401a3fc..bcfd6258 100644 --- a/src/apiutils.jl +++ b/src/apiutils.jl @@ -70,9 +70,21 @@ function structural_eachindex(x::Diagonal, y::AbstractArray) return diagind(x) end +# Dense arrays with isbits values need no structural-index or unset-element +# handling, so they can be seeded with broadcast/`map!` over contiguous views. +# This keeps GPU arrays (`AbstractGPUArray <: DenseArray`) free of scalar +# indexing and avoids the O(index) `Iterators.drop` walk of the structural +# path. Broadcast is fastest for the bulk writes; the chunk writes use `map!` +# with an index range because slicing the seeds tuple at runtime allocates. +@inline function dense_seedable(duals, x, ::Type{V}) where {V} + duals isa DenseArray && isbitstype(V) && !Base.has_offset_axes(duals, x) +end + function seed!(duals::AbstractArray{Dual{T,V,N}}, x, seed::Partials{N,V} = zero(Partials{N,V})) where {T,V,N} - if isbitstype(V) + if dense_seedable(duals, x, V) && axes(duals) == axes(x) + duals .= Dual{T,V,N}.(x, Ref(seed)) + elseif isbitstype(V) for idx in structural_eachindex(duals, x) duals[idx] = Dual{T,V,N}(x[idx], seed) end @@ -90,7 +102,12 @@ end function seed!(duals::AbstractArray{Dual{T,V,N}}, x, seeds::NTuple{N,Partials{N,V}}) where {T,V,N} - if isbitstype(V) + if dense_seedable(duals, x, V) + length(duals) == length(x) || throw(DimensionMismatch()) + dual_inds = 1:min(N, length(duals)) + map!((xi, i) -> Dual{T,V,N}(xi, seeds[i]), + view(duals, dual_inds), view(x, dual_inds), dual_inds) + elseif isbitstype(V) for (i, idx) in zip(1:N, structural_eachindex(duals, x)) duals[idx] = Dual{T,V,N}(x[idx], seeds[i]) end @@ -108,6 +125,14 @@ end function seed!(duals::AbstractArray{Dual{T,V,N}}, x, index, seed::Partials{N,V} = zero(Partials{N,V})) where {T,V,N} + if dense_seedable(duals, x, V) + length(duals) == length(x) || throw(DimensionMismatch()) + dual_inds = index:length(duals) + # map! rather than broadcast: the dotview allocates under + # --check-bounds=yes on Julia 1.10 + map!(xi -> Dual{T,V,N}(xi, seed), view(duals, dual_inds), view(x, dual_inds)) + return duals + end offset = index - 1 idxs = Iterators.drop(structural_eachindex(duals, x), offset) if isbitstype(V) @@ -128,6 +153,14 @@ end function seed!(duals::AbstractArray{Dual{T,V,N}}, x, index, seeds::NTuple{N,Partials{N,V}}, chunksize = N) where {T,V,N} + if dense_seedable(duals, x, V) + length(duals) == length(x) || throw(DimensionMismatch()) + shift = index - 1 + dual_inds = (1 + shift):min(shift + chunksize, length(duals)) + map!((xi, i) -> Dual{T,V,N}(xi, seeds[i - shift]), + view(duals, dual_inds), view(x, dual_inds), dual_inds) + return duals + end offset = index - 1 idxs = Iterators.drop(structural_eachindex(duals, x), offset) if isbitstype(V) diff --git a/test/GPUArraysTest.jl b/test/GPUArraysTest.jl new file mode 100644 index 00000000..cbec58fd --- /dev/null +++ b/test/GPUArraysTest.jl @@ -0,0 +1,56 @@ +module GPUArraysTest + +using ForwardDiff, Test +using JLArrays + +# JLArrays emulates GPU array semantics (including the scalar-indexing ban) on +# the CPU, so the dense-array `seed!` fast paths can be exercised on a GPU-like +# array type without a physical GPU. GPU arrays are covered by the dense fast +# paths because `AbstractGPUArray <: DenseArray`. +JLArrays.allowscalar(false) + +@testset "ForwardDiff seeding on GPU arrays" begin + f(x) = x .^ 2 .+ 2 .* x + + @testset "jacobian, vector mode (length $n)" for n in (1, 4, 8) + x = collect(Float64, 1:n) + @test Array(ForwardDiff.jacobian(f, JLArray(x))) == ForwardDiff.jacobian(f, x) + end + + # lengths above the chunk size exercise the chunked `seed!` methods + @testset "jacobian, chunk mode (length $n, chunk $c)" for n in (16, 20, 27), c in (4, 8) + x = collect(Float64, 1:n) + cfg = ForwardDiff.JacobianConfig(f, JLArray(x), ForwardDiff.Chunk{c}()) + @test Array(ForwardDiff.jacobian(f, JLArray(x), cfg)) == ForwardDiff.jacobian(f, x) + end + + @testset "jacobian! into a GPU array (length $n)" for n in (4, 16) + x = collect(Float64, 1:n) + out = JLArray(zeros(n, n)) + ForwardDiff.jacobian!(out, f, JLArray(x)) + @test Array(out) == ForwardDiff.jacobian(f, x) + end + + @testset "jacobian of f! with GPU input and output" begin + f!(y, x) = (y .= x .^ 2 .+ 2 .* x; nothing) + x = collect(Float64, 1:8) + y = zeros(8) + J = ForwardDiff.jacobian(f!, JLArray(y), JLArray(x)) + @test Array(J) == ForwardDiff.jacobian(f!, y, x) + end + + @testset "jacobian with matrix input (chunk $c)" for c in (3, 6) + X = reshape(collect(Float64, 1:12), 4, 3) + g(x) = x .* sum(x) + cfg = ForwardDiff.JacobianConfig(g, JLArray(X), ForwardDiff.Chunk{c}()) + @test Array(ForwardDiff.jacobian(g, JLArray(X), cfg)) ≈ ForwardDiff.jacobian(g, X) + end + + @testset "jacobian with view input" begin + X = JLArray(reshape(collect(Float64, 1:18), 6, 3)) + xv = view(X, :, 2) + @test Array(ForwardDiff.jacobian(f, xv)) == ForwardDiff.jacobian(f, Array(xv)) + end +end + +end # module diff --git a/test/runtests.jl b/test/runtests.jl index 2193242d..dea551ce 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -43,6 +43,11 @@ Random.seed!(SEED) t = @elapsed include("ConfusionTest.jl") println("##### done (took $t seconds).") end + @testset "GPUArrays" begin + println("##### Testing seeding on GPU arrays...") + t = @elapsed include("GPUArraysTest.jl") + println("##### done (took $t seconds).") + end @testset "Miscellaneous" begin println("##### Testing miscellaneous functionality...") t = @elapsed include("MiscTest.jl") From bacc1a22b67772b173bc02687dceb5f14877e835 Mon Sep 17 00:00:00 2001 From: ChrisRackauckas-Claude Date: Fri, 3 Jul 2026 12:45:04 -0400 Subject: [PATCH 2/2] Unseed only the chunk in 3-arg seed! The chunk-mode unseed call seed!(xdual, x, i) only needs to clear the N-wide chunk seeded at i: the rest of the array is zeroed up front and every other chunk clears itself. ForwardDiff 0.10 wrote exactly N elements here; the 1.x rewrite made it write from i to the end of the array, i.e. O(n^2/2N) redundant dual writes per chunked gradient/jacobian sweep (~40 GB of memory traffic for gradient! of 100000 elements at chunk 12). Write at most N elements starting at index: Iterators.take(_, N) on the structural path, a clamped range on the dense path. The 3-arg seed! form has no other callers. gradient! n=1000 (chunk 12): 344 -> 222 us; n=100000: 3.96 -> 2.61 s. Co-Authored-By: Chris Rackauckas Co-Authored-By: Claude Fable 5 Claude-Session: https://claude.ai/code/session_01Vx7zQ96NYk4VV4ML2s3kAC --- src/apiutils.jl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/apiutils.jl b/src/apiutils.jl index bcfd6258..94c1d612 100644 --- a/src/apiutils.jl +++ b/src/apiutils.jl @@ -123,18 +123,21 @@ function seed!(duals::AbstractArray{Dual{T,V,N}}, x, return duals end +# Writes at most N elements starting at `index`: chunk mode only ever needs to +# clear the N-wide chunk it just seeded, so writing through to the end of the +# array would be O(n) redundant work per chunk (O(n^2) per sweep). function seed!(duals::AbstractArray{Dual{T,V,N}}, x, index, seed::Partials{N,V} = zero(Partials{N,V})) where {T,V,N} if dense_seedable(duals, x, V) length(duals) == length(x) || throw(DimensionMismatch()) - dual_inds = index:length(duals) + dual_inds = index:min(index + N - 1, length(duals)) # map! rather than broadcast: the dotview allocates under # --check-bounds=yes on Julia 1.10 map!(xi -> Dual{T,V,N}(xi, seed), view(duals, dual_inds), view(x, dual_inds)) return duals end offset = index - 1 - idxs = Iterators.drop(structural_eachindex(duals, x), offset) + idxs = Iterators.take(Iterators.drop(structural_eachindex(duals, x), offset), N) if isbitstype(V) for idx in idxs duals[idx] = Dual{T,V,N}(x[idx], seed)