From dfa507142b137b7edecc5a04cd7dd56a6b74ef1a Mon Sep 17 00:00:00 2001 From: lkdvos Date: Thu, 18 Jun 2026 18:37:07 -0400 Subject: [PATCH] Erase op types from mapreduce bookkeeping for precompilation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The mapreduce setup chain (`_mapreducedim!`, `_mapreduce_fuse!`, `_mapreduce_order!`, `_mapreduce_block!`, `_mapreduce_threaded!`, and the public `map`/`map!`/`mapreduce`/`mapreducedim!`/`_mapreduce` entry points) previously specialized on the map/reduce function types `f`/`op`/`initop`. None of the bookkeeping logic depends on what those functions are — it only fuses dimensions, sorts the loop order by cache-importance and computes cache blocks from the array shapes/strides — yet a workload that calls mapreduce with many distinct ops (as TensorOperations does) forced a fresh compilation of the entire chain per (op, eltype) combination. `@nospecialize` the function arguments throughout so the bookkeeping specializes on the array-shape signature (ndims/n-arrays/eltypes) but no longer on the ops. A precompile workload can then compile it once per shape signature and reuse it across every op. The bookkeeping runs once per mapreduce call (coarse granularity), so erasing the op types is free at runtime; the only function that still specializes on the op is the monolithic `@generated _mapreduce_kernel!`, which is kept untouched. The per-array data is deliberately kept as stack-allocated `Tuple`s rather than `M`-erased `Vector`s; the latter (a pure-`N` bookkeeping variant) was prototyped and rejected because it roughly doubled small-array call overhead for no additional spec-count reduction (see the note in `mapreduce.jl`). The GPU `_mapreduce_block!` extension hook is unchanged: this commit does not alter `_mapreduce_block!`'s signature, so the extension's dispatch boundary is preserved. Adds a `benchmark/` harness (compile / many-op / runtime / spec-count) used to validate the change. Method specializations after a multi-op × multi-eltype × multi-ndims workload: function baseline branch _mapreduce_block! 346 75 _mapreduce_fuse! 202 75 _mapreduce_order! 202 75 _mapreduce_threaded! 515 124 _mapreduce_kernel! 659 412 Compile time: grid 28.8s -> 26.5s; many-distinct-ops 14.5s -> 11.0s. Runtime (single-thread, BenchmarkTools) neutral-to-better on both tiny (4^N) and large (~4M-element) arrays; per-call allocations drop (e.g. 768 -> 464 B). `Pkg.test` passes single- and multi-threaded, including the JLArray and CuArray GPU mapreduce/reduce tests. Co-Authored-By: Claude Opus 4.8 (1M context) --- benchmark/Project.toml | 3 ++ benchmark/cases.jl | 67 +++++++++++++++++++++++++++++++++ benchmark/compile_bench.jl | 57 ++++++++++++++++++++++++++++ benchmark/manyops_compile.jl | 49 ++++++++++++++++++++++++ benchmark/runtime_bench.jl | 56 ++++++++++++++++++++++++++++ benchmark/runtime_small.jl | 39 +++++++++++++++++++ benchmark/setup_env.jl | 17 +++++++++ benchmark/spec_count.jl | 72 ++++++++++++++++++++++++++++++++++++ src/mapreduce.jl | 57 +++++++++++++++++++++------- 9 files changed, 403 insertions(+), 14 deletions(-) create mode 100644 benchmark/Project.toml create mode 100644 benchmark/cases.jl create mode 100644 benchmark/compile_bench.jl create mode 100644 benchmark/manyops_compile.jl create mode 100644 benchmark/runtime_bench.jl create mode 100644 benchmark/runtime_small.jl create mode 100644 benchmark/setup_env.jl create mode 100644 benchmark/spec_count.jl diff --git a/benchmark/Project.toml b/benchmark/Project.toml new file mode 100644 index 0000000..c13013b --- /dev/null +++ b/benchmark/Project.toml @@ -0,0 +1,3 @@ +[deps] +BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +Strided = "5e0ebb24-38b0-5f93-81fe-25c709ecae67" diff --git a/benchmark/cases.jl b/benchmark/cases.jl new file mode 100644 index 0000000..bc1045c --- /dev/null +++ b/benchmark/cases.jl @@ -0,0 +1,67 @@ +# Shared benchmark cases for the Strided mapreduce machinery. +# +# A `Case` describes one operation that exercises `_mapreducedim!` / the kernel, +# parameterised by ndims `N`, element type `T`, and operation kind `kind`. +# `make_runner(c, sz)` returns a zero-argument closure that performs the op +# in-place on freshly allocated arrays of size `sz`. The same runner is used by +# both the runtime benchmark and the compile/TTFX benchmark, so the two measure +# exactly the same specializations. + +using Strided +using Strided: StridedView + +@enum OpKind permute add reduce_inner reduce_outer reduce_full + +struct Case + N::Int + T::DataType + kind::OpKind +end + +name(c::Case) = "$(c.kind)_N$(c.N)_$(c.T)" + +# A non-trivial size tuple of N dims with roughly `total` elements, avoiding +# size-1 dims (which would be pushed to the back / fused away). +function sizetuple(N::Int, total::Int) + d = max(2, round(Int, total^(1 / N))) + return ntuple(_ -> d, N) +end + +function make_runner(c::Case, sz::NTuple{N,Int}) where {N} + T = c.T + if c.kind == permute + p = reverse(ntuple(identity, Val(N))) # reverse perm: defeats fusion + src = StridedView(rand(T, sz)) + dst = StridedView(zeros(T, getindex.(Ref(sz), p))) + return () -> permutedims!(dst, src, p) + elseif c.kind == add + a = StridedView(rand(T, sz)) + b = StridedView(rand(T, sz)) + dst = StridedView(zeros(T, sz)) + return () -> map!(+, dst, a, b) + elseif c.kind == reduce_inner + A = StridedView(rand(T, sz)) + outsz = ntuple(i -> i == 1 ? 1 : sz[i], Val(N)) + dst = StridedView(zeros(T, outsz)) + return () -> (fill!(dst, zero(T)); Base.mapreducedim!(identity, +, dst, A)) + elseif c.kind == reduce_outer + A = StridedView(rand(T, sz)) + outsz = ntuple(i -> i == N ? 1 : sz[i], Val(N)) + dst = StridedView(zeros(T, outsz)) + return () -> (fill!(dst, zero(T)); Base.mapreducedim!(identity, +, dst, A)) + elseif c.kind == reduce_full + A = StridedView(rand(T, sz)) + return () -> sum(A) + else + error("unknown kind $(c.kind)") + end +end + +# Build the full case grid. +function all_cases(; Ns, Ts, kinds) + cs = Case[] + for kind in kinds, T in Ts, N in Ns + push!(cs, Case(N, T, kind)) + end + return cs +end diff --git a/benchmark/compile_bench.jl b/benchmark/compile_bench.jl new file mode 100644 index 0000000..b0d2def --- /dev/null +++ b/benchmark/compile_bench.jl @@ -0,0 +1,57 @@ +# Compile / TTFX benchmark. +# +# Run in a FRESH Julia process. Strided's kernels are not part of any precompile +# workload, so the first call to each (N, T, op) specialization triggers +# inference + codegen. `Base.@timed` reports `.compile_time` per call, which we +# sum across all cases to get the total cold-compile cost — the headline number +# we want to drive down. +# +# julia --project=benchmark benchmark/compile_bench.jl [label] +# +# Writes results to benchmark/results/compile_