Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions benchmark/Project.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[deps]
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
Strided = "5e0ebb24-38b0-5f93-81fe-25c709ecae67"
67 changes: 67 additions & 0 deletions benchmark/cases.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Shared benchmark cases for the Strided mapreduce machinery.
#
# A `Case` describes one operation that exercises `_mapreducedim!` / the kernel,
# parameterised by ndims `N`, element type `T`, and operation kind `kind`.
# `make_runner(c, sz)` returns a zero-argument closure that performs the op
# in-place on freshly allocated arrays of size `sz`. The same runner is used by
# both the runtime benchmark and the compile/TTFX benchmark, so the two measure
# exactly the same specializations.

using Strided
using Strided: StridedView

@enum OpKind permute add reduce_inner reduce_outer reduce_full

struct Case
N::Int
T::DataType
kind::OpKind
end

name(c::Case) = "$(c.kind)_N$(c.N)_$(c.T)"

# A non-trivial size tuple of N dims with roughly `total` elements, avoiding
# size-1 dims (which would be pushed to the back / fused away).
function sizetuple(N::Int, total::Int)
d = max(2, round(Int, total^(1 / N)))
return ntuple(_ -> d, N)
end

function make_runner(c::Case, sz::NTuple{N,Int}) where {N}
T = c.T
if c.kind == permute
p = reverse(ntuple(identity, Val(N))) # reverse perm: defeats fusion
src = StridedView(rand(T, sz))
dst = StridedView(zeros(T, getindex.(Ref(sz), p)))
return () -> permutedims!(dst, src, p)
elseif c.kind == add
a = StridedView(rand(T, sz))
b = StridedView(rand(T, sz))
dst = StridedView(zeros(T, sz))
return () -> map!(+, dst, a, b)
elseif c.kind == reduce_inner
A = StridedView(rand(T, sz))
outsz = ntuple(i -> i == 1 ? 1 : sz[i], Val(N))
dst = StridedView(zeros(T, outsz))
return () -> (fill!(dst, zero(T)); Base.mapreducedim!(identity, +, dst, A))
elseif c.kind == reduce_outer
A = StridedView(rand(T, sz))
outsz = ntuple(i -> i == N ? 1 : sz[i], Val(N))
dst = StridedView(zeros(T, outsz))
return () -> (fill!(dst, zero(T)); Base.mapreducedim!(identity, +, dst, A))
elseif c.kind == reduce_full
A = StridedView(rand(T, sz))
return () -> sum(A)
else
error("unknown kind $(c.kind)")
end
end

# Build the full case grid.
function all_cases(; Ns, Ts, kinds)
cs = Case[]
for kind in kinds, T in Ts, N in Ns
push!(cs, Case(N, T, kind))
end
return cs
end
57 changes: 57 additions & 0 deletions benchmark/compile_bench.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# Compile / TTFX benchmark.
#
# Run in a FRESH Julia process. Strided's kernels are not part of any precompile
# workload, so the first call to each (N, T, op) specialization triggers
# inference + codegen. `Base.@timed` reports `.compile_time` per call, which we
# sum across all cases to get the total cold-compile cost — the headline number
# we want to drive down.
#
# julia --project=benchmark benchmark/compile_bench.jl [label]
#
# Writes results to benchmark/results/compile_<label>.tsv

include(joinpath(@__DIR__, "setup_env.jl"))
include(joinpath(@__DIR__, "cases.jl"))

const LABEL = length(ARGS) >= 1 ? ARGS[1] : "baseline"

# Small arrays: we want to isolate compile time, not run time.
const SMALL_TOTAL = 1 << 12 # 4096 elements

function main()
cases = all_cases(;
Ns = 2:7,
Ts = (Float64, ComplexF64),
kinds = (permute, add, reduce_inner, reduce_outer, reduce_full),
)

rows = Tuple{String,Float64,Float64}[] # name, compile_time, total_time
total_compile = 0.0
for c in cases
sz = sizetuple(c.N, SMALL_TOTAL)
run = make_runner(c, sz)
stats = Base.@timed run() # first (cold) call
push!(rows, (name(c), stats.compile_time, stats.time))
total_compile += stats.compile_time
end

mkpath(joinpath(@__DIR__, "results"))
out = joinpath(@__DIR__, "results", "compile_$(LABEL).tsv")
open(out, "w") do io
println(io, "case\tcompile_s\ttotal_s")
for (nm, ct, tt) in rows
println(io, "$nm\t$(round(ct; digits = 5))\t$(round(tt; digits = 5))")
end
println(io, "TOTAL\t$(round(total_compile; digits = 5))\t")
end

println("== compile benchmark [$LABEL] ==")
for (nm, ct, tt) in rows
println(rpad(nm, 32), " compile=", rpad(round(ct; digits = 4), 9), " total=", round(tt; digits = 4))
end
println("-"^56)
println(rpad("TOTAL compile_time (s)", 32), " ", round(total_compile; digits = 4))
println("\nwrote $out")
end

main()
49 changes: 49 additions & 0 deletions benchmark/manyops_compile.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Compile benchmark across MANY distinct op TYPES — simulating the real
# combinatorial explosion (TensorOperations generates many distinct map/reduce
# closures). Each `@eval`'d function is a distinct type, forcing a fresh
# specialization of the whole call chain.
#
# julia --project=benchmark benchmark/manyops_compile.jl [label] [Kops] [Nmax]

include(joinpath(@__DIR__, "setup_env.jl"))
using Strided
using Strided: StridedView

const LABEL = length(ARGS) >= 1 ? ARGS[1] : "baseline"
const KOPS = length(ARGS) >= 2 ? parse(Int, ARGS[2]) : 8
const NMAXD = length(ARGS) >= 3 ? parse(Int, ARGS[3]) : 5

# K distinct unary map functions (distinct types) and K distinct binary reduce ops.
const MAPFNS = Function[]
const REDFNS = Function[]
for i in 1:KOPS
f = @eval ($(Symbol(:mapf_, i)))(x) = x * $i - $i
g = @eval ($(Symbol(:redf_, i)))(x, y) = x + y * $(i % 3 + 1)
push!(MAPFNS, f)
push!(REDFNS, g)
end

sz(N) = ntuple(_ -> 3, N)

function main()
total = 0.0
Strided.set_num_threads(1)
for N in 2:NMAXD
for k in 1:KOPS
A = StridedView(rand(Float64, sz(N)))
B = StridedView(zeros(Float64, sz(N)))
f = MAPFNS[k]
total += Base.@timed(map!(f, B, A)).compile_time
r = StridedView(zeros(Float64, ntuple(i -> i == 1 ? 1 : 3, N)))
g = REDFNS[k]
total += Base.@timed(Base.mapreducedim!(identity, g, r, A)).compile_time
end
end
mkpath(joinpath(@__DIR__, "results"))
open(joinpath(@__DIR__, "results", "manyops_$(LABEL).txt"), "w") do io
println(io, "label=$LABEL Kops=$KOPS Nmax=$NMAXD total_compile_s=$(round(total; digits = 4))")
end
println("[$LABEL] Kops=$KOPS Nmax=$NMAXD TOTAL compile_time = $(round(total; digits = 4)) s")
end

main()
56 changes: 56 additions & 0 deletions benchmark/runtime_bench.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# Runtime benchmark.
#
# Measures steady-state (compiled) performance of the mapreduce machinery, so we
# can guard against runtime regressions — permutations especially. Runs each
# case single-threaded and (if available) multi-threaded.
#
# julia --project=benchmark -t auto benchmark/runtime_bench.jl [label]
#
# Writes results to benchmark/results/runtime_<label>.tsv

include(joinpath(@__DIR__, "setup_env.jl"))
include(joinpath(@__DIR__, "cases.jl"))
using BenchmarkTools

const LABEL = length(ARGS) >= 1 ? ARGS[1] : "baseline"

# Large enough that the kernel, not call overhead, dominates.
const BIG_TOTAL = 1 << 22 # ~4M elements

function bench_one(c::Case)
sz = sizetuple(c.N, BIG_TOTAL)
run = make_runner(c, sz)
run() # warm up / compile
return @belapsed $run() samples = 30 evals = 1
end

function main()
cases = all_cases(;
Ns = 2:6,
Ts = (Float64, ComplexF64),
kinds = (permute, add, reduce_inner, reduce_outer, reduce_full),
)

nthreads_available = Base.Threads.nthreads()
thread_settings = nthreads_available > 1 ? (1, nthreads_available) : (1,)

mkpath(joinpath(@__DIR__, "results"))
out = joinpath(@__DIR__, "results", "runtime_$(LABEL).tsv")
io = open(out, "w")
println(io, "case\tnthreads\ttime_us")

for nt in thread_settings
Strided.set_num_threads(nt)
println("== runtime [$LABEL] nthreads=$nt ==")
for c in cases
t = bench_one(c)
us = t * 1e6
println(io, "$(name(c))\t$nt\t$(round(us; digits = 3))")
println(rpad(name(c), 32), " nt=$nt ", round(us; digits = 3), " us")
end
end
close(io)
println("\nwrote $out")
end

main()
39 changes: 39 additions & 0 deletions benchmark/runtime_small.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Small-array runtime benchmark: fixed bookkeeping overhead dominates here, so
# this is the sensitive guard against de-specialization regressions.
#
# julia --project=benchmark -t 1 benchmark/runtime_small.jl [label]
include(joinpath(@__DIR__, "setup_env.jl"))
include(joinpath(@__DIR__, "cases.jl"))
using BenchmarkTools

const LABEL = length(ARGS) >= 1 ? ARGS[1] : "baseline"

function bench_one(c::Case, sz)
run = make_runner(c, sz)
run()
return @belapsed $run() samples = 200 evals = 5
end

function main()
Strided.set_num_threads(1)
sizes = Dict(2 => (4, 4), 3 => (4, 4, 4), 4 => (4, 4, 4, 4))
cases = all_cases(;
Ns = 2:4,
Ts = (Float64, ComplexF64),
kinds = (permute, add, reduce_inner, reduce_outer, reduce_full),
)
mkpath(joinpath(@__DIR__, "results"))
out = joinpath(@__DIR__, "results", "runtime_small_$(LABEL).tsv")
io = open(out, "w")
println(io, "case\ttime_ns")
println("== runtime small [$LABEL] nt=1 ==")
for c in cases
t = bench_one(c, sizes[c.N])
ns = t * 1e9
println(io, "$(name(c))\t$(round(ns; digits = 2))")
println(rpad(name(c), 32), " ", round(ns; digits = 2), " ns")
end
close(io)
println("\nwrote $out")
end
main()
17 changes: 17 additions & 0 deletions benchmark/setup_env.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Activate a benchmark environment that uses the *local* Strided checkout
# (the package living one directory up). Works unchanged inside any worktree.
import Pkg
Pkg.activate(@__DIR__)
let root = normpath(joinpath(@__DIR__, ".."))
# `develop` is idempotent; re-pointing to the local path each run guarantees
# we benchmark this worktree's source rather than a registered version.
try
Pkg.develop(Pkg.PackageSpec(path = root); io = devnull)
catch
Pkg.develop(Pkg.PackageSpec(path = root))
end
if !haskey(Pkg.project().dependencies, "BenchmarkTools")
Pkg.add("BenchmarkTools"; io = devnull)
end
end
Pkg.instantiate(; io = devnull)
72 changes: 72 additions & 0 deletions benchmark/spec_count.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# Count method specializations of the bookkeeping functions after a
# multi-op / multi-eltype / multi-ndims workload. This is the headline
# precompile-effectiveness metric: fewer specializations => precompile once
# per N and reuse.
#
# julia --project=benchmark benchmark/spec_count.jl [label]
include(joinpath(@__DIR__, "setup_env.jl"))
include(joinpath(@__DIR__, "cases.jl"))
using Strided

const LABEL = length(ARGS) >= 1 ? ARGS[1] : "baseline"

function nspecs(f)
n = 0
for m in methods(f)
for s in Base.specializations(m)
s === nothing && continue
n += 1
end
end
return n
end

# distinct map fns and reduce ops (distinct types)
const MAPFNS = Function[]
const REDFNS = Function[]
for i in 1:6
push!(MAPFNS, @eval ($(Symbol(:mf_, i)))(x) = x * $i - $i)
push!(REDFNS, @eval ($(Symbol(:rf_, i)))(x, y) = x + y * $(i % 3 + 1))
end
sz(N) = ntuple(_ -> 3, N)

function workload()
Strided.set_num_threads(1)
for N in 2:7
for T in (Float64, ComplexF64, Float32, ComplexF32)
for k in 1:6
A = StridedView(rand(T, sz(N)))
B = StridedView(zeros(T, sz(N)))
map!(MAPFNS[k], B, A)
r = StridedView(zeros(T, ntuple(i -> i == 1 ? 1 : 3, N)))
Base.mapreducedim!(identity, REDFNS[k], r, A)
end
end
end
end

function main()
workload()
fns = Dict(
"_mapreduce_fuse!" => Strided._mapreduce_fuse!,
"_mapreduce_order!" => Strided._mapreduce_order!,
"_mapreduce_block!" => Strided._mapreduce_block!,
"_computeblocks" => Strided._computeblocks,
"_mapreduce_kernel!" => Strided._mapreduce_kernel!,
"_mapreduce_threaded!" => Strided._mapreduce_threaded!,
"indexorder" => Strided.indexorder,
"totalmemoryregion" => Strided.totalmemoryregion,
)
mkpath(joinpath(@__DIR__, "results"))
out = joinpath(@__DIR__, "results", "specs_$(LABEL).tsv")
open(out, "w") do io
println(io, "function\tnspecs")
for nm in sort(collect(keys(fns)))
n = nspecs(fns[nm])
println(io, "$nm\t$n")
println(rpad(nm, 24), " ", n)
end
end
println("wrote $out")
end
main()
Loading
Loading