QuantumKitHub · lkdvos · Jun 18, 2026
diff --git a/benchmark/Project.toml b/benchmark/Project.toml
@@ -0,0 +1,3 @@
+[deps]
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+Strided = "5e0ebb24-38b0-5f93-81fe-25c709ecae67"
diff --git a/benchmark/cases.jl b/benchmark/cases.jl
@@ -0,0 +1,67 @@
+# Shared benchmark cases for the Strided mapreduce machinery.
+#
+# A `Case` describes one operation that exercises `_mapreducedim!` / the kernel,
+# parameterised by ndims `N`, element type `T`, and operation kind `kind`.
+# `make_runner(c, sz)` returns a zero-argument closure that performs the op
+# in-place on freshly allocated arrays of size `sz`. The same runner is used by
+# both the runtime benchmark and the compile/TTFX benchmark, so the two measure
+# exactly the same specializations.
+
+using Strided
+using Strided: StridedView
+
+@enum OpKind permute add reduce_inner reduce_outer reduce_full
+
+struct Case
+    N::Int
+    T::DataType
+    kind::OpKind
+end
+
+name(c::Case) = "$(c.kind)_N$(c.N)_$(c.T)"
+
+# A non-trivial size tuple of N dims with roughly `total` elements, avoiding
+# size-1 dims (which would be pushed to the back / fused away).
+function sizetuple(N::Int, total::Int)
+    d = max(2, round(Int, total^(1 / N)))
+    return ntuple(_ -> d, N)
+end
+
+function make_runner(c::Case, sz::NTuple{N,Int}) where {N}
+    T = c.T
+    if c.kind == permute
+        p = reverse(ntuple(identity, Val(N)))        # reverse perm: defeats fusion
+        src = StridedView(rand(T, sz))
+        dst = StridedView(zeros(T, getindex.(Ref(sz), p)))
+        return () -> permutedims!(dst, src, p)
+    elseif c.kind == add
+        a = StridedView(rand(T, sz))
+        b = StridedView(rand(T, sz))
+        dst = StridedView(zeros(T, sz))
+        return () -> map!(+, dst, a, b)
+    elseif c.kind == reduce_inner
+        A = StridedView(rand(T, sz))
+        outsz = ntuple(i -> i == 1 ? 1 : sz[i], Val(N))
+        dst = StridedView(zeros(T, outsz))
+        return () -> (fill!(dst, zero(T)); Base.mapreducedim!(identity, +, dst, A))
+    elseif c.kind == reduce_outer
+        A = StridedView(rand(T, sz))
+        outsz = ntuple(i -> i == N ? 1 : sz[i], Val(N))
+        dst = StridedView(zeros(T, outsz))
+        return () -> (fill!(dst, zero(T)); Base.mapreducedim!(identity, +, dst, A))
+    elseif c.kind == reduce_full
+        A = StridedView(rand(T, sz))
+        return () -> sum(A)
+    else
+        error("unknown kind $(c.kind)")
+    end
+end
+
+# Build the full case grid.
+function all_cases(; Ns, Ts, kinds)
+    cs = Case[]
+    for kind in kinds, T in Ts, N in Ns
+        push!(cs, Case(N, T, kind))
+    end
+    return cs
+end
diff --git a/benchmark/compile_bench.jl b/benchmark/compile_bench.jl
@@ -0,0 +1,57 @@
+# Compile / TTFX benchmark.
+#
+# Run in a FRESH Julia process. Strided's kernels are not part of any precompile
+# workload, so the first call to each (N, T, op) specialization triggers
+# inference + codegen. `Base.@timed` reports `.compile_time` per call, which we
+# sum across all cases to get the total cold-compile cost — the headline number
+# we want to drive down.
+#
+#   julia --project=benchmark benchmark/compile_bench.jl [label]
+#
+# Writes results to benchmark/results/compile_<label>.tsv
+
+include(joinpath(@__DIR__, "setup_env.jl"))
+include(joinpath(@__DIR__, "cases.jl"))
+
+const LABEL = length(ARGS) >= 1 ? ARGS[1] : "baseline"
+
+# Small arrays: we want to isolate compile time, not run time.
+const SMALL_TOTAL = 1 << 12   # 4096 elements
+
+function main()
+    cases = all_cases(;
+        Ns = 2:7,
+        Ts = (Float64, ComplexF64),
+        kinds = (permute, add, reduce_inner, reduce_outer, reduce_full),
+    )
+
+    rows = Tuple{String,Float64,Float64}[]   # name, compile_time, total_time
+    total_compile = 0.0
+    for c in cases
+        sz = sizetuple(c.N, SMALL_TOTAL)
+        run = make_runner(c, sz)
+        stats = Base.@timed run()            # first (cold) call
+        push!(rows, (name(c), stats.compile_time, stats.time))
+        total_compile += stats.compile_time
+    end
+
+    mkpath(joinpath(@__DIR__, "results"))
+    out = joinpath(@__DIR__, "results", "compile_$(LABEL).tsv")
+    open(out, "w") do io
+        println(io, "case\tcompile_s\ttotal_s")
+        for (nm, ct, tt) in rows
+            println(io, "$nm\t$(round(ct; digits = 5))\t$(round(tt; digits = 5))")
+        end
+        println(io, "TOTAL\t$(round(total_compile; digits = 5))\t")
+    end
+
+    println("== compile benchmark [$LABEL] ==")
+    for (nm, ct, tt) in rows
+        println(rpad(nm, 32), "  compile=", rpad(round(ct; digits = 4), 9), " total=", round(tt; digits = 4))
+    end
+    println("-"^56)
+    println(rpad("TOTAL compile_time (s)", 32), "  ", round(total_compile; digits = 4))
+    println("\nwrote $out")
+end
+
+main()
diff --git a/benchmark/manyops_compile.jl b/benchmark/manyops_compile.jl
@@ -0,0 +1,49 @@
+# Compile benchmark across MANY distinct op TYPES — simulating the real
+# combinatorial explosion (TensorOperations generates many distinct map/reduce
+# closures). Each `@eval`'d function is a distinct type, forcing a fresh
+# specialization of the whole call chain.
+#
+#   julia --project=benchmark benchmark/manyops_compile.jl [label] [Kops] [Nmax]
+
+include(joinpath(@__DIR__, "setup_env.jl"))
+using Strided
+using Strided: StridedView
+
+const LABEL = length(ARGS) >= 1 ? ARGS[1] : "baseline"
+const KOPS = length(ARGS) >= 2 ? parse(Int, ARGS[2]) : 8
+const NMAXD = length(ARGS) >= 3 ? parse(Int, ARGS[3]) : 5
+
+# K distinct unary map functions (distinct types) and K distinct binary reduce ops.
+const MAPFNS = Function[]
+const REDFNS = Function[]
+for i in 1:KOPS
+    f = @eval ($(Symbol(:mapf_, i)))(x) = x * $i - $i
+    g = @eval ($(Symbol(:redf_, i)))(x, y) = x + y * $(i % 3 + 1)
+    push!(MAPFNS, f)
+    push!(REDFNS, g)
+end
+
+sz(N) = ntuple(_ -> 3, N)
+
+function main()
+    total = 0.0
+    Strided.set_num_threads(1)
+    for N in 2:NMAXD
+        for k in 1:KOPS
+            A = StridedView(rand(Float64, sz(N)))
+            B = StridedView(zeros(Float64, sz(N)))
+            f = MAPFNS[k]
+            total += Base.@timed(map!(f, B, A)).compile_time
+            r = StridedView(zeros(Float64, ntuple(i -> i == 1 ? 1 : 3, N)))
+            g = REDFNS[k]
+            total += Base.@timed(Base.mapreducedim!(identity, g, r, A)).compile_time
+        end
+    end
+    mkpath(joinpath(@__DIR__, "results"))
+    open(joinpath(@__DIR__, "results", "manyops_$(LABEL).txt"), "w") do io
+        println(io, "label=$LABEL Kops=$KOPS Nmax=$NMAXD total_compile_s=$(round(total; digits = 4))")
+    end
+    println("[$LABEL] Kops=$KOPS Nmax=$NMAXD  TOTAL compile_time = $(round(total; digits = 4)) s")
+end
+
+main()
diff --git a/benchmark/runtime_bench.jl b/benchmark/runtime_bench.jl
@@ -0,0 +1,56 @@
+# Runtime benchmark.
+#
+# Measures steady-state (compiled) performance of the mapreduce machinery, so we
+# can guard against runtime regressions — permutations especially. Runs each
+# case single-threaded and (if available) multi-threaded.
+#
+#   julia --project=benchmark -t auto benchmark/runtime_bench.jl [label]
+#
+# Writes results to benchmark/results/runtime_<label>.tsv
+
+include(joinpath(@__DIR__, "setup_env.jl"))
+include(joinpath(@__DIR__, "cases.jl"))
+using BenchmarkTools
+
+const LABEL = length(ARGS) >= 1 ? ARGS[1] : "baseline"
+
+# Large enough that the kernel, not call overhead, dominates.
+const BIG_TOTAL = 1 << 22   # ~4M elements
+
+function bench_one(c::Case)
+    sz = sizetuple(c.N, BIG_TOTAL)
+    run = make_runner(c, sz)
+    run()                                    # warm up / compile
+    return @belapsed $run() samples = 30 evals = 1
+end
+
+function main()
+    cases = all_cases(;
+        Ns = 2:6,
+        Ts = (Float64, ComplexF64),
+        kinds = (permute, add, reduce_inner, reduce_outer, reduce_full),
+    )
+
+    nthreads_available = Base.Threads.nthreads()
+    thread_settings = nthreads_available > 1 ? (1, nthreads_available) : (1,)
+
+    mkpath(joinpath(@__DIR__, "results"))
+    out = joinpath(@__DIR__, "results", "runtime_$(LABEL).tsv")
+    io = open(out, "w")
+    println(io, "case\tnthreads\ttime_us")
+
+    for nt in thread_settings
+        Strided.set_num_threads(nt)
+        println("== runtime [$LABEL] nthreads=$nt ==")
+        for c in cases
+            t = bench_one(c)
+            us = t * 1e6
+            println(io, "$(name(c))\t$nt\t$(round(us; digits = 3))")
+            println(rpad(name(c), 32), "  nt=$nt  ", round(us; digits = 3), " us")
+        end
+    end
+    close(io)
+    println("\nwrote $out")
+end
+
+main()
diff --git a/benchmark/runtime_small.jl b/benchmark/runtime_small.jl
@@ -0,0 +1,39 @@
+# Small-array runtime benchmark: fixed bookkeeping overhead dominates here, so
+# this is the sensitive guard against de-specialization regressions.
+#
+#   julia --project=benchmark -t 1 benchmark/runtime_small.jl [label]
+include(joinpath(@__DIR__, "setup_env.jl"))
+include(joinpath(@__DIR__, "cases.jl"))
+using BenchmarkTools
+
+const LABEL = length(ARGS) >= 1 ? ARGS[1] : "baseline"
+
+function bench_one(c::Case, sz)
+    run = make_runner(c, sz)
+    run()
+    return @belapsed $run() samples = 200 evals = 5
+end
+
+function main()
+    Strided.set_num_threads(1)
+    sizes = Dict(2 => (4, 4), 3 => (4, 4, 4), 4 => (4, 4, 4, 4))
+    cases = all_cases(;
+        Ns = 2:4,
+        Ts = (Float64, ComplexF64),
+        kinds = (permute, add, reduce_inner, reduce_outer, reduce_full),
+    )
+    mkpath(joinpath(@__DIR__, "results"))
+    out = joinpath(@__DIR__, "results", "runtime_small_$(LABEL).tsv")
+    io = open(out, "w")
+    println(io, "case\ttime_ns")
+    println("== runtime small [$LABEL] nt=1 ==")
+    for c in cases
+        t = bench_one(c, sizes[c.N])
+        ns = t * 1e9
+        println(io, "$(name(c))\t$(round(ns; digits = 2))")
+        println(rpad(name(c), 32), "  ", round(ns; digits = 2), " ns")
+    end
+    close(io)
+    println("\nwrote $out")
+end
+main()
diff --git a/benchmark/setup_env.jl b/benchmark/setup_env.jl
@@ -0,0 +1,17 @@
+# Activate a benchmark environment that uses the *local* Strided checkout
+# (the package living one directory up). Works unchanged inside any worktree.
+import Pkg
+Pkg.activate(@__DIR__)
+let root = normpath(joinpath(@__DIR__, ".."))
+    # `develop` is idempotent; re-pointing to the local path each run guarantees
+    # we benchmark this worktree's source rather than a registered version.
+    try
+        Pkg.develop(Pkg.PackageSpec(path = root); io = devnull)
+    catch
+        Pkg.develop(Pkg.PackageSpec(path = root))
+    end
+    if !haskey(Pkg.project().dependencies, "BenchmarkTools")
+        Pkg.add("BenchmarkTools"; io = devnull)
+    end
+end
+Pkg.instantiate(; io = devnull)
diff --git a/benchmark/spec_count.jl b/benchmark/spec_count.jl
@@ -0,0 +1,72 @@
+# Count method specializations of the bookkeeping functions after a
+# multi-op / multi-eltype / multi-ndims workload. This is the headline
+# precompile-effectiveness metric: fewer specializations => precompile once
+# per N and reuse.
+#
+#   julia --project=benchmark benchmark/spec_count.jl [label]
+include(joinpath(@__DIR__, "setup_env.jl"))
+include(joinpath(@__DIR__, "cases.jl"))
+using Strided
+
+const LABEL = length(ARGS) >= 1 ? ARGS[1] : "baseline"
+
+function nspecs(f)
+    n = 0
+    for m in methods(f)
+        for s in Base.specializations(m)
+            s === nothing && continue
+            n += 1
+        end
+    end
+    return n
+end
+
+# distinct map fns and reduce ops (distinct types)
+const MAPFNS = Function[]
+const REDFNS = Function[]
+for i in 1:6
+    push!(MAPFNS, @eval ($(Symbol(:mf_, i)))(x) = x * $i - $i)
+    push!(REDFNS, @eval ($(Symbol(:rf_, i)))(x, y) = x + y * $(i % 3 + 1))
+end
+sz(N) = ntuple(_ -> 3, N)
+
+function workload()
+    Strided.set_num_threads(1)
+    for N in 2:7
+        for T in (Float64, ComplexF64, Float32, ComplexF32)
+            for k in 1:6
+                A = StridedView(rand(T, sz(N)))
+                B = StridedView(zeros(T, sz(N)))
+                map!(MAPFNS[k], B, A)
+                r = StridedView(zeros(T, ntuple(i -> i == 1 ? 1 : 3, N)))
+                Base.mapreducedim!(identity, REDFNS[k], r, A)
+            end
+        end
+    end
+end
+
+function main()
+    workload()
+    fns = Dict(
+        "_mapreduce_fuse!" => Strided._mapreduce_fuse!,
+        "_mapreduce_order!" => Strided._mapreduce_order!,
+        "_mapreduce_block!" => Strided._mapreduce_block!,
+        "_computeblocks" => Strided._computeblocks,
+        "_mapreduce_kernel!" => Strided._mapreduce_kernel!,
+        "_mapreduce_threaded!" => Strided._mapreduce_threaded!,
+        "indexorder" => Strided.indexorder,
+        "totalmemoryregion" => Strided.totalmemoryregion,
+    )
+    mkpath(joinpath(@__DIR__, "results"))
+    out = joinpath(@__DIR__, "results", "specs_$(LABEL).tsv")
+    open(out, "w") do io
+        println(io, "function\tnspecs")
+        for nm in sort(collect(keys(fns)))
+            n = nspecs(fns[nm])
+            println(io, "$nm\t$n")
+            println(rpad(nm, 24), "  ", n)
+        end
+    end
+    println("wrote $out")
+end
+main()