diff --git a/benchmark/Project.toml b/benchmark/Project.toml
new file mode 100644
index 0000000..c13013b
--- /dev/null
+++ b/benchmark/Project.toml
@@ -0,0 +1,3 @@
+[deps]
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+Strided = "5e0ebb24-38b0-5f93-81fe-25c709ecae67"
diff --git a/benchmark/cases.jl b/benchmark/cases.jl
new file mode 100644
index 0000000..bc1045c
--- /dev/null
+++ b/benchmark/cases.jl
@@ -0,0 +1,67 @@
+# Shared benchmark cases for the Strided mapreduce machinery.
+#
+# A `Case` describes one operation that exercises `_mapreducedim!` / the kernel,
+# parameterised by ndims `N`, element type `T`, and operation kind `kind`.
+# `make_runner(c, sz)` returns a zero-argument closure that performs the op
+# in-place on freshly allocated arrays of size `sz`. The same runner is used by
+# both the runtime benchmark and the compile/TTFX benchmark, so the two measure
+# exactly the same specializations.
+
+using Strided
+using Strided: StridedView
+
+@enum OpKind permute add reduce_inner reduce_outer reduce_full
+
+struct Case
+    N::Int
+    T::DataType
+    kind::OpKind
+end
+
+name(c::Case) = "$(c.kind)_N$(c.N)_$(c.T)"
+
+# A non-trivial size tuple of N dims with roughly `total` elements, avoiding
+# size-1 dims (which would be pushed to the back / fused away).
+function sizetuple(N::Int, total::Int)
+    d = max(2, round(Int, total^(1 / N)))
+    return ntuple(_ -> d, N)
+end
+
+function make_runner(c::Case, sz::NTuple{N,Int}) where {N}
+    T = c.T
+    if c.kind == permute
+        p = reverse(ntuple(identity, Val(N)))        # reverse perm: defeats fusion
+        src = StridedView(rand(T, sz))
+        dst = StridedView(zeros(T, getindex.(Ref(sz), p)))
+        return () -> permutedims!(dst, src, p)
+    elseif c.kind == add
+        a = StridedView(rand(T, sz))
+        b = StridedView(rand(T, sz))
+        dst = StridedView(zeros(T, sz))
+        return () -> map!(+, dst, a, b)
+    elseif c.kind == reduce_inner
+        A = StridedView(rand(T, sz))
+        outsz = ntuple(i -> i == 1 ? 1 : sz[i], Val(N))
+        dst = StridedView(zeros(T, outsz))
+        return () -> (fill!(dst, zero(T)); Base.mapreducedim!(identity, +, dst, A))
+    elseif c.kind == reduce_outer
+        A = StridedView(rand(T, sz))
+        outsz = ntuple(i -> i == N ? 1 : sz[i], Val(N))
+        dst = StridedView(zeros(T, outsz))
+        return () -> (fill!(dst, zero(T)); Base.mapreducedim!(identity, +, dst, A))
+    elseif c.kind == reduce_full
+        A = StridedView(rand(T, sz))
+        return () -> sum(A)
+    else
+        error("unknown kind $(c.kind)")
+    end
+end
+
+# Build the full case grid.
+function all_cases(; Ns, Ts, kinds)
+    cs = Case[]
+    for kind in kinds, T in Ts, N in Ns
+        push!(cs, Case(N, T, kind))
+    end
+    return cs
+end
diff --git a/benchmark/compile_bench.jl b/benchmark/compile_bench.jl
new file mode 100644
index 0000000..b0d2def
--- /dev/null
+++ b/benchmark/compile_bench.jl
@@ -0,0 +1,57 @@
+# Compile / TTFX benchmark.
+#
+# Run in a FRESH Julia process. Strided's kernels are not part of any precompile
+# workload, so the first call to each (N, T, op) specialization triggers
+# inference + codegen. `Base.@timed` reports `.compile_time` per call, which we
+# sum across all cases to get the total cold-compile cost — the headline number
+# we want to drive down.
+#
+#   julia --project=benchmark benchmark/compile_bench.jl [label]
+#
+# Writes results to benchmark/results/compile_<label>.tsv
+
+include(joinpath(@__DIR__, "setup_env.jl"))
+include(joinpath(@__DIR__, "cases.jl"))
+
+const LABEL = length(ARGS) >= 1 ? ARGS[1] : "baseline"
+
+# Small arrays: we want to isolate compile time, not run time.
+const SMALL_TOTAL = 1 << 12   # 4096 elements
+
+function main()
+    cases = all_cases(;
+        Ns = 2:7,
+        Ts = (Float64, ComplexF64),
+        kinds = (permute, add, reduce_inner, reduce_outer, reduce_full),
+    )
+
+    rows = Tuple{String,Float64,Float64}[]   # name, compile_time, total_time
+    total_compile = 0.0
+    for c in cases
+        sz = sizetuple(c.N, SMALL_TOTAL)
+        run = make_runner(c, sz)
+        stats = Base.@timed run()            # first (cold) call
+        push!(rows, (name(c), stats.compile_time, stats.time))
+        total_compile += stats.compile_time
+    end
+
+    mkpath(joinpath(@__DIR__, "results"))
+    out = joinpath(@__DIR__, "results", "compile_$(LABEL).tsv")
+    open(out, "w") do io
+        println(io, "case\tcompile_s\ttotal_s")
+        for (nm, ct, tt) in rows
+            println(io, "$nm\t$(round(ct; digits = 5))\t$(round(tt; digits = 5))")
+        end
+        println(io, "TOTAL\t$(round(total_compile; digits = 5))\t")
+    end
+
+    println("== compile benchmark [$LABEL] ==")
+    for (nm, ct, tt) in rows
+        println(rpad(nm, 32), "  compile=", rpad(round(ct; digits = 4), 9), " total=", round(tt; digits = 4))
+    end
+    println("-"^56)
+    println(rpad("TOTAL compile_time (s)", 32), "  ", round(total_compile; digits = 4))
+    println("\nwrote $out")
+end
+
+main()
diff --git a/benchmark/manyops_compile.jl b/benchmark/manyops_compile.jl
new file mode 100644
index 0000000..9195281
--- /dev/null
+++ b/benchmark/manyops_compile.jl
@@ -0,0 +1,49 @@
+# Compile benchmark across MANY distinct op TYPES — simulating the real
+# combinatorial explosion (TensorOperations generates many distinct map/reduce
+# closures). Each `@eval`'d function is a distinct type, forcing a fresh
+# specialization of the whole call chain.
+#
+#   julia --project=benchmark benchmark/manyops_compile.jl [label] [Kops] [Nmax]
+
+include(joinpath(@__DIR__, "setup_env.jl"))
+using Strided
+using Strided: StridedView
+
+const LABEL = length(ARGS) >= 1 ? ARGS[1] : "baseline"
+const KOPS = length(ARGS) >= 2 ? parse(Int, ARGS[2]) : 8
+const NMAXD = length(ARGS) >= 3 ? parse(Int, ARGS[3]) : 5
+
+# K distinct unary map functions (distinct types) and K distinct binary reduce ops.
+const MAPFNS = Function[]
+const REDFNS = Function[]
+for i in 1:KOPS
+    f = @eval ($(Symbol(:mapf_, i)))(x) = x * $i - $i
+    g = @eval ($(Symbol(:redf_, i)))(x, y) = x + y * $(i % 3 + 1)
+    push!(MAPFNS, f)
+    push!(REDFNS, g)
+end
+
+sz(N) = ntuple(_ -> 3, N)
+
+function main()
+    total = 0.0
+    Strided.set_num_threads(1)
+    for N in 2:NMAXD
+        for k in 1:KOPS
+            A = StridedView(rand(Float64, sz(N)))
+            B = StridedView(zeros(Float64, sz(N)))
+            f = MAPFNS[k]
+            total += Base.@timed(map!(f, B, A)).compile_time
+            r = StridedView(zeros(Float64, ntuple(i -> i == 1 ? 1 : 3, N)))
+            g = REDFNS[k]
+            total += Base.@timed(Base.mapreducedim!(identity, g, r, A)).compile_time
+        end
+    end
+    mkpath(joinpath(@__DIR__, "results"))
+    open(joinpath(@__DIR__, "results", "manyops_$(LABEL).txt"), "w") do io
+        println(io, "label=$LABEL Kops=$KOPS Nmax=$NMAXD total_compile_s=$(round(total; digits = 4))")
+    end
+    println("[$LABEL] Kops=$KOPS Nmax=$NMAXD  TOTAL compile_time = $(round(total; digits = 4)) s")
+end
+
+main()
diff --git a/benchmark/runtime_bench.jl b/benchmark/runtime_bench.jl
new file mode 100644
index 0000000..d01557a
--- /dev/null
+++ b/benchmark/runtime_bench.jl
@@ -0,0 +1,56 @@
+# Runtime benchmark.
+#
+# Measures steady-state (compiled) performance of the mapreduce machinery, so we
+# can guard against runtime regressions — permutations especially. Runs each
+# case single-threaded and (if available) multi-threaded.
+#
+#   julia --project=benchmark -t auto benchmark/runtime_bench.jl [label]
+#
+# Writes results to benchmark/results/runtime_<label>.tsv
+
+include(joinpath(@__DIR__, "setup_env.jl"))
+include(joinpath(@__DIR__, "cases.jl"))
+using BenchmarkTools
+
+const LABEL = length(ARGS) >= 1 ? ARGS[1] : "baseline"
+
+# Large enough that the kernel, not call overhead, dominates.
+const BIG_TOTAL = 1 << 22   # ~4M elements
+
+function bench_one(c::Case)
+    sz = sizetuple(c.N, BIG_TOTAL)
+    run = make_runner(c, sz)
+    run()                                    # warm up / compile
+    return @belapsed $run() samples = 30 evals = 1
+end
+
+function main()
+    cases = all_cases(;
+        Ns = 2:6,
+        Ts = (Float64, ComplexF64),
+        kinds = (permute, add, reduce_inner, reduce_outer, reduce_full),
+    )
+
+    nthreads_available = Base.Threads.nthreads()
+    thread_settings = nthreads_available > 1 ? (1, nthreads_available) : (1,)
+
+    mkpath(joinpath(@__DIR__, "results"))
+    out = joinpath(@__DIR__, "results", "runtime_$(LABEL).tsv")
+    io = open(out, "w")
+    println(io, "case\tnthreads\ttime_us")
+
+    for nt in thread_settings
+        Strided.set_num_threads(nt)
+        println("== runtime [$LABEL] nthreads=$nt ==")
+        for c in cases
+            t = bench_one(c)
+            us = t * 1e6
+            println(io, "$(name(c))\t$nt\t$(round(us; digits = 3))")
+            println(rpad(name(c), 32), "  nt=$nt  ", round(us; digits = 3), " us")
+        end
+    end
+    close(io)
+    println("\nwrote $out")
+end
+
+main()
diff --git a/benchmark/runtime_small.jl b/benchmark/runtime_small.jl
new file mode 100644
index 0000000..a90be81
--- /dev/null
+++ b/benchmark/runtime_small.jl
@@ -0,0 +1,39 @@
+# Small-array runtime benchmark: fixed bookkeeping overhead dominates here, so
+# this is the sensitive guard against de-specialization regressions.
+#
+#   julia --project=benchmark -t 1 benchmark/runtime_small.jl [label]
+include(joinpath(@__DIR__, "setup_env.jl"))
+include(joinpath(@__DIR__, "cases.jl"))
+using BenchmarkTools
+
+const LABEL = length(ARGS) >= 1 ? ARGS[1] : "baseline"
+
+function bench_one(c::Case, sz)
+    run = make_runner(c, sz)
+    run()
+    return @belapsed $run() samples = 200 evals = 5
+end
+
+function main()
+    Strided.set_num_threads(1)
+    sizes = Dict(2 => (4, 4), 3 => (4, 4, 4), 4 => (4, 4, 4, 4))
+    cases = all_cases(;
+        Ns = 2:4,
+        Ts = (Float64, ComplexF64),
+        kinds = (permute, add, reduce_inner, reduce_outer, reduce_full),
+    )
+    mkpath(joinpath(@__DIR__, "results"))
+    out = joinpath(@__DIR__, "results", "runtime_small_$(LABEL).tsv")
+    io = open(out, "w")
+    println(io, "case\ttime_ns")
+    println("== runtime small [$LABEL] nt=1 ==")
+    for c in cases
+        t = bench_one(c, sizes[c.N])
+        ns = t * 1e9
+        println(io, "$(name(c))\t$(round(ns; digits = 2))")
+        println(rpad(name(c), 32), "  ", round(ns; digits = 2), " ns")
+    end
+    close(io)
+    println("\nwrote $out")
+end
+main()
diff --git a/benchmark/setup_env.jl b/benchmark/setup_env.jl
new file mode 100644
index 0000000..23dc826
--- /dev/null
+++ b/benchmark/setup_env.jl
@@ -0,0 +1,17 @@
+# Activate a benchmark environment that uses the *local* Strided checkout
+# (the package living one directory up). Works unchanged inside any worktree.
+import Pkg
+Pkg.activate(@__DIR__)
+let root = normpath(joinpath(@__DIR__, ".."))
+    # `develop` is idempotent; re-pointing to the local path each run guarantees
+    # we benchmark this worktree's source rather than a registered version.
+    try
+        Pkg.develop(Pkg.PackageSpec(path = root); io = devnull)
+    catch
+        Pkg.develop(Pkg.PackageSpec(path = root))
+    end
+    if !haskey(Pkg.project().dependencies, "BenchmarkTools")
+        Pkg.add("BenchmarkTools"; io = devnull)
+    end
+end
+Pkg.instantiate(; io = devnull)
diff --git a/benchmark/spec_count.jl b/benchmark/spec_count.jl
new file mode 100644
index 0000000..78e4a4c
--- /dev/null
+++ b/benchmark/spec_count.jl
@@ -0,0 +1,72 @@
+# Count method specializations of the bookkeeping functions after a
+# multi-op / multi-eltype / multi-ndims workload. This is the headline
+# precompile-effectiveness metric: fewer specializations => precompile once
+# per N and reuse.
+#
+#   julia --project=benchmark benchmark/spec_count.jl [label]
+include(joinpath(@__DIR__, "setup_env.jl"))
+include(joinpath(@__DIR__, "cases.jl"))
+using Strided
+
+const LABEL = length(ARGS) >= 1 ? ARGS[1] : "baseline"
+
+function nspecs(f)
+    n = 0
+    for m in methods(f)
+        for s in Base.specializations(m)
+            s === nothing && continue
+            n += 1
+        end
+    end
+    return n
+end
+
+# distinct map fns and reduce ops (distinct types)
+const MAPFNS = Function[]
+const REDFNS = Function[]
+for i in 1:6
+    push!(MAPFNS, @eval ($(Symbol(:mf_, i)))(x) = x * $i - $i)
+    push!(REDFNS, @eval ($(Symbol(:rf_, i)))(x, y) = x + y * $(i % 3 + 1))
+end
+sz(N) = ntuple(_ -> 3, N)
+
+function workload()
+    Strided.set_num_threads(1)
+    for N in 2:7
+        for T in (Float64, ComplexF64, Float32, ComplexF32)
+            for k in 1:6
+                A = StridedView(rand(T, sz(N)))
+                B = StridedView(zeros(T, sz(N)))
+                map!(MAPFNS[k], B, A)
+                r = StridedView(zeros(T, ntuple(i -> i == 1 ? 1 : 3, N)))
+                Base.mapreducedim!(identity, REDFNS[k], r, A)
+            end
+        end
+    end
+end
+
+function main()
+    workload()
+    fns = Dict(
+        "_mapreduce_fuse!" => Strided._mapreduce_fuse!,
+        "_mapreduce_order!" => Strided._mapreduce_order!,
+        "_mapreduce_block!" => Strided._mapreduce_block!,
+        "_computeblocks" => Strided._computeblocks,
+        "_mapreduce_kernel!" => Strided._mapreduce_kernel!,
+        "_mapreduce_threaded!" => Strided._mapreduce_threaded!,
+        "indexorder" => Strided.indexorder,
+        "totalmemoryregion" => Strided.totalmemoryregion,
+    )
+    mkpath(joinpath(@__DIR__, "results"))
+    out = joinpath(@__DIR__, "results", "specs_$(LABEL).tsv")
+    open(out, "w") do io
+        println(io, "function\tnspecs")
+        for nm in sort(collect(keys(fns)))
+            n = nspecs(fns[nm])
+            println(io, "$nm\t$n")
+            println(rpad(nm, 24), "  ", n)
+        end
+    end
+    println("wrote $out")
+end
+main()
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 9182ee8..fce7f11 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -33,17 +33,17 @@ function Base._mapreduce_dim(f, op, ::NamedTuple{()}, A::StridedView, dims)
 end
 
 function Base.map(
-        f::F, a1::StridedView{<:Any, N},
+        @nospecialize(f), a1::StridedView{<:Any, N},
         A::Vararg{StridedView{<:Any, N}}
-    ) where {F, N}
+    ) where {N}
     T = Base.promote_eltype(a1, A...)
     return map!(f, similar(a1, T), a1, A...)
 end
 
 function Base.map!(
-        f::F, b::StridedView{<:Any, N}, a1::StridedView{<:Any, N},
+        @nospecialize(f), b::StridedView{<:Any, N}, a1::StridedView{<:Any, N},
         A::Vararg{StridedView{<:Any, N}}
-    ) where {F, N}
+    ) where {N}
     dims = size(b)
 
     # Check dimesions
@@ -59,7 +59,7 @@ function Base.map!(
     return b
 end
 
-function _mapreduce(f, op, A::StridedView{T}, nt = nothing) where {T}
+function _mapreduce(@nospecialize(f), @nospecialize(op), A::StridedView{T}, nt = nothing) where {T}
     if isempty(A)
         b = Base.mapreduce_empty(f, op, T)
         return nt === nothing ? b : op(b, nt.init)
@@ -79,7 +79,7 @@ function _mapreduce(f, op, A::StridedView{T}, nt = nothing) where {T}
 end
 
 function Base.mapreducedim!(
-        f, op, b::StridedView{<:Any, N},
+        @nospecialize(f), @nospecialize(op), b::StridedView{<:Any, N},
         a1::StridedView{<:Any, N},
         A::Vararg{StridedView{<:Any, N}}
     ) where {N}
@@ -93,7 +93,7 @@ function Base.mapreducedim!(
 end
 
 function _mapreducedim!(
-        (f), (op), (initop),
+        @nospecialize(f), @nospecialize(op), @nospecialize(initop),
         dims::Dims, arrays::Tuple{Vararg{StridedView}}
     )
     if any(isequal(0), dims)
@@ -106,8 +106,40 @@ function _mapreducedim!(
     return arrays[1]
 end
 
+# ---------------------------------------------------------------------------
+# Bookkeeping: dimension fusion, loop-order selection and cache blocking.
+#
+# `_mapreduce_fuse!`, `_mapreduce_order!`, `_mapreduce_block!` and
+# `_computeblocks` set up the mapreduce: they fuse contiguous dimensions, sort
+# the loop order by cache-importance, and compute cache blocks. None of this
+# logic depends on *what* the map/reduce functions are, only on the array shapes
+# and strides, so `f`/`op`/`initop` are `@nospecialize`d and merely forwarded.
+#
+# This matters for precompilation. The functions themselves are the dominant
+# axis of the specialization explosion: a workload that calls mapreduce with
+# many distinct ops (as TensorOperations does) otherwise forces a fresh
+# compilation of this entire bookkeeping chain per (op, eltype) combination.
+# With `@nospecialize`, the bookkeeping specializes on the array-shape signature
+# (ndims `N`, number of arrays `M`, eltypes) but no longer on the ops, so a
+# precompile workload can compile it once per shape signature and reuse it
+# across every op. The bookkeeping runs once per mapreduce call (coarse
+# granularity), so erasing the op types here is free at runtime.
+#
+# NOTE: the per-array data is deliberately kept as `Tuple`s (`map(strides, …)`,
+# `map(offset, …)`, …) rather than `Vector`s. Tuples stay stack-allocated, which
+# keeps the fixed per-call overhead low — important for small arrays. A variant
+# that carried this data in `M`-erased `Vector`s (so the bookkeeping specialized
+# purely on `N`) was prototyped and rejected: it roughly doubled the small-array
+# call overhead through heap allocation and dynamic dispatch, for only a small
+# extra reduction in compile time over `@nospecialize` alone. The number of
+# remaining method specializations was in fact identical, because both are
+# bounded by the distinct `arrays` tuple types in the workload (the `@generated`
+# kernel genuinely needs that concrete type); erasing it further requires a
+# dynamic-dispatch barrier whose runtime cost was not worth it.
+# ---------------------------------------------------------------------------
+
 function _mapreduce_fuse!(
-        (f), (op), (initop),
+        @nospecialize(f), @nospecialize(op), @nospecialize(initop),
         dims::Dims, arrays::Tuple{Vararg{StridedView}}
     )
     # Fuse dimensions if possible: assume that at least one array, e.g. the output array in
@@ -130,7 +162,7 @@ function _mapreduce_fuse!(
 end
 
 function _mapreduce_order!(
-        (f), (op), (initop),
+        @nospecialize(f), @nospecialize(op), @nospecialize(initop),
         dims, strides, arrays
     )
     M = length(arrays)
@@ -155,16 +187,13 @@ end
 
 const MINTHREADLENGTH = 1 << 15 # minimal length before any kind of threading is applied
 function _mapreduce_block!(
-        (f), (op), (initop),
+        @nospecialize(f), @nospecialize(op), @nospecialize(initop),
         dims, strides, offsets, costs, arrays
     )
     bytestrides = map((s, stride) -> s .* stride, sizeof.(eltype.(arrays)), strides)
     strideorders = map(indexorder, strides)
     blocks = _computeblocks(dims, costs, bytestrides, strideorders)
 
-    # t = @elapsed _computeblocks(dims, costs, bytestrides, strideorders)
-    # println("_computeblocks time: $t")
-
     if get_num_threads() == 1 || prod(dims) <= MINTHREADLENGTH
         _mapreduce_kernel!(f, op, initop, dims, blocks, arrays, strides, offsets)
     elseif op !== nothing && _length(dims, strides[1]) == 1 # complete reduction
@@ -214,7 +243,7 @@ end
 # nthreads: number of threads spacing: extra addition to offset of array 1, to account for
 # reduction
 function _mapreduce_threaded!(
-        (f), (op), (initop),
+        @nospecialize(f), @nospecialize(op), @nospecialize(initop),
         dims, blocks, strides, offsets, costs, arrays, nthreads,
         spacing, taskindex
     )