From dfa507142b137b7edecc5a04cd7dd56a6b74ef1a Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Thu, 18 Jun 2026 18:37:07 -0400
Subject: [PATCH] Erase op types from mapreduce bookkeeping for precompilation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The mapreduce setup chain (`_mapreducedim!`, `_mapreduce_fuse!`,
`_mapreduce_order!`, `_mapreduce_block!`, `_mapreduce_threaded!`, and the
public `map`/`map!`/`mapreduce`/`mapreducedim!`/`_mapreduce` entry points)
previously specialized on the map/reduce function types `f`/`op`/`initop`.
None of the bookkeeping logic depends on what those functions are — it only
fuses dimensions, sorts the loop order by cache-importance and computes cache
blocks from the array shapes/strides — yet a workload that calls mapreduce with
many distinct ops (as TensorOperations does) forced a fresh compilation of the
entire chain per (op, eltype) combination.

`@nospecialize` the function arguments throughout so the bookkeeping specializes
on the array-shape signature (ndims/n-arrays/eltypes) but no longer on the ops.
A precompile workload can then compile it once per shape signature and reuse it
across every op. The bookkeeping runs once per mapreduce call (coarse
granularity), so erasing the op types is free at runtime; the only function that
still specializes on the op is the monolithic `@generated _mapreduce_kernel!`,
which is kept untouched.

The per-array data is deliberately kept as stack-allocated `Tuple`s rather than
`M`-erased `Vector`s; the latter (a pure-`N` bookkeeping variant) was prototyped
and rejected because it roughly doubled small-array call overhead for no
additional spec-count reduction (see the note in `mapreduce.jl`).

The GPU `_mapreduce_block!` extension hook is unchanged: this commit does not
alter `_mapreduce_block!`'s signature, so the extension's dispatch boundary is
preserved.

Adds a `benchmark/` harness (compile / many-op / runtime / spec-count) used to
validate the change.

Method specializations after a multi-op × multi-eltype × multi-ndims workload:

  function              baseline  branch
  _mapreduce_block!     346       75
  _mapreduce_fuse!      202       75
  _mapreduce_order!     202       75
  _mapreduce_threaded!  515       124
  _mapreduce_kernel!    659       412

Compile time: grid 28.8s -> 26.5s; many-distinct-ops 14.5s -> 11.0s.
Runtime (single-thread, BenchmarkTools) neutral-to-better on both tiny (4^N)
and large (~4M-element) arrays; per-call allocations drop (e.g. 768 -> 464 B).
`Pkg.test` passes single- and multi-threaded, including the JLArray and CuArray
GPU mapreduce/reduce tests.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 benchmark/Project.toml       |  3 ++
 benchmark/cases.jl           | 67 +++++++++++++++++++++++++++++++++
 benchmark/compile_bench.jl   | 57 ++++++++++++++++++++++++++++
 benchmark/manyops_compile.jl | 49 ++++++++++++++++++++++++
 benchmark/runtime_bench.jl   | 56 ++++++++++++++++++++++++++++
 benchmark/runtime_small.jl   | 39 +++++++++++++++++++
 benchmark/setup_env.jl       | 17 +++++++++
 benchmark/spec_count.jl      | 72 ++++++++++++++++++++++++++++++++++++
 src/mapreduce.jl             | 57 +++++++++++++++++++++-------
 9 files changed, 403 insertions(+), 14 deletions(-)
 create mode 100644 benchmark/Project.toml
 create mode 100644 benchmark/cases.jl
 create mode 100644 benchmark/compile_bench.jl
 create mode 100644 benchmark/manyops_compile.jl
 create mode 100644 benchmark/runtime_bench.jl
 create mode 100644 benchmark/runtime_small.jl
 create mode 100644 benchmark/setup_env.jl
 create mode 100644 benchmark/spec_count.jl

diff --git a/benchmark/Project.toml b/benchmark/Project.toml
new file mode 100644
index 0000000..c13013b
--- /dev/null
+++ b/benchmark/Project.toml
@@ -0,0 +1,3 @@
+[deps]
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+Strided = "5e0ebb24-38b0-5f93-81fe-25c709ecae67"
diff --git a/benchmark/cases.jl b/benchmark/cases.jl
new file mode 100644
index 0000000..bc1045c
--- /dev/null
+++ b/benchmark/cases.jl
@@ -0,0 +1,67 @@
+# Shared benchmark cases for the Strided mapreduce machinery.
+#
+# A `Case` describes one operation that exercises `_mapreducedim!` / the kernel,
+# parameterised by ndims `N`, element type `T`, and operation kind `kind`.
+# `make_runner(c, sz)` returns a zero-argument closure that performs the op
+# in-place on freshly allocated arrays of size `sz`. The same runner is used by
+# both the runtime benchmark and the compile/TTFX benchmark, so the two measure
+# exactly the same specializations.
+
+using Strided
+using Strided: StridedView
+
+@enum OpKind permute add reduce_inner reduce_outer reduce_full
+
+struct Case
+    N::Int
+    T::DataType
+    kind::OpKind
+end
+
+name(c::Case) = "$(c.kind)_N$(c.N)_$(c.T)"
+
+# A non-trivial size tuple of N dims with roughly `total` elements, avoiding
+# size-1 dims (which would be pushed to the back / fused away).
+function sizetuple(N::Int, total::Int)
+    d = max(2, round(Int, total^(1 / N)))
+    return ntuple(_ -> d, N)
+end
+
+function make_runner(c::Case, sz::NTuple{N,Int}) where {N}
+    T = c.T
+    if c.kind == permute
+        p = reverse(ntuple(identity, Val(N)))        # reverse perm: defeats fusion
+        src = StridedView(rand(T, sz))
+        dst = StridedView(zeros(T, getindex.(Ref(sz), p)))
+        return () -> permutedims!(dst, src, p)
+    elseif c.kind == add
+        a = StridedView(rand(T, sz))
+        b = StridedView(rand(T, sz))
+        dst = StridedView(zeros(T, sz))
+        return () -> map!(+, dst, a, b)
+    elseif c.kind == reduce_inner
+        A = StridedView(rand(T, sz))
+        outsz = ntuple(i -> i == 1 ? 1 : sz[i], Val(N))
+        dst = StridedView(zeros(T, outsz))
+        return () -> (fill!(dst, zero(T)); Base.mapreducedim!(identity, +, dst, A))
+    elseif c.kind == reduce_outer
+        A = StridedView(rand(T, sz))
+        outsz = ntuple(i -> i == N ? 1 : sz[i], Val(N))
+        dst = StridedView(zeros(T, outsz))
+        return () -> (fill!(dst, zero(T)); Base.mapreducedim!(identity, +, dst, A))
+    elseif c.kind == reduce_full
+        A = StridedView(rand(T, sz))
+        return () -> sum(A)
+    else
+        error("unknown kind $(c.kind)")
+    end
+end
+
+# Build the full case grid.
+function all_cases(; Ns, Ts, kinds)
+    cs = Case[]
+    for kind in kinds, T in Ts, N in Ns
+        push!(cs, Case(N, T, kind))
+    end
+    return cs
+end
diff --git a/benchmark/compile_bench.jl b/benchmark/compile_bench.jl
new file mode 100644
index 0000000..b0d2def
--- /dev/null
+++ b/benchmark/compile_bench.jl
@@ -0,0 +1,57 @@
+# Compile / TTFX benchmark.
+#
+# Run in a FRESH Julia process. Strided's kernels are not part of any precompile
+# workload, so the first call to each (N, T, op) specialization triggers
+# inference + codegen. `Base.@timed` reports `.compile_time` per call, which we
+# sum across all cases to get the total cold-compile cost — the headline number
+# we want to drive down.
+#
+#   julia --project=benchmark benchmark/compile_bench.jl [label]
+#
+# Writes results to benchmark/results/compile_<label>.tsv
+
+include(joinpath(@__DIR__, "setup_env.jl"))
+include(joinpath(@__DIR__, "cases.jl"))
+
+const LABEL = length(ARGS) >= 1 ? ARGS[1] : "baseline"
+
+# Small arrays: we want to isolate compile time, not run time.
+const SMALL_TOTAL = 1 << 12   # 4096 elements
+
+function main()
+    cases = all_cases(;
+        Ns = 2:7,
+        Ts = (Float64, ComplexF64),
+        kinds = (permute, add, reduce_inner, reduce_outer, reduce_full),
+    )
+
+    rows = Tuple{String,Float64,Float64}[]   # name, compile_time, total_time
+    total_compile = 0.0
+    for c in cases
+        sz = sizetuple(c.N, SMALL_TOTAL)
+        run = make_runner(c, sz)
+        stats = Base.@timed run()            # first (cold) call
+        push!(rows, (name(c), stats.compile_time, stats.time))
+        total_compile += stats.compile_time
+    end
+
+    mkpath(joinpath(@__DIR__, "results"))
+    out = joinpath(@__DIR__, "results", "compile_$(LABEL).tsv")
+    open(out, "w") do io
+        println(io, "case\tcompile_s\ttotal_s")
+        for (nm, ct, tt) in rows
+            println(io, "$nm\t$(round(ct; digits = 5))\t$(round(tt; digits = 5))")
+        end
+        println(io, "TOTAL\t$(round(total_compile; digits = 5))\t")
+    end
+
+    println("== compile benchmark [$LABEL] ==")
+    for (nm, ct, tt) in rows
+        println(rpad(nm, 32), "  compile=", rpad(round(ct; digits = 4), 9), " total=", round(tt; digits = 4))
+    end
+    println("-"^56)
+    println(rpad("TOTAL compile_time (s)", 32), "  ", round(total_compile; digits = 4))
+    println("\nwrote $out")
+end
+
+main()
diff --git a/benchmark/manyops_compile.jl b/benchmark/manyops_compile.jl
new file mode 100644
index 0000000..9195281
--- /dev/null
+++ b/benchmark/manyops_compile.jl
@@ -0,0 +1,49 @@
+# Compile benchmark across MANY distinct op TYPES — simulating the real
+# combinatorial explosion (TensorOperations generates many distinct map/reduce
+# closures). Each `@eval`'d function is a distinct type, forcing a fresh
+# specialization of the whole call chain.
+#
+#   julia --project=benchmark benchmark/manyops_compile.jl [label] [Kops] [Nmax]
+
+include(joinpath(@__DIR__, "setup_env.jl"))
+using Strided
+using Strided: StridedView
+
+const LABEL = length(ARGS) >= 1 ? ARGS[1] : "baseline"
+const KOPS = length(ARGS) >= 2 ? parse(Int, ARGS[2]) : 8
+const NMAXD = length(ARGS) >= 3 ? parse(Int, ARGS[3]) : 5
+
+# K distinct unary map functions (distinct types) and K distinct binary reduce ops.
+const MAPFNS = Function[]
+const REDFNS = Function[]
+for i in 1:KOPS
+    f = @eval ($(Symbol(:mapf_, i)))(x) = x * $i - $i
+    g = @eval ($(Symbol(:redf_, i)))(x, y) = x + y * $(i % 3 + 1)
+    push!(MAPFNS, f)
+    push!(REDFNS, g)
+end
+
+sz(N) = ntuple(_ -> 3, N)
+
+function main()
+    total = 0.0
+    Strided.set_num_threads(1)
+    for N in 2:NMAXD
+        for k in 1:KOPS
+            A = StridedView(rand(Float64, sz(N)))
+            B = StridedView(zeros(Float64, sz(N)))
+            f = MAPFNS[k]
+            total += Base.@timed(map!(f, B, A)).compile_time
+            r = StridedView(zeros(Float64, ntuple(i -> i == 1 ? 1 : 3, N)))
+            g = REDFNS[k]
+            total += Base.@timed(Base.mapreducedim!(identity, g, r, A)).compile_time
+        end
+    end
+    mkpath(joinpath(@__DIR__, "results"))
+    open(joinpath(@__DIR__, "results", "manyops_$(LABEL).txt"), "w") do io
+        println(io, "label=$LABEL Kops=$KOPS Nmax=$NMAXD total_compile_s=$(round(total; digits = 4))")
+    end
+    println("[$LABEL] Kops=$KOPS Nmax=$NMAXD  TOTAL compile_time = $(round(total; digits = 4)) s")
+end
+
+main()
diff --git a/benchmark/runtime_bench.jl b/benchmark/runtime_bench.jl
new file mode 100644
index 0000000..d01557a
--- /dev/null
+++ b/benchmark/runtime_bench.jl
@@ -0,0 +1,56 @@
+# Runtime benchmark.
+#
+# Measures steady-state (compiled) performance of the mapreduce machinery, so we
+# can guard against runtime regressions — permutations especially. Runs each
+# case single-threaded and (if available) multi-threaded.
+#
+#   julia --project=benchmark -t auto benchmark/runtime_bench.jl [label]
+#
+# Writes results to benchmark/results/runtime_<label>.tsv
+
+include(joinpath(@__DIR__, "setup_env.jl"))
+include(joinpath(@__DIR__, "cases.jl"))
+using BenchmarkTools
+
+const LABEL = length(ARGS) >= 1 ? ARGS[1] : "baseline"
+
+# Large enough that the kernel, not call overhead, dominates.
+const BIG_TOTAL = 1 << 22   # ~4M elements
+
+function bench_one(c::Case)
+    sz = sizetuple(c.N, BIG_TOTAL)
+    run = make_runner(c, sz)
+    run()                                    # warm up / compile
+    return @belapsed $run() samples = 30 evals = 1
+end
+
+function main()
+    cases = all_cases(;
+        Ns = 2:6,
+        Ts = (Float64, ComplexF64),
+        kinds = (permute, add, reduce_inner, reduce_outer, reduce_full),
+    )
+
+    nthreads_available = Base.Threads.nthreads()
+    thread_settings = nthreads_available > 1 ? (1, nthreads_available) : (1,)
+
+    mkpath(joinpath(@__DIR__, "results"))
+    out = joinpath(@__DIR__, "results", "runtime_$(LABEL).tsv")
+    io = open(out, "w")
+    println(io, "case\tnthreads\ttime_us")
+
+    for nt in thread_settings
+        Strided.set_num_threads(nt)
+        println("== runtime [$LABEL] nthreads=$nt ==")
+        for c in cases
+            t = bench_one(c)
+            us = t * 1e6
+            println(io, "$(name(c))\t$nt\t$(round(us; digits = 3))")
+            println(rpad(name(c), 32), "  nt=$nt  ", round(us; digits = 3), " us")
+        end
+    end
+    close(io)
+    println("\nwrote $out")
+end
+
+main()
diff --git a/benchmark/runtime_small.jl b/benchmark/runtime_small.jl
new file mode 100644
index 0000000..a90be81
--- /dev/null
+++ b/benchmark/runtime_small.jl
@@ -0,0 +1,39 @@
+# Small-array runtime benchmark: fixed bookkeeping overhead dominates here, so
+# this is the sensitive guard against de-specialization regressions.
+#
+#   julia --project=benchmark -t 1 benchmark/runtime_small.jl [label]
+include(joinpath(@__DIR__, "setup_env.jl"))
+include(joinpath(@__DIR__, "cases.jl"))
+using BenchmarkTools
+
+const LABEL = length(ARGS) >= 1 ? ARGS[1] : "baseline"
+
+function bench_one(c::Case, sz)
+    run = make_runner(c, sz)
+    run()
+    return @belapsed $run() samples = 200 evals = 5
+end
+
+function main()
+    Strided.set_num_threads(1)
+    sizes = Dict(2 => (4, 4), 3 => (4, 4, 4), 4 => (4, 4, 4, 4))
+    cases = all_cases(;
+        Ns = 2:4,
+        Ts = (Float64, ComplexF64),
+        kinds = (permute, add, reduce_inner, reduce_outer, reduce_full),
+    )
+    mkpath(joinpath(@__DIR__, "results"))
+    out = joinpath(@__DIR__, "results", "runtime_small_$(LABEL).tsv")
+    io = open(out, "w")
+    println(io, "case\ttime_ns")
+    println("== runtime small [$LABEL] nt=1 ==")
+    for c in cases
+        t = bench_one(c, sizes[c.N])
+        ns = t * 1e9
+        println(io, "$(name(c))\t$(round(ns; digits = 2))")
+        println(rpad(name(c), 32), "  ", round(ns; digits = 2), " ns")
+    end
+    close(io)
+    println("\nwrote $out")
+end
+main()
diff --git a/benchmark/setup_env.jl b/benchmark/setup_env.jl
new file mode 100644
index 0000000..23dc826
--- /dev/null
+++ b/benchmark/setup_env.jl
@@ -0,0 +1,17 @@
+# Activate a benchmark environment that uses the *local* Strided checkout
+# (the package living one directory up). Works unchanged inside any worktree.
+import Pkg
+Pkg.activate(@__DIR__)
+let root = normpath(joinpath(@__DIR__, ".."))
+    # `develop` is idempotent; re-pointing to the local path each run guarantees
+    # we benchmark this worktree's source rather than a registered version.
+    try
+        Pkg.develop(Pkg.PackageSpec(path = root); io = devnull)
+    catch
+        Pkg.develop(Pkg.PackageSpec(path = root))
+    end
+    if !haskey(Pkg.project().dependencies, "BenchmarkTools")
+        Pkg.add("BenchmarkTools"; io = devnull)
+    end
+end
+Pkg.instantiate(; io = devnull)
diff --git a/benchmark/spec_count.jl b/benchmark/spec_count.jl
new file mode 100644
index 0000000..78e4a4c
--- /dev/null
+++ b/benchmark/spec_count.jl
@@ -0,0 +1,72 @@
+# Count method specializations of the bookkeeping functions after a
+# multi-op / multi-eltype / multi-ndims workload. This is the headline
+# precompile-effectiveness metric: fewer specializations => precompile once
+# per N and reuse.
+#
+#   julia --project=benchmark benchmark/spec_count.jl [label]
+include(joinpath(@__DIR__, "setup_env.jl"))
+include(joinpath(@__DIR__, "cases.jl"))
+using Strided
+
+const LABEL = length(ARGS) >= 1 ? ARGS[1] : "baseline"
+
+function nspecs(f)
+    n = 0
+    for m in methods(f)
+        for s in Base.specializations(m)
+            s === nothing && continue
+            n += 1
+        end
+    end
+    return n
+end
+
+# distinct map fns and reduce ops (distinct types)
+const MAPFNS = Function[]
+const REDFNS = Function[]
+for i in 1:6
+    push!(MAPFNS, @eval ($(Symbol(:mf_, i)))(x) = x * $i - $i)
+    push!(REDFNS, @eval ($(Symbol(:rf_, i)))(x, y) = x + y * $(i % 3 + 1))
+end
+sz(N) = ntuple(_ -> 3, N)
+
+function workload()
+    Strided.set_num_threads(1)
+    for N in 2:7
+        for T in (Float64, ComplexF64, Float32, ComplexF32)
+            for k in 1:6
+                A = StridedView(rand(T, sz(N)))
+                B = StridedView(zeros(T, sz(N)))
+                map!(MAPFNS[k], B, A)
+                r = StridedView(zeros(T, ntuple(i -> i == 1 ? 1 : 3, N)))
+                Base.mapreducedim!(identity, REDFNS[k], r, A)
+            end
+        end
+    end
+end
+
+function main()
+    workload()
+    fns = Dict(
+        "_mapreduce_fuse!" => Strided._mapreduce_fuse!,
+        "_mapreduce_order!" => Strided._mapreduce_order!,
+        "_mapreduce_block!" => Strided._mapreduce_block!,
+        "_computeblocks" => Strided._computeblocks,
+        "_mapreduce_kernel!" => Strided._mapreduce_kernel!,
+        "_mapreduce_threaded!" => Strided._mapreduce_threaded!,
+        "indexorder" => Strided.indexorder,
+        "totalmemoryregion" => Strided.totalmemoryregion,
+    )
+    mkpath(joinpath(@__DIR__, "results"))
+    out = joinpath(@__DIR__, "results", "specs_$(LABEL).tsv")
+    open(out, "w") do io
+        println(io, "function\tnspecs")
+        for nm in sort(collect(keys(fns)))
+            n = nspecs(fns[nm])
+            println(io, "$nm\t$n")
+            println(rpad(nm, 24), "  ", n)
+        end
+    end
+    println("wrote $out")
+end
+main()
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 9182ee8..fce7f11 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -33,17 +33,17 @@ function Base._mapreduce_dim(f, op, ::NamedTuple{()}, A::StridedView, dims)
 end
 
 function Base.map(
-        f::F, a1::StridedView{<:Any, N},
+        @nospecialize(f), a1::StridedView{<:Any, N},
         A::Vararg{StridedView{<:Any, N}}
-    ) where {F, N}
+    ) where {N}
     T = Base.promote_eltype(a1, A...)
     return map!(f, similar(a1, T), a1, A...)
 end
 
 function Base.map!(
-        f::F, b::StridedView{<:Any, N}, a1::StridedView{<:Any, N},
+        @nospecialize(f), b::StridedView{<:Any, N}, a1::StridedView{<:Any, N},
         A::Vararg{StridedView{<:Any, N}}
-    ) where {F, N}
+    ) where {N}
     dims = size(b)
 
     # Check dimesions
@@ -59,7 +59,7 @@ function Base.map!(
     return b
 end
 
-function _mapreduce(f, op, A::StridedView{T}, nt = nothing) where {T}
+function _mapreduce(@nospecialize(f), @nospecialize(op), A::StridedView{T}, nt = nothing) where {T}
     if isempty(A)
         b = Base.mapreduce_empty(f, op, T)
         return nt === nothing ? b : op(b, nt.init)
@@ -79,7 +79,7 @@ function _mapreduce(f, op, A::StridedView{T}, nt = nothing) where {T}
 end
 
 function Base.mapreducedim!(
-        f, op, b::StridedView{<:Any, N},
+        @nospecialize(f), @nospecialize(op), b::StridedView{<:Any, N},
         a1::StridedView{<:Any, N},
         A::Vararg{StridedView{<:Any, N}}
     ) where {N}
@@ -93,7 +93,7 @@ function Base.mapreducedim!(
 end
 
 function _mapreducedim!(
-        (f), (op), (initop),
+        @nospecialize(f), @nospecialize(op), @nospecialize(initop),
         dims::Dims, arrays::Tuple{Vararg{StridedView}}
     )
     if any(isequal(0), dims)
@@ -106,8 +106,40 @@ function _mapreducedim!(
     return arrays[1]
 end
 
+# ---------------------------------------------------------------------------
+# Bookkeeping: dimension fusion, loop-order selection and cache blocking.
+#
+# `_mapreduce_fuse!`, `_mapreduce_order!`, `_mapreduce_block!` and
+# `_computeblocks` set up the mapreduce: they fuse contiguous dimensions, sort
+# the loop order by cache-importance, and compute cache blocks. None of this
+# logic depends on *what* the map/reduce functions are, only on the array shapes
+# and strides, so `f`/`op`/`initop` are `@nospecialize`d and merely forwarded.
+#
+# This matters for precompilation. The functions themselves are the dominant
+# axis of the specialization explosion: a workload that calls mapreduce with
+# many distinct ops (as TensorOperations does) otherwise forces a fresh
+# compilation of this entire bookkeeping chain per (op, eltype) combination.
+# With `@nospecialize`, the bookkeeping specializes on the array-shape signature
+# (ndims `N`, number of arrays `M`, eltypes) but no longer on the ops, so a
+# precompile workload can compile it once per shape signature and reuse it
+# across every op. The bookkeeping runs once per mapreduce call (coarse
+# granularity), so erasing the op types here is free at runtime.
+#
+# NOTE: the per-array data is deliberately kept as `Tuple`s (`map(strides, …)`,
+# `map(offset, …)`, …) rather than `Vector`s. Tuples stay stack-allocated, which
+# keeps the fixed per-call overhead low — important for small arrays. A variant
+# that carried this data in `M`-erased `Vector`s (so the bookkeeping specialized
+# purely on `N`) was prototyped and rejected: it roughly doubled the small-array
+# call overhead through heap allocation and dynamic dispatch, for only a small
+# extra reduction in compile time over `@nospecialize` alone. The number of
+# remaining method specializations was in fact identical, because both are
+# bounded by the distinct `arrays` tuple types in the workload (the `@generated`
+# kernel genuinely needs that concrete type); erasing it further requires a
+# dynamic-dispatch barrier whose runtime cost was not worth it.
+# ---------------------------------------------------------------------------
+
 function _mapreduce_fuse!(
-        (f), (op), (initop),
+        @nospecialize(f), @nospecialize(op), @nospecialize(initop),
         dims::Dims, arrays::Tuple{Vararg{StridedView}}
     )
     # Fuse dimensions if possible: assume that at least one array, e.g. the output array in
@@ -130,7 +162,7 @@ function _mapreduce_fuse!(
 end
 
 function _mapreduce_order!(
-        (f), (op), (initop),
+        @nospecialize(f), @nospecialize(op), @nospecialize(initop),
         dims, strides, arrays
     )
     M = length(arrays)
@@ -155,16 +187,13 @@ end
 
 const MINTHREADLENGTH = 1 << 15 # minimal length before any kind of threading is applied
 function _mapreduce_block!(
-        (f), (op), (initop),
+        @nospecialize(f), @nospecialize(op), @nospecialize(initop),
         dims, strides, offsets, costs, arrays
     )
     bytestrides = map((s, stride) -> s .* stride, sizeof.(eltype.(arrays)), strides)
     strideorders = map(indexorder, strides)
     blocks = _computeblocks(dims, costs, bytestrides, strideorders)
 
-    # t = @elapsed _computeblocks(dims, costs, bytestrides, strideorders)
-    # println("_computeblocks time: $t")
-
     if get_num_threads() == 1 || prod(dims) <= MINTHREADLENGTH
         _mapreduce_kernel!(f, op, initop, dims, blocks, arrays, strides, offsets)
     elseif op !== nothing && _length(dims, strides[1]) == 1 # complete reduction
@@ -214,7 +243,7 @@ end
 # nthreads: number of threads spacing: extra addition to offset of array 1, to account for
 # reduction
 function _mapreduce_threaded!(
-        (f), (op), (initop),
+        @nospecialize(f), @nospecialize(op), @nospecialize(initop),
         dims, blocks, strides, offsets, costs, arrays, nthreads,
         spacing, taskindex
     )