From 26b00138b459ccb032be463497f0e681bb2aa266 Mon Sep 17 00:00:00 2001 From: lkdvos Date: Tue, 23 Jun 2026 21:04:44 -0400 Subject: [PATCH] Specialize non-reducing kernel: drop reduction-only iszero hoist The dim-1 inner core in `_mapreduce_kernel_expr` carried an `iszero(stride_1_1)` branch that hoists `A1[I1]` out of the `@simd` loop. That hoist only matters for reductions, where the destination stride along the inner dim is zero. For the non-reducing path (`op === nothing`: map!/permute/copy!/fill!/...) every destination element is written exactly once, so the branch is dead and the loop body was needlessly generated twice. Emit the plain `@simd` loop (one body, no runtime branch) for `op === nothing`, keeping the hoist for reductions. Runtime-neutral (no permute regression) and trims a bit of non-reducing compile time. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/mapreduce.jl | 47 ++++++++++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/src/mapreduce.jl b/src/mapreduce.jl index ff8180a..ce5fab5 100644 --- a/src/mapreduce.jl +++ b/src/mapreduce.jl @@ -340,25 +340,14 @@ function _mapreduce_kernel_expr(f, op, initop, N::Int, M::Int) outerreturnstrideex[i] = returnex end + i = 1 if op == Nothing + # Non-reducing (map!/permute/...): each destination element is written once, + # so skip the reduction-only `iszero` hoist below and emit just the direct + # `@simd` loop. ex = Expr(:(=), lhsex, fcallex) - exa = Expr(:(=), :a, fcallex) - else - ex = Expr(:(=), lhsex, Expr(:call, :op, lhsex, fcallex)) - exa = Expr(:(=), :a, Expr(:call, :op, :a, fcallex)) - end - i = 1 - if N >= 1 - ex = quote - if iszero($(stridevars[1, 1])) # explicitly hoist A1[I1] out of loop - a = $lhsex - @simd for $(innerloopvars[i]) in Base.OneTo($(blockdimvars[i])) - $exa - $(stepstride2ex[i]) - end - $lhsex = a - $(returnstride2ex[i]) - else + if N >= 1 + ex = quote @simd for $(innerloopvars[i]) in Base.OneTo($(blockdimvars[i])) $ex $(stepstride1ex[i]) @@ -368,6 +357,30 @@ function _mapreduce_kernel_expr(f, op, initop, N::Int, M::Int) $(returnstride2ex[i]) end end + else + ex = Expr(:(=), lhsex, Expr(:call, :op, lhsex, fcallex)) + exa = Expr(:(=), :a, Expr(:call, :op, :a, fcallex)) + if N >= 1 + ex = quote + if iszero($(stridevars[1, 1])) # explicitly hoist A1[I1] out of loop + a = $lhsex + @simd for $(innerloopvars[i]) in Base.OneTo($(blockdimvars[i])) + $exa + $(stepstride2ex[i]) + end + $lhsex = a + $(returnstride2ex[i]) + else + @simd for $(innerloopvars[i]) in Base.OneTo($(blockdimvars[i])) + $ex + $(stepstride1ex[i]) + $(stepstride2ex[i]) + end + $(returnstride1ex[i]) + $(returnstride2ex[i]) + end + end + end end for outer i in 2:N ex = quote