diff --git a/src/mapreduce.jl b/src/mapreduce.jl index ff8180a..ce5fab5 100644 --- a/src/mapreduce.jl +++ b/src/mapreduce.jl @@ -340,25 +340,14 @@ function _mapreduce_kernel_expr(f, op, initop, N::Int, M::Int) outerreturnstrideex[i] = returnex end + i = 1 if op == Nothing + # Non-reducing (map!/permute/...): each destination element is written once, + # so skip the reduction-only `iszero` hoist below and emit just the direct + # `@simd` loop. ex = Expr(:(=), lhsex, fcallex) - exa = Expr(:(=), :a, fcallex) - else - ex = Expr(:(=), lhsex, Expr(:call, :op, lhsex, fcallex)) - exa = Expr(:(=), :a, Expr(:call, :op, :a, fcallex)) - end - i = 1 - if N >= 1 - ex = quote - if iszero($(stridevars[1, 1])) # explicitly hoist A1[I1] out of loop - a = $lhsex - @simd for $(innerloopvars[i]) in Base.OneTo($(blockdimvars[i])) - $exa - $(stepstride2ex[i]) - end - $lhsex = a - $(returnstride2ex[i]) - else + if N >= 1 + ex = quote @simd for $(innerloopvars[i]) in Base.OneTo($(blockdimvars[i])) $ex $(stepstride1ex[i]) @@ -368,6 +357,30 @@ function _mapreduce_kernel_expr(f, op, initop, N::Int, M::Int) $(returnstride2ex[i]) end end + else + ex = Expr(:(=), lhsex, Expr(:call, :op, lhsex, fcallex)) + exa = Expr(:(=), :a, Expr(:call, :op, :a, fcallex)) + if N >= 1 + ex = quote + if iszero($(stridevars[1, 1])) # explicitly hoist A1[I1] out of loop + a = $lhsex + @simd for $(innerloopvars[i]) in Base.OneTo($(blockdimvars[i])) + $exa + $(stepstride2ex[i]) + end + $lhsex = a + $(returnstride2ex[i]) + else + @simd for $(innerloopvars[i]) in Base.OneTo($(blockdimvars[i])) + $ex + $(stepstride1ex[i]) + $(stepstride2ex[i]) + end + $(returnstride1ex[i]) + $(returnstride2ex[i]) + end + end + end end for outer i in 2:N ex = quote