From 0f828cae59002468bec09eee672ceefb475c7ce8 Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Wed, 24 Jun 2026 20:41:08 -0400
Subject: [PATCH 1/3] Add unit-stride fast path to _mapreduce_kernel!

The innermost (vectorized) loop dimension steps the parent indices by the
arrays' strides, which are runtime values. Even when the data is contiguous,
the compiler cannot prove unit stride and auto-vectorizes the loop with
gather/scatter instructions, which do not stream memory. For a contiguous
400x400 Float64 `copy!` this runs at ~8.5 GB/s (~300 us) instead of the
~33 GB/s a contiguous SIMD loop achieves.

Add a runtime branch: when every array is contiguous along loop dimension 1
(all innermost strides == 1), step the indices by the literal `1` so the
compiler emits streaming SIMD loads/stores. The post-loop index correction
reuses the existing return-stride expressions, which are numerically identical
because the stride equals 1.

Measured (contiguous 400x400 Float64, single thread): `copy!` 300 us -> 91 us
(~3.3x), matching the compile-time-constant-stride ideal. Non-contiguous
(e.g. transposed) inputs are unaffected and keep the existing path. Full test
suite passes (single- and multi-threaded).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/mapreduce.jl | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index ff8180a..9952134 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -340,6 +340,23 @@ function _mapreduce_kernel_expr(f, op, initop, N::Int, M::Int)
         outerreturnstrideex[i] = returnex
     end
 
+    # Unit-stride fast path for the innermost (vectorized) loop dimension. The strides are
+    # runtime values, so even when the data is contiguous the compiler cannot prove unit
+    # stride and falls back to gather/scatter SIMD (which does not stream memory). When every
+    # array is contiguous along loop dimension 1 we instead step the parent indices by the
+    # literal `1`, letting the compiler emit contiguous SIMD loads/stores — up to ~3x faster
+    # for contiguous data. The post-loop index correction reuses the regular return-stride
+    # expressions, which are numerically identical here because the stride equals 1.
+    unitstep1ex = :($(Ivars[1]) += 1)
+    unitstep2ex = Expr(:block)
+    for j in 2:M
+        push!(unitstep2ex.args, :($(Ivars[j]) += 1))
+    end
+    unitstridecond = reduce(
+        (a, b) -> :($a && $b),
+        [:($(stridevars[1, j]) == 1) for j in 1:M]
+    )
+
     if op == Nothing
         ex = Expr(:(=), lhsex, fcallex)
         exa = Expr(:(=), :a, fcallex)
@@ -358,6 +375,14 @@ function _mapreduce_kernel_expr(f, op, initop, N::Int, M::Int)
                 end
                 $lhsex = a
                 $(returnstride2ex[i])
+            elseif $unitstridecond
+                @simd for $(innerloopvars[i]) in Base.OneTo($(blockdimvars[i]))
+                    $ex
+                    $unitstep1ex
+                    $unitstep2ex
+                end
+                $(returnstride1ex[i])
+                $(returnstride2ex[i])
             else
                 @simd for $(innerloopvars[i]) in Base.OneTo($(blockdimvars[i]))
                     $ex

From a302147b3f3f09fb6eed6f97f114a990ece167f0 Mon Sep 17 00:00:00 2001
From: Lukas Devos <ldevos98@gmail.com>
Date: Wed, 24 Jun 2026 21:35:16 -0400
Subject: [PATCH 2/3] Improve comments in mapreduce.jl

Refactor comments for clarity and conciseness.
---
 src/mapreduce.jl | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 9952134..b08b6aa 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -340,13 +340,9 @@ function _mapreduce_kernel_expr(f, op, initop, N::Int, M::Int)
         outerreturnstrideex[i] = returnex
     end
 
-    # Unit-stride fast path for the innermost (vectorized) loop dimension. The strides are
-    # runtime values, so even when the data is contiguous the compiler cannot prove unit
-    # stride and falls back to gather/scatter SIMD (which does not stream memory). When every
-    # array is contiguous along loop dimension 1 we instead step the parent indices by the
-    # literal `1`, letting the compiler emit contiguous SIMD loads/stores — up to ~3x faster
-    # for contiguous data. The post-loop index correction reuses the regular return-stride
-    # expressions, which are numerically identical here because the stride equals 1.
+    # Unit-stride fast path for the innermost (vectorized) loop dimension.
+    # We special-case for contiguous loads/stores to avoid SIMD gather/scatter
+    # in favor of SIMD load/store, which streams memory more efficiently.
     unitstep1ex = :($(Ivars[1]) += 1)
     unitstep2ex = Expr(:block)
     for j in 2:M

From 3d605d1058487f569d0428dcc52de4e4b3df3ec7 Mon Sep 17 00:00:00 2001
From: lkdvos <ldevos98@gmail.com>
Date: Thu, 25 Jun 2026 20:40:18 -0400
Subject: [PATCH 3/3] simplify firststride condition

---
 src/mapreduce.jl | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index b08b6aa..d7c4659 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -348,10 +348,8 @@ function _mapreduce_kernel_expr(f, op, initop, N::Int, M::Int)
     for j in 2:M
         push!(unitstep2ex.args, :($(Ivars[j]) += 1))
     end
-    unitstridecond = reduce(
-        (a, b) -> :($a && $b),
-        [:($(stridevars[1, j]) == 1) for j in 1:M]
-    )
+    firststrides = Expr(:tuple, (stridevars[1, j] for j in 1:M)...)
+    unitstridecond = :(all(==(1), $firststrides))
 
     if op == Nothing
         ex = Expr(:(=), lhsex, fcallex)