From d735bbf8c8b3e8fc80262b00cd0f2bda2e16ee79 Mon Sep 17 00:00:00 2001 From: Hendrik Ranocha Date: Fri, 28 Nov 2025 09:12:55 +0100 Subject: [PATCH 1/2] add new test for issue 543 --- test/staticsize.jl | 64 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/test/staticsize.jl b/test/staticsize.jl index 6ff2b0f6..5d14e956 100644 --- a/test/staticsize.jl +++ b/test/staticsize.jl @@ -135,3 +135,67 @@ end @test sum2_10turbo(A) ≈ sum(A) end end + +# Test for Issue #543: W=1 nested VecUnroll store on ARM +# This tests the case where vector width is 1 (scalar) with nested unrolling +function issue543_noavx!(data_out, matrix, data_in) + for j in axes(data_out, 3), i in axes(data_out, 2), v in axes(data_out, 1) + res = zero(eltype(data_out)) + for jj in axes(matrix, 2) + res += matrix[j, jj] * data_in[v, i, jj] + end + data_out[v, i, j] = res + end + return nothing +end + +function issue543_turbo!(data_out, matrix, data_in) + @turbo for j in axes(data_out, 3), i in axes(data_out, 2), v in axes(data_out, 1) + res = zero(eltype(data_out)) + for jj in axes(matrix, 2) + res += matrix[j, jj] * data_in[v, i, jj] + end + data_out[v, i, j] = res + end + return nothing +end + +@testset "Issue #543: W=1 Nested VecUnroll" begin + # Test the specific case that was failing: v=1 (first dim size 1) with n=5 + # This triggers W=1 code paths where VecUnroll stores T instead of Vec{1,T} + for v in [1, 2], n in [4, 5, 6, 7, 8] + data_out_ref = StrideArray(undef, StaticInt(v), StaticInt(n), StaticInt(n)) + data_out_turbo = StrideArray(undef, StaticInt(v), StaticInt(n), StaticInt(n)) + matrix = StrideArray(undef, StaticInt(n), StaticInt(n)) + data_in = rand(v, n, n) + + # Initialize with random data + matrix .= rand.() + + fill!(data_out_ref, 0.0) + fill!(data_out_turbo, 0.0) + + issue543_noavx!(data_out_ref, matrix, data_in) + issue543_turbo!(data_out_turbo, matrix, data_in) + + @test data_out_turbo ≈ data_out_ref + end + + # Also test with non-static first dimension but static others + for v in [1, 2], n in [4, 5, 6] + data_out_ref = StrideArray(undef, v, StaticInt(n), StaticInt(n)) + data_out_turbo = StrideArray(undef, v, StaticInt(n), StaticInt(n)) + matrix = StrideArray(undef, StaticInt(n), StaticInt(n)) + data_in = rand(v, n, n) + + matrix .= rand.() + + fill!(data_out_ref, 0.0) + fill!(data_out_turbo, 0.0) + + issue543_noavx!(data_out_ref, matrix, data_in) + issue543_turbo!(data_out_turbo, matrix, data_in) + + @test data_out_turbo ≈ data_out_ref + end +end From f00dff5295f9414890b9ad04bec4271526704498 Mon Sep 17 00:00:00 2001 From: Hendrik Ranocha Date: Thu, 4 Dec 2025 08:28:47 +0100 Subject: [PATCH 2/2] update tests --- test/staticsize.jl | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/test/staticsize.jl b/test/staticsize.jl index 5d14e956..e0902f43 100644 --- a/test/staticsize.jl +++ b/test/staticsize.jl @@ -161,28 +161,32 @@ function issue543_turbo!(data_out, matrix, data_in) end @testset "Issue #543: W=1 Nested VecUnroll" begin - # Test the specific case that was failing: v=1 (first dim size 1) with n=5 - # This triggers W=1 code paths where VecUnroll stores T instead of Vec{1,T} - for v in [1, 2], n in [4, 5, 6, 7, 8] + # Test with static first dimension + for v in 1:4, n in 2:8 data_out_ref = StrideArray(undef, StaticInt(v), StaticInt(n), StaticInt(n)) data_out_turbo = StrideArray(undef, StaticInt(v), StaticInt(n), StaticInt(n)) matrix = StrideArray(undef, StaticInt(n), StaticInt(n)) data_in = rand(v, n, n) - # Initialize with random data matrix .= rand.() fill!(data_out_ref, 0.0) fill!(data_out_turbo, 0.0) issue543_noavx!(data_out_ref, matrix, data_in) - issue543_turbo!(data_out_turbo, matrix, data_in) - @test data_out_turbo ≈ data_out_ref + # This is broken on Apple ARM CPUs (Apple M series) for some reason. + # TODO: Fix the underlying issue! + if (v == 1) && Sys.isapple() && Sys.ARCH == :aarch64 + @test_skip issue543_turbo!(data_out_turbo, matrix, data_in) + else + @test_nowarn issue543_turbo!(data_out_turbo, matrix, data_in) + @test data_out_turbo ≈ data_out_ref + end end - # Also test with non-static first dimension but static others - for v in [1, 2], n in [4, 5, 6] + # Test with non-static first but static other dimensions + for v in 1:4, n in 2:8 data_out_ref = StrideArray(undef, v, StaticInt(n), StaticInt(n)) data_out_turbo = StrideArray(undef, v, StaticInt(n), StaticInt(n)) matrix = StrideArray(undef, StaticInt(n), StaticInt(n))