From 156a51a8f33f8e028009ac47c4887b077962a993 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Fri, 15 May 2026 10:57:14 +0200 Subject: [PATCH] cuBLAS: extend norm/norm2 to strided vector views. Generalizes the BLAS-optimized `norm`/`norm2` methods from `DenseCuArray` to `StridedCuVecOrDenseMat`, so 1D strided subarray views also dispatch to `nrm2`. Multi-dim non-contiguous views go through the sum-based fallback in GPUArrays (which now dispatches on `AnyGPUArray`). Resolves JuliaGPU/CUDA.jl#2280, replaces #2302. Co-Authored-By: Claude Opus 4.7 (1M context) --- CUDACore/Project.toml | 2 +- lib/cublas/src/linalg.jl | 6 +++--- lib/cublas/test/level1/core.jl | 13 +++++++++++++ 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/CUDACore/Project.toml b/CUDACore/Project.toml index ad44b0203f..bab293aba3 100644 --- a/CUDACore/Project.toml +++ b/CUDACore/Project.toml @@ -52,7 +52,7 @@ CUDA_Runtime_jll = "0.22" ChainRulesCore = "1" EnzymeCore = "0.8.2" ExprTools = "0.1" -GPUArrays = "11.5" +GPUArrays = "11.5.4" GPUCompiler = "1.10" GPUToolbox = "1.1" KernelAbstractions = "0.9.38" diff --git a/lib/cublas/src/linalg.jl b/lib/cublas/src/linalg.jl index 60cdc49a2c..c138c2ce23 100644 --- a/lib/cublas/src/linalg.jl +++ b/lib/cublas/src/linalg.jl @@ -209,16 +209,16 @@ function LinearAlgebra.:(*)(transx::Transpose{<:Any,<:StridedCuVector{T}}, return dotu(n, x, y) end -function LinearAlgebra.norm(x::DenseCuArray{<:Union{Float16, ComplexF16, CublasFloat}}, +function LinearAlgebra.norm(x::StridedCuVecOrDenseMat{<:Union{Float16, ComplexF16, CublasFloat}}, p::Real=2) if p == 2 return nrm2(x) else - return invoke(norm, Tuple{AbstractGPUArray, Real}, x, p) + return invoke(norm, Tuple{AnyGPUArray, Real}, x, p) end end LinearAlgebra.norm(x::Diagonal{T, <:StridedCuVector{T}}, p::Real=2) where {T<:Union{Float16, ComplexF16, CublasFloat}} = norm(x.diag, p) -LinearAlgebra.norm2(x::DenseCuArray{<:Union{Float16, ComplexF16, CublasFloat}}) = nrm2(x) +LinearAlgebra.norm2(x::StridedCuVecOrDenseMat{<:Union{Float16, ComplexF16, CublasFloat}}) = nrm2(x) LinearAlgebra.BLAS.asum(x::StridedCuArray{<:CublasFloat}) = asum(length(x), x) diff --git a/lib/cublas/test/level1/core.jl b/lib/cublas/test/level1/core.jl index 22e6daec51..0fafe5de92 100644 --- a/lib/cublas/test/level1/core.jl +++ b/lib/cublas/test/level1/core.jl @@ -76,6 +76,19 @@ using LinearAlgebra @test norm(dDx, 2) ≈ norm(Dx, 2) @test norm(dDx, Inf) ≈ norm(Dx, Inf) end + + @testset "norm of strided views" begin # JuliaGPU/CUDA.jl#2280 + # 1D contiguous view: should hit the cuBLAS nrm2 fast path. + x = rand(T, m) + dx = CuArray(x) + @test norm(@view(dx[2:end-1]), 2) ≈ norm(@view(x[2:end-1]), 2) + # Multi-dim non-contiguous view: must avoid scalar iteration. + y = rand(T, 10, 10) + dy = CuArray(y) + @test norm(@view(dy[2:end-1, 2:end-1]), 1) ≈ norm(@view(y[2:end-1, 2:end-1]), 1) + @test norm(@view(dy[2:end-1, 2:end-1]), 2) ≈ norm(@view(y[2:end-1, 2:end-1]), 2) + @test norm(@view(dy[2:end-1, 2:end-1]), Inf) ≈ norm(@view(y[2:end-1, 2:end-1]), Inf) + end end @testset for T in [Float16, ComplexF16]