From 4368f93dc0a1818995bf16944a37530be744f6a9 Mon Sep 17 00:00:00 2001 From: Jackson Mowry Date: Fri, 20 Feb 2026 11:23:15 +0000 Subject: [PATCH 1/4] ger implementation, 32/64 split --- src/__init__.mojo | 2 + src/level2/__init__.mojo | 1 + src/level2/ger_device.mojo | 129 ++++++++++++++++++++++++++++ test-level2.mojo | 166 +++++++++++++++++++++++++++++++++++++ 4 files changed, 298 insertions(+) create mode 100644 src/level2/__init__.mojo create mode 100644 src/level2/ger_device.mojo create mode 100644 test-level2.mojo diff --git a/src/__init__.mojo b/src/__init__.mojo index 014246d..280bb3c 100644 --- a/src/__init__.mojo +++ b/src/__init__.mojo @@ -11,3 +11,5 @@ from .level1.dotc_device import * from .level1.dotu_device import * from .level1.nrm2_device import * from .level1.iamax_device import * + +from .level2.ger_device import * diff --git a/src/level2/__init__.mojo b/src/level2/__init__.mojo new file mode 100644 index 0000000..997c28c --- /dev/null +++ b/src/level2/__init__.mojo @@ -0,0 +1 @@ +from .ger_device import * diff --git a/src/level2/ger_device.mojo b/src/level2/ger_device.mojo new file mode 100644 index 0000000..81349c6 --- /dev/null +++ b/src/level2/ger_device.mojo @@ -0,0 +1,129 @@ +from gpu import thread_idx, block_idx, block_dim, grid_dim +from gpu.host import DeviceContext +from math import ceildiv + +comptime TBsize = 512 + +# level2.sger +# Computes single-precision rank-1 update of given matrix: A := A + αxy' +fn sger_device[ + BLOCK: Int, +]( + m: Int, + n: Int, + alpha: Scalar[DType.float32], + x: UnsafePointer[Scalar[DType.float32], ImmutAnyOrigin], + incx: Int, + y: UnsafePointer[Scalar[DType.float32], ImmutAnyOrigin], + incy: Int, + A: UnsafePointer[Scalar[DType.float32], MutAnyOrigin], + lda: Int, +): + if m < 1 or n < 1: + return + + var global_i = block_dim.x * block_idx.x + thread_idx.x + var n_threads = grid_dim.x * block_dim.x + + var total = m * n + + for idx in range(global_i, total, n_threads): + var row = idx // n + var col = idx % n + + var x_val = x[row * incx] + var y_val = y[col * incy] + + A[row * lda + col] += alpha * x_val * y_val + +fn blas_sger( + m: Int, + n: Int, + alpha: Scalar[DType.float32], + d_x: UnsafePointer[Scalar[DType.float32], ImmutAnyOrigin], + incx: Int, + d_y: UnsafePointer[Scalar[DType.float32], ImmutAnyOrigin], + incy: Int, + d_A: UnsafePointer[Scalar[DType.float32], MutAnyOrigin], + lda: Int, + ctx: DeviceContext +) raises: + if m < 1 or n < 1: + return + + comptime kernel = sger_device[TBsize] + + var total = m * n + + ctx.enqueue_function[kernel, kernel]( + m, n, alpha, + d_x, incx, + d_y, incy, + d_A, lda, + grid_dim=ceildiv(total, TBsize), + block_dim=TBsize, + ) + + ctx.synchronize() + +# level2.dger +# Computes double-precision rank-1 update of given matrix: A := A + αxy' +fn dger_device[ + BLOCK: Int, +]( + m: Int, + n: Int, + alpha: Scalar[DType.float64], + x: UnsafePointer[Scalar[DType.float64], ImmutAnyOrigin], + incx: Int, + y: UnsafePointer[Scalar[DType.float64], ImmutAnyOrigin], + incy: Int, + A: UnsafePointer[Scalar[DType.float64], MutAnyOrigin], + lda: Int, +): + if m < 1 or n < 1: + return + + var global_i = block_dim.x * block_idx.x + thread_idx.x + var n_threads = grid_dim.x * block_dim.x + + var total = m * n + + for idx in range(global_i, total, n_threads): + var row = idx // n + var col = idx % n + + var x_val = x[row * incx] + var y_val = y[col * incy] + + A[row * lda + col] += alpha * x_val * y_val + +fn blas_dger( + m: Int, + n: Int, + alpha: Scalar[DType.float64], + d_x: UnsafePointer[Scalar[DType.float64], ImmutAnyOrigin], + incx: Int, + d_y: UnsafePointer[Scalar[DType.float64], ImmutAnyOrigin], + incy: Int, + d_A: UnsafePointer[Scalar[DType.float64], MutAnyOrigin], + lda: Int, + ctx: DeviceContext +) raises: + if m < 1 or n < 1: + return + + comptime kernel = dger_device[TBsize] + + var total = m * n + + ctx.enqueue_function[kernel, kernel]( + m, n, alpha, + d_x, incx, + d_y, incy, + d_A, lda, + grid_dim=ceildiv(total, TBsize), + block_dim=TBsize, + ) + + ctx.synchronize() diff --git a/test-level2.mojo b/test-level2.mojo new file mode 100644 index 0000000..e2e8491 --- /dev/null +++ b/test-level2.mojo @@ -0,0 +1,166 @@ +from testing import assert_equal, assert_almost_equal, TestSuite +from sys import has_accelerator +from gpu.host import DeviceContext +from gpu import block_dim, grid_dim, thread_idx +from layout import Layout, LayoutTensor +from math import sqrt +from complex import ComplexSIMD + +from src import * +from random import rand, seed, randn_float64 +from math import ceildiv, sin, cos +from python import Python, PythonObject + +comptime TBsize = 512 +comptime atol = 1.0E-5 + +def generate_random_arr[ + dtype: DType, + size: Int +]( + a: UnsafePointer[Scalar[dtype], MutAnyOrigin], + min_value: Scalar[dtype], + max_value: Scalar[dtype] +): + # Generate random values in [0, 1] + seed() + rand[dtype](a, size) + + # Scale to [min, max] + var rng = max_value - min_value + for i in range(size): + a[i] = min_value + a[i] * rng + + +def sger_test[ + m: Int, + n: Int, +](): + with DeviceContext() as ctx: + A_device = ctx.enqueue_create_buffer[DType.float32](m*n) + A = ctx.enqueue_create_host_buffer[DType.float32](m*n) + x_device = ctx.enqueue_create_buffer[DType.float32](m) + x = ctx.enqueue_create_host_buffer[DType.float32](m) + y_device = ctx.enqueue_create_buffer[DType.float32](n) + y = ctx.enqueue_create_host_buffer[DType.float32](n) + + # Generate two arrays of random numbers on CPU + generate_random_arr[DType.float32, m*n](A.unsafe_ptr(), -100, 100) + generate_random_arr[DType.float32, m](x.unsafe_ptr(), -100, 100) + generate_random_arr[DType.float32, n](y.unsafe_ptr(), -100, 100) + + ctx.enqueue_copy(A_device, A) + ctx.enqueue_copy(x_device, x) + ctx.enqueue_copy(y_device, y) + + var alpha = randn_float64(0.0, 1.0) + + # Import SciPy and numpy + sp = Python.import_module("scipy") + np = Python.import_module("numpy") + sp_blas = sp.linalg.blas + + # Move a and b to a SciPy-compatible array and run SciPy BLAS routine + py_a = Python.list() + py_x = Python.list() + py_y = Python.list() + + for i in range(m*n): + py_a.append(A[i]) + for i in range(m): + py_x.append(x[i]) + for i in range(n): + py_y.append(y[i]) + + var sp_res: PythonObject + # sger - float32 + np_a = np.array(py_a, dtype=np.float32).reshape(m,n) + np_x = np.array(py_x, dtype=np.float32) + np_y = np.array(py_y, dtype=np.float32) + sp_res = sp_blas.sger(alpha, np_x, np_y, 1, 1, np_a) + + blas_sger( + m, + n, + Scalar[DType.float32](alpha), + x_device.unsafe_ptr(), 1, + y_device.unsafe_ptr(), 1, + A_device.unsafe_ptr(), n, + ctx) + + with A_device.map_to_host() as res_mojo: + for i in range(m): + for j in range(n): + assert_almost_equal(Scalar[DType.float32](py=sp_res[i][j]), res_mojo[(i*n)+j], atol=atol) + +def dger_test[ + m: Int, + n: Int, +](): + with DeviceContext() as ctx: + A_device = ctx.enqueue_create_buffer[DType.float64](m*n) + A = ctx.enqueue_create_host_buffer[DType.float64](m*n) + x_device = ctx.enqueue_create_buffer[DType.float64](m) + x = ctx.enqueue_create_host_buffer[DType.float64](m) + y_device = ctx.enqueue_create_buffer[DType.float64](n) + y = ctx.enqueue_create_host_buffer[DType.float64](n) + + # Generate two arrays of random numbers on CPU + generate_random_arr[DType.float64, m*n](A.unsafe_ptr(), -100, 100) + generate_random_arr[DType.float64, m](x.unsafe_ptr(), -100, 100) + generate_random_arr[DType.float64, n](y.unsafe_ptr(), -100, 100) + + ctx.enqueue_copy(A_device, A) + ctx.enqueue_copy(x_device, x) + ctx.enqueue_copy(y_device, y) + + var alpha = randn_float64(0.0, 1.0) + + # Import SciPy and numpy + sp = Python.import_module("scipy") + np = Python.import_module("numpy") + sp_blas = sp.linalg.blas + + # Move a and b to a SciPy-compatible array and run SciPy BLAS routine + py_a = Python.list() + py_x = Python.list() + py_y = Python.list() + + for i in range(m*n): + py_a.append(A[i]) + for i in range(m): + py_x.append(x[i]) + for i in range(n): + py_y.append(y[i]) + + var sp_res: PythonObject + # dger - float64 + np_a = np.array(py_a, dtype=np.float64).reshape(m,n) + np_x = np.array(py_x, dtype=np.float64) + np_y = np.array(py_y, dtype=np.float64) + sp_res = sp_blas.dger(alpha, np_x, np_y, 1, 1, np_a) + blas_dger( + m, + n, + Scalar[DType.float64](alpha), + x_device.unsafe_ptr(), 1, + y_device.unsafe_ptr(), 1, + A_device.unsafe_ptr(), n, + ctx) + + with A_device.map_to_host() as res_mojo: + for i in range(m): + for j in range(n): + assert_almost_equal(Scalar[DType.float64](py=sp_res[i][j]), res_mojo[(i*n)+j], atol=atol) + +def test_sger(): + sger_test[64, 64]() + sger_test[256, 256]() + +def test_dger(): + dger_test[64, 64]() + dger_test[256, 256]() + +def main(): + print("--- MojoBLAS Level 2 routines testing ---") + TestSuite.discover_tests[__functions_in_module()]().run() From 22cb32de2cf520fa7c460fbd30718816544cd130 Mon Sep 17 00:00:00 2001 From: Jackson Mowry Date: Fri, 20 Feb 2026 11:37:03 +0000 Subject: [PATCH 2/4] Refactor out random array generation for all levels of testing --- src/__init__.mojo | 2 ++ src/testing_utils/__init__.mojo | 1 + src/testing_utils/testing_utils.mojo | 18 ++++++++++++++++++ test-level1.mojo | 18 ------------------ test-level2.mojo | 22 ++-------------------- 5 files changed, 23 insertions(+), 38 deletions(-) create mode 100644 src/testing_utils/__init__.mojo create mode 100644 src/testing_utils/testing_utils.mojo diff --git a/src/__init__.mojo b/src/__init__.mojo index 280bb3c..e79fd5a 100644 --- a/src/__init__.mojo +++ b/src/__init__.mojo @@ -13,3 +13,5 @@ from .level1.nrm2_device import * from .level1.iamax_device import * from .level2.ger_device import * + +from .testing_utils.testing_utils import * diff --git a/src/testing_utils/__init__.mojo b/src/testing_utils/__init__.mojo new file mode 100644 index 0000000..8975453 --- /dev/null +++ b/src/testing_utils/__init__.mojo @@ -0,0 +1 @@ +from .testing_utils import * diff --git a/src/testing_utils/testing_utils.mojo b/src/testing_utils/testing_utils.mojo new file mode 100644 index 0000000..e7d1117 --- /dev/null +++ b/src/testing_utils/testing_utils.mojo @@ -0,0 +1,18 @@ +from random import rand, seed + +def generate_random_arr[ + dtype: DType, + size: Int +]( + a: UnsafePointer[Scalar[dtype], MutAnyOrigin], + min_value: Scalar[dtype], + max_value: Scalar[dtype] +): + # Generate random values in [0, 1] + seed() + rand[dtype](a, size) + + # Scale to [min, max] + var rng = max_value - min_value + for i in range(size): + a[i] = min_value + a[i] * rng diff --git a/test-level1.mojo b/test-level1.mojo index 01b02a3..665155e 100644 --- a/test-level1.mojo +++ b/test-level1.mojo @@ -14,24 +14,6 @@ from python import Python, PythonObject comptime TBsize = 512 comptime atol = 1.0E-4 -def generate_random_arr[ - dtype: DType, - size: Int -]( - a: UnsafePointer[Scalar[dtype], MutAnyOrigin], - min_value: Scalar[dtype], - max_value: Scalar[dtype] -): - # Generate random values in [0, 1] - seed() - rand[dtype](a, size) - - # Scale to [min, max] - var rng = max_value - min_value - for i in range(size): - a[i] = min_value + a[i] * rng - - def generate_random_scalar[ dtype: DType, ]( diff --git a/test-level2.mojo b/test-level2.mojo index e2e8491..7c1aa21 100644 --- a/test-level2.mojo +++ b/test-level2.mojo @@ -4,7 +4,6 @@ from gpu.host import DeviceContext from gpu import block_dim, grid_dim, thread_idx from layout import Layout, LayoutTensor from math import sqrt -from complex import ComplexSIMD from src import * from random import rand, seed, randn_float64 @@ -14,23 +13,6 @@ from python import Python, PythonObject comptime TBsize = 512 comptime atol = 1.0E-5 -def generate_random_arr[ - dtype: DType, - size: Int -]( - a: UnsafePointer[Scalar[dtype], MutAnyOrigin], - min_value: Scalar[dtype], - max_value: Scalar[dtype] -): - # Generate random values in [0, 1] - seed() - rand[dtype](a, size) - - # Scale to [min, max] - var rng = max_value - min_value - for i in range(size): - a[i] = min_value + a[i] * rng - def sger_test[ m: Int, @@ -44,7 +26,7 @@ def sger_test[ y_device = ctx.enqueue_create_buffer[DType.float32](n) y = ctx.enqueue_create_host_buffer[DType.float32](n) - # Generate two arrays of random numbers on CPU + # Generate three arrays of random numbers on CPU generate_random_arr[DType.float32, m*n](A.unsafe_ptr(), -100, 100) generate_random_arr[DType.float32, m](x.unsafe_ptr(), -100, 100) generate_random_arr[DType.float32, n](y.unsafe_ptr(), -100, 100) @@ -105,7 +87,7 @@ def dger_test[ y_device = ctx.enqueue_create_buffer[DType.float64](n) y = ctx.enqueue_create_host_buffer[DType.float64](n) - # Generate two arrays of random numbers on CPU + # Generate three arrays of random numbers on CPU generate_random_arr[DType.float64, m*n](A.unsafe_ptr(), -100, 100) generate_random_arr[DType.float64, m](x.unsafe_ptr(), -100, 100) generate_random_arr[DType.float64, n](y.unsafe_ptr(), -100, 100) From e00f3ab723e420289ace4f9b795078b2ef8c0f98 Mon Sep 17 00:00:00 2001 From: Jackson Mowry Date: Fri, 20 Feb 2026 19:36:16 +0000 Subject: [PATCH 3/4] Broken state --- src/level2/ger_device.mojo | 76 +++++++++-------------- test-level2.mojo | 119 +++++++++++-------------------------- 2 files changed, 61 insertions(+), 134 deletions(-) diff --git a/src/level2/ger_device.mojo b/src/level2/ger_device.mojo index 81349c6..bd584e4 100644 --- a/src/level2/ger_device.mojo +++ b/src/level2/ger_device.mojo @@ -4,8 +4,8 @@ from math import ceildiv comptime TBsize = 512 -# level2.sger -# Computes single-precision rank-1 update of given matrix: A := A + αxy' +# level2.ger +# Computes rank-1 update of given matrix: A := A + αxy' fn sger_device[ BLOCK: Int, ]( @@ -36,38 +36,6 @@ fn sger_device[ A[row * lda + col] += alpha * x_val * y_val -fn blas_sger( - m: Int, - n: Int, - alpha: Scalar[DType.float32], - d_x: UnsafePointer[Scalar[DType.float32], ImmutAnyOrigin], - incx: Int, - d_y: UnsafePointer[Scalar[DType.float32], ImmutAnyOrigin], - incy: Int, - d_A: UnsafePointer[Scalar[DType.float32], MutAnyOrigin], - lda: Int, - ctx: DeviceContext -) raises: - if m < 1 or n < 1: - return - - comptime kernel = sger_device[TBsize] - - var total = m * n - - ctx.enqueue_function[kernel, kernel]( - m, n, alpha, - d_x, incx, - d_y, incy, - d_A, lda, - grid_dim=ceildiv(total, TBsize), - block_dim=TBsize, - ) - - ctx.synchronize() - -# level2.dger -# Computes double-precision rank-1 update of given matrix: A := A + αxy' fn dger_device[ BLOCK: Int, ]( @@ -98,32 +66,42 @@ fn dger_device[ A[row * lda + col] += alpha * x_val * y_val -fn blas_dger( +fn blas_ger[dtype: DType]( m: Int, n: Int, - alpha: Scalar[DType.float64], - d_x: UnsafePointer[Scalar[DType.float64], ImmutAnyOrigin], + alpha: Scalar[dtype], + d_x: UnsafePointer[Scalar[dtype], ImmutAnyOrigin], incx: Int, - d_y: UnsafePointer[Scalar[DType.float64], ImmutAnyOrigin], + d_y: UnsafePointer[Scalar[dtype], ImmutAnyOrigin], incy: Int, - d_A: UnsafePointer[Scalar[DType.float64], MutAnyOrigin], + d_A: UnsafePointer[Scalar[dtype], MutAnyOrigin], lda: Int, ctx: DeviceContext ) raises: if m < 1 or n < 1: return - comptime kernel = dger_device[TBsize] - var total = m * n - ctx.enqueue_function[kernel, kernel]( - m, n, alpha, - d_x, incx, - d_y, incy, - d_A, lda, - grid_dim=ceildiv(total, TBsize), - block_dim=TBsize, - ) + if dtype == DType.float32: + ctx.enqueue_function[sger_device[TBsize], sger_device[TBsize]]( + m, n, alpha, + d_x, incx, + d_y, incy, + d_A, lda, + grid_dim=ceildiv(total, TBsize), + block_dim=TBsize, + ) + elif dtype == DType.float64: + ctx.enqueue_function[dger_device[TBsize], dger_device[TBsize]]( + m, n, alpha, + d_x, incx, + d_y, incy, + d_A, lda, + grid_dim=ceildiv(total, TBsize), + block_dim=TBsize, + ) + else: + return ctx.synchronize() diff --git a/test-level2.mojo b/test-level2.mojo index 7c1aa21..343b35a 100644 --- a/test-level2.mojo +++ b/test-level2.mojo @@ -14,22 +14,23 @@ comptime TBsize = 512 comptime atol = 1.0E-5 -def sger_test[ +def ger_test[ + dtype: DType, m: Int, n: Int, ](): with DeviceContext() as ctx: - A_device = ctx.enqueue_create_buffer[DType.float32](m*n) - A = ctx.enqueue_create_host_buffer[DType.float32](m*n) - x_device = ctx.enqueue_create_buffer[DType.float32](m) - x = ctx.enqueue_create_host_buffer[DType.float32](m) - y_device = ctx.enqueue_create_buffer[DType.float32](n) - y = ctx.enqueue_create_host_buffer[DType.float32](n) + A_device = ctx.enqueue_create_buffer[dtype](m*n) + A = ctx.enqueue_create_host_buffer[dtype](m*n) + x_device = ctx.enqueue_create_buffer[dtype](m) + x = ctx.enqueue_create_host_buffer[dtype](m) + y_device = ctx.enqueue_create_buffer[dtype](n) + y = ctx.enqueue_create_host_buffer[dtype](n) # Generate three arrays of random numbers on CPU - generate_random_arr[DType.float32, m*n](A.unsafe_ptr(), -100, 100) - generate_random_arr[DType.float32, m](x.unsafe_ptr(), -100, 100) - generate_random_arr[DType.float32, n](y.unsafe_ptr(), -100, 100) + generate_random_arr[dtype, m*n](A.unsafe_ptr(), -100, 100) + generate_random_arr[dtype, m](x.unsafe_ptr(), -100, 100) + generate_random_arr[dtype, n](y.unsafe_ptr(), -100, 100) ctx.enqueue_copy(A_device, A) ctx.enqueue_copy(x_device, x) @@ -55,76 +56,25 @@ def sger_test[ py_y.append(y[i]) var sp_res: PythonObject - # sger - float32 - np_a = np.array(py_a, dtype=np.float32).reshape(m,n) - np_x = np.array(py_x, dtype=np.float32) - np_y = np.array(py_y, dtype=np.float32) - sp_res = sp_blas.sger(alpha, np_x, np_y, 1, 1, np_a) - - blas_sger( - m, - n, - Scalar[DType.float32](alpha), - x_device.unsafe_ptr(), 1, - y_device.unsafe_ptr(), 1, - A_device.unsafe_ptr(), n, - ctx) - - with A_device.map_to_host() as res_mojo: - for i in range(m): - for j in range(n): - assert_almost_equal(Scalar[DType.float32](py=sp_res[i][j]), res_mojo[(i*n)+j], atol=atol) - -def dger_test[ - m: Int, - n: Int, -](): - with DeviceContext() as ctx: - A_device = ctx.enqueue_create_buffer[DType.float64](m*n) - A = ctx.enqueue_create_host_buffer[DType.float64](m*n) - x_device = ctx.enqueue_create_buffer[DType.float64](m) - x = ctx.enqueue_create_host_buffer[DType.float64](m) - y_device = ctx.enqueue_create_buffer[DType.float64](n) - y = ctx.enqueue_create_host_buffer[DType.float64](n) - - # Generate three arrays of random numbers on CPU - generate_random_arr[DType.float64, m*n](A.unsafe_ptr(), -100, 100) - generate_random_arr[DType.float64, m](x.unsafe_ptr(), -100, 100) - generate_random_arr[DType.float64, n](y.unsafe_ptr(), -100, 100) - - ctx.enqueue_copy(A_device, A) - ctx.enqueue_copy(x_device, x) - ctx.enqueue_copy(y_device, y) - - var alpha = randn_float64(0.0, 1.0) - - # Import SciPy and numpy - sp = Python.import_module("scipy") - np = Python.import_module("numpy") - sp_blas = sp.linalg.blas - - # Move a and b to a SciPy-compatible array and run SciPy BLAS routine - py_a = Python.list() - py_x = Python.list() - py_y = Python.list() - - for i in range(m*n): - py_a.append(A[i]) - for i in range(m): - py_x.append(x[i]) - for i in range(n): - py_y.append(y[i]) - - var sp_res: PythonObject - # dger - float64 - np_a = np.array(py_a, dtype=np.float64).reshape(m,n) - np_x = np.array(py_x, dtype=np.float64) - np_y = np.array(py_y, dtype=np.float64) - sp_res = sp_blas.dger(alpha, np_x, np_y, 1, 1, np_a) - blas_dger( + # ger - float32 + if dtype == DType.float32: + np_a = np.array(py_a, dtype=np.float32).reshape(m,n) + np_x = np.array(py_x, dtype=np.float32) + np_y = np.array(py_y, dtype=np.float32) + sp_res = sp_blas.sger(alpha, np_x, np_y, 1, 1, np_a) + if dtype == DType.float64: + np_a = np.array(py_a, dtype=np.float64).reshape(m,n) + np_x = np.array(py_x, dtype=np.float64) + np_y = np.array(py_y, dtype=np.float64) + sp_res = sp_blas.dger(alpha, np_x, np_y, 1, 1, np_a) + else: + print("Unsupported type: ", dtype) + return + + blas_ger[dtype]( m, n, - Scalar[DType.float64](alpha), + Scalar[dtype](alpha), x_device.unsafe_ptr(), 1, y_device.unsafe_ptr(), 1, A_device.unsafe_ptr(), n, @@ -133,15 +83,14 @@ def dger_test[ with A_device.map_to_host() as res_mojo: for i in range(m): for j in range(n): - assert_almost_equal(Scalar[DType.float64](py=sp_res[i][j]), res_mojo[(i*n)+j], atol=atol) + assert_almost_equal(Scalar[dtype](py=sp_res[i][j]), res_mojo[(i*n)+j], atol=atol) -def test_sger(): - sger_test[64, 64]() - sger_test[256, 256]() -def test_dger(): - dger_test[64, 64]() - dger_test[256, 256]() +def test_ger(): + ger_test[DType.float32, 64, 64]() + # ger_test[DType.float32, 256, 256]() + # ger_test[DType.float64, 64, 64]() + # ger_test[DType.float64, 256, 256]() def main(): print("--- MojoBLAS Level 2 routines testing ---") From 07efeb8f82b0c878ff496b207d6b4e75b4d62d3b Mon Sep 17 00:00:00 2001 From: Jackson Mowry Date: Wed, 25 Feb 2026 11:23:13 +0000 Subject: [PATCH 4/4] Add @parameter for compile-time execution, fix and re-enable test for level 2 --- src/level2/ger_device.mojo | 3 ++- test-level2.mojo | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/level2/ger_device.mojo b/src/level2/ger_device.mojo index bd584e4..011e7b9 100644 --- a/src/level2/ger_device.mojo +++ b/src/level2/ger_device.mojo @@ -83,6 +83,7 @@ fn blas_ger[dtype: DType]( var total = m * n + @parameter if dtype == DType.float32: ctx.enqueue_function[sger_device[TBsize], sger_device[TBsize]]( m, n, alpha, @@ -102,6 +103,6 @@ fn blas_ger[dtype: DType]( block_dim=TBsize, ) else: - return + raise Error("blas_ger: Unsupported type") ctx.synchronize() diff --git a/test-level2.mojo b/test-level2.mojo index 343b35a..3466766 100644 --- a/test-level2.mojo +++ b/test-level2.mojo @@ -62,7 +62,7 @@ def ger_test[ np_x = np.array(py_x, dtype=np.float32) np_y = np.array(py_y, dtype=np.float32) sp_res = sp_blas.sger(alpha, np_x, np_y, 1, 1, np_a) - if dtype == DType.float64: + elif dtype == DType.float64: np_a = np.array(py_a, dtype=np.float64).reshape(m,n) np_x = np.array(py_x, dtype=np.float64) np_y = np.array(py_y, dtype=np.float64) @@ -88,9 +88,9 @@ def ger_test[ def test_ger(): ger_test[DType.float32, 64, 64]() - # ger_test[DType.float32, 256, 256]() - # ger_test[DType.float64, 64, 64]() - # ger_test[DType.float64, 256, 256]() + ger_test[DType.float32, 256, 256]() + ger_test[DType.float64, 64, 64]() + ger_test[DType.float64, 256, 256]() def main(): print("--- MojoBLAS Level 2 routines testing ---")