diff --git a/src/__init__.mojo b/src/__init__.mojo index 014246d..e79fd5a 100644 --- a/src/__init__.mojo +++ b/src/__init__.mojo @@ -11,3 +11,7 @@ from .level1.dotc_device import * from .level1.dotu_device import * from .level1.nrm2_device import * from .level1.iamax_device import * + +from .level2.ger_device import * + +from .testing_utils.testing_utils import * diff --git a/src/level2/__init__.mojo b/src/level2/__init__.mojo new file mode 100644 index 0000000..997c28c --- /dev/null +++ b/src/level2/__init__.mojo @@ -0,0 +1 @@ +from .ger_device import * diff --git a/src/level2/ger_device.mojo b/src/level2/ger_device.mojo new file mode 100644 index 0000000..011e7b9 --- /dev/null +++ b/src/level2/ger_device.mojo @@ -0,0 +1,108 @@ +from gpu import thread_idx, block_idx, block_dim, grid_dim +from gpu.host import DeviceContext +from math import ceildiv + +comptime TBsize = 512 + +# level2.ger +# Computes rank-1 update of given matrix: A := A + αxy' +fn sger_device[ + BLOCK: Int, +]( + m: Int, + n: Int, + alpha: Scalar[DType.float32], + x: UnsafePointer[Scalar[DType.float32], ImmutAnyOrigin], + incx: Int, + y: UnsafePointer[Scalar[DType.float32], ImmutAnyOrigin], + incy: Int, + A: UnsafePointer[Scalar[DType.float32], MutAnyOrigin], + lda: Int, +): + if m < 1 or n < 1: + return + + var global_i = block_dim.x * block_idx.x + thread_idx.x + var n_threads = grid_dim.x * block_dim.x + + var total = m * n + + for idx in range(global_i, total, n_threads): + var row = idx // n + var col = idx % n + + var x_val = x[row * incx] + var y_val = y[col * incy] + + A[row * lda + col] += alpha * x_val * y_val + +fn dger_device[ + BLOCK: Int, +]( + m: Int, + n: Int, + alpha: Scalar[DType.float64], + x: UnsafePointer[Scalar[DType.float64], ImmutAnyOrigin], + incx: Int, + y: UnsafePointer[Scalar[DType.float64], ImmutAnyOrigin], + incy: Int, + A: UnsafePointer[Scalar[DType.float64], MutAnyOrigin], + lda: Int, +): + if m < 1 or n < 1: + return + + var global_i = block_dim.x * block_idx.x + thread_idx.x + var n_threads = grid_dim.x * block_dim.x + + var total = m * n + + for idx in range(global_i, total, n_threads): + var row = idx // n + var col = idx % n + + var x_val = x[row * incx] + var y_val = y[col * incy] + + A[row * lda + col] += alpha * x_val * y_val + +fn blas_ger[dtype: DType]( + m: Int, + n: Int, + alpha: Scalar[dtype], + d_x: UnsafePointer[Scalar[dtype], ImmutAnyOrigin], + incx: Int, + d_y: UnsafePointer[Scalar[dtype], ImmutAnyOrigin], + incy: Int, + d_A: UnsafePointer[Scalar[dtype], MutAnyOrigin], + lda: Int, + ctx: DeviceContext +) raises: + if m < 1 or n < 1: + return + + var total = m * n + + @parameter + if dtype == DType.float32: + ctx.enqueue_function[sger_device[TBsize], sger_device[TBsize]]( + m, n, alpha, + d_x, incx, + d_y, incy, + d_A, lda, + grid_dim=ceildiv(total, TBsize), + block_dim=TBsize, + ) + elif dtype == DType.float64: + ctx.enqueue_function[dger_device[TBsize], dger_device[TBsize]]( + m, n, alpha, + d_x, incx, + d_y, incy, + d_A, lda, + grid_dim=ceildiv(total, TBsize), + block_dim=TBsize, + ) + else: + raise Error("blas_ger: Unsupported type") + + ctx.synchronize() diff --git a/src/testing_utils/__init__.mojo b/src/testing_utils/__init__.mojo new file mode 100644 index 0000000..8975453 --- /dev/null +++ b/src/testing_utils/__init__.mojo @@ -0,0 +1 @@ +from .testing_utils import * diff --git a/src/testing_utils/testing_utils.mojo b/src/testing_utils/testing_utils.mojo new file mode 100644 index 0000000..e7d1117 --- /dev/null +++ b/src/testing_utils/testing_utils.mojo @@ -0,0 +1,18 @@ +from random import rand, seed + +def generate_random_arr[ + dtype: DType, + size: Int +]( + a: UnsafePointer[Scalar[dtype], MutAnyOrigin], + min_value: Scalar[dtype], + max_value: Scalar[dtype] +): + # Generate random values in [0, 1] + seed() + rand[dtype](a, size) + + # Scale to [min, max] + var rng = max_value - min_value + for i in range(size): + a[i] = min_value + a[i] * rng diff --git a/test-level1.mojo b/test-level1.mojo index 01b02a3..665155e 100644 --- a/test-level1.mojo +++ b/test-level1.mojo @@ -14,24 +14,6 @@ from python import Python, PythonObject comptime TBsize = 512 comptime atol = 1.0E-4 -def generate_random_arr[ - dtype: DType, - size: Int -]( - a: UnsafePointer[Scalar[dtype], MutAnyOrigin], - min_value: Scalar[dtype], - max_value: Scalar[dtype] -): - # Generate random values in [0, 1] - seed() - rand[dtype](a, size) - - # Scale to [min, max] - var rng = max_value - min_value - for i in range(size): - a[i] = min_value + a[i] * rng - - def generate_random_scalar[ dtype: DType, ]( diff --git a/test-level2.mojo b/test-level2.mojo new file mode 100644 index 0000000..3466766 --- /dev/null +++ b/test-level2.mojo @@ -0,0 +1,97 @@ +from testing import assert_equal, assert_almost_equal, TestSuite +from sys import has_accelerator +from gpu.host import DeviceContext +from gpu import block_dim, grid_dim, thread_idx +from layout import Layout, LayoutTensor +from math import sqrt + +from src import * +from random import rand, seed, randn_float64 +from math import ceildiv, sin, cos +from python import Python, PythonObject + +comptime TBsize = 512 +comptime atol = 1.0E-5 + + +def ger_test[ + dtype: DType, + m: Int, + n: Int, +](): + with DeviceContext() as ctx: + A_device = ctx.enqueue_create_buffer[dtype](m*n) + A = ctx.enqueue_create_host_buffer[dtype](m*n) + x_device = ctx.enqueue_create_buffer[dtype](m) + x = ctx.enqueue_create_host_buffer[dtype](m) + y_device = ctx.enqueue_create_buffer[dtype](n) + y = ctx.enqueue_create_host_buffer[dtype](n) + + # Generate three arrays of random numbers on CPU + generate_random_arr[dtype, m*n](A.unsafe_ptr(), -100, 100) + generate_random_arr[dtype, m](x.unsafe_ptr(), -100, 100) + generate_random_arr[dtype, n](y.unsafe_ptr(), -100, 100) + + ctx.enqueue_copy(A_device, A) + ctx.enqueue_copy(x_device, x) + ctx.enqueue_copy(y_device, y) + + var alpha = randn_float64(0.0, 1.0) + + # Import SciPy and numpy + sp = Python.import_module("scipy") + np = Python.import_module("numpy") + sp_blas = sp.linalg.blas + + # Move a and b to a SciPy-compatible array and run SciPy BLAS routine + py_a = Python.list() + py_x = Python.list() + py_y = Python.list() + + for i in range(m*n): + py_a.append(A[i]) + for i in range(m): + py_x.append(x[i]) + for i in range(n): + py_y.append(y[i]) + + var sp_res: PythonObject + # ger - float32 + if dtype == DType.float32: + np_a = np.array(py_a, dtype=np.float32).reshape(m,n) + np_x = np.array(py_x, dtype=np.float32) + np_y = np.array(py_y, dtype=np.float32) + sp_res = sp_blas.sger(alpha, np_x, np_y, 1, 1, np_a) + elif dtype == DType.float64: + np_a = np.array(py_a, dtype=np.float64).reshape(m,n) + np_x = np.array(py_x, dtype=np.float64) + np_y = np.array(py_y, dtype=np.float64) + sp_res = sp_blas.dger(alpha, np_x, np_y, 1, 1, np_a) + else: + print("Unsupported type: ", dtype) + return + + blas_ger[dtype]( + m, + n, + Scalar[dtype](alpha), + x_device.unsafe_ptr(), 1, + y_device.unsafe_ptr(), 1, + A_device.unsafe_ptr(), n, + ctx) + + with A_device.map_to_host() as res_mojo: + for i in range(m): + for j in range(n): + assert_almost_equal(Scalar[dtype](py=sp_res[i][j]), res_mojo[(i*n)+j], atol=atol) + + +def test_ger(): + ger_test[DType.float32, 64, 64]() + ger_test[DType.float32, 256, 256]() + ger_test[DType.float64, 64, 64]() + ger_test[DType.float64, 256, 256]() + +def main(): + print("--- MojoBLAS Level 2 routines testing ---") + TestSuite.discover_tests[__functions_in_module()]().run()