Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/__init__.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,7 @@ from .level1.dotc_device import *
from .level1.dotu_device import *
from .level1.nrm2_device import *
from .level1.iamax_device import *

from .level2.ger_device import *

from .testing_utils.testing_utils import *
1 change: 1 addition & 0 deletions src/level2/__init__.mojo
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .ger_device import *
108 changes: 108 additions & 0 deletions src/level2/ger_device.mojo
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
from gpu import thread_idx, block_idx, block_dim, grid_dim
from gpu.host import DeviceContext
from math import ceildiv

comptime TBsize = 512

# level2.ger
# Computes rank-1 update of given matrix: A := A + αxy'
fn sger_device[
BLOCK: Int,
](
m: Int,
n: Int,
alpha: Scalar[DType.float32],
x: UnsafePointer[Scalar[DType.float32], ImmutAnyOrigin],
incx: Int,
y: UnsafePointer[Scalar[DType.float32], ImmutAnyOrigin],
incy: Int,
A: UnsafePointer[Scalar[DType.float32], MutAnyOrigin],
lda: Int,
):
if m < 1 or n < 1:
return

var global_i = block_dim.x * block_idx.x + thread_idx.x
var n_threads = grid_dim.x * block_dim.x

var total = m * n

for idx in range(global_i, total, n_threads):
var row = idx // n
var col = idx % n

var x_val = x[row * incx]
var y_val = y[col * incy]

A[row * lda + col] += alpha * x_val * y_val

fn dger_device[
BLOCK: Int,
](
m: Int,
n: Int,
alpha: Scalar[DType.float64],
x: UnsafePointer[Scalar[DType.float64], ImmutAnyOrigin],
incx: Int,
y: UnsafePointer[Scalar[DType.float64], ImmutAnyOrigin],
incy: Int,
A: UnsafePointer[Scalar[DType.float64], MutAnyOrigin],
lda: Int,
):
if m < 1 or n < 1:
return

var global_i = block_dim.x * block_idx.x + thread_idx.x
var n_threads = grid_dim.x * block_dim.x

var total = m * n

for idx in range(global_i, total, n_threads):
var row = idx // n
var col = idx % n

var x_val = x[row * incx]
var y_val = y[col * incy]

A[row * lda + col] += alpha * x_val * y_val

fn blas_ger[dtype: DType](
m: Int,
n: Int,
alpha: Scalar[dtype],
d_x: UnsafePointer[Scalar[dtype], ImmutAnyOrigin],
incx: Int,
d_y: UnsafePointer[Scalar[dtype], ImmutAnyOrigin],
incy: Int,
d_A: UnsafePointer[Scalar[dtype], MutAnyOrigin],
lda: Int,
ctx: DeviceContext
) raises:
if m < 1 or n < 1:
return

var total = m * n

@parameter
if dtype == DType.float32:
ctx.enqueue_function[sger_device[TBsize], sger_device[TBsize]](
m, n, alpha,
d_x, incx,
d_y, incy,
d_A, lda,
grid_dim=ceildiv(total, TBsize),
block_dim=TBsize,
)
elif dtype == DType.float64:
ctx.enqueue_function[dger_device[TBsize], dger_device[TBsize]](
m, n, alpha,
d_x, incx,
d_y, incy,
d_A, lda,
grid_dim=ceildiv(total, TBsize),
block_dim=TBsize,
)
else:
raise Error("blas_ger: Unsupported type")

ctx.synchronize()
1 change: 1 addition & 0 deletions src/testing_utils/__init__.mojo
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .testing_utils import *
18 changes: 18 additions & 0 deletions src/testing_utils/testing_utils.mojo
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from random import rand, seed

def generate_random_arr[
dtype: DType,
size: Int
](
a: UnsafePointer[Scalar[dtype], MutAnyOrigin],
min_value: Scalar[dtype],
max_value: Scalar[dtype]
):
# Generate random values in [0, 1]
seed()
rand[dtype](a, size)

# Scale to [min, max]
var rng = max_value - min_value
for i in range(size):
a[i] = min_value + a[i] * rng
18 changes: 0 additions & 18 deletions test-level1.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -14,24 +14,6 @@ from python import Python, PythonObject
comptime TBsize = 512
comptime atol = 1.0E-4

def generate_random_arr[
dtype: DType,
size: Int
](
a: UnsafePointer[Scalar[dtype], MutAnyOrigin],
min_value: Scalar[dtype],
max_value: Scalar[dtype]
):
# Generate random values in [0, 1]
seed()
rand[dtype](a, size)

# Scale to [min, max]
var rng = max_value - min_value
for i in range(size):
a[i] = min_value + a[i] * rng


def generate_random_scalar[
dtype: DType,
](
Expand Down
97 changes: 97 additions & 0 deletions test-level2.mojo
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
from testing import assert_equal, assert_almost_equal, TestSuite
from sys import has_accelerator
from gpu.host import DeviceContext
from gpu import block_dim, grid_dim, thread_idx
from layout import Layout, LayoutTensor
from math import sqrt

from src import *
from random import rand, seed, randn_float64
from math import ceildiv, sin, cos
from python import Python, PythonObject

comptime TBsize = 512
comptime atol = 1.0E-5


def ger_test[
dtype: DType,
m: Int,
n: Int,
]():
with DeviceContext() as ctx:
A_device = ctx.enqueue_create_buffer[dtype](m*n)
A = ctx.enqueue_create_host_buffer[dtype](m*n)
x_device = ctx.enqueue_create_buffer[dtype](m)
x = ctx.enqueue_create_host_buffer[dtype](m)
y_device = ctx.enqueue_create_buffer[dtype](n)
y = ctx.enqueue_create_host_buffer[dtype](n)

# Generate three arrays of random numbers on CPU
generate_random_arr[dtype, m*n](A.unsafe_ptr(), -100, 100)
generate_random_arr[dtype, m](x.unsafe_ptr(), -100, 100)
generate_random_arr[dtype, n](y.unsafe_ptr(), -100, 100)

ctx.enqueue_copy(A_device, A)
ctx.enqueue_copy(x_device, x)
ctx.enqueue_copy(y_device, y)

var alpha = randn_float64(0.0, 1.0)

# Import SciPy and numpy
sp = Python.import_module("scipy")
np = Python.import_module("numpy")
sp_blas = sp.linalg.blas

# Move a and b to a SciPy-compatible array and run SciPy BLAS routine
py_a = Python.list()
py_x = Python.list()
py_y = Python.list()

for i in range(m*n):
py_a.append(A[i])
for i in range(m):
py_x.append(x[i])
for i in range(n):
py_y.append(y[i])

var sp_res: PythonObject
# ger - float32
if dtype == DType.float32:
np_a = np.array(py_a, dtype=np.float32).reshape(m,n)
np_x = np.array(py_x, dtype=np.float32)
np_y = np.array(py_y, dtype=np.float32)
sp_res = sp_blas.sger(alpha, np_x, np_y, 1, 1, np_a)
elif dtype == DType.float64:
np_a = np.array(py_a, dtype=np.float64).reshape(m,n)
np_x = np.array(py_x, dtype=np.float64)
np_y = np.array(py_y, dtype=np.float64)
sp_res = sp_blas.dger(alpha, np_x, np_y, 1, 1, np_a)
else:
print("Unsupported type: ", dtype)
return

blas_ger[dtype](
m,
n,
Scalar[dtype](alpha),
x_device.unsafe_ptr(), 1,
y_device.unsafe_ptr(), 1,
A_device.unsafe_ptr(), n,
ctx)

with A_device.map_to_host() as res_mojo:
for i in range(m):
for j in range(n):
assert_almost_equal(Scalar[dtype](py=sp_res[i][j]), res_mojo[(i*n)+j], atol=atol)


def test_ger():
ger_test[DType.float32, 64, 64]()
ger_test[DType.float32, 256, 256]()
ger_test[DType.float64, 64, 64]()
ger_test[DType.float64, 256, 256]()

def main():
print("--- MojoBLAS Level 2 routines testing ---")
TestSuite.discover_tests[__functions_in_module()]().run()
Loading