tdehoff · tdehoff · Feb 25, 2026 · Feb 20, 2026 · Feb 20, 2026 · Feb 20, 2026
diff --git a/src/__init__.mojo b/src/__init__.mojo
@@ -11,3 +11,7 @@ from .level1.dotc_device import *
 from .level1.dotu_device import *
 from .level1.nrm2_device import *
 from .level1.iamax_device import *
+
+from .level2.ger_device import *
+
+from .testing_utils.testing_utils import *
diff --git a/src/level2/__init__.mojo b/src/level2/__init__.mojo
@@ -0,0 +1 @@
+from .ger_device import *
diff --git a/src/level2/ger_device.mojo b/src/level2/ger_device.mojo
@@ -0,0 +1,108 @@
+from gpu import thread_idx, block_idx, block_dim, grid_dim
+from gpu.host import DeviceContext
+from math import ceildiv
+
+comptime TBsize = 512
+
+# level2.ger
+# Computes rank-1 update of given matrix: A := A + αxy'
+fn sger_device[
+    BLOCK: Int,
+](
+    m: Int,
+    n: Int,
+    alpha: Scalar[DType.float32],
+    x: UnsafePointer[Scalar[DType.float32], ImmutAnyOrigin],
+    incx: Int,
+    y: UnsafePointer[Scalar[DType.float32], ImmutAnyOrigin],
+    incy: Int,
+    A: UnsafePointer[Scalar[DType.float32], MutAnyOrigin],
+    lda: Int,
+):
+    if m < 1 or n < 1:
+        return
+
+    var global_i = block_dim.x * block_idx.x + thread_idx.x
+    var n_threads = grid_dim.x * block_dim.x
+
+    var total = m * n
+
+    for idx in range(global_i, total, n_threads):
+        var row = idx // n
+        var col = idx % n
+
+        var x_val = x[row * incx]
+        var y_val = y[col * incy]
+
+        A[row * lda + col] += alpha * x_val * y_val
+
+fn dger_device[
+    BLOCK: Int,
+](
+    m: Int,
+    n: Int,
+    alpha: Scalar[DType.float64],
+    x: UnsafePointer[Scalar[DType.float64], ImmutAnyOrigin],
+    incx: Int,
+    y: UnsafePointer[Scalar[DType.float64], ImmutAnyOrigin],
+    incy: Int,
+    A: UnsafePointer[Scalar[DType.float64], MutAnyOrigin],
+    lda: Int,
+):
+    if m < 1 or n < 1:
+        return
+
+    var global_i = block_dim.x * block_idx.x + thread_idx.x
+    var n_threads = grid_dim.x * block_dim.x
+
+    var total = m * n
+
+    for idx in range(global_i, total, n_threads):
+        var row = idx // n
+        var col = idx % n
+
+        var x_val = x[row * incx]
+        var y_val = y[col * incy]
+
+        A[row * lda + col] += alpha * x_val * y_val
+
+fn blas_ger[dtype: DType](
+    m: Int,
+    n: Int,
+    alpha: Scalar[dtype],
+    d_x: UnsafePointer[Scalar[dtype], ImmutAnyOrigin],
+    incx: Int,
+    d_y: UnsafePointer[Scalar[dtype], ImmutAnyOrigin],
+    incy: Int,
+    d_A: UnsafePointer[Scalar[dtype], MutAnyOrigin],
+    lda: Int,
+    ctx: DeviceContext
+) raises:
+    if m < 1 or n < 1:
+        return
+
+    var total = m * n
+
+    @parameter
+    if dtype == DType.float32:
+        ctx.enqueue_function[sger_device[TBsize], sger_device[TBsize]](
+            m, n, alpha,
+            d_x, incx,
+            d_y, incy,
+            d_A, lda,
+            grid_dim=ceildiv(total, TBsize),
+            block_dim=TBsize,
+        )
+    elif dtype == DType.float64:
+        ctx.enqueue_function[dger_device[TBsize], dger_device[TBsize]](
+            m, n, alpha,
+            d_x, incx,
+            d_y, incy,
+            d_A, lda,
+            grid_dim=ceildiv(total, TBsize),
+            block_dim=TBsize,
+        )
+    else:
+        raise Error("blas_ger: Unsupported type")
+
+    ctx.synchronize()
diff --git a/src/testing_utils/__init__.mojo b/src/testing_utils/__init__.mojo
@@ -0,0 +1 @@
+from .testing_utils import *
diff --git a/src/testing_utils/testing_utils.mojo b/src/testing_utils/testing_utils.mojo
@@ -0,0 +1,18 @@
+from random import rand, seed
+
+def generate_random_arr[
+    dtype: DType,
+    size:  Int
+](
+    a:   UnsafePointer[Scalar[dtype], MutAnyOrigin],
+    min_value: Scalar[dtype],
+    max_value: Scalar[dtype]
+):
+    # Generate random values in [0, 1]
+    seed()
+    rand[dtype](a, size)
+
+    # Scale to [min, max]
+    var rng = max_value - min_value
+    for i in range(size):
+        a[i] = min_value + a[i] * rng
diff --git a/test-level1.mojo b/test-level1.mojo
@@ -14,24 +14,6 @@ from python import Python, PythonObject
 comptime TBsize = 512
 comptime atol = 1.0E-4
 
-def generate_random_arr[
-    dtype: DType,
-    size:  Int
-](
-    a:   UnsafePointer[Scalar[dtype], MutAnyOrigin],
-    min_value: Scalar[dtype],
-    max_value: Scalar[dtype]
-):
-    # Generate random values in [0, 1]
-    seed()
-    rand[dtype](a, size)
-
-    # Scale to [min, max]
-    var rng = max_value - min_value
-    for i in range(size):
-        a[i] = min_value + a[i] * rng
-
-
 def generate_random_scalar[
     dtype: DType,
 ](

diff --git a/test-level2.mojo b/test-level2.mojo
@@ -0,0 +1,97 @@
+from testing import assert_equal, assert_almost_equal, TestSuite
+from sys import has_accelerator
+from gpu.host import DeviceContext
+from gpu import block_dim, grid_dim, thread_idx
+from layout import Layout, LayoutTensor
+from math import sqrt
+
+from src import *
+from random import rand, seed, randn_float64
+from math import ceildiv, sin, cos
+from python import Python, PythonObject
+
+comptime TBsize = 512
+comptime atol = 1.0E-5
+
+
+def ger_test[
+    dtype: DType,
+    m:  Int,
+    n: Int,
+]():
+    with DeviceContext() as ctx:
+        A_device = ctx.enqueue_create_buffer[dtype](m*n)
+        A = ctx.enqueue_create_host_buffer[dtype](m*n)
+        x_device = ctx.enqueue_create_buffer[dtype](m)
+        x = ctx.enqueue_create_host_buffer[dtype](m)
+        y_device = ctx.enqueue_create_buffer[dtype](n)
+        y = ctx.enqueue_create_host_buffer[dtype](n)
+
+        # Generate three arrays of random numbers on CPU
+        generate_random_arr[dtype, m*n](A.unsafe_ptr(), -100, 100)
+        generate_random_arr[dtype, m](x.unsafe_ptr(), -100, 100)
+        generate_random_arr[dtype, n](y.unsafe_ptr(), -100, 100)
+
+        ctx.enqueue_copy(A_device, A)
+        ctx.enqueue_copy(x_device, x)
+        ctx.enqueue_copy(y_device, y)
+
+        var alpha = randn_float64(0.0, 1.0)
+
+        # Import SciPy and numpy
+        sp = Python.import_module("scipy")
+        np = Python.import_module("numpy")
+        sp_blas = sp.linalg.blas
+
+        # Move a and b to a SciPy-compatible array and run SciPy BLAS routine
+        py_a = Python.list()
+        py_x = Python.list()
+        py_y = Python.list()
+
+        for i in range(m*n):
+            py_a.append(A[i])
+        for i in range(m):
+            py_x.append(x[i])
+        for i in range(n):
+            py_y.append(y[i])
+
+        var sp_res: PythonObject
+        # ger - float32
+        if dtype == DType.float32:
+            np_a = np.array(py_a, dtype=np.float32).reshape(m,n)
+            np_x = np.array(py_x, dtype=np.float32)
+            np_y = np.array(py_y, dtype=np.float32)
+            sp_res = sp_blas.sger(alpha, np_x, np_y, 1, 1, np_a)
+        elif dtype == DType.float64:
+            np_a = np.array(py_a, dtype=np.float64).reshape(m,n)
+            np_x = np.array(py_x, dtype=np.float64)
+            np_y = np.array(py_y, dtype=np.float64)
+            sp_res = sp_blas.dger(alpha, np_x, np_y, 1, 1, np_a)
+        else:
+            print("Unsupported type: ", dtype)
+            return
+
+        blas_ger[dtype](
+            m,
+            n,
+            Scalar[dtype](alpha),
+            x_device.unsafe_ptr(), 1,
+            y_device.unsafe_ptr(), 1,
+            A_device.unsafe_ptr(), n,
+            ctx)
+
+        with A_device.map_to_host() as res_mojo:
+            for i in range(m):
+                for j in range(n):
+                    assert_almost_equal(Scalar[dtype](py=sp_res[i][j]), res_mojo[(i*n)+j], atol=atol)
+
+
+def test_ger():
+    ger_test[DType.float32, 64, 64]()
+    ger_test[DType.float32, 256, 256]()
+    ger_test[DType.float64, 64, 64]()
+    ger_test[DType.float64, 256, 256]()
+
+def main():
+    print("--- MojoBLAS Level 2 routines testing ---")
+    TestSuite.discover_tests[__functions_in_module()]().run()