From 4368f93dc0a1818995bf16944a37530be744f6a9 Mon Sep 17 00:00:00 2001
From: Jackson Mowry <jacksonm@duck.com>
Date: Fri, 20 Feb 2026 11:23:15 +0000
Subject: [PATCH 1/4] ger implementation, 32/64 split

---
 src/__init__.mojo          |   2 +
 src/level2/__init__.mojo   |   1 +
 src/level2/ger_device.mojo | 129 ++++++++++++++++++++++++++++
 test-level2.mojo           | 166 +++++++++++++++++++++++++++++++++++++
 4 files changed, 298 insertions(+)
 create mode 100644 src/level2/__init__.mojo
 create mode 100644 src/level2/ger_device.mojo
 create mode 100644 test-level2.mojo

diff --git a/src/__init__.mojo b/src/__init__.mojo
index 014246d..280bb3c 100644
--- a/src/__init__.mojo
+++ b/src/__init__.mojo
@@ -11,3 +11,5 @@ from .level1.dotc_device import *
 from .level1.dotu_device import *
 from .level1.nrm2_device import *
 from .level1.iamax_device import *
+
+from .level2.ger_device import *
diff --git a/src/level2/__init__.mojo b/src/level2/__init__.mojo
new file mode 100644
index 0000000..997c28c
--- /dev/null
+++ b/src/level2/__init__.mojo
@@ -0,0 +1 @@
+from .ger_device import *
diff --git a/src/level2/ger_device.mojo b/src/level2/ger_device.mojo
new file mode 100644
index 0000000..81349c6
--- /dev/null
+++ b/src/level2/ger_device.mojo
@@ -0,0 +1,129 @@
+from gpu import thread_idx, block_idx, block_dim, grid_dim
+from gpu.host import DeviceContext
+from math import ceildiv
+
+comptime TBsize = 512
+
+# level2.sger
+# Computes single-precision rank-1 update of given matrix: A := A + αxy'
+fn sger_device[
+    BLOCK: Int,
+](
+    m: Int,
+    n: Int,
+    alpha: Scalar[DType.float32],
+    x: UnsafePointer[Scalar[DType.float32], ImmutAnyOrigin],
+    incx: Int,
+    y: UnsafePointer[Scalar[DType.float32], ImmutAnyOrigin],
+    incy: Int,
+    A: UnsafePointer[Scalar[DType.float32], MutAnyOrigin],
+    lda: Int,
+):
+    if m < 1 or n < 1:
+        return
+
+    var global_i = block_dim.x * block_idx.x + thread_idx.x
+    var n_threads = grid_dim.x * block_dim.x
+
+    var total = m * n
+
+    for idx in range(global_i, total, n_threads):
+        var row = idx // n
+        var col = idx % n
+
+        var x_val = x[row * incx]
+        var y_val = y[col * incy]
+
+        A[row * lda + col] += alpha * x_val * y_val
+
+fn blas_sger(
+    m: Int,
+    n: Int,
+    alpha: Scalar[DType.float32],
+    d_x: UnsafePointer[Scalar[DType.float32], ImmutAnyOrigin],
+    incx: Int,
+    d_y: UnsafePointer[Scalar[DType.float32], ImmutAnyOrigin],
+    incy: Int,
+    d_A: UnsafePointer[Scalar[DType.float32], MutAnyOrigin],
+    lda: Int,
+    ctx: DeviceContext
+) raises:
+    if m < 1 or n < 1:
+        return
+    
+    comptime kernel = sger_device[TBsize]
+
+    var total = m * n
+
+    ctx.enqueue_function[kernel, kernel](
+        m, n, alpha,
+        d_x, incx,
+        d_y, incy,
+        d_A, lda,
+        grid_dim=ceildiv(total, TBsize),
+        block_dim=TBsize,
+    )
+
+    ctx.synchronize()
+
+# level2.dger
+# Computes double-precision rank-1 update of given matrix: A := A + αxy'
+fn dger_device[
+    BLOCK: Int,
+](
+    m: Int,
+    n: Int,
+    alpha: Scalar[DType.float64],
+    x: UnsafePointer[Scalar[DType.float64], ImmutAnyOrigin],
+    incx: Int,
+    y: UnsafePointer[Scalar[DType.float64], ImmutAnyOrigin],
+    incy: Int,
+    A: UnsafePointer[Scalar[DType.float64], MutAnyOrigin],
+    lda: Int,
+):
+    if m < 1 or n < 1:
+        return
+
+    var global_i = block_dim.x * block_idx.x + thread_idx.x
+    var n_threads = grid_dim.x * block_dim.x
+
+    var total = m * n
+
+    for idx in range(global_i, total, n_threads):
+        var row = idx // n
+        var col = idx % n
+
+        var x_val = x[row * incx]
+        var y_val = y[col * incy]
+
+        A[row * lda + col] += alpha * x_val * y_val
+
+fn blas_dger(
+    m: Int,
+    n: Int,
+    alpha: Scalar[DType.float64],
+    d_x: UnsafePointer[Scalar[DType.float64], ImmutAnyOrigin],
+    incx: Int,
+    d_y: UnsafePointer[Scalar[DType.float64], ImmutAnyOrigin],
+    incy: Int,
+    d_A: UnsafePointer[Scalar[DType.float64], MutAnyOrigin],
+    lda: Int,
+    ctx: DeviceContext
+) raises:
+    if m < 1 or n < 1:
+        return
+    
+    comptime kernel = dger_device[TBsize]
+
+    var total = m * n
+
+    ctx.enqueue_function[kernel, kernel](
+        m, n, alpha,
+        d_x, incx,
+        d_y, incy,
+        d_A, lda,
+        grid_dim=ceildiv(total, TBsize),
+        block_dim=TBsize,
+    )
+
+    ctx.synchronize()
diff --git a/test-level2.mojo b/test-level2.mojo
new file mode 100644
index 0000000..e2e8491
--- /dev/null
+++ b/test-level2.mojo
@@ -0,0 +1,166 @@
+from testing import assert_equal, assert_almost_equal, TestSuite
+from sys import has_accelerator
+from gpu.host import DeviceContext
+from gpu import block_dim, grid_dim, thread_idx
+from layout import Layout, LayoutTensor
+from math import sqrt
+from complex import ComplexSIMD
+
+from src import *
+from random import rand, seed, randn_float64
+from math import ceildiv, sin, cos
+from python import Python, PythonObject
+
+comptime TBsize = 512
+comptime atol = 1.0E-5
+
+def generate_random_arr[
+    dtype: DType,
+    size:  Int
+](
+    a:   UnsafePointer[Scalar[dtype], MutAnyOrigin],
+    min_value: Scalar[dtype],
+    max_value: Scalar[dtype]
+):
+    # Generate random values in [0, 1]
+    seed()
+    rand[dtype](a, size)
+
+    # Scale to [min, max]
+    var rng = max_value - min_value
+    for i in range(size):
+        a[i] = min_value + a[i] * rng
+
+
+def sger_test[
+    m:  Int,
+    n: Int,
+]():
+    with DeviceContext() as ctx:
+        A_device = ctx.enqueue_create_buffer[DType.float32](m*n)
+        A = ctx.enqueue_create_host_buffer[DType.float32](m*n)
+        x_device = ctx.enqueue_create_buffer[DType.float32](m)
+        x = ctx.enqueue_create_host_buffer[DType.float32](m)
+        y_device = ctx.enqueue_create_buffer[DType.float32](n)
+        y = ctx.enqueue_create_host_buffer[DType.float32](n)
+
+        # Generate two arrays of random numbers on CPU
+        generate_random_arr[DType.float32, m*n](A.unsafe_ptr(), -100, 100)
+        generate_random_arr[DType.float32, m](x.unsafe_ptr(), -100, 100)
+        generate_random_arr[DType.float32, n](y.unsafe_ptr(), -100, 100)
+
+        ctx.enqueue_copy(A_device, A)
+        ctx.enqueue_copy(x_device, x)
+        ctx.enqueue_copy(y_device, y)
+
+        var alpha = randn_float64(0.0, 1.0)
+
+        # Import SciPy and numpy
+        sp = Python.import_module("scipy")
+        np = Python.import_module("numpy")
+        sp_blas = sp.linalg.blas
+
+        # Move a and b to a SciPy-compatible array and run SciPy BLAS routine
+        py_a = Python.list()
+        py_x = Python.list()
+        py_y = Python.list()
+
+        for i in range(m*n):
+            py_a.append(A[i])
+        for i in range(m):
+            py_x.append(x[i])
+        for i in range(n):
+            py_y.append(y[i])
+
+        var sp_res: PythonObject
+        # sger - float32
+        np_a = np.array(py_a, dtype=np.float32).reshape(m,n)
+        np_x = np.array(py_x, dtype=np.float32)
+        np_y = np.array(py_y, dtype=np.float32)
+        sp_res = sp_blas.sger(alpha, np_x, np_y, 1, 1, np_a)
+
+        blas_sger(
+            m,
+            n,
+            Scalar[DType.float32](alpha),
+            x_device.unsafe_ptr(), 1,
+            y_device.unsafe_ptr(), 1,
+            A_device.unsafe_ptr(), n,
+            ctx)
+
+        with A_device.map_to_host() as res_mojo:
+            for i in range(m):
+                for j in range(n):
+                    assert_almost_equal(Scalar[DType.float32](py=sp_res[i][j]), res_mojo[(i*n)+j], atol=atol)
+
+def dger_test[
+    m:  Int,
+    n: Int,
+]():
+    with DeviceContext() as ctx:
+        A_device = ctx.enqueue_create_buffer[DType.float64](m*n)
+        A = ctx.enqueue_create_host_buffer[DType.float64](m*n)
+        x_device = ctx.enqueue_create_buffer[DType.float64](m)
+        x = ctx.enqueue_create_host_buffer[DType.float64](m)
+        y_device = ctx.enqueue_create_buffer[DType.float64](n)
+        y = ctx.enqueue_create_host_buffer[DType.float64](n)
+
+        # Generate two arrays of random numbers on CPU
+        generate_random_arr[DType.float64, m*n](A.unsafe_ptr(), -100, 100)
+        generate_random_arr[DType.float64, m](x.unsafe_ptr(), -100, 100)
+        generate_random_arr[DType.float64, n](y.unsafe_ptr(), -100, 100)
+
+        ctx.enqueue_copy(A_device, A)
+        ctx.enqueue_copy(x_device, x)
+        ctx.enqueue_copy(y_device, y)
+
+        var alpha = randn_float64(0.0, 1.0)
+
+        # Import SciPy and numpy
+        sp = Python.import_module("scipy")
+        np = Python.import_module("numpy")
+        sp_blas = sp.linalg.blas
+
+        # Move a and b to a SciPy-compatible array and run SciPy BLAS routine
+        py_a = Python.list()
+        py_x = Python.list()
+        py_y = Python.list()
+
+        for i in range(m*n):
+            py_a.append(A[i])
+        for i in range(m):
+            py_x.append(x[i])
+        for i in range(n):
+            py_y.append(y[i])
+
+        var sp_res: PythonObject
+        # dger - float64
+        np_a = np.array(py_a, dtype=np.float64).reshape(m,n)
+        np_x = np.array(py_x, dtype=np.float64)
+        np_y = np.array(py_y, dtype=np.float64)
+        sp_res = sp_blas.dger(alpha, np_x, np_y, 1, 1, np_a)
+        blas_dger(
+            m,
+            n,
+            Scalar[DType.float64](alpha),
+            x_device.unsafe_ptr(), 1,
+            y_device.unsafe_ptr(), 1,
+            A_device.unsafe_ptr(), n,
+            ctx)
+
+        with A_device.map_to_host() as res_mojo:
+            for i in range(m):
+                for j in range(n):
+                    assert_almost_equal(Scalar[DType.float64](py=sp_res[i][j]), res_mojo[(i*n)+j], atol=atol)
+
+def test_sger():
+    sger_test[64, 64]()
+    sger_test[256, 256]()
+
+def test_dger():
+    dger_test[64, 64]()
+    dger_test[256, 256]()
+
+def main():
+    print("--- MojoBLAS Level 2 routines testing ---")
+    TestSuite.discover_tests[__functions_in_module()]().run()

From 22cb32de2cf520fa7c460fbd30718816544cd130 Mon Sep 17 00:00:00 2001
From: Jackson Mowry <jacksonm@duck.com>
Date: Fri, 20 Feb 2026 11:37:03 +0000
Subject: [PATCH 2/4] Refactor out random array generation for all levels of
 testing

---
 src/__init__.mojo                    |  2 ++
 src/testing_utils/__init__.mojo      |  1 +
 src/testing_utils/testing_utils.mojo | 18 ++++++++++++++++++
 test-level1.mojo                     | 18 ------------------
 test-level2.mojo                     | 22 ++--------------------
 5 files changed, 23 insertions(+), 38 deletions(-)
 create mode 100644 src/testing_utils/__init__.mojo
 create mode 100644 src/testing_utils/testing_utils.mojo

diff --git a/src/__init__.mojo b/src/__init__.mojo
index 280bb3c..e79fd5a 100644
--- a/src/__init__.mojo
+++ b/src/__init__.mojo
@@ -13,3 +13,5 @@ from .level1.nrm2_device import *
 from .level1.iamax_device import *
 
 from .level2.ger_device import *
+
+from .testing_utils.testing_utils import *
diff --git a/src/testing_utils/__init__.mojo b/src/testing_utils/__init__.mojo
new file mode 100644
index 0000000..8975453
--- /dev/null
+++ b/src/testing_utils/__init__.mojo
@@ -0,0 +1 @@
+from .testing_utils import *
diff --git a/src/testing_utils/testing_utils.mojo b/src/testing_utils/testing_utils.mojo
new file mode 100644
index 0000000..e7d1117
--- /dev/null
+++ b/src/testing_utils/testing_utils.mojo
@@ -0,0 +1,18 @@
+from random import rand, seed
+
+def generate_random_arr[
+    dtype: DType,
+    size:  Int
+](
+    a:   UnsafePointer[Scalar[dtype], MutAnyOrigin],
+    min_value: Scalar[dtype],
+    max_value: Scalar[dtype]
+):
+    # Generate random values in [0, 1]
+    seed()
+    rand[dtype](a, size)
+
+    # Scale to [min, max]
+    var rng = max_value - min_value
+    for i in range(size):
+        a[i] = min_value + a[i] * rng
diff --git a/test-level1.mojo b/test-level1.mojo
index 01b02a3..665155e 100644
--- a/test-level1.mojo
+++ b/test-level1.mojo
@@ -14,24 +14,6 @@ from python import Python, PythonObject
 comptime TBsize = 512
 comptime atol = 1.0E-4
 
-def generate_random_arr[
-    dtype: DType,
-    size:  Int
-](
-    a:   UnsafePointer[Scalar[dtype], MutAnyOrigin],
-    min_value: Scalar[dtype],
-    max_value: Scalar[dtype]
-):
-    # Generate random values in [0, 1]
-    seed()
-    rand[dtype](a, size)
-
-    # Scale to [min, max]
-    var rng = max_value - min_value
-    for i in range(size):
-        a[i] = min_value + a[i] * rng
-
-
 def generate_random_scalar[
     dtype: DType,
 ](
diff --git a/test-level2.mojo b/test-level2.mojo
index e2e8491..7c1aa21 100644
--- a/test-level2.mojo
+++ b/test-level2.mojo
@@ -4,7 +4,6 @@ from gpu.host import DeviceContext
 from gpu import block_dim, grid_dim, thread_idx
 from layout import Layout, LayoutTensor
 from math import sqrt
-from complex import ComplexSIMD
 
 from src import *
 from random import rand, seed, randn_float64
@@ -14,23 +13,6 @@ from python import Python, PythonObject
 comptime TBsize = 512
 comptime atol = 1.0E-5
 
-def generate_random_arr[
-    dtype: DType,
-    size:  Int
-](
-    a:   UnsafePointer[Scalar[dtype], MutAnyOrigin],
-    min_value: Scalar[dtype],
-    max_value: Scalar[dtype]
-):
-    # Generate random values in [0, 1]
-    seed()
-    rand[dtype](a, size)
-
-    # Scale to [min, max]
-    var rng = max_value - min_value
-    for i in range(size):
-        a[i] = min_value + a[i] * rng
-
 
 def sger_test[
     m:  Int,
@@ -44,7 +26,7 @@ def sger_test[
         y_device = ctx.enqueue_create_buffer[DType.float32](n)
         y = ctx.enqueue_create_host_buffer[DType.float32](n)
 
-        # Generate two arrays of random numbers on CPU
+        # Generate three arrays of random numbers on CPU
         generate_random_arr[DType.float32, m*n](A.unsafe_ptr(), -100, 100)
         generate_random_arr[DType.float32, m](x.unsafe_ptr(), -100, 100)
         generate_random_arr[DType.float32, n](y.unsafe_ptr(), -100, 100)
@@ -105,7 +87,7 @@ def dger_test[
         y_device = ctx.enqueue_create_buffer[DType.float64](n)
         y = ctx.enqueue_create_host_buffer[DType.float64](n)
 
-        # Generate two arrays of random numbers on CPU
+        # Generate three arrays of random numbers on CPU
         generate_random_arr[DType.float64, m*n](A.unsafe_ptr(), -100, 100)
         generate_random_arr[DType.float64, m](x.unsafe_ptr(), -100, 100)
         generate_random_arr[DType.float64, n](y.unsafe_ptr(), -100, 100)

From e00f3ab723e420289ace4f9b795078b2ef8c0f98 Mon Sep 17 00:00:00 2001
From: Jackson Mowry <jacksonm@duck.com>
Date: Fri, 20 Feb 2026 19:36:16 +0000
Subject: [PATCH 3/4] Broken state

---
 src/level2/ger_device.mojo |  76 +++++++++--------------
 test-level2.mojo           | 119 +++++++++++--------------------------
 2 files changed, 61 insertions(+), 134 deletions(-)

diff --git a/src/level2/ger_device.mojo b/src/level2/ger_device.mojo
index 81349c6..bd584e4 100644
--- a/src/level2/ger_device.mojo
+++ b/src/level2/ger_device.mojo
@@ -4,8 +4,8 @@ from math import ceildiv
 
 comptime TBsize = 512
 
-# level2.sger
-# Computes single-precision rank-1 update of given matrix: A := A + αxy'
+# level2.ger
+# Computes rank-1 update of given matrix: A := A + αxy'
 fn sger_device[
     BLOCK: Int,
 ](
@@ -36,38 +36,6 @@ fn sger_device[
 
         A[row * lda + col] += alpha * x_val * y_val
 
-fn blas_sger(
-    m: Int,
-    n: Int,
-    alpha: Scalar[DType.float32],
-    d_x: UnsafePointer[Scalar[DType.float32], ImmutAnyOrigin],
-    incx: Int,
-    d_y: UnsafePointer[Scalar[DType.float32], ImmutAnyOrigin],
-    incy: Int,
-    d_A: UnsafePointer[Scalar[DType.float32], MutAnyOrigin],
-    lda: Int,
-    ctx: DeviceContext
-) raises:
-    if m < 1 or n < 1:
-        return
-    
-    comptime kernel = sger_device[TBsize]
-
-    var total = m * n
-
-    ctx.enqueue_function[kernel, kernel](
-        m, n, alpha,
-        d_x, incx,
-        d_y, incy,
-        d_A, lda,
-        grid_dim=ceildiv(total, TBsize),
-        block_dim=TBsize,
-    )
-
-    ctx.synchronize()
-
-# level2.dger
-# Computes double-precision rank-1 update of given matrix: A := A + αxy'
 fn dger_device[
     BLOCK: Int,
 ](
@@ -98,32 +66,42 @@ fn dger_device[
 
         A[row * lda + col] += alpha * x_val * y_val
 
-fn blas_dger(
+fn blas_ger[dtype: DType](
     m: Int,
     n: Int,
-    alpha: Scalar[DType.float64],
-    d_x: UnsafePointer[Scalar[DType.float64], ImmutAnyOrigin],
+    alpha: Scalar[dtype],
+    d_x: UnsafePointer[Scalar[dtype], ImmutAnyOrigin],
     incx: Int,
-    d_y: UnsafePointer[Scalar[DType.float64], ImmutAnyOrigin],
+    d_y: UnsafePointer[Scalar[dtype], ImmutAnyOrigin],
     incy: Int,
-    d_A: UnsafePointer[Scalar[DType.float64], MutAnyOrigin],
+    d_A: UnsafePointer[Scalar[dtype], MutAnyOrigin],
     lda: Int,
     ctx: DeviceContext
 ) raises:
     if m < 1 or n < 1:
         return
     
-    comptime kernel = dger_device[TBsize]
-
     var total = m * n
 
-    ctx.enqueue_function[kernel, kernel](
-        m, n, alpha,
-        d_x, incx,
-        d_y, incy,
-        d_A, lda,
-        grid_dim=ceildiv(total, TBsize),
-        block_dim=TBsize,
-    )
+    if dtype == DType.float32:
+        ctx.enqueue_function[sger_device[TBsize], sger_device[TBsize]](
+            m, n, alpha,
+            d_x, incx,
+            d_y, incy,
+            d_A, lda,
+            grid_dim=ceildiv(total, TBsize),
+            block_dim=TBsize,
+        )
+    elif dtype == DType.float64:
+        ctx.enqueue_function[dger_device[TBsize], dger_device[TBsize]](
+            m, n, alpha,
+            d_x, incx,
+            d_y, incy,
+            d_A, lda,
+            grid_dim=ceildiv(total, TBsize),
+            block_dim=TBsize,
+        )
+    else:
+        return
 
     ctx.synchronize()
diff --git a/test-level2.mojo b/test-level2.mojo
index 7c1aa21..343b35a 100644
--- a/test-level2.mojo
+++ b/test-level2.mojo
@@ -14,22 +14,23 @@ comptime TBsize = 512
 comptime atol = 1.0E-5
 
 
-def sger_test[
+def ger_test[
+    dtype: DType,
     m:  Int,
     n: Int,
 ]():
     with DeviceContext() as ctx:
-        A_device = ctx.enqueue_create_buffer[DType.float32](m*n)
-        A = ctx.enqueue_create_host_buffer[DType.float32](m*n)
-        x_device = ctx.enqueue_create_buffer[DType.float32](m)
-        x = ctx.enqueue_create_host_buffer[DType.float32](m)
-        y_device = ctx.enqueue_create_buffer[DType.float32](n)
-        y = ctx.enqueue_create_host_buffer[DType.float32](n)
+        A_device = ctx.enqueue_create_buffer[dtype](m*n)
+        A = ctx.enqueue_create_host_buffer[dtype](m*n)
+        x_device = ctx.enqueue_create_buffer[dtype](m)
+        x = ctx.enqueue_create_host_buffer[dtype](m)
+        y_device = ctx.enqueue_create_buffer[dtype](n)
+        y = ctx.enqueue_create_host_buffer[dtype](n)
 
         # Generate three arrays of random numbers on CPU
-        generate_random_arr[DType.float32, m*n](A.unsafe_ptr(), -100, 100)
-        generate_random_arr[DType.float32, m](x.unsafe_ptr(), -100, 100)
-        generate_random_arr[DType.float32, n](y.unsafe_ptr(), -100, 100)
+        generate_random_arr[dtype, m*n](A.unsafe_ptr(), -100, 100)
+        generate_random_arr[dtype, m](x.unsafe_ptr(), -100, 100)
+        generate_random_arr[dtype, n](y.unsafe_ptr(), -100, 100)
 
         ctx.enqueue_copy(A_device, A)
         ctx.enqueue_copy(x_device, x)
@@ -55,76 +56,25 @@ def sger_test[
             py_y.append(y[i])
 
         var sp_res: PythonObject
-        # sger - float32
-        np_a = np.array(py_a, dtype=np.float32).reshape(m,n)
-        np_x = np.array(py_x, dtype=np.float32)
-        np_y = np.array(py_y, dtype=np.float32)
-        sp_res = sp_blas.sger(alpha, np_x, np_y, 1, 1, np_a)
-
-        blas_sger(
-            m,
-            n,
-            Scalar[DType.float32](alpha),
-            x_device.unsafe_ptr(), 1,
-            y_device.unsafe_ptr(), 1,
-            A_device.unsafe_ptr(), n,
-            ctx)
-
-        with A_device.map_to_host() as res_mojo:
-            for i in range(m):
-                for j in range(n):
-                    assert_almost_equal(Scalar[DType.float32](py=sp_res[i][j]), res_mojo[(i*n)+j], atol=atol)
-
-def dger_test[
-    m:  Int,
-    n: Int,
-]():
-    with DeviceContext() as ctx:
-        A_device = ctx.enqueue_create_buffer[DType.float64](m*n)
-        A = ctx.enqueue_create_host_buffer[DType.float64](m*n)
-        x_device = ctx.enqueue_create_buffer[DType.float64](m)
-        x = ctx.enqueue_create_host_buffer[DType.float64](m)
-        y_device = ctx.enqueue_create_buffer[DType.float64](n)
-        y = ctx.enqueue_create_host_buffer[DType.float64](n)
-
-        # Generate three arrays of random numbers on CPU
-        generate_random_arr[DType.float64, m*n](A.unsafe_ptr(), -100, 100)
-        generate_random_arr[DType.float64, m](x.unsafe_ptr(), -100, 100)
-        generate_random_arr[DType.float64, n](y.unsafe_ptr(), -100, 100)
-
-        ctx.enqueue_copy(A_device, A)
-        ctx.enqueue_copy(x_device, x)
-        ctx.enqueue_copy(y_device, y)
-
-        var alpha = randn_float64(0.0, 1.0)
-
-        # Import SciPy and numpy
-        sp = Python.import_module("scipy")
-        np = Python.import_module("numpy")
-        sp_blas = sp.linalg.blas
-
-        # Move a and b to a SciPy-compatible array and run SciPy BLAS routine
-        py_a = Python.list()
-        py_x = Python.list()
-        py_y = Python.list()
-
-        for i in range(m*n):
-            py_a.append(A[i])
-        for i in range(m):
-            py_x.append(x[i])
-        for i in range(n):
-            py_y.append(y[i])
-
-        var sp_res: PythonObject
-        # dger - float64
-        np_a = np.array(py_a, dtype=np.float64).reshape(m,n)
-        np_x = np.array(py_x, dtype=np.float64)
-        np_y = np.array(py_y, dtype=np.float64)
-        sp_res = sp_blas.dger(alpha, np_x, np_y, 1, 1, np_a)
-        blas_dger(
+        # ger - float32
+        if dtype == DType.float32:
+            np_a = np.array(py_a, dtype=np.float32).reshape(m,n)
+            np_x = np.array(py_x, dtype=np.float32)
+            np_y = np.array(py_y, dtype=np.float32)
+            sp_res = sp_blas.sger(alpha, np_x, np_y, 1, 1, np_a)
+        if dtype == DType.float64:
+            np_a = np.array(py_a, dtype=np.float64).reshape(m,n)
+            np_x = np.array(py_x, dtype=np.float64)
+            np_y = np.array(py_y, dtype=np.float64)
+            sp_res = sp_blas.dger(alpha, np_x, np_y, 1, 1, np_a)
+        else:
+            print("Unsupported type: ", dtype)
+            return
+
+        blas_ger[dtype](
             m,
             n,
-            Scalar[DType.float64](alpha),
+            Scalar[dtype](alpha),
             x_device.unsafe_ptr(), 1,
             y_device.unsafe_ptr(), 1,
             A_device.unsafe_ptr(), n,
@@ -133,15 +83,14 @@ def dger_test[
         with A_device.map_to_host() as res_mojo:
             for i in range(m):
                 for j in range(n):
-                    assert_almost_equal(Scalar[DType.float64](py=sp_res[i][j]), res_mojo[(i*n)+j], atol=atol)
+                    assert_almost_equal(Scalar[dtype](py=sp_res[i][j]), res_mojo[(i*n)+j], atol=atol)
 
-def test_sger():
-    sger_test[64, 64]()
-    sger_test[256, 256]()
 
-def test_dger():
-    dger_test[64, 64]()
-    dger_test[256, 256]()
+def test_ger():
+    ger_test[DType.float32, 64, 64]()
+    # ger_test[DType.float32, 256, 256]()
+    # ger_test[DType.float64, 64, 64]()
+    # ger_test[DType.float64, 256, 256]()
 
 def main():
     print("--- MojoBLAS Level 2 routines testing ---")

From 07efeb8f82b0c878ff496b207d6b4e75b4d62d3b Mon Sep 17 00:00:00 2001
From: Jackson Mowry <jacksonm@duck.com>
Date: Wed, 25 Feb 2026 11:23:13 +0000
Subject: [PATCH 4/4] Add @parameter for compile-time execution, fix and
 re-enable test for level 2

---
 src/level2/ger_device.mojo | 3 ++-
 test-level2.mojo           | 8 ++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/level2/ger_device.mojo b/src/level2/ger_device.mojo
index bd584e4..011e7b9 100644
--- a/src/level2/ger_device.mojo
+++ b/src/level2/ger_device.mojo
@@ -83,6 +83,7 @@ fn blas_ger[dtype: DType](
     
     var total = m * n
 
+    @parameter
     if dtype == DType.float32:
         ctx.enqueue_function[sger_device[TBsize], sger_device[TBsize]](
             m, n, alpha,
@@ -102,6 +103,6 @@ fn blas_ger[dtype: DType](
             block_dim=TBsize,
         )
     else:
-        return
+        raise Error("blas_ger: Unsupported type")
 
     ctx.synchronize()
diff --git a/test-level2.mojo b/test-level2.mojo
index 343b35a..3466766 100644
--- a/test-level2.mojo
+++ b/test-level2.mojo
@@ -62,7 +62,7 @@ def ger_test[
             np_x = np.array(py_x, dtype=np.float32)
             np_y = np.array(py_y, dtype=np.float32)
             sp_res = sp_blas.sger(alpha, np_x, np_y, 1, 1, np_a)
-        if dtype == DType.float64:
+        elif dtype == DType.float64:
             np_a = np.array(py_a, dtype=np.float64).reshape(m,n)
             np_x = np.array(py_x, dtype=np.float64)
             np_y = np.array(py_y, dtype=np.float64)
@@ -88,9 +88,9 @@ def ger_test[
 
 def test_ger():
     ger_test[DType.float32, 64, 64]()
-    # ger_test[DType.float32, 256, 256]()
-    # ger_test[DType.float64, 64, 64]()
-    # ger_test[DType.float64, 256, 256]()
+    ger_test[DType.float32, 256, 256]()
+    ger_test[DType.float64, 64, 64]()
+    ger_test[DType.float64, 256, 256]()
 
 def main():
     print("--- MojoBLAS Level 2 routines testing ---")