Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 4 additions & 13 deletions src/__init__.mojo
Original file line number Diff line number Diff line change
@@ -1,13 +1,4 @@
from .level1.asum_device import *
from .level1.axpy_device import *
from .level1.scal_device import *
from .level1.copy_device import *
from .level1.rot_device import *
from .level1.rotg import *
from .level1.rotm_device import *
from .level1.swap_device import *
from .level1.dot_device import *
from .level1.dotc_device import *
from .level1.dotu_device import *
from .level1.nrm2_device import *
from .level1.iamax_device import *
from .testing_utils import *
from .level1 import *
from .level2 import *
# from .level3 import *
2 changes: 2 additions & 0 deletions src/level2/__init__.mojo
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .gemv_device import *
from .ger_device import *
117 changes: 117 additions & 0 deletions src/level2/gemv_device.mojo
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
from gpu import thread_idx, block_idx, block_dim, grid_dim
from gpu.host import DeviceContext
from math import ceildiv

comptime TBsize = 512

# level2.gemv
# Performs matrix-vector multiplication of form
# y := alpha*A*x + beta*y, or y := alpha*A**T*x + beta*y,
# where alpha and beta are scalars, x and y are vectors and A is an m by n matrix.
fn sgemv_device(
trans: Int,
m: Int,
n: Int,
alpha: Float32,
A: UnsafePointer[Float32, ImmutAnyOrigin],
lda: Int,
x: UnsafePointer[Float32, ImmutAnyOrigin],
incx: Int,
beta: Float32,
y: UnsafePointer[Float32, MutAnyOrigin],
incy: Int,
):
var global_i = block_dim.x * block_idx.x + thread_idx.x
var n_threads = grid_dim.x * block_dim.x

if not trans:
for i in range(global_i, m, n_threads):
var sum = Scalar[DType.float32](0)
for j in range(n):
sum += A[i * lda + j] * x[j * incx]
y[i * incy] = alpha * sum + beta * y[i * incy]
else:
for j in range(global_i, n, n_threads):
var sum = Scalar[DType.float32](0)
for i in range(m):
sum += A[i * lda + j] * x[i * incx]
y[j * incy] = alpha * sum + beta * y[j * incy]


fn dgemv_device(
trans: Int,
m: Int,
n: Int,
alpha: Float64,
A: UnsafePointer[Float64, ImmutAnyOrigin],
lda: Int,
x: UnsafePointer[Float64, ImmutAnyOrigin],
incx: Int,
beta: Float64,
y: UnsafePointer[Float64, MutAnyOrigin],
incy: Int,
):
var global_i = block_dim.x * block_idx.x + thread_idx.x
var n_threads = grid_dim.x * block_dim.x

if not trans:
for i in range(global_i, m, n_threads):
var sum = Scalar[DType.float64](0)
for j in range(n):
sum += A[i * lda + j] * x[j * incx]
y[i * incy] = alpha * sum + beta * y[i * incy]
else:
for j in range(global_i, n, n_threads):
var sum = Scalar[DType.float64](0)
for i in range(m):
sum += A[i * lda + j] * x[i * incx]
y[j * incy] = alpha * sum + beta * y[j * incy]


fn blas_gemv[dtype: DType](
trans: Bool,
m: Int,
n: Int,
alpha: Scalar[dtype],
d_A: UnsafePointer[Scalar[dtype], ImmutAnyOrigin],
lda: Int,
d_x: UnsafePointer[Scalar[dtype], ImmutAnyOrigin],
incx: Int,
beta: Scalar[dtype],
d_y: UnsafePointer[Scalar[dtype], MutAnyOrigin],
incy: Int,
ctx: DeviceContext,
) raises:

# NOTE: add error checking here?
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Possibly just revisit with the issue Jonah made #34 ?

Copy link
Copy Markdown
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to meet and discuss where and how much error checking we want to implement

# check m, n > 0
# check incx, incy > 0

var out_len = m if not trans else n

# Can't pass a Bool to GPU kernel
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lol

var trans_i = 1 if trans else 0

@parameter
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see now why this fixes everything haha

if dtype == DType.float32:
ctx.enqueue_function[sgemv_device, sgemv_device](
trans_i, m, n,
alpha, d_A, lda,
d_x, incx,
beta, d_y, incy,
grid_dim=ceildiv(out_len, TBsize),
block_dim=TBsize,
)
elif dtype == DType.float64:
ctx.enqueue_function[dgemv_device, dgemv_device](
trans_i, m, n,
alpha, d_A, lda,
d_x, incx,
beta, d_y, incy,
grid_dim=ceildiv(out_len, TBsize),
block_dim=TBsize,
)
else:
raise Error("blas_gemv: Unsupported type")

ctx.synchronize()
108 changes: 108 additions & 0 deletions src/level2/ger_device.mojo
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
from gpu import thread_idx, block_idx, block_dim, grid_dim
from gpu.host import DeviceContext
from math import ceildiv

comptime TBsize = 512

# level2.ger
# Computes rank-1 update of given matrix: A := A + αxy'
fn sger_device[
BLOCK: Int,
](
m: Int,
n: Int,
alpha: Scalar[DType.float32],
x: UnsafePointer[Scalar[DType.float32], ImmutAnyOrigin],
incx: Int,
y: UnsafePointer[Scalar[DType.float32], ImmutAnyOrigin],
incy: Int,
A: UnsafePointer[Scalar[DType.float32], MutAnyOrigin],
lda: Int,
):
if m < 1 or n < 1:
return

var global_i = block_dim.x * block_idx.x + thread_idx.x
var n_threads = grid_dim.x * block_dim.x

var total = m * n

for idx in range(global_i, total, n_threads):
var row = idx // n
var col = idx % n

var x_val = x[row * incx]
var y_val = y[col * incy]

A[row * lda + col] += alpha * x_val * y_val

fn dger_device[
BLOCK: Int,
](
m: Int,
n: Int,
alpha: Scalar[DType.float64],
x: UnsafePointer[Scalar[DType.float64], ImmutAnyOrigin],
incx: Int,
y: UnsafePointer[Scalar[DType.float64], ImmutAnyOrigin],
incy: Int,
A: UnsafePointer[Scalar[DType.float64], MutAnyOrigin],
lda: Int,
):
if m < 1 or n < 1:
return

var global_i = block_dim.x * block_idx.x + thread_idx.x
var n_threads = grid_dim.x * block_dim.x

var total = m * n

for idx in range(global_i, total, n_threads):
var row = idx // n
var col = idx % n

var x_val = x[row * incx]
var y_val = y[col * incy]

A[row * lda + col] += alpha * x_val * y_val

fn blas_ger[dtype: DType](
m: Int,
n: Int,
alpha: Scalar[dtype],
d_x: UnsafePointer[Scalar[dtype], ImmutAnyOrigin],
incx: Int,
d_y: UnsafePointer[Scalar[dtype], ImmutAnyOrigin],
incy: Int,
d_A: UnsafePointer[Scalar[dtype], MutAnyOrigin],
lda: Int,
ctx: DeviceContext
) raises:
if m < 1 or n < 1:
return

var total = m * n

@parameter
if dtype == DType.float32:
ctx.enqueue_function[sger_device[TBsize], sger_device[TBsize]](
m, n, alpha,
d_x, incx,
d_y, incy,
d_A, lda,
grid_dim=ceildiv(total, TBsize),
block_dim=TBsize,
)
elif dtype == DType.float64:
ctx.enqueue_function[dger_device[TBsize], dger_device[TBsize]](
m, n, alpha,
d_x, incx,
d_y, incy,
d_A, lda,
grid_dim=ceildiv(total, TBsize),
block_dim=TBsize,
)
else:
raise Error("blas_ger: Unsupported type")

ctx.synchronize()
1 change: 1 addition & 0 deletions src/testing_utils/__init__.mojo
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .testing_utils import *
75 changes: 75 additions & 0 deletions src/testing_utils/testing_utils.mojo
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
from random import rand, seed
from math import sqrt

comptime tol32: Float32 = 1e-8
comptime tol64: Float64 = 1e-16

def generate_random_arr[
dtype: DType,
size: Int
](
a: UnsafePointer[Scalar[dtype], MutAnyOrigin],
min_value: Scalar[dtype],
max_value: Scalar[dtype]
):
# Generate random values in [0, 1]
seed()
rand[dtype](a, size)

# Scale to [min, max]
var rng = max_value - min_value
for i in range(size):
a[i] = min_value + a[i] * rng


def generate_random_scalar[
dtype: DType,
](
min_value: Scalar[dtype],
max_value: Scalar[dtype]
) -> Scalar[dtype]:
# Generate random values in [0, 1]
seed()
var result = Scalar[dtype]()
rand[dtype](UnsafePointer(to=result), 1)

range = max_value - min_value
return min_value + result * range


# Error check following BLAS++ check_gemm:
# NOTE: can't get epsilon value in Mojo; using tol32, tol64
fn check_gemm_error[dtype: DType](
m: Int, n: Int, k: Int,
alpha: Scalar[dtype],
beta: Scalar[dtype],
A_norm: Scalar[dtype],
B_norm: Scalar[dtype],
C_ini_norm: Scalar[dtype],
error_norm: Scalar[dtype]
) -> Bool:
var alpha_ = max(abs(alpha), Scalar[dtype](1))
var beta_ = max(abs(beta), Scalar[dtype](1))
var denom = sqrt(Scalar[dtype](k) + Scalar[dtype](2)) * alpha_ * A_norm * B_norm
+ Scalar[dtype](2) * beta_ * C_ini_norm

if denom == Scalar[dtype](0):
return error_norm == Scalar[dtype](0)

var err = error_norm / denom

@parameter
if dtype == DType.float32:
return err < Scalar[dtype](tol32)
else:
return err < Scalar[dtype](tol64)


fn frobenius_norm[dtype: DType](
a: UnsafePointer[Scalar[dtype], MutAnyOrigin],
n: Int
) -> Scalar[dtype]:
var sum = Scalar[dtype](0)
for i in range(n):
sum += a[i] * a[i]
return sqrt(sum)
43 changes: 3 additions & 40 deletions test-level1.mojo
Original file line number Diff line number Diff line change
@@ -1,51 +1,14 @@
from testing import assert_equal, assert_almost_equal, TestSuite
from sys import has_accelerator
from gpu.host import DeviceContext
from gpu import block_dim, grid_dim, thread_idx
from layout import Layout, LayoutTensor
from math import sqrt
from complex import *

from src import *
from random import rand, seed, random_float64
from math import ceildiv, sin, cos
from math import sin, cos
from python import Python, PythonObject

comptime TBsize = 512
comptime atol = 1.0E-4

def generate_random_arr[
dtype: DType,
size: Int
](
a: UnsafePointer[Scalar[dtype], MutAnyOrigin],
min_value: Scalar[dtype],
max_value: Scalar[dtype]
):
# Generate random values in [0, 1]
seed()
rand[dtype](a, size)

# Scale to [min, max]
var rng = max_value - min_value
for i in range(size):
a[i] = min_value + a[i] * rng


def generate_random_scalar[
dtype: DType,
](
min_value: Scalar[dtype],
max_value: Scalar[dtype]
) -> Scalar[dtype]:
# Generate random values in [0, 1]
seed()
var result = Scalar[dtype]()
rand[dtype](UnsafePointer(to=result), 1)

range = max_value - min_value
return min_value + result * range


def asum_test[
dtype: DType,
Expand Down Expand Up @@ -528,7 +491,7 @@ def nrm2_test[
def rot_test[
dtype: DType,
size: Int
]():
]() where dtype.is_floating_point(): # neeeded for sin and cos
with DeviceContext() as ctx:
# print("[ rot test:", dtype, "]")

Expand All @@ -544,7 +507,7 @@ def rot_test[
ctx.enqueue_copy(d_y, y)

# Generate random angle for sin and cos
var angle = random_float64(0, 2 * 3.14159265359)
var angle = generate_random_scalar[dtype](0, 2 * 3.14159265359)
var c = Scalar[dtype](cos(angle))
var s = Scalar[dtype](sin(angle))

Expand Down
Loading