PyGPUkit API Reference

Core Types

GPUArray

The main array type for GPU computation.

class GPUArray:
    """N-dimensional array stored in GPU memory."""

    # Properties
    shape: tuple[int, ...]  # Array dimensions
    dtype: DataType         # Data type
    ndim: int               # Number of dimensions
    size: int               # Total number of elements
    nbytes: int             # Total bytes
    itemsize: int           # Bytes per element

    # Methods
    def to_numpy(self) -> np.ndarray:
        """Copy array data to NumPy array."""

    def astype(self, dtype: DataType) -> GPUArray:
        """Convert to different data type."""

Example:

import pygpukit as gpk
import numpy as np

a = gpk.from_numpy(np.random.randn(100, 100).astype(np.float32))
print(f"Shape: {a.shape}")       # (100, 100)
print(f"Dtype: {a.dtype}")       # float32
print(f"Size: {a.size}")         # 10000
print(f"Bytes: {a.nbytes}")      # 40000

# Convert to numpy
np_arr = a.to_numpy()

# Convert dtype
b = a.astype(gpk.float16)

DataType

Data type enumeration.

Type	Description	Size
`gpk.float32`	32-bit float	4 bytes
`gpk.float64`	64-bit float	8 bytes
`gpk.float16`	16-bit float	2 bytes
`gpk.bfloat16`	Brain float16	2 bytes
`gpk.int32`	32-bit integer	4 bytes
`gpk.int64`	64-bit integer	8 bytes

Factory Functions

zeros

def zeros(
    shape: tuple[int, ...] | int,
    dtype: str | DataType = "float32",
) -> GPUArray:
    """Create array filled with zeros."""

Example:

a = gpk.zeros((1024, 1024))
b = gpk.zeros((512,), dtype=gpk.float16)
c = gpk.zeros(100)  # 1D array

ones

def ones(
    shape: tuple[int, ...] | int,
    dtype: str | DataType = "float32",
) -> GPUArray:
    """Create array filled with ones."""

Example:

a = gpk.ones((1024, 1024))
b = gpk.ones((512,), dtype="float64")

empty

def empty(
    shape: tuple[int, ...] | int,
    dtype: str | DataType = "float32",
) -> GPUArray:
    """Create uninitialized array (faster than zeros)."""

Example:

# Use when you'll overwrite all values
a = gpk.empty((1024, 1024))

from_numpy

def from_numpy(array: np.ndarray) -> GPUArray:
    """Create GPUArray from NumPy array (copies data to GPU)."""

Example:

np_arr = np.random.randn(100, 100).astype(np.float32)
gpu_arr = gpk.from_numpy(np_arr)

Elementwise Operations

All elementwise operations require arrays of the same shape and dtype.

add

def add(a: GPUArray, b: GPUArray) -> GPUArray:
    """Element-wise addition: c = a + b"""

sub

def sub(a: GPUArray, b: GPUArray) -> GPUArray:
    """Element-wise subtraction: c = a - b"""

mul

def mul(a: GPUArray, b: GPUArray) -> GPUArray:
    """Element-wise multiplication: c = a * b"""

div

def div(a: GPUArray, b: GPUArray) -> GPUArray:
    """Element-wise division: c = a / b"""

Example:

a = gpk.ones((1024, 1024))
b = gpk.ones((1024, 1024)) * 2

c = gpk.add(a, b)   # or a + b
d = gpk.sub(a, b)   # or a - b
e = gpk.mul(a, b)   # or a * b
f = gpk.div(a, b)   # or a / b

Math Operations

exp

def exp(a: GPUArray) -> GPUArray:
    """Element-wise exponential: e^x"""

log

def log(a: GPUArray) -> GPUArray:
    """Element-wise natural logarithm: ln(x)"""

sin

def sin(a: GPUArray) -> GPUArray:
    """Element-wise sine: sin(x)"""

cos

def cos(a: GPUArray) -> GPUArray:
    """Element-wise cosine: cos(x)"""

sqrt

def sqrt(a: GPUArray) -> GPUArray:
    """Element-wise square root: sqrt(x)"""

rsqrt

def rsqrt(a: GPUArray) -> GPUArray:
    """Element-wise reciprocal square root: 1/sqrt(x)"""

abs

def abs(a: GPUArray) -> GPUArray:
    """Element-wise absolute value: |x|"""

neg

def neg(a: GPUArray) -> GPUArray:
    """Element-wise negation: -x"""

Example:

a = gpk.from_numpy(np.array([1.0, 2.0, 3.0], dtype=np.float32))
b = gpk.exp(a)   # [e^1, e^2, e^3]
c = gpk.log(a)   # [0, ln(2), ln(3)]
d = gpk.sin(a)   # [sin(1), sin(2), sin(3)]
e = gpk.cos(a)   # [cos(1), cos(2), cos(3)]
f = gpk.sqrt(a)  # [1, 1.414, 1.732]
g = gpk.rsqrt(a) # [1, 0.707, 0.577]

Comparison Operations

clamp

def clamp(a: GPUArray, min_val: float, max_val: float) -> GPUArray:
    """Clamp values to range [min_val, max_val]."""

where

def where(cond: GPUArray, x: GPUArray, y: GPUArray) -> GPUArray:
    """Element-wise conditional: cond ? x : y"""

Example:

x = gpk.from_numpy(np.array([-2.0, 0.5, 3.0], dtype=np.float32))

# Clamp to [-1, 1]
y = gpk.clamp(x, -1.0, 1.0)  # [-1.0, 0.5, 1.0]

# Conditional selection
cond = gpk.from_numpy(np.array([1.0, 0.0, 1.0], dtype=np.float32))
a = gpk.from_numpy(np.array([1.0, 2.0, 3.0], dtype=np.float32))
b = gpk.from_numpy(np.array([4.0, 5.0, 6.0], dtype=np.float32))
result = gpk.where(cond, a, b)  # [1.0, 5.0, 3.0]

Activation Functions

relu

def relu(a: GPUArray) -> GPUArray:
    """ReLU activation: max(0, x)"""

gelu

def gelu(a: GPUArray) -> GPUArray:
    """GELU activation: x * 0.5 * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))"""

sigmoid

def sigmoid(a: GPUArray) -> GPUArray:
    """Sigmoid activation: 1 / (1 + exp(-x))"""

tanh

def tanh(a: GPUArray) -> GPUArray:
    """Hyperbolic tangent activation: tanh(x)"""

Example:

x = gpk.from_numpy(np.array([-1.0, 0.0, 1.0, 2.0], dtype=np.float32))
y_relu = gpk.relu(x)     # [0, 0, 1, 2]
y_gelu = gpk.gelu(x)     # [-0.159, 0, 0.841, 1.955]
y_sigmoid = gpk.sigmoid(x)  # [0.269, 0.5, 0.731, 0.881]
y_tanh = gpk.tanh(x)     # [-0.762, 0, 0.762, 0.964]

Matrix Operations

matmul

def matmul(
    a: GPUArray,
    b: GPUArray,
    *,
    use_tf32: bool | None = None,
) -> GPUArray:
    """Matrix multiplication: C = A @ B

    Args:
        a: First matrix [M, K]
        b: Second matrix [K, N]
        use_tf32: TF32 TensorCore mode (None=env var, True=force, False=disable)

    Returns:
        Result matrix [M, N]
    """

Example:

a = gpk.from_numpy(np.random.randn(4096, 4096).astype(np.float32))
b = gpk.from_numpy(np.random.randn(4096, 4096).astype(np.float32))

# Default: uses CUTLASS TF32 TensorCore (~30 TFLOPS)
c = a @ b

# Force TF32 mode
c = gpk.matmul(a, b, use_tf32=True)

# Force FP32 mode (full precision)
c = gpk.matmul(a, b, use_tf32=False)

transpose

def transpose(a: GPUArray) -> GPUArray:
    """Matrix transpose: B = A.T

    Args:
        a: Input matrix [rows, cols]

    Returns:
        Transposed matrix [cols, rows]
    """

Example:

a = gpk.from_numpy(np.random.randn(100, 200).astype(np.float32))
b = gpk.transpose(a)  # shape: (200, 100)

Reduction Operations

Reductions return a scalar GPUArray of shape [1].

sum

def sum(a: GPUArray) -> GPUArray:
    """Sum of all elements."""

mean

def mean(a: GPUArray) -> GPUArray:
    """Mean of all elements."""

max

def max(a: GPUArray) -> GPUArray:
    """Maximum element."""

min

def min(a: GPUArray) -> GPUArray:
    """Minimum element."""

argmax

def argmax(a: GPUArray) -> GPUArray:
    """Index of maximum element."""

sum_axis

def sum_axis(a: GPUArray, axis: int) -> GPUArray:
    """Sum along specified axis.

    Args:
        a: Input array
        axis: Axis to reduce (0 for rows, 1 for columns)

    Returns:
        Reduced array with axis removed
    """

Example:

a = gpk.from_numpy(np.array([1.0, 2.0, 3.0, 4.0], dtype=np.float32))

total = gpk.sum(a)      # [10.0]
avg = gpk.mean(a)       # [2.5]
maximum = gpk.max(a)    # [4.0]
minimum = gpk.min(a)    # [1.0]
max_idx = gpk.argmax(a) # [3] (index of 4.0)

# Get scalar value
print(total.to_numpy()[0])  # 10.0

# Sum along axis
mat = gpk.from_numpy(np.array([[1, 2], [3, 4]], dtype=np.float32))
row_sum = gpk.sum_axis(mat, axis=1)  # [3, 7]
col_sum = gpk.sum_axis(mat, axis=0)  # [4, 6]

Neural Network Operations

layernorm

def layernorm(
    input: GPUArray,
    gamma: GPUArray,
    beta: GPUArray,
    eps: float = 1e-5,
) -> GPUArray:
    """Layer normalization.

    Computes: (x - mean) / sqrt(var + eps) * gamma + beta

    Args:
        input: Input tensor [batch, features]
        gamma: Scale parameter [features]
        beta: Bias parameter [features]
        eps: Epsilon for numerical stability

    Returns:
        Normalized tensor [batch, features]
    """

Example:

batch, features = 32, 768
x = gpk.from_numpy(np.random.randn(batch, features).astype(np.float32))
gamma = gpk.ones(features)
beta = gpk.zeros(features)

y = gpk.layernorm(x, gamma, beta, eps=1e-5)

bias_add_inplace

def bias_add_inplace(output: GPUArray, bias: GPUArray) -> None:
    """Add bias to output in-place.

    Computes: output[batch, features] += bias[features]

    Args:
        output: Output tensor [batch, features] (modified in-place)
        bias: Bias tensor [features]
    """

Example:

output = gpk.from_numpy(np.random.randn(32, 768).astype(np.float32))
bias = gpk.from_numpy(np.random.randn(768).astype(np.float32))

gpk.bias_add_inplace(output, bias)  # output modified in-place

linear_bias_gelu

def linear_bias_gelu(
    input: GPUArray,
    weight: GPUArray,
    bias: GPUArray,
) -> GPUArray:
    """Fused Linear + Bias + GELU operation.

    Computes: output = gelu(input @ weight.T + bias)

    Uses CUTLASS epilogue fusion when dimensions are multiples of 16.

    Args:
        input: Input tensor [batch, in_features]
        weight: Weight tensor [out_features, in_features]
        bias: Bias tensor [out_features]

    Returns:
        Output tensor [batch, out_features]
    """

Example:

batch, in_feat, out_feat = 512, 768, 3072

input = gpk.from_numpy(np.random.randn(batch, in_feat).astype(np.float32))
weight = gpk.from_numpy(np.random.randn(out_feat, in_feat).astype(np.float32))
bias = gpk.from_numpy(np.random.randn(out_feat).astype(np.float32))

# Fused operation (single GPU kernel)
output = gpk.linear_bias_gelu(input, weight, bias)

# Equivalent unfused operations (multiple GPU kernels)
# output = gpk.gelu(gpk.matmul(input, gpk.transpose(weight)) + bias)

FP8 Operations (SM120+)

FP8 E4M3 GEMM operations for Blackwell GPUs (RTX 5090, B100, B200).

fp8_fp8_sm120_available

def fp8_fp8_sm120_available() -> bool:
    """Check if FP8 I/O GEMM is available (requires SM120+)."""

fp8_fp8_get_scale_sizes

def fp8_fp8_get_scale_sizes(M: int, N: int, K: int) -> tuple[int, int]:
    """Get required scale factor sizes for blockwise FP8 GEMM.

    Args:
        M: Number of rows in A
        N: Number of columns in B
        K: Inner dimension

    Returns:
        Tuple of (scale_A_size, scale_B_size)
    """

matmul_fp8_fp8_sm120

def matmul_fp8_fp8_sm120(
    a: GPUArray,
    b: GPUArray,
    *,
    out: GPUArray | None = None,
) -> GPUArray:
    """FP8 E4M3 GEMM with unity scaling.

    Args:
        a: FP8 E4M3 matrix [M, K] (stored as uint8)
        b: FP8 E4M3 matrix [K, N] (stored as uint8)
        out: Optional output buffer [M, N]

    Returns:
        FP8 E4M3 result [M, N] (stored as uint8)
    """

matmul_fp8_fp8_blockwise_sm120

def matmul_fp8_fp8_blockwise_sm120(
    a: GPUArray,
    b: GPUArray,
    scale_a: GPUArray,
    scale_b: GPUArray,
    *,
    out: GPUArray | None = None,
) -> GPUArray:
    """FP8 E4M3 GEMM with blockwise scaling.

    For FP8 models (Llama 3.1 FP8, Qwen FP8, etc.) that store
    per-block scale factors alongside quantized weights.

    Args:
        a: FP8 E4M3 matrix [M, K] (stored as uint8)
        b: FP8 E4M3 matrix [K, N] (stored as uint8)
        scale_a: Scale factors for A (size from fp8_fp8_get_scale_sizes)
        scale_b: Scale factors for B (size from fp8_fp8_get_scale_sizes)
        out: Optional output buffer [M, N]

    Returns:
        FP8 E4M3 result [M, N] (stored as uint8)

    Note:
        Minimum matrix size is 128x128x128 due to CUTLASS tile requirements.
    """

Example:

import pygpukit as gpk
import numpy as np

if gpk.fp8_fp8_sm120_available():
    M, N, K = 4096, 4096, 4096

    # Create FP8 data (stored as uint8)
    A = gpk.from_numpy(np.random.randint(0, 255, (M, K), dtype=np.uint8))
    B = gpk.from_numpy(np.random.randint(0, 255, (K, N), dtype=np.uint8))

    # Get scale sizes and create scale factors
    sfa_size, sfb_size = gpk.fp8_fp8_get_scale_sizes(M, N, K)
    scale_A = gpk.from_numpy(np.ones(sfa_size, dtype=np.float32))
    scale_B = gpk.from_numpy(np.ones(sfb_size, dtype=np.float32))

    # Blockwise scaled FP8 GEMM
    C = gpk.matmul_fp8_fp8_blockwise_sm120(A, B, scale_A, scale_B)

Device Information

is_cuda_available

def is_cuda_available() -> bool:
    """Check if CUDA is available."""

get_device_info

def get_device_info() -> DeviceInfo:
    """Get current GPU device information."""

DeviceInfo properties:

name: str - Device name
compute_capability: tuple[int, int] - SM version (e.g., (8, 6))
total_memory: int - Total VRAM in bytes
multiprocessor_count: int - Number of SMs

get_device_capabilities

def get_device_capabilities() -> DeviceCapabilities:
    """Get device capability details."""

Example:

if gpk.is_cuda_available():
    info = gpk.get_device_info()
    print(f"GPU: {info.name}")
    print(f"SM: {info.compute_capability}")
    print(f"VRAM: {info.total_memory / 1e9:.1f} GB")

    caps = gpk.get_device_capabilities()
    print(f"TensorCore: {caps.has_tensor_core}")

JIT Compilation

is_nvrtc_available

def is_nvrtc_available() -> bool:
    """Check if NVRTC (JIT compiler) is available."""

jit

def jit(
    source: str,
    func: str,
    options: list[str] | None = None,
) -> JITKernel:
    """Compile CUDA kernel from source.

    Args:
        source: CUDA C++ source code
        func: Kernel function name
        options: Compiler options (e.g., ["-arch=sm_86"])

    Returns:
        Compiled kernel object
    """

Example:

source = '''
extern "C" __global__
void scale(float* x, float factor, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) x[idx] *= factor;
}
'''

if gpk.is_nvrtc_available():
    kernel = gpk.jit(source, func="scale")

    x = gpk.ones(1024)
    kernel(x, factor=2.0, n=1024)
    # x is now all 2.0

Environment Variables

Variable	Description	Default
`PYGPUKIT_NO_TF32`	Disable TF32 TensorCore for matmul	Not set (TF32 enabled)
`PYGPUKIT_NO_CUTLASS`	Disable CUTLASS, use fallback kernels	Not set

Example:

# Force full FP32 precision (disable TF32)
PYGPUKIT_NO_TF32=1 python my_script.py

FilesExpand file tree

api.md

Latest commit

History

api.md

File metadata and controls

PyGPUkit API Reference

Core Types

GPUArray

DataType

Factory Functions

zeros

ones

empty

from_numpy

Elementwise Operations

add

sub

mul

div

Math Operations

exp

log

sin

cos

sqrt

rsqrt

abs

neg

Comparison Operations

clamp

where

Activation Functions

relu

gelu

sigmoid

tanh

Matrix Operations

matmul

transpose

Reduction Operations

sum

mean

max

min

argmax

sum_axis

Neural Network Operations

layernorm

bias_add_inplace

linear_bias_gelu

FP8 Operations (SM120+)

fp8_fp8_sm120_available

fp8_fp8_get_scale_sizes

matmul_fp8_fp8_sm120

matmul_fp8_fp8_blockwise_sm120

Device Information

is_cuda_available

get_device_info

get_device_capabilities

JIT Compilation

is_nvrtc_available

jit

Environment Variables