From 91339a333eaf9ea828604deffe45dfbe01d7e041 Mon Sep 17 00:00:00 2001
From: Tuomas Karna <tuomas.karna@intel.com>
Date: Tue, 18 Nov 2025 16:42:14 +0200
Subject: [PATCH 01/15] add workload obj, execution and mlir utils, and two
 workload examples

---
 examples/workload/example.py      | 179 +++++++++++++++++++++++
 examples/workload/example_mlir.py | 205 ++++++++++++++++++++++++++
 lighthouse/__init__.py            |   6 +
 lighthouse/utils/execution.py     | 231 ++++++++++++++++++++++++++++++
 lighthouse/utils/mlir.py          |  37 +++++
 lighthouse/workload.py            |  79 ++++++++++
 6 files changed, 737 insertions(+)
 create mode 100644 examples/workload/example.py
 create mode 100644 examples/workload/example_mlir.py
 create mode 100644 lighthouse/utils/execution.py
 create mode 100644 lighthouse/utils/mlir.py
 create mode 100644 lighthouse/workload.py

diff --git a/examples/workload/example.py b/examples/workload/example.py
new file mode 100644
index 0000000..28bbd01
--- /dev/null
+++ b/examples/workload/example.py
@@ -0,0 +1,179 @@
+"""
+Workload example: Element-wise sum of two (M, N) float32 arrays on CPU.
+"""
+
+import numpy as np
+from mlir import ir
+from mlir.runtime.np_to_memref import get_ranked_memref_descriptor
+from mlir.dialects import func, linalg, bufferization
+from mlir.dialects import transform
+from functools import cached_property
+from lighthouse import Workload
+from lighthouse.utils.mlir import (
+    apply_registered_pass,
+    canonicalize,
+    cse,
+    match,
+)
+from lighthouse.utils.execution import (
+    lower_payload,
+    execute,
+    benchmark,
+)
+
+
+class ElementwiseSum(Workload):
+    """
+    Computes element-wise sum of (M, N) float32 arrays on CPU.
+
+    We can construct the input arrays and compute the reference solution in
+    Python with Numpy.
+
+    We use @cached_property to store the inputs and reference solution in the
+    object so that they are only computed once.
+    """
+
+    def __init__(self, M, N):
+        self.M = M
+        self.N = N
+        self.dtype = np.float32
+        self.context = ir.Context()
+        self.location = ir.Location.unknown(context=self.context)
+
+    @cached_property
+    def _input_arrays(self):
+        print(" * Generating input arrays...")
+        np.random.seed(2)
+        A = np.random.rand(self.M, self.N).astype(self.dtype)
+        B = np.random.rand(self.M, self.N).astype(self.dtype)
+        C = np.zeros((self.M, self.N), dtype=self.dtype)
+        return [A, B, C]
+
+    @cached_property
+    def _reference_solution(self):
+        print(" * Computing reference solution...")
+        A, B, _ = self._input_arrays
+        return A + B
+
+    def get_input_arrays(self, execution_engine):
+        return [get_ranked_memref_descriptor(a) for a in self._input_arrays]
+
+    def verify(self, execution_engine, verbose: int = 0) -> bool:
+        C = self._input_arrays[2]
+        C_ref = self._reference_solution
+        if verbose > 1:
+            print("Reference solution:")
+            print(C_ref)
+            print("Computed solution:")
+            print(C)
+        success = np.allclose(C, C_ref)
+        if verbose:
+            if success:
+                print("PASSED")
+            else:
+                print("FAILED Result mismatch!")
+        return success
+
+    def requirements(self):
+        return []
+
+    def get_complexity(self):
+        nbytes = np.dtype(self.dtype).itemsize
+        flop_count = self.M * self.N  # one addition per element
+        memory_reads = 2 * self.M * self.N * nbytes  # read A and B
+        memory_writes = self.M * self.N * nbytes  # write C
+        return (flop_count, memory_reads, memory_writes)
+
+    def payload_module(self):
+        with self.context, self.location:
+            float32_t = ir.F32Type.get()
+            shape = (self.M, self.N)
+            tensor_t = ir.RankedTensorType.get(shape, float32_t)
+            memref_t = ir.MemRefType.get(shape, float32_t)
+            mod = ir.Module.create()
+            with ir.InsertionPoint(mod.body):
+                args = [memref_t, memref_t, memref_t]
+                f = func.FuncOp(self.payload_function_name, (tuple(args), ()))
+                f.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get()
+            with ir.InsertionPoint(f.add_entry_block()):
+                A = f.arguments[0]
+                B = f.arguments[1]
+                C = f.arguments[2]
+                a_tensor = bufferization.ToTensorOp(tensor_t, A, restrict=True)
+                b_tensor = bufferization.ToTensorOp(tensor_t, B, restrict=True)
+                c_tensor = bufferization.ToTensorOp(
+                    tensor_t, C, restrict=True, writable=True
+                )
+                add = linalg.add(a_tensor, b_tensor, outs=[c_tensor])
+                bufferization.MaterializeInDestinationOp(
+                    None, add, C, restrict=True, writable=True
+                )
+                func.ReturnOp(())
+        return mod
+
+    def schedule_module(self, dump_kernel=None, parameters=None):
+        with self.context, self.location:
+            schedule_module = ir.Module.create()
+            schedule_module.operation.attributes["transform.with_named_sequence"] = (
+                ir.UnitAttr.get()
+            )
+            with ir.InsertionPoint(schedule_module.body):
+                named_sequence = transform.NamedSequenceOp(
+                    "__transform_main",
+                    [transform.AnyOpType.get()],
+                    [],
+                    arg_attrs=[{"transform.readonly": ir.UnitAttr.get()}],
+                )
+                with ir.InsertionPoint(named_sequence.body):
+                    anytype = transform.AnyOpType.get()
+                    func = match(named_sequence.bodyTarget, ops={"func.func"})
+                    mod = transform.get_parent_op(
+                        anytype,
+                        func,
+                        op_name="builtin.module",
+                        deduplicate=True,
+                    )
+                    mod = apply_registered_pass(mod, "one-shot-bufferize")
+                    mod = apply_registered_pass(mod, "convert-linalg-to-loops")
+                    cse(mod)
+                    canonicalize(mod)
+
+                    if dump_kernel == "bufferized":
+                        transform.YieldOp()
+                        return schedule_module
+
+                    mod = apply_registered_pass(mod, "convert-scf-to-cf")
+                    mod = apply_registered_pass(mod, "finalize-memref-to-llvm")
+                    mod = apply_registered_pass(mod, "convert-cf-to-llvm")
+                    mod = apply_registered_pass(mod, "convert-arith-to-llvm")
+                    mod = apply_registered_pass(mod, "convert-func-to-llvm")
+                    mod = apply_registered_pass(mod, "reconcile-unrealized-casts")
+                    transform.YieldOp()
+
+        return schedule_module
+
+
+if __name__ == "__main__":
+    wload = ElementwiseSum(400, 400)
+
+    print(" Dump kernel ".center(60, "-"))
+    lower_payload(wload, dump_kernel="bufferized", dump_schedule=True)
+
+    print(" Execute 1 ".center(60, "-"))
+    execute(wload, verbose=2)
+
+    print(" Execute 2 ".center(60, "-"))
+    execute(wload, verbose=1)
+
+    print(" Benchmark ".center(60, "-"))
+    times = benchmark(wload)
+    times *= 1e6  # convert to microseconds
+    # compute statistics
+    mean = np.mean(times)
+    min = np.min(times)
+    max = np.max(times)
+    std = np.std(times)
+    print(f"Timings (us): mean={mean:.2f}+/-{std:.2f} min={min:.2f} max={max:.2f}")
+    flop_count = wload.get_complexity()[0]
+    gflops = flop_count / (mean * 1e-6) / 1e9
+    print(f"Throughput: {gflops:.2f} GFLOPS")
diff --git a/examples/workload/example_mlir.py b/examples/workload/example_mlir.py
new file mode 100644
index 0000000..a539a3a
--- /dev/null
+++ b/examples/workload/example_mlir.py
@@ -0,0 +1,205 @@
+"""
+Workload example: Element-wise sum of two (M, N) float32 arrays on CPU.
+
+In this example, allocation and deallocation of input arrays is done in MLIR.
+"""
+
+import numpy as np
+from mlir import ir
+from mlir.runtime.np_to_memref import (
+    ranked_memref_to_numpy,
+    make_nd_memref_descriptor,
+    as_ctype,
+)
+from mlir.dialects import func, linalg, arith, memref
+import ctypes
+from contextlib import contextmanager
+from lighthouse.utils import get_packed_arg
+from lighthouse.utils.execution import (
+    lower_payload,
+    execute,
+    benchmark,
+)
+from example import ElementwiseSum
+
+
+def emit_host_alloc(mod, suffix, element_type, rank=2):
+    dyn = ir.ShapedType.get_dynamic_size()
+    memref_dyn_t = ir.MemRefType.get(rank * (dyn,), element_type)
+    index_t = ir.IndexType.get()
+    i32_t = ir.IntegerType.get_signless(32)
+    with ir.InsertionPoint(mod.body):
+        f = func.FuncOp("host_alloc_" + suffix, (rank * (i32_t,), (memref_dyn_t,)))
+        f.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get()
+    with ir.InsertionPoint(f.add_entry_block()):
+        dims = [arith.IndexCastOp(index_t, a) for a in list(f.arguments)]
+        alloc = memref.alloc(memref_dyn_t, dims, [])
+        func.ReturnOp((alloc,))
+
+
+def emit_host_dealloc(mod, suffix, element_type, rank=2):
+    dyn = ir.ShapedType.get_dynamic_size()
+    memref_dyn_t = ir.MemRefType.get(rank * (dyn,), element_type)
+    with ir.InsertionPoint(mod.body):
+        f = func.FuncOp("host_dealloc_" + suffix, ((memref_dyn_t,), ()))
+        f.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get()
+    with ir.InsertionPoint(f.add_entry_block()):
+        memref.dealloc(f.arguments[0])
+        func.ReturnOp(())
+
+
+def emit_fill_constant(mod, suffix, value, element_type, rank=2):
+    dyn = ir.ShapedType.get_dynamic_size()
+    memref_dyn_t = ir.MemRefType.get(rank * (dyn,), element_type)
+    with ir.InsertionPoint(mod.body):
+        f = func.FuncOp("host_fill_constant_" + suffix, ((memref_dyn_t,), ()))
+        f.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get()
+    with ir.InsertionPoint(f.add_entry_block()):
+        const = arith.constant(element_type, value)
+        linalg.fill(const, outs=[f.arguments[0]])
+        func.ReturnOp(())
+
+
+def emit_fill_random(mod, suffix, element_type, min=0.0, max=1.0, seed=2):
+    rank = 2
+    dyn = ir.ShapedType.get_dynamic_size()
+    memref_dyn_t = ir.MemRefType.get(rank * (dyn,), element_type)
+    i32_t = ir.IntegerType.get_signless(32)
+    f64_t = ir.F64Type.get()
+    with ir.InsertionPoint(mod.body):
+        f = func.FuncOp("host_fill_random_" + suffix, ((memref_dyn_t,), ()))
+        f.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get()
+    with ir.InsertionPoint(f.add_entry_block()):
+        min_cst = arith.constant(f64_t, min)
+        max_cst = arith.constant(f64_t, max)
+        seed_cst = arith.constant(i32_t, seed)
+        linalg.fill_rng_2d(min_cst, max_cst, seed_cst, outs=[f.arguments[0]])
+        func.ReturnOp(())
+
+
+class ElementwiseSumMLIRAlloc(ElementwiseSum):
+    """
+    Computes element-wise sum of (M, N) float32 arrays on CPU.
+
+    Extends ElementwiseSum by allocating input arrays in MLIR.
+    """
+
+    def __init__(self, M, N):
+        super().__init__(M, N)
+        # keep track of allocated memrefs
+        self.memrefs = {}
+
+    def _allocate_array(self, name, execution_engine):
+        if name in self.memrefs:
+            return self.memrefs[name]
+        alloc_func = execution_engine.lookup("host_alloc_f32")
+        shape = (self.M, self.N)
+        mref = make_nd_memref_descriptor(len(shape), as_ctype(self.dtype))()
+        ptr_mref = ctypes.pointer(ctypes.pointer(mref))
+        ptr_dims = [ctypes.pointer(ctypes.c_int32(d)) for d in shape]
+        alloc_func(get_packed_arg([ptr_mref, *ptr_dims]))
+        self.memrefs[name] = mref
+        return mref
+
+    def _allocate_inputs(self, execution_engine):
+        self._allocate_array("A", execution_engine)
+        self._allocate_array("B", execution_engine)
+        self._allocate_array("C", execution_engine)
+
+    def _deallocate_all(self, execution_engine):
+        for mref in self.memrefs.values():
+            dealloc_func = execution_engine.lookup("host_dealloc_f32")
+            ptr_mref = ctypes.pointer(ctypes.pointer(mref))
+            dealloc_func(get_packed_arg([ptr_mref]))
+        self.memrefs = {}
+
+    @contextmanager
+    def allocate(self, execution_engine):
+        try:
+            self._allocate_inputs(execution_engine)
+            yield None
+        finally:
+            self._deallocate_all(execution_engine)
+
+    def get_input_arrays(self, execution_engine):
+        A = self._allocate_array("A", execution_engine)
+        B = self._allocate_array("B", execution_engine)
+        C = self._allocate_array("C", execution_engine)
+
+        # initialize with MLIR
+        fill_zero_func = execution_engine.lookup("host_fill_constant_zero_f32")
+        fill_random_func = execution_engine.lookup("host_fill_random_f32")
+        fill_zero_func(get_packed_arg([ctypes.pointer(ctypes.pointer(C))]))
+        fill_random_func(get_packed_arg([ctypes.pointer(ctypes.pointer(A))]))
+        fill_random_func(get_packed_arg([ctypes.pointer(ctypes.pointer(B))]))
+
+        return [A, B, C]
+
+    def verify(self, execution_engine, verbose: int = 0) -> bool:
+        # compute reference solution with numpy
+        A = ranked_memref_to_numpy([self.memrefs["A"]])
+        B = ranked_memref_to_numpy([self.memrefs["B"]])
+        C = ranked_memref_to_numpy([self.memrefs["C"]])
+        C_ref = A + B
+        if verbose > 1:
+            print("Reference solution:")
+            print(C_ref)
+            print("Computed solution:")
+            print(C)
+        success = np.allclose(C, C_ref)
+
+        # Alternatively we could have done the verification in MLIR by emitting
+        # a check function.
+        # Here we just call the payload function again.
+        # self._allocate_array("C_ref", execution_engine)
+        # func = execution_engine.lookup("payload")
+        # func(get_packed_arg([
+        #     ctypes.pointer(ctypes.pointer(self.memrefs["A"])),
+        #     ctypes.pointer(ctypes.pointer(self.memrefs["B"])),
+        #     ctypes.pointer(ctypes.pointer(self.memrefs["C_ref"])),
+        # ]))
+        # Check correctness with numpy.
+        # C = ranked_memref_to_numpy([self.memrefs["C"]])
+        # C_ref = ranked_memref_to_numpy([self.memrefs["C_ref"]])
+        # success = np.allclose(C, C_ref)
+
+        if verbose:
+            if success:
+                print("PASSED")
+            else:
+                print("FAILED Result mismatch!")
+        return success
+
+    def payload_module(self):
+        mod = super().payload_module()
+        # extend the payload module with de/alloc/fill functions
+        with self.context, self.location:
+            float32_t = ir.F32Type.get()
+            emit_host_alloc(mod, "f32", float32_t)
+            emit_host_dealloc(mod, "f32", float32_t)
+            emit_fill_constant(mod, "zero_f32", 0.0, float32_t)
+            emit_fill_random(mod, "f32", float32_t, min=-1.0, max=1.0)
+        return mod
+
+
+if __name__ == "__main__":
+    wload = ElementwiseSumMLIRAlloc(400, 400)
+
+    print(" Dump kernel ".center(60, "-"))
+    lower_payload(wload, dump_kernel="bufferized", dump_schedule=False)
+
+    print(" Execute ".center(60, "-"))
+    execute(wload, verbose=2)
+
+    print(" Benchmark ".center(60, "-"))
+    times = benchmark(wload)
+    times *= 1e6  # convert to microseconds
+    # compute statistics
+    mean = np.mean(times)
+    min = np.min(times)
+    max = np.max(times)
+    std = np.std(times)
+    print(f"Timings (us): mean={mean:.2f}+/-{std:.2f} min={min:.2f} max={max:.2f}")
+    flop_count = wload.get_complexity()[0]
+    gflops = flop_count / (mean * 1e-6) / 1e9
+    print(f"Throughput: {gflops:.2f} GFLOPS")
diff --git a/lighthouse/__init__.py b/lighthouse/__init__.py
index 1ac008e..d05b010 100644
--- a/lighthouse/__init__.py
+++ b/lighthouse/__init__.py
@@ -1 +1,7 @@
 __version__ = "0.1.0a1"
+
+from .workload import Workload
+
+__all__ = [
+    "Workload",
+]
diff --git a/lighthouse/utils/execution.py b/lighthouse/utils/execution.py
new file mode 100644
index 0000000..e4c04a4
--- /dev/null
+++ b/lighthouse/utils/execution.py
@@ -0,0 +1,231 @@
+"""
+Execution engine utility functions.
+"""
+
+import numpy as np
+import ctypes
+import os
+from mlir import ir
+from mlir.dialects.transform import interpreter as transform_interpreter
+from mlir.dialects import func, arith, scf, memref
+from mlir.execution_engine import ExecutionEngine
+from mlir.runtime.np_to_memref import get_ranked_memref_descriptor
+from lighthouse.utils.mlir import get_mlir_library_path
+from lighthouse.utils import get_packed_arg
+from lighthouse import Workload
+from typing import Optional
+
+
+def get_engine(payload_module, requirements=None, opt_level=3) -> ExecutionEngine:
+    requirements = requirements or []
+    context = ir.Context()
+    location = ir.Location.unknown(context)
+    required_libs = {
+        "levelzero": (
+            ["libmlir_levelzero_runtime.so"],
+            "Did you compile LLVM with -DMLIR_ENABLE_LEVELZERO_RUNNER=1?",
+        ),
+        "mlir_runner": (["libmlir_runner_utils.so"], ""),
+        "mlir_c_runner": (["libmlir_c_runner_utils.so"], ""),
+    }
+    libs = []
+    lib_dir = os.path.join(get_mlir_library_path())
+    for r in requirements:
+        if r not in required_libs:
+            raise ValueError(f"Unknown execution engine requirement: {r}")
+        so_files, hint = required_libs[r]
+        for f in so_files:
+            so_path = os.path.join(lib_dir, f)
+            if not os.path.isfile(so_path):
+                msg = f"Could not find shared library {so_path}"
+                if hint:
+                    msg += "\n" + hint
+                raise ValueError(msg)
+            libs.append(so_path)
+    with context, location:
+        execution_engine = ExecutionEngine(
+            payload_module, opt_level=opt_level, shared_libs=libs
+        )
+        execution_engine.initialize()
+    return execution_engine
+
+
+def apply_transform_schedule(
+    payload_module,
+    schedule_module,
+    context,
+    location,
+    dump_kernel: Optional[str] = None,
+    dump_schedule: bool = False,
+):
+    if not dump_kernel or dump_kernel != "initial":
+        with context, location:
+            # invoke transform interpreter directly
+            transform_interpreter.apply_named_sequence(
+                payload_root=payload_module,
+                transform_root=schedule_module.body.operations[0],
+                transform_module=schedule_module,
+            )
+    if dump_kernel:
+        print(payload_module)
+    if dump_schedule:
+        print(schedule_module)
+
+
+def lower_payload(
+    workload,
+    dump_kernel: Optional[str] = None,
+    dump_schedule: bool = False,
+    schedule_parameters: Optional[dict] = None,
+) -> ir.Module:
+    payload_module = workload.payload_module()
+    schedule_module = workload.schedule_module(
+        dump_kernel=dump_kernel, parameters=schedule_parameters
+    )
+    apply_transform_schedule(
+        payload_module,
+        schedule_module,
+        workload.context,
+        workload.location,
+        dump_kernel=dump_kernel,
+        dump_schedule=dump_schedule,
+    )
+    return payload_module
+
+
+def execute(
+    workload,
+    check_correctness: bool = True,
+    schedule_parameters: Optional[dict] = None,
+    verbose: int = 0,
+):
+    # lower payload with schedule
+    payload_module = lower_payload(workload, schedule_parameters=schedule_parameters)
+    # get execution engine
+    engine = get_engine(payload_module, requirements=workload.requirements())
+
+    with workload.allocate(execution_engine=engine):
+        # prepare function arguments
+        inputs = workload.get_input_arrays(execution_engine=engine)
+        pointers = [ctypes.pointer(ctypes.pointer(m)) for m in inputs]
+        packed_args = get_packed_arg(pointers)
+
+        # handle to payload function
+        payload_func = engine.lookup(workload.payload_function_name)
+
+        # call function
+        payload_func(packed_args)
+
+        if check_correctness:
+            success = workload.verify(execution_engine=engine, verbose=verbose)
+            if not success:
+                raise ValueError("Benchmark verification failed.")
+
+
+def emit_benchmark_function(
+    payload_module: ir.Module, workload: Workload, nruns: int, nwarmup: int
+):
+    """
+    Emit a benchmark function that calls payload function and times it.
+
+    Every function call is timed separately. Returns the times (seconds) in a
+    memref.
+    """
+    # find original payload function
+    payload_func = None
+    for op in payload_module.operation.regions[0].blocks[0]:
+        if (
+            isinstance(op, func.FuncOp)
+            and str(op.name).strip('"') == workload.payload_function_name
+        ):
+            payload_func = op
+            break
+    assert payload_func is not None, "Could not find payload function"
+    payload_arguments = payload_func.type.inputs
+    # emit benchmark function
+    with workload.context, workload.location:
+        with ir.InsertionPoint(payload_module.body):
+            # define rtclock function
+            f64_t = ir.F64Type.get()
+            f = func.FuncOp("rtclock", ((), (f64_t,)), visibility="private")
+            # emit new function
+            time_memref_t = ir.MemRefType.get((nruns,), f64_t)
+            args = payload_arguments + [time_memref_t]
+            f = func.FuncOp("benchmark", (tuple(args), ()))
+            f.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get()
+        with ir.InsertionPoint(f.add_entry_block()):
+            index_t = ir.IndexType.get()
+            zero = arith.ConstantOp(index_t, 0)
+            one = arith.ConstantOp(index_t, 1)
+            # call payload for warmup runs
+            nwarmup_cst = arith.ConstantOp(index_t, nwarmup)
+            for_op = scf.ForOp(zero, nwarmup_cst, one)
+            with ir.InsertionPoint(for_op.body):
+                func.CallOp(payload_func, list(f.arguments[: len(payload_arguments)]))
+                scf.YieldOp(())
+            # call payload for benchmark runs, time every call separately
+            nruns_cst = arith.ConstantOp(index_t, nruns)
+            for_op = scf.ForOp(zero, nruns_cst, one)
+            i = for_op.induction_variable
+            with ir.InsertionPoint(for_op.body):
+                tic = func.CallOp((f64_t,), "rtclock", ()).result
+                func.CallOp(payload_func, list(f.arguments[: len(payload_arguments)]))
+                toc = func.CallOp((f64_t,), "rtclock", ()).result
+                time = arith.SubFOp(toc, tic)
+                memref.StoreOp(time, f.arguments[-1], [i])
+                scf.YieldOp(())
+            func.ReturnOp(())
+
+
+def benchmark(
+    workload,
+    nruns: int = 100,
+    nwarmup: int = 10,
+    schedule_parameters: Optional[dict] = None,
+    check_correctness: bool = True,
+    verbose: int = 0,
+) -> np.ndarray:
+    # get original payload module
+    payload_module = workload.payload_module()
+
+    # add benchmark function with timing
+    emit_benchmark_function(payload_module, workload, nruns, nwarmup)
+
+    # lower
+    apply_transform_schedule(
+        payload_module,
+        workload.schedule_module(parameters=schedule_parameters),
+        workload.context,
+        workload.location,
+    )
+    # get execution engine, rtclock requires mlir_c_runner
+    requirements = workload.requirements()
+    if "mlir_c_runner" not in requirements:
+        requirements.append("mlir_c_runner")
+    engine = get_engine(payload_module, requirements=requirements)
+
+    with workload.allocate(execution_engine=engine):
+        inputs = workload.get_input_arrays(execution_engine=engine)
+        pointers = [ctypes.pointer(ctypes.pointer(m)) for m in inputs]
+        if check_correctness:
+            # call payload once to verify correctness
+            # prepare function arguments
+            packed_args = get_packed_arg(pointers)
+
+            payload_func = engine.lookup(workload.payload_function_name)
+            payload_func(packed_args)
+            success = workload.verify(execution_engine=engine, verbose=verbose)
+            if not success:
+                raise ValueError("Benchmark verification failed.")
+
+        # allocate buffer for timings and prepare arguments
+        time_array = np.zeros((nruns,), dtype=np.float64)
+        time_memref = get_ranked_memref_descriptor(time_array)
+        time_pointer = ctypes.pointer(ctypes.pointer(time_memref))
+        packed_args_with_time = get_packed_arg(pointers + [time_pointer])
+
+        # call benchmark function
+        benchmark_func = engine.lookup("benchmark")
+        benchmark_func(packed_args_with_time)
+
+    return time_array
diff --git a/lighthouse/utils/mlir.py b/lighthouse/utils/mlir.py
new file mode 100644
index 0000000..f32d243
--- /dev/null
+++ b/lighthouse/utils/mlir.py
@@ -0,0 +1,37 @@
+"""
+MLIR utility functions.
+"""
+
+from mlir import ir
+from mlir.dialects import transform
+from mlir.dialects.transform import structured
+import os
+
+
+def apply_registered_pass(*args, **kwargs):
+    return transform.apply_registered_pass(transform.AnyOpType.get(), *args, **kwargs)
+
+
+def match(*args, **kwargs):
+    return structured.MatchOp(transform.AnyOpType.get(), *args, **kwargs)
+
+
+def cse(op):
+    transform.ApplyCommonSubexpressionEliminationOp(op)
+
+
+def canonicalize(op):
+    with ir.InsertionPoint(transform.ApplyPatternsOp(op).patterns):
+        transform.ApplyCanonicalizationPatternsOp()
+
+
+def get_mlir_library_path():
+    pkg_path = ir.__file__
+    if "python_packages" in pkg_path:
+        # looks like a local mlir install
+        path = pkg_path.split("python_packages")[0] + os.sep + "lib"
+    else:
+        # maybe installed in python path
+        path = os.path.split(pkg_path)[0] + os.sep + "_mlir_libs"
+    assert os.path.isdir(path)
+    return path
diff --git a/lighthouse/workload.py b/lighthouse/workload.py
new file mode 100644
index 0000000..83e2918
--- /dev/null
+++ b/lighthouse/workload.py
@@ -0,0 +1,79 @@
+"""
+Abstract base class for workloads.
+
+Defines the expected interface for generic workload execution methods.
+"""
+
+from mlir import ir
+from abc import ABC, abstractmethod
+from contextlib import contextmanager
+from typing import Optional
+
+
+class Workload(ABC):
+    """
+    Abstract base class for workloads.
+
+    A workload is defined by a fixed payload function and problem size.
+    Different realizations of the workload can be obtained by altering the
+    lowering schedule.
+
+    The MLIR payload function should take input arrays as memrefs and return
+    nothing.
+    """
+
+    payload_function_name: str = "payload"
+
+    @abstractmethod
+    def requirements(self) -> list[str]:
+        """Return a list of requirements for the execution engine."""
+        pass
+
+    @abstractmethod
+    def payload_module(self) -> ir.Module:
+        """Generate the MLIR module containing the payload function."""
+        pass
+
+    @abstractmethod
+    def schedule_module(
+        self,
+        dump_kernel: Optional[str] = None,
+        parameters: Optional[dict] = None,
+    ) -> ir.Module:
+        """Generate the MLIR module containing the transform schedule."""
+        pass
+
+    @abstractmethod
+    def get_input_arrays(self, execution_engine) -> list:
+        """
+        Return the input arrays for the payload function as memrefs.
+
+        Allocation and initialization of the input arrays should be done here.
+        """
+        pass
+
+    @contextmanager
+    def allocate(self, execution_engine):
+        """
+        Allocate any necessary memory for the workload.
+
+        Override this method if the workload requires memory management."""
+        try:
+            yield None
+        finally:
+            pass
+
+    @abstractmethod
+    def verify(self, execution_engine, verbose: int = 0) -> bool:
+        """Verify the correctness of the computation."""
+        pass
+
+    @abstractmethod
+    def get_complexity(self) -> list:
+        """
+        Return the computational complexity of the workload.
+
+        Returns a tuple (flop_count, memory_reads, memory_writes). Memory
+        reads/writes are in bytes.
+        """
+        pass

From 01c8007d63b330d16d60abced5b5d0bfc4fe484f Mon Sep 17 00:00:00 2001
From: Tuomas Karna <tuomas.karna@intel.com>
Date: Tue, 2 Dec 2025 18:41:04 +0200
Subject: [PATCH 02/15] clean up context and other fixes

---
 examples/workload/example.py      | 149 +++++++++++++++---------------
 examples/workload/example_mlir.py |  54 +++++------
 lighthouse/utils/execution.py     |  95 +++++++++----------
 lighthouse/utils/mlir.py          |  10 +-
 lighthouse/workload.py            |   2 +-
 5 files changed, 148 insertions(+), 162 deletions(-)

diff --git a/examples/workload/example.py b/examples/workload/example.py
index 28bbd01..403f27c 100644
--- a/examples/workload/example.py
+++ b/examples/workload/example.py
@@ -12,7 +12,6 @@
 from lighthouse.utils.mlir import (
     apply_registered_pass,
     canonicalize,
-    cse,
     match,
 )
 from lighthouse.utils.execution import (
@@ -37,8 +36,6 @@ def __init__(self, M, N):
         self.M = M
         self.N = N
         self.dtype = np.float32
-        self.context = ir.Context()
-        self.location = ir.Location.unknown(context=self.context)
 
     @cached_property
     def _input_arrays(self):
@@ -58,7 +55,7 @@ def _reference_solution(self):
     def get_input_arrays(self, execution_engine):
         return [get_ranked_memref_descriptor(a) for a in self._input_arrays]
 
-    def verify(self, execution_engine, verbose: int = 0) -> bool:
+    def check_correctness(self, execution_engine, verbose: int = 0) -> bool:
         C = self._input_arrays[2]
         C_ref = self._reference_solution
         if verbose > 1:
@@ -85,95 +82,95 @@ def get_complexity(self):
         return (flop_count, memory_reads, memory_writes)
 
     def payload_module(self):
-        with self.context, self.location:
+        mod = ir.Module.create()
+
+        with ir.InsertionPoint(mod.body):
             float32_t = ir.F32Type.get()
             shape = (self.M, self.N)
             tensor_t = ir.RankedTensorType.get(shape, float32_t)
             memref_t = ir.MemRefType.get(shape, float32_t)
-            mod = ir.Module.create()
-            with ir.InsertionPoint(mod.body):
-                args = [memref_t, memref_t, memref_t]
-                f = func.FuncOp(self.payload_function_name, (tuple(args), ()))
-                f.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get()
-            with ir.InsertionPoint(f.add_entry_block()):
-                A = f.arguments[0]
-                B = f.arguments[1]
-                C = f.arguments[2]
-                a_tensor = bufferization.ToTensorOp(tensor_t, A, restrict=True)
-                b_tensor = bufferization.ToTensorOp(tensor_t, B, restrict=True)
-                c_tensor = bufferization.ToTensorOp(
+            fargs = [memref_t, memref_t, memref_t]
+
+            @func.func(*fargs, name=self.payload_function_name)
+            def payload(*args):
+                A, B, C = args
+                a_tensor = bufferization.to_tensor(tensor_t, A, restrict=True)
+                b_tensor = bufferization.to_tensor(tensor_t, B, restrict=True)
+                c_tensor = bufferization.to_tensor(
                     tensor_t, C, restrict=True, writable=True
                 )
                 add = linalg.add(a_tensor, b_tensor, outs=[c_tensor])
-                bufferization.MaterializeInDestinationOp(
+                bufferization.materialize_in_destination(
                     None, add, C, restrict=True, writable=True
                 )
-                func.ReturnOp(())
+
+        payload.func_op.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get()
+
         return mod
 
     def schedule_module(self, dump_kernel=None, parameters=None):
-        with self.context, self.location:
-            schedule_module = ir.Module.create()
-            schedule_module.operation.attributes["transform.with_named_sequence"] = (
-                ir.UnitAttr.get()
+        schedule_module = ir.Module.create()
+        schedule_module.operation.attributes["transform.with_named_sequence"] = (
+            ir.UnitAttr.get()
+        )
+        with ir.InsertionPoint(schedule_module.body):
+            named_sequence = transform.named_sequence(
+                "__transform_main",
+                [transform.AnyOpType.get()],
+                [],
+                arg_attrs=[{"transform.readonly": ir.UnitAttr.get()}],
             )
-            with ir.InsertionPoint(schedule_module.body):
-                named_sequence = transform.NamedSequenceOp(
-                    "__transform_main",
-                    [transform.AnyOpType.get()],
-                    [],
-                    arg_attrs=[{"transform.readonly": ir.UnitAttr.get()}],
+            with ir.InsertionPoint(named_sequence.body):
+                anytype = transform.AnyOpType.get()
+                func = match(named_sequence.bodyTarget, ops={"func.func"})
+                mod = transform.get_parent_op(
+                    anytype,
+                    func,
+                    op_name="builtin.module",
+                    deduplicate=True,
                 )
-                with ir.InsertionPoint(named_sequence.body):
-                    anytype = transform.AnyOpType.get()
-                    func = match(named_sequence.bodyTarget, ops={"func.func"})
-                    mod = transform.get_parent_op(
-                        anytype,
-                        func,
-                        op_name="builtin.module",
-                        deduplicate=True,
-                    )
-                    mod = apply_registered_pass(mod, "one-shot-bufferize")
-                    mod = apply_registered_pass(mod, "convert-linalg-to-loops")
-                    cse(mod)
-                    canonicalize(mod)
-
-                    if dump_kernel == "bufferized":
-                        transform.YieldOp()
-                        return schedule_module
-
-                    mod = apply_registered_pass(mod, "convert-scf-to-cf")
-                    mod = apply_registered_pass(mod, "finalize-memref-to-llvm")
-                    mod = apply_registered_pass(mod, "convert-cf-to-llvm")
-                    mod = apply_registered_pass(mod, "convert-arith-to-llvm")
-                    mod = apply_registered_pass(mod, "convert-func-to-llvm")
-                    mod = apply_registered_pass(mod, "reconcile-unrealized-casts")
+                mod = apply_registered_pass(mod, "one-shot-bufferize")
+                mod = apply_registered_pass(mod, "convert-linalg-to-loops")
+                transform.apply_cse(mod)
+                canonicalize(mod)
+
+                if dump_kernel == "bufferized":
                     transform.YieldOp()
+                    return schedule_module
+
+                mod = apply_registered_pass(mod, "convert-scf-to-cf")
+                mod = apply_registered_pass(mod, "finalize-memref-to-llvm")
+                mod = apply_registered_pass(mod, "convert-cf-to-llvm")
+                mod = apply_registered_pass(mod, "convert-arith-to-llvm")
+                mod = apply_registered_pass(mod, "convert-func-to-llvm")
+                mod = apply_registered_pass(mod, "reconcile-unrealized-casts")
+                transform.YieldOp()
 
         return schedule_module
 
 
 if __name__ == "__main__":
-    wload = ElementwiseSum(400, 400)
-
-    print(" Dump kernel ".center(60, "-"))
-    lower_payload(wload, dump_kernel="bufferized", dump_schedule=True)
-
-    print(" Execute 1 ".center(60, "-"))
-    execute(wload, verbose=2)
-
-    print(" Execute 2 ".center(60, "-"))
-    execute(wload, verbose=1)
-
-    print(" Benchmark ".center(60, "-"))
-    times = benchmark(wload)
-    times *= 1e6  # convert to microseconds
-    # compute statistics
-    mean = np.mean(times)
-    min = np.min(times)
-    max = np.max(times)
-    std = np.std(times)
-    print(f"Timings (us): mean={mean:.2f}+/-{std:.2f} min={min:.2f} max={max:.2f}")
-    flop_count = wload.get_complexity()[0]
-    gflops = flop_count / (mean * 1e-6) / 1e9
-    print(f"Throughput: {gflops:.2f} GFLOPS")
+    with ir.Context(), ir.Location.unknown():
+        wload = ElementwiseSum(400, 400)
+
+        print(" Dump kernel ".center(60, "-"))
+        lower_payload(wload, dump_kernel="bufferized", dump_schedule=True)
+
+        print(" Execute 1 ".center(60, "-"))
+        execute(wload, verbose=2)
+
+        print(" Execute 2 ".center(60, "-"))
+        execute(wload, verbose=1)
+
+        print(" Benchmark ".center(60, "-"))
+        times = benchmark(wload)
+        times *= 1e6  # convert to microseconds
+        # compute statistics
+        mean = np.mean(times)
+        min = np.min(times)
+        max = np.max(times)
+        std = np.std(times)
+        print(f"Timings (us): mean={mean:.2f}+/-{std:.2f} min={min:.2f} max={max:.2f}")
+        flop_count = wload.get_complexity()[0]
+        gflops = flop_count / (mean * 1e-6) / 1e9
+        print(f"Throughput: {gflops:.2f} GFLOPS")
diff --git a/examples/workload/example_mlir.py b/examples/workload/example_mlir.py
index a539a3a..a992454 100644
--- a/examples/workload/example_mlir.py
+++ b/examples/workload/example_mlir.py
@@ -135,7 +135,7 @@ def get_input_arrays(self, execution_engine):
 
         return [A, B, C]
 
-    def verify(self, execution_engine, verbose: int = 0) -> bool:
+    def check_correctness(self, execution_engine, verbose: int = 0) -> bool:
         # compute reference solution with numpy
         A = ranked_memref_to_numpy([self.memrefs["A"]])
         B = ranked_memref_to_numpy([self.memrefs["B"]])
@@ -173,33 +173,33 @@ def verify(self, execution_engine, verbose: int = 0) -> bool:
     def payload_module(self):
         mod = super().payload_module()
         # extend the payload module with de/alloc/fill functions
-        with self.context, self.location:
-            float32_t = ir.F32Type.get()
-            emit_host_alloc(mod, "f32", float32_t)
-            emit_host_dealloc(mod, "f32", float32_t)
-            emit_fill_constant(mod, "zero_f32", 0.0, float32_t)
-            emit_fill_random(mod, "f32", float32_t, min=-1.0, max=1.0)
+        float32_t = ir.F32Type.get()
+        emit_host_alloc(mod, "f32", float32_t)
+        emit_host_dealloc(mod, "f32", float32_t)
+        emit_fill_constant(mod, "zero_f32", 0.0, float32_t)
+        emit_fill_random(mod, "f32", float32_t, min=-1.0, max=1.0)
         return mod
 
 
 if __name__ == "__main__":
-    wload = ElementwiseSumMLIRAlloc(400, 400)
-
-    print(" Dump kernel ".center(60, "-"))
-    lower_payload(wload, dump_kernel="bufferized", dump_schedule=False)
-
-    print(" Execute ".center(60, "-"))
-    execute(wload, verbose=2)
-
-    print(" Benchmark ".center(60, "-"))
-    times = benchmark(wload)
-    times *= 1e6  # convert to microseconds
-    # compute statistics
-    mean = np.mean(times)
-    min = np.min(times)
-    max = np.max(times)
-    std = np.std(times)
-    print(f"Timings (us): mean={mean:.2f}+/-{std:.2f} min={min:.2f} max={max:.2f}")
-    flop_count = wload.get_complexity()[0]
-    gflops = flop_count / (mean * 1e-6) / 1e9
-    print(f"Throughput: {gflops:.2f} GFLOPS")
+    with ir.Context(), ir.Location.unknown():
+        wload = ElementwiseSumMLIRAlloc(400, 400)
+
+        print(" Dump kernel ".center(60, "-"))
+        lower_payload(wload, dump_kernel="bufferized", dump_schedule=False)
+
+        print(" Execute ".center(60, "-"))
+        execute(wload, verbose=2)
+
+        print(" Benchmark ".center(60, "-"))
+        times = benchmark(wload)
+        times *= 1e6  # convert to microseconds
+        # compute statistics
+        mean = np.mean(times)
+        min = np.min(times)
+        max = np.max(times)
+        std = np.std(times)
+        print(f"Timings (us): mean={mean:.2f}+/-{std:.2f} min={min:.2f} max={max:.2f}")
+        flop_count = wload.get_complexity()[0]
+        gflops = flop_count / (mean * 1e-6) / 1e9
+        print(f"Throughput: {gflops:.2f} GFLOPS")
diff --git a/lighthouse/utils/execution.py b/lighthouse/utils/execution.py
index e4c04a4..b34a125 100644
--- a/lighthouse/utils/execution.py
+++ b/lighthouse/utils/execution.py
@@ -6,7 +6,6 @@
 import ctypes
 import os
 from mlir import ir
-from mlir.dialects.transform import interpreter as transform_interpreter
 from mlir.dialects import func, arith, scf, memref
 from mlir.execution_engine import ExecutionEngine
 from mlir.runtime.np_to_memref import get_ranked_memref_descriptor
@@ -53,19 +52,13 @@ def get_engine(payload_module, requirements=None, opt_level=3) -> ExecutionEngin
 def apply_transform_schedule(
     payload_module,
     schedule_module,
-    context,
-    location,
     dump_kernel: Optional[str] = None,
     dump_schedule: bool = False,
 ):
     if not dump_kernel or dump_kernel != "initial":
-        with context, location:
-            # invoke transform interpreter directly
-            transform_interpreter.apply_named_sequence(
-                payload_root=payload_module,
-                transform_root=schedule_module.body.operations[0],
-                transform_module=schedule_module,
-            )
+        # apply schedule on payload module
+        named_seq = schedule_module.body.operations[0]
+        named_seq.apply(payload_module)
     if dump_kernel:
         print(payload_module)
     if dump_schedule:
@@ -85,8 +78,6 @@ def lower_payload(
     apply_transform_schedule(
         payload_module,
         schedule_module,
-        workload.context,
-        workload.location,
         dump_kernel=dump_kernel,
         dump_schedule=dump_schedule,
     )
@@ -117,13 +108,18 @@ def execute(
         payload_func(packed_args)
 
         if check_correctness:
-            success = workload.verify(execution_engine=engine, verbose=verbose)
+            success = workload.check_correctness(
+                execution_engine=engine, verbose=verbose
+            )
             if not success:
                 raise ValueError("Benchmark verification failed.")
 
 
 def emit_benchmark_function(
-    payload_module: ir.Module, workload: Workload, nruns: int, nwarmup: int
+    payload_module: ir.Module,
+    workload: Workload,
+    nruns: int,
+    nwarmup: int,
 ):
     """
     Emit a benchmark function that calls payload function and times it.
@@ -136,49 +132,46 @@ def emit_benchmark_function(
     for op in payload_module.operation.regions[0].blocks[0]:
         if (
             isinstance(op, func.FuncOp)
-            and str(op.name).strip('"') == workload.payload_function_name
+            and op.name.value == workload.payload_function_name
         ):
             payload_func = op
             break
     assert payload_func is not None, "Could not find payload function"
     payload_arguments = payload_func.type.inputs
-    # emit benchmark function
-    with workload.context, workload.location:
-        with ir.InsertionPoint(payload_module.body):
-            # define rtclock function
-            f64_t = ir.F64Type.get()
-            f = func.FuncOp("rtclock", ((), (f64_t,)), visibility="private")
-            # emit new function
-            time_memref_t = ir.MemRefType.get((nruns,), f64_t)
-            args = payload_arguments + [time_memref_t]
-            f = func.FuncOp("benchmark", (tuple(args), ()))
-            f.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get()
-        with ir.InsertionPoint(f.add_entry_block()):
+
+    # emit benchmark function that calls payload and times it
+    with ir.InsertionPoint(payload_module.body):
+        # define rtclock function
+        f64_t = ir.F64Type.get()
+        func.FuncOp("rtclock", ((), (f64_t,)), visibility="private")
+        # emit benchmark function
+        time_memref_t = ir.MemRefType.get((nruns,), f64_t)
+        args = payload_arguments + [time_memref_t]
+
+        @func.func(*args)
+        def benchmark(*args):
             index_t = ir.IndexType.get()
-            zero = arith.ConstantOp(index_t, 0)
-            one = arith.ConstantOp(index_t, 1)
-            # call payload for warmup runs
-            nwarmup_cst = arith.ConstantOp(index_t, nwarmup)
-            for_op = scf.ForOp(zero, nwarmup_cst, one)
-            with ir.InsertionPoint(for_op.body):
-                func.CallOp(payload_func, list(f.arguments[: len(payload_arguments)]))
-                scf.YieldOp(())
-            # call payload for benchmark runs, time every call separately
-            nruns_cst = arith.ConstantOp(index_t, nruns)
-            for_op = scf.ForOp(zero, nruns_cst, one)
-            i = for_op.induction_variable
-            with ir.InsertionPoint(for_op.body):
-                tic = func.CallOp((f64_t,), "rtclock", ()).result
-                func.CallOp(payload_func, list(f.arguments[: len(payload_arguments)]))
-                toc = func.CallOp((f64_t,), "rtclock", ()).result
-                time = arith.SubFOp(toc, tic)
-                memref.StoreOp(time, f.arguments[-1], [i])
-                scf.YieldOp(())
-            func.ReturnOp(())
+            zero = arith.constant(index_t, 0)
+            one = arith.constant(index_t, 1)
+            nwarmup_cst = arith.constant(index_t, nwarmup)
+            for i in scf.for_(zero, nwarmup_cst, one):
+                # FIXME(upstream): func.call is broken for this use case?
+                func.CallOp(payload_func, list(args[: len(payload_arguments)]))
+                scf.yield_(())
+            nruns_cst = arith.constant(index_t, nruns)
+            for i in scf.for_(zero, nruns_cst, one):
+                tic = func.call((f64_t,), "rtclock", ())
+                func.CallOp(payload_func, list(args[: len(payload_arguments)]))
+                toc = func.call((f64_t,), "rtclock", ())
+                time = arith.subf(toc, tic)
+                memref.store(time, args[-1], [i])
+                scf.yield_(())
+
+        benchmark.func_op.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get()
 
 
 def benchmark(
-    workload,
+    workload: Workload,
     nruns: int = 100,
     nwarmup: int = 10,
     schedule_parameters: Optional[dict] = None,
@@ -195,8 +188,6 @@ def benchmark(
     apply_transform_schedule(
         payload_module,
         workload.schedule_module(parameters=schedule_parameters),
-        workload.context,
-        workload.location,
     )
     # get execution engine, rtclock requires mlir_c_runner
     requirements = workload.requirements()
@@ -214,7 +205,9 @@ def benchmark(
 
             payload_func = engine.lookup(workload.payload_function_name)
             payload_func(packed_args)
-            success = workload.verify(execution_engine=engine, verbose=verbose)
+            success = workload.check_correctness(
+                execution_engine=engine, verbose=verbose
+            )
             if not success:
                 raise ValueError("Benchmark verification failed.")
 
diff --git a/lighthouse/utils/mlir.py b/lighthouse/utils/mlir.py
index f32d243..bf6e248 100644
--- a/lighthouse/utils/mlir.py
+++ b/lighthouse/utils/mlir.py
@@ -13,16 +13,12 @@ def apply_registered_pass(*args, **kwargs):
 
 
 def match(*args, **kwargs):
-    return structured.MatchOp(transform.AnyOpType.get(), *args, **kwargs)
-
-
-def cse(op):
-    transform.ApplyCommonSubexpressionEliminationOp(op)
+    return structured.structured_match(transform.AnyOpType.get(), *args, **kwargs)
 
 
 def canonicalize(op):
-    with ir.InsertionPoint(transform.ApplyPatternsOp(op).patterns):
-        transform.ApplyCanonicalizationPatternsOp()
+    with ir.InsertionPoint(transform.apply_patterns(op).patterns):
+        transform.apply_patterns_canonicalization()
 
 
 def get_mlir_library_path():
diff --git a/lighthouse/workload.py b/lighthouse/workload.py
index 83e2918..26fe8be 100644
--- a/lighthouse/workload.py
+++ b/lighthouse/workload.py
@@ -64,7 +64,7 @@ def allocate(self, execution_engine):
             pass
 
     @abstractmethod
-    def verify(self, execution_engine, verbose: int = 0) -> bool:
+    def check_correctness(self, execution_engine, verbose: int = 0) -> bool:
         """Verify the correctness of the computation."""
         pass
 

From 83b837a9f97cc1e7f31b9d227f98e3d0215eac6a Mon Sep 17 00:00:00 2001
From: Tuomas Karna <tuomas.karna@intel.com>
Date: Tue, 2 Dec 2025 19:05:18 +0200
Subject: [PATCH 03/15] move execution.py -> runner.py

---
 examples/workload/example.py                 | 2 +-
 examples/workload/example_mlir.py            | 2 +-
 lighthouse/utils/{execution.py => runner.py} | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)
 rename lighthouse/utils/{execution.py => runner.py} (99%)

diff --git a/examples/workload/example.py b/examples/workload/example.py
index 403f27c..1baa7d9 100644
--- a/examples/workload/example.py
+++ b/examples/workload/example.py
@@ -14,7 +14,7 @@
     canonicalize,
     match,
 )
-from lighthouse.utils.execution import (
+from lighthouse.utils.runner import (
     lower_payload,
     execute,
     benchmark,
diff --git a/examples/workload/example_mlir.py b/examples/workload/example_mlir.py
index a992454..8f89c01 100644
--- a/examples/workload/example_mlir.py
+++ b/examples/workload/example_mlir.py
@@ -15,7 +15,7 @@
 import ctypes
 from contextlib import contextmanager
 from lighthouse.utils import get_packed_arg
-from lighthouse.utils.execution import (
+from lighthouse.utils.runner import (
     lower_payload,
     execute,
     benchmark,
diff --git a/lighthouse/utils/execution.py b/lighthouse/utils/runner.py
similarity index 99%
rename from lighthouse/utils/execution.py
rename to lighthouse/utils/runner.py
index b34a125..0895a9b 100644
--- a/lighthouse/utils/execution.py
+++ b/lighthouse/utils/runner.py
@@ -1,5 +1,5 @@
 """
-Execution engine utility functions.
+Utility functions for running workloads.
 """
 
 import numpy as np
@@ -85,7 +85,7 @@ def lower_payload(
 
 
 def execute(
-    workload,
+    workload: Workload,
     check_correctness: bool = True,
     schedule_parameters: Optional[dict] = None,
     verbose: int = 0,

From af7a49b5f9094c536a0b0147611ec0417469ff02 Mon Sep 17 00:00:00 2001
From: Tuomas Karna <tuomas.karna@intel.com>
Date: Tue, 2 Dec 2025 21:16:27 +0200
Subject: [PATCH 04/15] workload: allocate_inputs ctx manager returns the input
 memrefs

---
 examples/workload/example.py      | 12 +++++++-
 examples/workload/example_mlir.py | 46 +++++++++++++++----------------
 lighthouse/utils/runner.py        |  6 ++--
 lighthouse/workload.py            | 18 ++++++------
 4 files changed, 43 insertions(+), 39 deletions(-)

diff --git a/examples/workload/example.py b/examples/workload/example.py
index 1baa7d9..21963da 100644
--- a/examples/workload/example.py
+++ b/examples/workload/example.py
@@ -7,6 +7,8 @@
 from mlir.runtime.np_to_memref import get_ranked_memref_descriptor
 from mlir.dialects import func, linalg, bufferization
 from mlir.dialects import transform
+from mlir.execution_engine import ExecutionEngine
+from contextlib import contextmanager
 from functools import cached_property
 from lighthouse import Workload
 from lighthouse.utils.mlir import (
@@ -52,9 +54,17 @@ def _reference_solution(self):
         A, B, _ = self._input_arrays
         return A + B
 
-    def get_input_arrays(self, execution_engine):
+    def _get_input_arrays(self):
         return [get_ranked_memref_descriptor(a) for a in self._input_arrays]
 
+    @contextmanager
+    def allocate_inputs(self, execution_engine: ExecutionEngine):
+        try:
+            yield self._get_input_arrays()
+        finally:
+            # cached numpy arrays are deallocated automatically
+            pass
+
     def check_correctness(self, execution_engine, verbose: int = 0) -> bool:
         C = self._input_arrays[2]
         C_ref = self._reference_solution
diff --git a/examples/workload/example_mlir.py b/examples/workload/example_mlir.py
index 8f89c01..5d1d6ed 100644
--- a/examples/workload/example_mlir.py
+++ b/examples/workload/example_mlir.py
@@ -14,7 +14,11 @@
 from mlir.dialects import func, linalg, arith, memref
 import ctypes
 from contextlib import contextmanager
-from lighthouse.utils import get_packed_arg
+from lighthouse.utils import (
+    get_packed_arg,
+    memrefs_to_packed_args,
+    memref_to_ctype,
+)
 from lighthouse.utils.runner import (
     lower_payload,
     execute,
@@ -93,34 +97,21 @@ def _allocate_array(self, name, execution_engine):
         if name in self.memrefs:
             return self.memrefs[name]
         alloc_func = execution_engine.lookup("host_alloc_f32")
+        # construct a memref descriptor for the result memref
         shape = (self.M, self.N)
         mref = make_nd_memref_descriptor(len(shape), as_ctype(self.dtype))()
-        ptr_mref = ctypes.pointer(ctypes.pointer(mref))
+        ptr_mref = memref_to_ctype(mref)
         ptr_dims = [ctypes.pointer(ctypes.c_int32(d)) for d in shape]
         alloc_func(get_packed_arg([ptr_mref, *ptr_dims]))
         self.memrefs[name] = mref
         return mref
 
-    def _allocate_inputs(self, execution_engine):
-        self._allocate_array("A", execution_engine)
-        self._allocate_array("B", execution_engine)
-        self._allocate_array("C", execution_engine)
-
     def _deallocate_all(self, execution_engine):
         for mref in self.memrefs.values():
             dealloc_func = execution_engine.lookup("host_dealloc_f32")
-            ptr_mref = ctypes.pointer(ctypes.pointer(mref))
-            dealloc_func(get_packed_arg([ptr_mref]))
+            dealloc_func(memrefs_to_packed_args([mref]))
         self.memrefs = {}
 
-    @contextmanager
-    def allocate(self, execution_engine):
-        try:
-            self._allocate_inputs(execution_engine)
-            yield None
-        finally:
-            self._deallocate_all(execution_engine)
-
     def get_input_arrays(self, execution_engine):
         A = self._allocate_array("A", execution_engine)
         B = self._allocate_array("B", execution_engine)
@@ -129,12 +120,19 @@ def get_input_arrays(self, execution_engine):
         # initialize with MLIR
         fill_zero_func = execution_engine.lookup("host_fill_constant_zero_f32")
         fill_random_func = execution_engine.lookup("host_fill_random_f32")
-        fill_zero_func(get_packed_arg([ctypes.pointer(ctypes.pointer(C))]))
-        fill_random_func(get_packed_arg([ctypes.pointer(ctypes.pointer(A))]))
-        fill_random_func(get_packed_arg([ctypes.pointer(ctypes.pointer(B))]))
+        fill_zero_func(memrefs_to_packed_args([C]))
+        fill_random_func(memrefs_to_packed_args([A]))
+        fill_random_func(memrefs_to_packed_args([B]))
 
         return [A, B, C]
 
+    @contextmanager
+    def allocate_inputs(self, execution_engine):
+        try:
+            yield self.get_input_arrays(execution_engine)
+        finally:
+            self._deallocate_all(execution_engine)
+
     def check_correctness(self, execution_engine, verbose: int = 0) -> bool:
         # compute reference solution with numpy
         A = ranked_memref_to_numpy([self.memrefs["A"]])
@@ -153,10 +151,10 @@ def check_correctness(self, execution_engine, verbose: int = 0) -> bool:
         # Here we just call the payload function again.
         # self._allocate_array("C_ref", execution_engine)
         # func = execution_engine.lookup("payload")
-        # func(get_packed_arg([
-        #     ctypes.pointer(ctypes.pointer(self.memrefs["A"])),
-        #     ctypes.pointer(ctypes.pointer(self.memrefs["B"])),
-        #     ctypes.pointer(ctypes.pointer(self.memrefs["C_ref"])),
+        # func(memrefs_to_packed_args([
+        #     self.memrefs["A"],
+        #     self.memrefs["B"],
+        #     self.memrefs["C_ref"],
         # ]))
         # Check correctness with numpy.
         # C = ranked_memref_to_numpy([self.memrefs["C"]])
diff --git a/lighthouse/utils/runner.py b/lighthouse/utils/runner.py
index 0895a9b..5046b3a 100644
--- a/lighthouse/utils/runner.py
+++ b/lighthouse/utils/runner.py
@@ -95,9 +95,8 @@ def execute(
     # get execution engine
     engine = get_engine(payload_module, requirements=workload.requirements())
 
-    with workload.allocate(execution_engine=engine):
+    with workload.allocate_inputs(execution_engine=engine) as inputs:
         # prepare function arguments
-        inputs = workload.get_input_arrays(execution_engine=engine)
         pointers = [ctypes.pointer(ctypes.pointer(m)) for m in inputs]
         packed_args = get_packed_arg(pointers)
 
@@ -195,8 +194,7 @@ def benchmark(
         requirements.append("mlir_c_runner")
     engine = get_engine(payload_module, requirements=requirements)
 
-    with workload.allocate(execution_engine=engine):
-        inputs = workload.get_input_arrays(execution_engine=engine)
+    with workload.allocate_inputs(execution_engine=engine) as inputs:
         pointers = [ctypes.pointer(ctypes.pointer(m)) for m in inputs]
         if check_correctness:
             # call payload once to verify correctness
diff --git a/lighthouse/workload.py b/lighthouse/workload.py
index 26fe8be..b7f0aa1 100644
--- a/lighthouse/workload.py
+++ b/lighthouse/workload.py
@@ -44,23 +44,21 @@ def schedule_module(
         pass
 
     @abstractmethod
-    def get_input_arrays(self, execution_engine) -> list:
+    @contextmanager
+    def allocate_inputs(self, execution_engine):
         """
-        Return the input arrays for the payload function as memrefs.
+        Context manager that allocates and returns payload input buffers.
 
-        Allocation and initialization of the input arrays should be done here.
-        """
-        pass
+        Returns the payload input buffers as memrefs that can be directly
+        passed to the compiled payload function.
 
-    @contextmanager
-    def allocate(self, execution_engine):
+        On exit, frees any manually allocated memory (if any).
         """
-        Allocate any necessary memory for the workload.
-
-        Override this method if the workload requires memory management."""
         try:
+            # Yield payload function input memrefs here.
             yield None
         finally:
+            # Manually deallocate memory here (if needed).
             pass
 
     @abstractmethod

From f026282b2fa965a7a540112c3dfe2fa80edcaaac Mon Sep 17 00:00:00 2001
From: Tuomas Karna <tuomas.karna@intel.com>
Date: Tue, 2 Dec 2025 21:26:26 +0200
Subject: [PATCH 05/15] define helper functions with func.func decorator

---
 examples/workload/example_mlir.py | 69 ++++++++++++++++---------------
 1 file changed, 36 insertions(+), 33 deletions(-)

diff --git a/examples/workload/example_mlir.py b/examples/workload/example_mlir.py
index 5d1d6ed..428e61e 100644
--- a/examples/workload/example_mlir.py
+++ b/examples/workload/example_mlir.py
@@ -27,58 +27,60 @@
 from example import ElementwiseSum
 
 
-def emit_host_alloc(mod, suffix, element_type, rank=2):
+def emit_host_alloc(suffix: str, element_type: ir.Type, rank: int = 2):
     dyn = ir.ShapedType.get_dynamic_size()
     memref_dyn_t = ir.MemRefType.get(rank * (dyn,), element_type)
     index_t = ir.IndexType.get()
     i32_t = ir.IntegerType.get_signless(32)
-    with ir.InsertionPoint(mod.body):
-        f = func.FuncOp("host_alloc_" + suffix, (rank * (i32_t,), (memref_dyn_t,)))
-        f.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get()
-    with ir.InsertionPoint(f.add_entry_block()):
-        dims = [arith.IndexCastOp(index_t, a) for a in list(f.arguments)]
+    inputs = rank * (i32_t,)
+
+    @func.func(*inputs, name="host_alloc_" + suffix)
+    def alloc_func(*shape):
+        dims = [arith.index_cast(index_t, a) for a in shape]
         alloc = memref.alloc(memref_dyn_t, dims, [])
-        func.ReturnOp((alloc,))
+        return alloc
+
+    alloc_func.func_op.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get()
 
 
-def emit_host_dealloc(mod, suffix, element_type, rank=2):
+def emit_host_dealloc(suffix: str, element_type: ir.Type, rank: int = 2):
     dyn = ir.ShapedType.get_dynamic_size()
     memref_dyn_t = ir.MemRefType.get(rank * (dyn,), element_type)
-    with ir.InsertionPoint(mod.body):
-        f = func.FuncOp("host_dealloc_" + suffix, ((memref_dyn_t,), ()))
-        f.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get()
-    with ir.InsertionPoint(f.add_entry_block()):
-        memref.dealloc(f.arguments[0])
-        func.ReturnOp(())
+
+    @func.func(memref_dyn_t, name="host_dealloc_" + suffix)
+    def dealloc_func(buffer):
+        memref.dealloc(buffer)
+
+    dealloc_func.func_op.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get()
 
 
-def emit_fill_constant(mod, suffix, value, element_type, rank=2):
+def emit_fill_constant(suffix, value, element_type, rank=2):
     dyn = ir.ShapedType.get_dynamic_size()
     memref_dyn_t = ir.MemRefType.get(rank * (dyn,), element_type)
-    with ir.InsertionPoint(mod.body):
-        f = func.FuncOp("host_fill_constant_" + suffix, ((memref_dyn_t,), ()))
-        f.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get()
-    with ir.InsertionPoint(f.add_entry_block()):
+
+    @func.func(memref_dyn_t, name="host_fill_constant_" + suffix)
+    def init_func(buffer):
         const = arith.constant(element_type, value)
-        linalg.fill(const, outs=[f.arguments[0]])
-        func.ReturnOp(())
+        linalg.fill(const, outs=[buffer])
+
+    init_func.func_op.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get()
 
 
-def emit_fill_random(mod, suffix, element_type, min=0.0, max=1.0, seed=2):
+def emit_fill_random(suffix, element_type, min=0.0, max=1.0, seed=2):
     rank = 2
     dyn = ir.ShapedType.get_dynamic_size()
     memref_dyn_t = ir.MemRefType.get(rank * (dyn,), element_type)
     i32_t = ir.IntegerType.get_signless(32)
     f64_t = ir.F64Type.get()
-    with ir.InsertionPoint(mod.body):
-        f = func.FuncOp("host_fill_random_" + suffix, ((memref_dyn_t,), ()))
-        f.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get()
-    with ir.InsertionPoint(f.add_entry_block()):
+
+    @func.func(memref_dyn_t, name="host_fill_random_" + suffix)
+    def init_func(buffer):
         min_cst = arith.constant(f64_t, min)
         max_cst = arith.constant(f64_t, max)
         seed_cst = arith.constant(i32_t, seed)
-        linalg.fill_rng_2d(min_cst, max_cst, seed_cst, outs=[f.arguments[0]])
-        func.ReturnOp(())
+        linalg.fill_rng_2d(min_cst, max_cst, seed_cst, outs=[buffer])
+
+    init_func.func_op.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get()
 
 
 class ElementwiseSumMLIRAlloc(ElementwiseSum):
@@ -171,11 +173,12 @@ def check_correctness(self, execution_engine, verbose: int = 0) -> bool:
     def payload_module(self):
         mod = super().payload_module()
         # extend the payload module with de/alloc/fill functions
-        float32_t = ir.F32Type.get()
-        emit_host_alloc(mod, "f32", float32_t)
-        emit_host_dealloc(mod, "f32", float32_t)
-        emit_fill_constant(mod, "zero_f32", 0.0, float32_t)
-        emit_fill_random(mod, "f32", float32_t, min=-1.0, max=1.0)
+        with ir.InsertionPoint(mod.body):
+            float32_t = ir.F32Type.get()
+            emit_host_alloc("f32", float32_t)
+            emit_host_dealloc("f32", float32_t)
+            emit_fill_constant("zero_f32", 0.0, float32_t)
+            emit_fill_random("f32", float32_t, min=-1.0, max=1.0)
         return mod
 
 

From ebcc6bbd32e5d70a2edfb699e7176834eadbb323 Mon Sep 17 00:00:00 2001
From: Tuomas Karna <tuomas.karna@intel.com>
Date: Tue, 2 Dec 2025 21:47:02 +0200
Subject: [PATCH 06/15] remove apply_transform_schedule function

---
 lighthouse/utils/runner.py | 49 ++++++++++++--------------------------
 1 file changed, 15 insertions(+), 34 deletions(-)

diff --git a/lighthouse/utils/runner.py b/lighthouse/utils/runner.py
index 5046b3a..2561766 100644
--- a/lighthouse/utils/runner.py
+++ b/lighthouse/utils/runner.py
@@ -3,14 +3,13 @@
 """
 
 import numpy as np
-import ctypes
 import os
 from mlir import ir
 from mlir.dialects import func, arith, scf, memref
 from mlir.execution_engine import ExecutionEngine
 from mlir.runtime.np_to_memref import get_ranked_memref_descriptor
 from lighthouse.utils.mlir import get_mlir_library_path
-from lighthouse.utils import get_packed_arg
+from lighthouse.utils import memrefs_to_packed_args
 from lighthouse import Workload
 from typing import Optional
 
@@ -49,22 +48,6 @@ def get_engine(payload_module, requirements=None, opt_level=3) -> ExecutionEngin
     return execution_engine
 
 
-def apply_transform_schedule(
-    payload_module,
-    schedule_module,
-    dump_kernel: Optional[str] = None,
-    dump_schedule: bool = False,
-):
-    if not dump_kernel or dump_kernel != "initial":
-        # apply schedule on payload module
-        named_seq = schedule_module.body.operations[0]
-        named_seq.apply(payload_module)
-    if dump_kernel:
-        print(payload_module)
-    if dump_schedule:
-        print(schedule_module)
-
-
 def lower_payload(
     workload,
     dump_kernel: Optional[str] = None,
@@ -75,12 +58,14 @@ def lower_payload(
     schedule_module = workload.schedule_module(
         dump_kernel=dump_kernel, parameters=schedule_parameters
     )
-    apply_transform_schedule(
-        payload_module,
-        schedule_module,
-        dump_kernel=dump_kernel,
-        dump_schedule=dump_schedule,
-    )
+    if not dump_kernel or dump_kernel != "initial":
+        # apply schedule on payload module
+        named_seq = schedule_module.body.operations[0]
+        named_seq.apply(payload_module)
+    if dump_kernel:
+        print(payload_module)
+    if dump_schedule:
+        print(schedule_module)
     return payload_module
 
 
@@ -97,8 +82,7 @@ def execute(
 
     with workload.allocate_inputs(execution_engine=engine) as inputs:
         # prepare function arguments
-        pointers = [ctypes.pointer(ctypes.pointer(m)) for m in inputs]
-        packed_args = get_packed_arg(pointers)
+        packed_args = memrefs_to_packed_args(inputs)
 
         # handle to payload function
         payload_func = engine.lookup(workload.payload_function_name)
@@ -184,10 +168,9 @@ def benchmark(
     emit_benchmark_function(payload_module, workload, nruns, nwarmup)
 
     # lower
-    apply_transform_schedule(
-        payload_module,
-        workload.schedule_module(parameters=schedule_parameters),
-    )
+    schedule_module = workload.schedule_module(parameters=schedule_parameters)
+    schedule_module.body.operations[0].apply(payload_module)
+
     # get execution engine, rtclock requires mlir_c_runner
     requirements = workload.requirements()
     if "mlir_c_runner" not in requirements:
@@ -195,11 +178,10 @@ def benchmark(
     engine = get_engine(payload_module, requirements=requirements)
 
     with workload.allocate_inputs(execution_engine=engine) as inputs:
-        pointers = [ctypes.pointer(ctypes.pointer(m)) for m in inputs]
         if check_correctness:
             # call payload once to verify correctness
             # prepare function arguments
-            packed_args = get_packed_arg(pointers)
+            packed_args = memrefs_to_packed_args(inputs)
 
             payload_func = engine.lookup(workload.payload_function_name)
             payload_func(packed_args)
@@ -212,8 +194,7 @@ def benchmark(
         # allocate buffer for timings and prepare arguments
         time_array = np.zeros((nruns,), dtype=np.float64)
         time_memref = get_ranked_memref_descriptor(time_array)
-        time_pointer = ctypes.pointer(ctypes.pointer(time_memref))
-        packed_args_with_time = get_packed_arg(pointers + [time_pointer])
+        packed_args_with_time = memrefs_to_packed_args(inputs + [time_memref])
 
         # call benchmark function
         benchmark_func = engine.lookup("benchmark")

From da9eb18c49e9f8b34a8860c1a4f9f023fc936f6c Mon Sep 17 00:00:00 2001
From: Tuomas Karna <tuomas.karna@intel.com>
Date: Wed, 3 Dec 2025 16:16:29 +0200
Subject: [PATCH 07/15] lower_payload function is a member of Workload

---
 examples/workload/example.py      |  7 +++---
 examples/workload/example_mlir.py |  3 +--
 lighthouse/utils/runner.py        | 23 +-----------------
 lighthouse/workload.py            | 40 +++++++++++++++++++++++++++----
 4 files changed, 41 insertions(+), 32 deletions(-)

diff --git a/examples/workload/example.py b/examples/workload/example.py
index 21963da..d1f71a6 100644
--- a/examples/workload/example.py
+++ b/examples/workload/example.py
@@ -17,7 +17,6 @@
     match,
 )
 from lighthouse.utils.runner import (
-    lower_payload,
     execute,
     benchmark,
 )
@@ -118,7 +117,7 @@ def payload(*args):
 
         return mod
 
-    def schedule_module(self, dump_kernel=None, parameters=None):
+    def schedule_module(self, stop_at_stage=None, parameters=None):
         schedule_module = ir.Module.create()
         schedule_module.operation.attributes["transform.with_named_sequence"] = (
             ir.UnitAttr.get()
@@ -144,7 +143,7 @@ def schedule_module(self, dump_kernel=None, parameters=None):
                 transform.apply_cse(mod)
                 canonicalize(mod)
 
-                if dump_kernel == "bufferized":
+                if stop_at_stage == "bufferized":
                     transform.YieldOp()
                     return schedule_module
 
@@ -164,7 +163,7 @@ def schedule_module(self, dump_kernel=None, parameters=None):
         wload = ElementwiseSum(400, 400)
 
         print(" Dump kernel ".center(60, "-"))
-        lower_payload(wload, dump_kernel="bufferized", dump_schedule=True)
+        wload.lower_payload(dump_payload="bufferized", dump_schedule=True)
 
         print(" Execute 1 ".center(60, "-"))
         execute(wload, verbose=2)
diff --git a/examples/workload/example_mlir.py b/examples/workload/example_mlir.py
index 428e61e..bc65724 100644
--- a/examples/workload/example_mlir.py
+++ b/examples/workload/example_mlir.py
@@ -20,7 +20,6 @@
     memref_to_ctype,
 )
 from lighthouse.utils.runner import (
-    lower_payload,
     execute,
     benchmark,
 )
@@ -187,7 +186,7 @@ def payload_module(self):
         wload = ElementwiseSumMLIRAlloc(400, 400)
 
         print(" Dump kernel ".center(60, "-"))
-        lower_payload(wload, dump_kernel="bufferized", dump_schedule=False)
+        wload.lower_payload(dump_payload="bufferized", dump_schedule=False)
 
         print(" Execute ".center(60, "-"))
         execute(wload, verbose=2)
diff --git a/lighthouse/utils/runner.py b/lighthouse/utils/runner.py
index 2561766..8879d6a 100644
--- a/lighthouse/utils/runner.py
+++ b/lighthouse/utils/runner.py
@@ -48,27 +48,6 @@ def get_engine(payload_module, requirements=None, opt_level=3) -> ExecutionEngin
     return execution_engine
 
 
-def lower_payload(
-    workload,
-    dump_kernel: Optional[str] = None,
-    dump_schedule: bool = False,
-    schedule_parameters: Optional[dict] = None,
-) -> ir.Module:
-    payload_module = workload.payload_module()
-    schedule_module = workload.schedule_module(
-        dump_kernel=dump_kernel, parameters=schedule_parameters
-    )
-    if not dump_kernel or dump_kernel != "initial":
-        # apply schedule on payload module
-        named_seq = schedule_module.body.operations[0]
-        named_seq.apply(payload_module)
-    if dump_kernel:
-        print(payload_module)
-    if dump_schedule:
-        print(schedule_module)
-    return payload_module
-
-
 def execute(
     workload: Workload,
     check_correctness: bool = True,
@@ -76,7 +55,7 @@ def execute(
     verbose: int = 0,
 ):
     # lower payload with schedule
-    payload_module = lower_payload(workload, schedule_parameters=schedule_parameters)
+    payload_module = workload.lower_payload(schedule_parameters=schedule_parameters)
     # get execution engine
     engine = get_engine(payload_module, requirements=workload.requirements())
 
diff --git a/lighthouse/workload.py b/lighthouse/workload.py
index b7f0aa1..dc96831 100644
--- a/lighthouse/workload.py
+++ b/lighthouse/workload.py
@@ -16,7 +16,7 @@ class Workload(ABC):
 
     A workload is defined by a fixed payload function and problem size.
     Different realizations of the workload can be obtained by altering the
-    lowering schedule.
+    lowering schedule parameters.
 
     The MLIR payload function should take input arrays as memrefs and return
     nothing.
@@ -37,12 +37,44 @@ def payload_module(self) -> ir.Module:
     @abstractmethod
     def schedule_module(
         self,
-        dump_kernel: Optional[str] = None,
+        stop_at_stage: Optional[str] = None,
         parameters: Optional[dict] = None,
     ) -> ir.Module:
-        """Generate the MLIR module containing the transform schedule."""
+        """
+        Generate the MLIR module containing the transform schedule.
+
+        The `stop_at_stage` argument can be used to interrupt lowering at
+        a desired IR level for debugging purposes.
+        """
         pass
 
+    def lower_payload(
+        self,
+        dump_payload: Optional[str] = None,
+        dump_schedule: bool = False,
+        schedule_parameters: Optional[dict] = None,
+    ) -> ir.Module:
+        """
+        Apply transform schedule to the payload module.
+
+        Optionally dumps the payload IR and/or transform schedule to stdout.
+
+        Returns the lowered payload module.
+        """
+        payload_module = self.payload_module()
+        schedule_module = self.schedule_module(
+            stop_at_stage=dump_payload, parameters=schedule_parameters
+        )
+        if not dump_payload or dump_payload != "initial":
+            # apply schedule on payload module
+            named_seq = schedule_module.body.operations[0]
+            named_seq.apply(payload_module)
+        if dump_payload:
+            print(payload_module)
+        if dump_schedule:
+            print(schedule_module)
+        return payload_module
+
     @abstractmethod
     @contextmanager
     def allocate_inputs(self, execution_engine):
@@ -67,7 +99,7 @@ def check_correctness(self, execution_engine, verbose: int = 0) -> bool:
         pass
 
     @abstractmethod
-    def get_complexity(self) -> list:
+    def get_complexity(self) -> tuple[int, int, int]:
         """
         Return the computational complexity of the workload.
 

From 802ded0784ec8eb3ff77e9f393ef8bc28214cbf2 Mon Sep 17 00:00:00 2001
From: Tuomas Karna <tuomas.karna@intel.com>
Date: Wed, 3 Dec 2025 16:47:40 +0200
Subject: [PATCH 08/15] typing and mlir utils import

---
 examples/workload/example.py      | 26 ++++++++++++++++----------
 examples/workload/example_mlir.py | 31 +++++++++++++++++++++----------
 lighthouse/utils/__init__.py      |  3 +++
 lighthouse/utils/mlir.py          |  5 +++--
 lighthouse/utils/runner.py        |  4 +++-
 lighthouse/workload.py            | 10 +++++++---
 6 files changed, 53 insertions(+), 26 deletions(-)

diff --git a/examples/workload/example.py b/examples/workload/example.py
index d1f71a6..03a0cd6 100644
--- a/examples/workload/example.py
+++ b/examples/workload/example.py
@@ -10,13 +10,15 @@
 from mlir.execution_engine import ExecutionEngine
 from contextlib import contextmanager
 from functools import cached_property
+import ctypes
+from typing import Optional
 from lighthouse import Workload
 from lighthouse.utils.mlir import (
     apply_registered_pass,
     canonicalize,
     match,
 )
-from lighthouse.utils.runner import (
+from lighthouse.utils import (
     execute,
     benchmark,
 )
@@ -33,13 +35,13 @@ class ElementwiseSum(Workload):
     object so that they are only computed once.
     """
 
-    def __init__(self, M, N):
+    def __init__(self, M: int, N: int):
         self.M = M
         self.N = N
         self.dtype = np.float32
 
     @cached_property
-    def _input_arrays(self):
+    def _input_arrays(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
         print(" * Generating input arrays...")
         np.random.seed(2)
         A = np.random.rand(self.M, self.N).astype(self.dtype)
@@ -48,12 +50,12 @@ def _input_arrays(self):
         return [A, B, C]
 
     @cached_property
-    def _reference_solution(self):
+    def _reference_solution(self) -> np.ndarray:
         print(" * Computing reference solution...")
         A, B, _ = self._input_arrays
         return A + B
 
-    def _get_input_arrays(self):
+    def _get_input_arrays(self) -> list[ctypes.Structure]:
         return [get_ranked_memref_descriptor(a) for a in self._input_arrays]
 
     @contextmanager
@@ -64,7 +66,9 @@ def allocate_inputs(self, execution_engine: ExecutionEngine):
             # cached numpy arrays are deallocated automatically
             pass
 
-    def check_correctness(self, execution_engine, verbose: int = 0) -> bool:
+    def check_correctness(
+        self, execution_engine: ExecutionEngine, verbose: int = 0
+    ) -> bool:
         C = self._input_arrays[2]
         C_ref = self._reference_solution
         if verbose > 1:
@@ -80,17 +84,17 @@ def check_correctness(self, execution_engine, verbose: int = 0) -> bool:
                 print("FAILED Result mismatch!")
         return success
 
-    def requirements(self):
+    def requirements(self) -> list[str]:
         return []
 
-    def get_complexity(self):
+    def get_complexity(self) -> tuple[int, int, int]:
         nbytes = np.dtype(self.dtype).itemsize
         flop_count = self.M * self.N  # one addition per element
         memory_reads = 2 * self.M * self.N * nbytes  # read A and B
         memory_writes = self.M * self.N * nbytes  # write C
         return (flop_count, memory_reads, memory_writes)
 
-    def payload_module(self):
+    def payload_module(self) -> ir.Module:
         mod = ir.Module.create()
 
         with ir.InsertionPoint(mod.body):
@@ -117,7 +121,9 @@ def payload(*args):
 
         return mod
 
-    def schedule_module(self, stop_at_stage=None, parameters=None):
+    def schedule_module(
+        self, stop_at_stage: Optional[str] = None, parameters: Optional[dict] = None
+    ) -> ir.Module:
         schedule_module = ir.Module.create()
         schedule_module.operation.attributes["transform.with_named_sequence"] = (
             ir.UnitAttr.get()
diff --git a/examples/workload/example_mlir.py b/examples/workload/example_mlir.py
index bc65724..dc48b1c 100644
--- a/examples/workload/example_mlir.py
+++ b/examples/workload/example_mlir.py
@@ -12,14 +12,13 @@
     as_ctype,
 )
 from mlir.dialects import func, linalg, arith, memref
+from mlir.execution_engine import ExecutionEngine
 import ctypes
 from contextlib import contextmanager
 from lighthouse.utils import (
     get_packed_arg,
     memrefs_to_packed_args,
     memref_to_ctype,
-)
-from lighthouse.utils.runner import (
     execute,
     benchmark,
 )
@@ -53,7 +52,7 @@ def dealloc_func(buffer):
     dealloc_func.func_op.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get()
 
 
-def emit_fill_constant(suffix, value, element_type, rank=2):
+def emit_fill_constant(suffix: str, value: float, element_type: ir.Type, rank: int = 2):
     dyn = ir.ShapedType.get_dynamic_size()
     memref_dyn_t = ir.MemRefType.get(rank * (dyn,), element_type)
 
@@ -65,7 +64,13 @@ def init_func(buffer):
     init_func.func_op.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get()
 
 
-def emit_fill_random(suffix, element_type, min=0.0, max=1.0, seed=2):
+def emit_fill_random(
+    suffix: str,
+    element_type: ir.Type,
+    min: float = 0.0,
+    max: float = 1.0,
+    seed: int = 2,
+):
     rank = 2
     dyn = ir.ShapedType.get_dynamic_size()
     memref_dyn_t = ir.MemRefType.get(rank * (dyn,), element_type)
@@ -89,12 +94,14 @@ class ElementwiseSumMLIRAlloc(ElementwiseSum):
     Extends ElementwiseSum by allocating input arrays in MLIR.
     """
 
-    def __init__(self, M, N):
+    def __init__(self, M: int, N: int):
         super().__init__(M, N)
         # keep track of allocated memrefs
         self.memrefs = {}
 
-    def _allocate_array(self, name, execution_engine):
+    def _allocate_array(
+        self, name: str, execution_engine: ExecutionEngine
+    ) -> ctypes.Structure:
         if name in self.memrefs:
             return self.memrefs[name]
         alloc_func = execution_engine.lookup("host_alloc_f32")
@@ -107,13 +114,15 @@ def _allocate_array(self, name, execution_engine):
         self.memrefs[name] = mref
         return mref
 
-    def _deallocate_all(self, execution_engine):
+    def _deallocate_all(self, execution_engine: ExecutionEngine):
         for mref in self.memrefs.values():
             dealloc_func = execution_engine.lookup("host_dealloc_f32")
             dealloc_func(memrefs_to_packed_args([mref]))
         self.memrefs = {}
 
-    def get_input_arrays(self, execution_engine):
+    def get_input_arrays(
+        self, execution_engine: ExecutionEngine
+    ) -> list[ctypes.Structure]:
         A = self._allocate_array("A", execution_engine)
         B = self._allocate_array("B", execution_engine)
         C = self._allocate_array("C", execution_engine)
@@ -128,13 +137,15 @@ def get_input_arrays(self, execution_engine):
         return [A, B, C]
 
     @contextmanager
-    def allocate_inputs(self, execution_engine):
+    def allocate_inputs(self, execution_engine: ExecutionEngine):
         try:
             yield self.get_input_arrays(execution_engine)
         finally:
             self._deallocate_all(execution_engine)
 
-    def check_correctness(self, execution_engine, verbose: int = 0) -> bool:
+    def check_correctness(
+        self, execution_engine: ExecutionEngine, verbose: int = 0
+    ) -> bool:
         # compute reference solution with numpy
         A = ranked_memref_to_numpy([self.memrefs["A"]])
         B = ranked_memref_to_numpy([self.memrefs["B"]])
diff --git a/lighthouse/utils/__init__.py b/lighthouse/utils/__init__.py
index 474b748..738326b 100644
--- a/lighthouse/utils/__init__.py
+++ b/lighthouse/utils/__init__.py
@@ -8,8 +8,11 @@
     torch_to_packed_args,
     mlir_type_to_torch_dtype,
 )
+from .runner import execute, benchmark
 
 __all__ = [
+    "benchmark",
+    "execute",
     "get_packed_arg",
     "memref_to_ctype",
     "memrefs_to_packed_args",
diff --git a/lighthouse/utils/mlir.py b/lighthouse/utils/mlir.py
index bf6e248..e900b6a 100644
--- a/lighthouse/utils/mlir.py
+++ b/lighthouse/utils/mlir.py
@@ -22,12 +22,13 @@ def canonicalize(op):
 
 
 def get_mlir_library_path():
+    """Return MLIR shared library path."""
     pkg_path = ir.__file__
     if "python_packages" in pkg_path:
         # looks like a local mlir install
-        path = pkg_path.split("python_packages")[0] + os.sep + "lib"
+        path = os.path.join(pkg_path.split("python_packages")[0], "lib")
     else:
         # maybe installed in python path
-        path = os.path.split(pkg_path)[0] + os.sep + "_mlir_libs"
+        path = os.path.join(os.path.split(pkg_path)[0], "_mlir_libs")
     assert os.path.isdir(path)
     return path
diff --git a/lighthouse/utils/runner.py b/lighthouse/utils/runner.py
index 8879d6a..699db2e 100644
--- a/lighthouse/utils/runner.py
+++ b/lighthouse/utils/runner.py
@@ -14,7 +14,9 @@
 from typing import Optional
 
 
-def get_engine(payload_module, requirements=None, opt_level=3) -> ExecutionEngine:
+def get_engine(
+    payload_module: ir.Module, requirements: list[str] = None, opt_level: int = 3
+) -> ExecutionEngine:
     requirements = requirements or []
     context = ir.Context()
     location = ir.Location.unknown(context)
diff --git a/lighthouse/workload.py b/lighthouse/workload.py
index dc96831..dbf6ca0 100644
--- a/lighthouse/workload.py
+++ b/lighthouse/workload.py
@@ -5,6 +5,7 @@
 """
 
 from mlir import ir
+from mlir.execution_engine import ExecutionEngine
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
 from typing import Optional
@@ -57,7 +58,8 @@ def lower_payload(
         """
         Apply transform schedule to the payload module.
 
-        Optionally dumps the payload IR and/or transform schedule to stdout.
+        Optionally dumps the payload IR at the desired level and/or the
+        transform schedule to stdout.
 
         Returns the lowered payload module.
         """
@@ -77,7 +79,7 @@ def lower_payload(
 
     @abstractmethod
     @contextmanager
-    def allocate_inputs(self, execution_engine):
+    def allocate_inputs(self, execution_engine: ExecutionEngine):
         """
         Context manager that allocates and returns payload input buffers.
 
@@ -94,7 +96,9 @@ def allocate_inputs(self, execution_engine):
             pass
 
     @abstractmethod
-    def check_correctness(self, execution_engine, verbose: int = 0) -> bool:
+    def check_correctness(
+        self, execution_engine: ExecutionEngine, verbose: int = 0
+    ) -> bool:
         """Verify the correctness of the computation."""
         pass
 

From 84637482d5f789f9f475c149afdec6cb9b25d305 Mon Sep 17 00:00:00 2001
From: Tuomas Karna <tuomas.karna@intel.com>
Date: Wed, 3 Dec 2025 21:35:50 +0200
Subject: [PATCH 09/15] rename workload requirements to shared_libs

---
 examples/workload/example.py    |  2 +-
 examples/xegpu_matmul/matmul.py |  3 +++
 lighthouse/utils/runner.py      | 43 ++++++++++-----------------------
 lighthouse/workload.py          |  4 +--
 4 files changed, 19 insertions(+), 33 deletions(-)

diff --git a/examples/workload/example.py b/examples/workload/example.py
index 03a0cd6..af800c4 100644
--- a/examples/workload/example.py
+++ b/examples/workload/example.py
@@ -84,7 +84,7 @@ def check_correctness(
                 print("FAILED Result mismatch!")
         return success
 
-    def requirements(self) -> list[str]:
+    def shared_libs(self) -> list[str]:
         return []
 
     def get_complexity(self) -> tuple[int, int, int]:
diff --git a/examples/xegpu_matmul/matmul.py b/examples/xegpu_matmul/matmul.py
index 32b397f..65a89d2 100644
--- a/examples/xegpu_matmul/matmul.py
+++ b/examples/xegpu_matmul/matmul.py
@@ -220,6 +220,9 @@ def schedule_module(
             params=parameters,
         )
 
+    def shared_libs() -> list[str]:
+        return ["libmlir_levelzero_runtime.so"]
+
 
 def parse_cli():
     parser = argparse.ArgumentParser(
diff --git a/lighthouse/utils/runner.py b/lighthouse/utils/runner.py
index 699db2e..202aa2b 100644
--- a/lighthouse/utils/runner.py
+++ b/lighthouse/utils/runner.py
@@ -15,33 +15,15 @@
 
 
 def get_engine(
-    payload_module: ir.Module, requirements: list[str] = None, opt_level: int = 3
+    payload_module: ir.Module, shared_libs: list[str] = None, opt_level: int = 3
 ) -> ExecutionEngine:
-    requirements = requirements or []
-    context = ir.Context()
-    location = ir.Location.unknown(context)
-    required_libs = {
-        "levelzero": (
-            ["libmlir_levelzero_runtime.so"],
-            "Did you compile LLVM with -DMLIR_ENABLE_LEVELZERO_RUNNER=1?",
-        ),
-        "mlir_runner": (["libmlir_runner_utils.so"], ""),
-        "mlir_c_runner": (["libmlir_c_runner_utils.so"], ""),
-    }
+    lib_dir = get_mlir_library_path()
     libs = []
-    lib_dir = os.path.join(get_mlir_library_path())
-    for r in requirements:
-        if r not in required_libs:
-            raise ValueError(f"Unknown execution engine requirement: {r}")
-        so_files, hint = required_libs[r]
-        for f in so_files:
-            so_path = os.path.join(lib_dir, f)
-            if not os.path.isfile(so_path):
-                msg = f"Could not find shared library {so_path}"
-                if hint:
-                    msg += "\n" + hint
-                raise ValueError(msg)
-            libs.append(so_path)
+    for so_file in shared_libs or []:
+        so_path = os.path.join(lib_dir, so_file)
+        if not os.path.isfile(so_path):
+            raise ValueError(f"Could not find shared library {so_path}")
+        libs.append(so_path)
     with context, location:
         execution_engine = ExecutionEngine(
             payload_module, opt_level=opt_level, shared_libs=libs
@@ -59,7 +41,7 @@ def execute(
     # lower payload with schedule
     payload_module = workload.lower_payload(schedule_parameters=schedule_parameters)
     # get execution engine
-    engine = get_engine(payload_module, requirements=workload.requirements())
+    engine = get_engine(payload_module, shared_libs=workload.shared_libs())
 
     with workload.allocate_inputs(execution_engine=engine) as inputs:
         # prepare function arguments
@@ -153,10 +135,11 @@ def benchmark(
     schedule_module.body.operations[0].apply(payload_module)
 
     # get execution engine, rtclock requires mlir_c_runner
-    requirements = workload.requirements()
-    if "mlir_c_runner" not in requirements:
-        requirements.append("mlir_c_runner")
-    engine = get_engine(payload_module, requirements=requirements)
+    libs = workload.shared_libs()
+    c_runner_lib = "libmlir_c_runner_utils.so"
+    if c_runner_lib not in libs:
+        libs.append(c_runner_lib)
+    engine = get_engine(payload_module, shared_libs=libs)
 
     with workload.allocate_inputs(execution_engine=engine) as inputs:
         if check_correctness:
diff --git a/lighthouse/workload.py b/lighthouse/workload.py
index dbf6ca0..db13ff7 100644
--- a/lighthouse/workload.py
+++ b/lighthouse/workload.py
@@ -26,8 +26,8 @@ class Workload(ABC):
     payload_function_name: str = "payload"
 
     @abstractmethod
-    def requirements(self) -> list[str]:
-        """Return a list of requirements for the execution engine."""
+    def shared_libs(self) -> list[str]:
+        """Return a list of shared libraries required byt the execution engine."""
         pass
 
     @abstractmethod

From 9214caa07a59c9003ed9cce1c928240d5589ecf8 Mon Sep 17 00:00:00 2001
From: Tuomas Karna <tuomas.karna@intel.com>
Date: Wed, 3 Dec 2025 21:37:29 +0200
Subject: [PATCH 10/15] get_engine: remove context

---
 lighthouse/utils/runner.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/lighthouse/utils/runner.py b/lighthouse/utils/runner.py
index 202aa2b..f203f14 100644
--- a/lighthouse/utils/runner.py
+++ b/lighthouse/utils/runner.py
@@ -24,11 +24,10 @@ def get_engine(
         if not os.path.isfile(so_path):
             raise ValueError(f"Could not find shared library {so_path}")
         libs.append(so_path)
-    with context, location:
-        execution_engine = ExecutionEngine(
-            payload_module, opt_level=opt_level, shared_libs=libs
-        )
-        execution_engine.initialize()
+    execution_engine = ExecutionEngine(
+        payload_module, opt_level=opt_level, shared_libs=libs
+    )
+    execution_engine.initialize()
     return execution_engine
 
 

From bf78876f722b4be8f47e4b3ace76a4f14b3f3b7a Mon Sep 17 00:00:00 2001
From: Tuomas Karna <tuomas.karna@intel.com>
Date: Thu, 4 Dec 2025 16:32:55 +0200
Subject: [PATCH 11/15] annotate examples for CI

---
 examples/workload/example.py      | 4 ++++
 examples/workload/example_mlir.py | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/examples/workload/example.py b/examples/workload/example.py
index af800c4..1c3ffbe 100644
--- a/examples/workload/example.py
+++ b/examples/workload/example.py
@@ -1,3 +1,7 @@
+# RUN: %PYTHON %s  | FileCheck %s
+# CHECK: func.func @payload
+# CHECK: PASSED
+# CHECK: Throughput:
 """
 Workload example: Element-wise sum of two (M, N) float32 arrays on CPU.
 """
diff --git a/examples/workload/example_mlir.py b/examples/workload/example_mlir.py
index dc48b1c..a366863 100644
--- a/examples/workload/example_mlir.py
+++ b/examples/workload/example_mlir.py
@@ -1,3 +1,7 @@
+# RUN: %PYTHON %s  | FileCheck %s
+# CHECK: func.func @payload
+# CHECK: PASSED
+# CHECK: Throughput:
 """
 Workload example: Element-wise sum of two (M, N) float32 arrays on CPU.
 

From 513da56fb3311bc360f4272b9b22e53b2e0a6bf5 Mon Sep 17 00:00:00 2001
From: Tuomas Karna <tuomas.karna@intel.com>
Date: Mon, 8 Dec 2025 19:22:06 +0200
Subject: [PATCH 12/15] move workload specific things to lighthouse/workload

---
 examples/workload/example.py             | 4 ++--
 examples/workload/example_mlir.py        | 4 +++-
 lighthouse/__init__.py                   | 6 ------
 lighthouse/utils/__init__.py             | 3 ---
 lighthouse/{utils => workload}/runner.py | 2 +-
 lighthouse/{ => workload}/workload.py    | 0
 6 files changed, 6 insertions(+), 13 deletions(-)
 rename lighthouse/{utils => workload}/runner.py (99%)
 rename lighthouse/{ => workload}/workload.py (100%)

diff --git a/examples/workload/example.py b/examples/workload/example.py
index 1c3ffbe..23f0879 100644
--- a/examples/workload/example.py
+++ b/examples/workload/example.py
@@ -16,13 +16,13 @@
 from functools import cached_property
 import ctypes
 from typing import Optional
-from lighthouse import Workload
 from lighthouse.utils.mlir import (
     apply_registered_pass,
     canonicalize,
     match,
 )
-from lighthouse.utils import (
+from lighthouse.workload import (
+    Workload,
     execute,
     benchmark,
 )
diff --git a/examples/workload/example_mlir.py b/examples/workload/example_mlir.py
index a366863..7d3211a 100644
--- a/examples/workload/example_mlir.py
+++ b/examples/workload/example_mlir.py
@@ -23,10 +23,12 @@
     get_packed_arg,
     memrefs_to_packed_args,
     memref_to_ctype,
+)
+from example import ElementwiseSum
+from lighthouse.workload import (
     execute,
     benchmark,
 )
-from example import ElementwiseSum
 
 
 def emit_host_alloc(suffix: str, element_type: ir.Type, rank: int = 2):
diff --git a/lighthouse/__init__.py b/lighthouse/__init__.py
index d05b010..1ac008e 100644
--- a/lighthouse/__init__.py
+++ b/lighthouse/__init__.py
@@ -1,7 +1 @@
 __version__ = "0.1.0a1"
-
-from .workload import Workload
-
-__all__ = [
-    "Workload",
-]
diff --git a/lighthouse/utils/__init__.py b/lighthouse/utils/__init__.py
index 738326b..474b748 100644
--- a/lighthouse/utils/__init__.py
+++ b/lighthouse/utils/__init__.py
@@ -8,11 +8,8 @@
     torch_to_packed_args,
     mlir_type_to_torch_dtype,
 )
-from .runner import execute, benchmark
 
 __all__ = [
-    "benchmark",
-    "execute",
     "get_packed_arg",
     "memref_to_ctype",
     "memrefs_to_packed_args",
diff --git a/lighthouse/utils/runner.py b/lighthouse/workload/runner.py
similarity index 99%
rename from lighthouse/utils/runner.py
rename to lighthouse/workload/runner.py
index f203f14..4f2e9d9 100644
--- a/lighthouse/utils/runner.py
+++ b/lighthouse/workload/runner.py
@@ -10,7 +10,7 @@
 from mlir.runtime.np_to_memref import get_ranked_memref_descriptor
 from lighthouse.utils.mlir import get_mlir_library_path
 from lighthouse.utils import memrefs_to_packed_args
-from lighthouse import Workload
+from lighthouse.workload import Workload
 from typing import Optional
 
 
diff --git a/lighthouse/workload.py b/lighthouse/workload/workload.py
similarity index 100%
rename from lighthouse/workload.py
rename to lighthouse/workload/workload.py

From 9fa76d9a155388200973835dfb7cd015e26d9b0d Mon Sep 17 00:00:00 2001
From: Tuomas Karna <tuomas.karna@intel.com>
Date: Mon, 8 Dec 2025 19:39:35 +0200
Subject: [PATCH 13/15] nit comments

---
 examples/workload/example.py    |  3 +--
 lighthouse/workload/runner.py   | 11 +++++------
 lighthouse/workload/workload.py |  7 +------
 3 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/examples/workload/example.py b/examples/workload/example.py
index 23f0879..3137dad 100644
--- a/examples/workload/example.py
+++ b/examples/workload/example.py
@@ -109,8 +109,7 @@ def payload_module(self) -> ir.Module:
             fargs = [memref_t, memref_t, memref_t]
 
             @func.func(*fargs, name=self.payload_function_name)
-            def payload(*args):
-                A, B, C = args
+            def payload(A, B, C):
                 a_tensor = bufferization.to_tensor(tensor_t, A, restrict=True)
                 b_tensor = bufferization.to_tensor(tensor_t, B, restrict=True)
                 c_tensor = bufferization.to_tensor(
diff --git a/lighthouse/workload/runner.py b/lighthouse/workload/runner.py
index 4f2e9d9..ae5e07e 100644
--- a/lighthouse/workload/runner.py
+++ b/lighthouse/workload/runner.py
@@ -62,7 +62,7 @@ def execute(
 
 def emit_benchmark_function(
     payload_module: ir.Module,
-    workload: Workload,
+    payload_function_name: str,
     nruns: int,
     nwarmup: int,
 ):
@@ -75,10 +75,7 @@ def emit_benchmark_function(
     # find original payload function
     payload_func = None
     for op in payload_module.operation.regions[0].blocks[0]:
-        if (
-            isinstance(op, func.FuncOp)
-            and op.name.value == workload.payload_function_name
-        ):
+        if isinstance(op, func.FuncOp) and op.name.value == payload_function_name:
             payload_func = op
             break
     assert payload_func is not None, "Could not find payload function"
@@ -127,7 +124,9 @@ def benchmark(
     payload_module = workload.payload_module()
 
     # add benchmark function with timing
-    emit_benchmark_function(payload_module, workload, nruns, nwarmup)
+    emit_benchmark_function(
+        payload_module, workload.payload_function_name, nruns, nwarmup
+    )
 
     # lower
     schedule_module = workload.schedule_module(parameters=schedule_parameters)
diff --git a/lighthouse/workload/workload.py b/lighthouse/workload/workload.py
index db13ff7..cc2c4f5 100644
--- a/lighthouse/workload/workload.py
+++ b/lighthouse/workload/workload.py
@@ -88,12 +88,7 @@ def allocate_inputs(self, execution_engine: ExecutionEngine):
 
         On exit, frees any manually allocated memory (if any).
         """
-        try:
-            # Yield payload function input memrefs here.
-            yield None
-        finally:
-            # Manually deallocate memory here (if needed).
-            pass
+        pass
 
     @abstractmethod
     def check_correctness(

From 4454b097992e39bb8990433270f34a9df4537ad9 Mon Sep 17 00:00:00 2001
From: Tuomas Karna <tuomas.karna@intel.com>
Date: Mon, 8 Dec 2025 19:43:04 +0200
Subject: [PATCH 14/15] add missing init file

---
 lighthouse/workload/__init__.py | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 lighthouse/workload/__init__.py

diff --git a/lighthouse/workload/__init__.py b/lighthouse/workload/__init__.py
new file mode 100644
index 0000000..4738604
--- /dev/null
+++ b/lighthouse/workload/__init__.py
@@ -0,0 +1,4 @@
+from .workload import Workload
+from .runner import execute, benchmark
+
+__all__ = ["Workload", "benchmark", "execute"]

From 27424a9ed85929b79938a782dd3e8e581266c948 Mon Sep 17 00:00:00 2001
From: Tuomas Karna <tuomas.karna@intel.com>
Date: Mon, 8 Dec 2025 20:54:29 +0200
Subject: [PATCH 15/15] revert change to xegpu example

---
 examples/xegpu_matmul/matmul.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/examples/xegpu_matmul/matmul.py b/examples/xegpu_matmul/matmul.py
index 65a89d2..32b397f 100644
--- a/examples/xegpu_matmul/matmul.py
+++ b/examples/xegpu_matmul/matmul.py
@@ -220,9 +220,6 @@ def schedule_module(
             params=parameters,
         )
 
-    def shared_libs() -> list[str]:
-        return ["libmlir_levelzero_runtime.so"]
-
 
 def parse_cli():
     parser = argparse.ArgumentParser(