From 91339a333eaf9ea828604deffe45dfbe01d7e041 Mon Sep 17 00:00:00 2001 From: Tuomas Karna Date: Tue, 18 Nov 2025 16:42:14 +0200 Subject: [PATCH 01/15] add workload obj, execution and mlir utils, and two workload examples --- examples/workload/example.py | 179 +++++++++++++++++++++++ examples/workload/example_mlir.py | 205 ++++++++++++++++++++++++++ lighthouse/__init__.py | 6 + lighthouse/utils/execution.py | 231 ++++++++++++++++++++++++++++++ lighthouse/utils/mlir.py | 37 +++++ lighthouse/workload.py | 79 ++++++++++ 6 files changed, 737 insertions(+) create mode 100644 examples/workload/example.py create mode 100644 examples/workload/example_mlir.py create mode 100644 lighthouse/utils/execution.py create mode 100644 lighthouse/utils/mlir.py create mode 100644 lighthouse/workload.py diff --git a/examples/workload/example.py b/examples/workload/example.py new file mode 100644 index 0000000..28bbd01 --- /dev/null +++ b/examples/workload/example.py @@ -0,0 +1,179 @@ +""" +Workload example: Element-wise sum of two (M, N) float32 arrays on CPU. +""" + +import numpy as np +from mlir import ir +from mlir.runtime.np_to_memref import get_ranked_memref_descriptor +from mlir.dialects import func, linalg, bufferization +from mlir.dialects import transform +from functools import cached_property +from lighthouse import Workload +from lighthouse.utils.mlir import ( + apply_registered_pass, + canonicalize, + cse, + match, +) +from lighthouse.utils.execution import ( + lower_payload, + execute, + benchmark, +) + + +class ElementwiseSum(Workload): + """ + Computes element-wise sum of (M, N) float32 arrays on CPU. + + We can construct the input arrays and compute the reference solution in + Python with Numpy. + + We use @cached_property to store the inputs and reference solution in the + object so that they are only computed once. + """ + + def __init__(self, M, N): + self.M = M + self.N = N + self.dtype = np.float32 + self.context = ir.Context() + self.location = ir.Location.unknown(context=self.context) + + @cached_property + def _input_arrays(self): + print(" * Generating input arrays...") + np.random.seed(2) + A = np.random.rand(self.M, self.N).astype(self.dtype) + B = np.random.rand(self.M, self.N).astype(self.dtype) + C = np.zeros((self.M, self.N), dtype=self.dtype) + return [A, B, C] + + @cached_property + def _reference_solution(self): + print(" * Computing reference solution...") + A, B, _ = self._input_arrays + return A + B + + def get_input_arrays(self, execution_engine): + return [get_ranked_memref_descriptor(a) for a in self._input_arrays] + + def verify(self, execution_engine, verbose: int = 0) -> bool: + C = self._input_arrays[2] + C_ref = self._reference_solution + if verbose > 1: + print("Reference solution:") + print(C_ref) + print("Computed solution:") + print(C) + success = np.allclose(C, C_ref) + if verbose: + if success: + print("PASSED") + else: + print("FAILED Result mismatch!") + return success + + def requirements(self): + return [] + + def get_complexity(self): + nbytes = np.dtype(self.dtype).itemsize + flop_count = self.M * self.N # one addition per element + memory_reads = 2 * self.M * self.N * nbytes # read A and B + memory_writes = self.M * self.N * nbytes # write C + return (flop_count, memory_reads, memory_writes) + + def payload_module(self): + with self.context, self.location: + float32_t = ir.F32Type.get() + shape = (self.M, self.N) + tensor_t = ir.RankedTensorType.get(shape, float32_t) + memref_t = ir.MemRefType.get(shape, float32_t) + mod = ir.Module.create() + with ir.InsertionPoint(mod.body): + args = [memref_t, memref_t, memref_t] + f = func.FuncOp(self.payload_function_name, (tuple(args), ())) + f.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get() + with ir.InsertionPoint(f.add_entry_block()): + A = f.arguments[0] + B = f.arguments[1] + C = f.arguments[2] + a_tensor = bufferization.ToTensorOp(tensor_t, A, restrict=True) + b_tensor = bufferization.ToTensorOp(tensor_t, B, restrict=True) + c_tensor = bufferization.ToTensorOp( + tensor_t, C, restrict=True, writable=True + ) + add = linalg.add(a_tensor, b_tensor, outs=[c_tensor]) + bufferization.MaterializeInDestinationOp( + None, add, C, restrict=True, writable=True + ) + func.ReturnOp(()) + return mod + + def schedule_module(self, dump_kernel=None, parameters=None): + with self.context, self.location: + schedule_module = ir.Module.create() + schedule_module.operation.attributes["transform.with_named_sequence"] = ( + ir.UnitAttr.get() + ) + with ir.InsertionPoint(schedule_module.body): + named_sequence = transform.NamedSequenceOp( + "__transform_main", + [transform.AnyOpType.get()], + [], + arg_attrs=[{"transform.readonly": ir.UnitAttr.get()}], + ) + with ir.InsertionPoint(named_sequence.body): + anytype = transform.AnyOpType.get() + func = match(named_sequence.bodyTarget, ops={"func.func"}) + mod = transform.get_parent_op( + anytype, + func, + op_name="builtin.module", + deduplicate=True, + ) + mod = apply_registered_pass(mod, "one-shot-bufferize") + mod = apply_registered_pass(mod, "convert-linalg-to-loops") + cse(mod) + canonicalize(mod) + + if dump_kernel == "bufferized": + transform.YieldOp() + return schedule_module + + mod = apply_registered_pass(mod, "convert-scf-to-cf") + mod = apply_registered_pass(mod, "finalize-memref-to-llvm") + mod = apply_registered_pass(mod, "convert-cf-to-llvm") + mod = apply_registered_pass(mod, "convert-arith-to-llvm") + mod = apply_registered_pass(mod, "convert-func-to-llvm") + mod = apply_registered_pass(mod, "reconcile-unrealized-casts") + transform.YieldOp() + + return schedule_module + + +if __name__ == "__main__": + wload = ElementwiseSum(400, 400) + + print(" Dump kernel ".center(60, "-")) + lower_payload(wload, dump_kernel="bufferized", dump_schedule=True) + + print(" Execute 1 ".center(60, "-")) + execute(wload, verbose=2) + + print(" Execute 2 ".center(60, "-")) + execute(wload, verbose=1) + + print(" Benchmark ".center(60, "-")) + times = benchmark(wload) + times *= 1e6 # convert to microseconds + # compute statistics + mean = np.mean(times) + min = np.min(times) + max = np.max(times) + std = np.std(times) + print(f"Timings (us): mean={mean:.2f}+/-{std:.2f} min={min:.2f} max={max:.2f}") + flop_count = wload.get_complexity()[0] + gflops = flop_count / (mean * 1e-6) / 1e9 + print(f"Throughput: {gflops:.2f} GFLOPS") diff --git a/examples/workload/example_mlir.py b/examples/workload/example_mlir.py new file mode 100644 index 0000000..a539a3a --- /dev/null +++ b/examples/workload/example_mlir.py @@ -0,0 +1,205 @@ +""" +Workload example: Element-wise sum of two (M, N) float32 arrays on CPU. + +In this example, allocation and deallocation of input arrays is done in MLIR. +""" + +import numpy as np +from mlir import ir +from mlir.runtime.np_to_memref import ( + ranked_memref_to_numpy, + make_nd_memref_descriptor, + as_ctype, +) +from mlir.dialects import func, linalg, arith, memref +import ctypes +from contextlib import contextmanager +from lighthouse.utils import get_packed_arg +from lighthouse.utils.execution import ( + lower_payload, + execute, + benchmark, +) +from example import ElementwiseSum + + +def emit_host_alloc(mod, suffix, element_type, rank=2): + dyn = ir.ShapedType.get_dynamic_size() + memref_dyn_t = ir.MemRefType.get(rank * (dyn,), element_type) + index_t = ir.IndexType.get() + i32_t = ir.IntegerType.get_signless(32) + with ir.InsertionPoint(mod.body): + f = func.FuncOp("host_alloc_" + suffix, (rank * (i32_t,), (memref_dyn_t,))) + f.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get() + with ir.InsertionPoint(f.add_entry_block()): + dims = [arith.IndexCastOp(index_t, a) for a in list(f.arguments)] + alloc = memref.alloc(memref_dyn_t, dims, []) + func.ReturnOp((alloc,)) + + +def emit_host_dealloc(mod, suffix, element_type, rank=2): + dyn = ir.ShapedType.get_dynamic_size() + memref_dyn_t = ir.MemRefType.get(rank * (dyn,), element_type) + with ir.InsertionPoint(mod.body): + f = func.FuncOp("host_dealloc_" + suffix, ((memref_dyn_t,), ())) + f.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get() + with ir.InsertionPoint(f.add_entry_block()): + memref.dealloc(f.arguments[0]) + func.ReturnOp(()) + + +def emit_fill_constant(mod, suffix, value, element_type, rank=2): + dyn = ir.ShapedType.get_dynamic_size() + memref_dyn_t = ir.MemRefType.get(rank * (dyn,), element_type) + with ir.InsertionPoint(mod.body): + f = func.FuncOp("host_fill_constant_" + suffix, ((memref_dyn_t,), ())) + f.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get() + with ir.InsertionPoint(f.add_entry_block()): + const = arith.constant(element_type, value) + linalg.fill(const, outs=[f.arguments[0]]) + func.ReturnOp(()) + + +def emit_fill_random(mod, suffix, element_type, min=0.0, max=1.0, seed=2): + rank = 2 + dyn = ir.ShapedType.get_dynamic_size() + memref_dyn_t = ir.MemRefType.get(rank * (dyn,), element_type) + i32_t = ir.IntegerType.get_signless(32) + f64_t = ir.F64Type.get() + with ir.InsertionPoint(mod.body): + f = func.FuncOp("host_fill_random_" + suffix, ((memref_dyn_t,), ())) + f.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get() + with ir.InsertionPoint(f.add_entry_block()): + min_cst = arith.constant(f64_t, min) + max_cst = arith.constant(f64_t, max) + seed_cst = arith.constant(i32_t, seed) + linalg.fill_rng_2d(min_cst, max_cst, seed_cst, outs=[f.arguments[0]]) + func.ReturnOp(()) + + +class ElementwiseSumMLIRAlloc(ElementwiseSum): + """ + Computes element-wise sum of (M, N) float32 arrays on CPU. + + Extends ElementwiseSum by allocating input arrays in MLIR. + """ + + def __init__(self, M, N): + super().__init__(M, N) + # keep track of allocated memrefs + self.memrefs = {} + + def _allocate_array(self, name, execution_engine): + if name in self.memrefs: + return self.memrefs[name] + alloc_func = execution_engine.lookup("host_alloc_f32") + shape = (self.M, self.N) + mref = make_nd_memref_descriptor(len(shape), as_ctype(self.dtype))() + ptr_mref = ctypes.pointer(ctypes.pointer(mref)) + ptr_dims = [ctypes.pointer(ctypes.c_int32(d)) for d in shape] + alloc_func(get_packed_arg([ptr_mref, *ptr_dims])) + self.memrefs[name] = mref + return mref + + def _allocate_inputs(self, execution_engine): + self._allocate_array("A", execution_engine) + self._allocate_array("B", execution_engine) + self._allocate_array("C", execution_engine) + + def _deallocate_all(self, execution_engine): + for mref in self.memrefs.values(): + dealloc_func = execution_engine.lookup("host_dealloc_f32") + ptr_mref = ctypes.pointer(ctypes.pointer(mref)) + dealloc_func(get_packed_arg([ptr_mref])) + self.memrefs = {} + + @contextmanager + def allocate(self, execution_engine): + try: + self._allocate_inputs(execution_engine) + yield None + finally: + self._deallocate_all(execution_engine) + + def get_input_arrays(self, execution_engine): + A = self._allocate_array("A", execution_engine) + B = self._allocate_array("B", execution_engine) + C = self._allocate_array("C", execution_engine) + + # initialize with MLIR + fill_zero_func = execution_engine.lookup("host_fill_constant_zero_f32") + fill_random_func = execution_engine.lookup("host_fill_random_f32") + fill_zero_func(get_packed_arg([ctypes.pointer(ctypes.pointer(C))])) + fill_random_func(get_packed_arg([ctypes.pointer(ctypes.pointer(A))])) + fill_random_func(get_packed_arg([ctypes.pointer(ctypes.pointer(B))])) + + return [A, B, C] + + def verify(self, execution_engine, verbose: int = 0) -> bool: + # compute reference solution with numpy + A = ranked_memref_to_numpy([self.memrefs["A"]]) + B = ranked_memref_to_numpy([self.memrefs["B"]]) + C = ranked_memref_to_numpy([self.memrefs["C"]]) + C_ref = A + B + if verbose > 1: + print("Reference solution:") + print(C_ref) + print("Computed solution:") + print(C) + success = np.allclose(C, C_ref) + + # Alternatively we could have done the verification in MLIR by emitting + # a check function. + # Here we just call the payload function again. + # self._allocate_array("C_ref", execution_engine) + # func = execution_engine.lookup("payload") + # func(get_packed_arg([ + # ctypes.pointer(ctypes.pointer(self.memrefs["A"])), + # ctypes.pointer(ctypes.pointer(self.memrefs["B"])), + # ctypes.pointer(ctypes.pointer(self.memrefs["C_ref"])), + # ])) + # Check correctness with numpy. + # C = ranked_memref_to_numpy([self.memrefs["C"]]) + # C_ref = ranked_memref_to_numpy([self.memrefs["C_ref"]]) + # success = np.allclose(C, C_ref) + + if verbose: + if success: + print("PASSED") + else: + print("FAILED Result mismatch!") + return success + + def payload_module(self): + mod = super().payload_module() + # extend the payload module with de/alloc/fill functions + with self.context, self.location: + float32_t = ir.F32Type.get() + emit_host_alloc(mod, "f32", float32_t) + emit_host_dealloc(mod, "f32", float32_t) + emit_fill_constant(mod, "zero_f32", 0.0, float32_t) + emit_fill_random(mod, "f32", float32_t, min=-1.0, max=1.0) + return mod + + +if __name__ == "__main__": + wload = ElementwiseSumMLIRAlloc(400, 400) + + print(" Dump kernel ".center(60, "-")) + lower_payload(wload, dump_kernel="bufferized", dump_schedule=False) + + print(" Execute ".center(60, "-")) + execute(wload, verbose=2) + + print(" Benchmark ".center(60, "-")) + times = benchmark(wload) + times *= 1e6 # convert to microseconds + # compute statistics + mean = np.mean(times) + min = np.min(times) + max = np.max(times) + std = np.std(times) + print(f"Timings (us): mean={mean:.2f}+/-{std:.2f} min={min:.2f} max={max:.2f}") + flop_count = wload.get_complexity()[0] + gflops = flop_count / (mean * 1e-6) / 1e9 + print(f"Throughput: {gflops:.2f} GFLOPS") diff --git a/lighthouse/__init__.py b/lighthouse/__init__.py index 1ac008e..d05b010 100644 --- a/lighthouse/__init__.py +++ b/lighthouse/__init__.py @@ -1 +1,7 @@ __version__ = "0.1.0a1" + +from .workload import Workload + +__all__ = [ + "Workload", +] diff --git a/lighthouse/utils/execution.py b/lighthouse/utils/execution.py new file mode 100644 index 0000000..e4c04a4 --- /dev/null +++ b/lighthouse/utils/execution.py @@ -0,0 +1,231 @@ +""" +Execution engine utility functions. +""" + +import numpy as np +import ctypes +import os +from mlir import ir +from mlir.dialects.transform import interpreter as transform_interpreter +from mlir.dialects import func, arith, scf, memref +from mlir.execution_engine import ExecutionEngine +from mlir.runtime.np_to_memref import get_ranked_memref_descriptor +from lighthouse.utils.mlir import get_mlir_library_path +from lighthouse.utils import get_packed_arg +from lighthouse import Workload +from typing import Optional + + +def get_engine(payload_module, requirements=None, opt_level=3) -> ExecutionEngine: + requirements = requirements or [] + context = ir.Context() + location = ir.Location.unknown(context) + required_libs = { + "levelzero": ( + ["libmlir_levelzero_runtime.so"], + "Did you compile LLVM with -DMLIR_ENABLE_LEVELZERO_RUNNER=1?", + ), + "mlir_runner": (["libmlir_runner_utils.so"], ""), + "mlir_c_runner": (["libmlir_c_runner_utils.so"], ""), + } + libs = [] + lib_dir = os.path.join(get_mlir_library_path()) + for r in requirements: + if r not in required_libs: + raise ValueError(f"Unknown execution engine requirement: {r}") + so_files, hint = required_libs[r] + for f in so_files: + so_path = os.path.join(lib_dir, f) + if not os.path.isfile(so_path): + msg = f"Could not find shared library {so_path}" + if hint: + msg += "\n" + hint + raise ValueError(msg) + libs.append(so_path) + with context, location: + execution_engine = ExecutionEngine( + payload_module, opt_level=opt_level, shared_libs=libs + ) + execution_engine.initialize() + return execution_engine + + +def apply_transform_schedule( + payload_module, + schedule_module, + context, + location, + dump_kernel: Optional[str] = None, + dump_schedule: bool = False, +): + if not dump_kernel or dump_kernel != "initial": + with context, location: + # invoke transform interpreter directly + transform_interpreter.apply_named_sequence( + payload_root=payload_module, + transform_root=schedule_module.body.operations[0], + transform_module=schedule_module, + ) + if dump_kernel: + print(payload_module) + if dump_schedule: + print(schedule_module) + + +def lower_payload( + workload, + dump_kernel: Optional[str] = None, + dump_schedule: bool = False, + schedule_parameters: Optional[dict] = None, +) -> ir.Module: + payload_module = workload.payload_module() + schedule_module = workload.schedule_module( + dump_kernel=dump_kernel, parameters=schedule_parameters + ) + apply_transform_schedule( + payload_module, + schedule_module, + workload.context, + workload.location, + dump_kernel=dump_kernel, + dump_schedule=dump_schedule, + ) + return payload_module + + +def execute( + workload, + check_correctness: bool = True, + schedule_parameters: Optional[dict] = None, + verbose: int = 0, +): + # lower payload with schedule + payload_module = lower_payload(workload, schedule_parameters=schedule_parameters) + # get execution engine + engine = get_engine(payload_module, requirements=workload.requirements()) + + with workload.allocate(execution_engine=engine): + # prepare function arguments + inputs = workload.get_input_arrays(execution_engine=engine) + pointers = [ctypes.pointer(ctypes.pointer(m)) for m in inputs] + packed_args = get_packed_arg(pointers) + + # handle to payload function + payload_func = engine.lookup(workload.payload_function_name) + + # call function + payload_func(packed_args) + + if check_correctness: + success = workload.verify(execution_engine=engine, verbose=verbose) + if not success: + raise ValueError("Benchmark verification failed.") + + +def emit_benchmark_function( + payload_module: ir.Module, workload: Workload, nruns: int, nwarmup: int +): + """ + Emit a benchmark function that calls payload function and times it. + + Every function call is timed separately. Returns the times (seconds) in a + memref. + """ + # find original payload function + payload_func = None + for op in payload_module.operation.regions[0].blocks[0]: + if ( + isinstance(op, func.FuncOp) + and str(op.name).strip('"') == workload.payload_function_name + ): + payload_func = op + break + assert payload_func is not None, "Could not find payload function" + payload_arguments = payload_func.type.inputs + # emit benchmark function + with workload.context, workload.location: + with ir.InsertionPoint(payload_module.body): + # define rtclock function + f64_t = ir.F64Type.get() + f = func.FuncOp("rtclock", ((), (f64_t,)), visibility="private") + # emit new function + time_memref_t = ir.MemRefType.get((nruns,), f64_t) + args = payload_arguments + [time_memref_t] + f = func.FuncOp("benchmark", (tuple(args), ())) + f.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get() + with ir.InsertionPoint(f.add_entry_block()): + index_t = ir.IndexType.get() + zero = arith.ConstantOp(index_t, 0) + one = arith.ConstantOp(index_t, 1) + # call payload for warmup runs + nwarmup_cst = arith.ConstantOp(index_t, nwarmup) + for_op = scf.ForOp(zero, nwarmup_cst, one) + with ir.InsertionPoint(for_op.body): + func.CallOp(payload_func, list(f.arguments[: len(payload_arguments)])) + scf.YieldOp(()) + # call payload for benchmark runs, time every call separately + nruns_cst = arith.ConstantOp(index_t, nruns) + for_op = scf.ForOp(zero, nruns_cst, one) + i = for_op.induction_variable + with ir.InsertionPoint(for_op.body): + tic = func.CallOp((f64_t,), "rtclock", ()).result + func.CallOp(payload_func, list(f.arguments[: len(payload_arguments)])) + toc = func.CallOp((f64_t,), "rtclock", ()).result + time = arith.SubFOp(toc, tic) + memref.StoreOp(time, f.arguments[-1], [i]) + scf.YieldOp(()) + func.ReturnOp(()) + + +def benchmark( + workload, + nruns: int = 100, + nwarmup: int = 10, + schedule_parameters: Optional[dict] = None, + check_correctness: bool = True, + verbose: int = 0, +) -> np.ndarray: + # get original payload module + payload_module = workload.payload_module() + + # add benchmark function with timing + emit_benchmark_function(payload_module, workload, nruns, nwarmup) + + # lower + apply_transform_schedule( + payload_module, + workload.schedule_module(parameters=schedule_parameters), + workload.context, + workload.location, + ) + # get execution engine, rtclock requires mlir_c_runner + requirements = workload.requirements() + if "mlir_c_runner" not in requirements: + requirements.append("mlir_c_runner") + engine = get_engine(payload_module, requirements=requirements) + + with workload.allocate(execution_engine=engine): + inputs = workload.get_input_arrays(execution_engine=engine) + pointers = [ctypes.pointer(ctypes.pointer(m)) for m in inputs] + if check_correctness: + # call payload once to verify correctness + # prepare function arguments + packed_args = get_packed_arg(pointers) + + payload_func = engine.lookup(workload.payload_function_name) + payload_func(packed_args) + success = workload.verify(execution_engine=engine, verbose=verbose) + if not success: + raise ValueError("Benchmark verification failed.") + + # allocate buffer for timings and prepare arguments + time_array = np.zeros((nruns,), dtype=np.float64) + time_memref = get_ranked_memref_descriptor(time_array) + time_pointer = ctypes.pointer(ctypes.pointer(time_memref)) + packed_args_with_time = get_packed_arg(pointers + [time_pointer]) + + # call benchmark function + benchmark_func = engine.lookup("benchmark") + benchmark_func(packed_args_with_time) + + return time_array diff --git a/lighthouse/utils/mlir.py b/lighthouse/utils/mlir.py new file mode 100644 index 0000000..f32d243 --- /dev/null +++ b/lighthouse/utils/mlir.py @@ -0,0 +1,37 @@ +""" +MLIR utility functions. +""" + +from mlir import ir +from mlir.dialects import transform +from mlir.dialects.transform import structured +import os + + +def apply_registered_pass(*args, **kwargs): + return transform.apply_registered_pass(transform.AnyOpType.get(), *args, **kwargs) + + +def match(*args, **kwargs): + return structured.MatchOp(transform.AnyOpType.get(), *args, **kwargs) + + +def cse(op): + transform.ApplyCommonSubexpressionEliminationOp(op) + + +def canonicalize(op): + with ir.InsertionPoint(transform.ApplyPatternsOp(op).patterns): + transform.ApplyCanonicalizationPatternsOp() + + +def get_mlir_library_path(): + pkg_path = ir.__file__ + if "python_packages" in pkg_path: + # looks like a local mlir install + path = pkg_path.split("python_packages")[0] + os.sep + "lib" + else: + # maybe installed in python path + path = os.path.split(pkg_path)[0] + os.sep + "_mlir_libs" + assert os.path.isdir(path) + return path diff --git a/lighthouse/workload.py b/lighthouse/workload.py new file mode 100644 index 0000000..83e2918 --- /dev/null +++ b/lighthouse/workload.py @@ -0,0 +1,79 @@ +""" +Abstract base class for workloads. + +Defines the expected interface for generic workload execution methods. +""" + +from mlir import ir +from abc import ABC, abstractmethod +from contextlib import contextmanager +from typing import Optional + + +class Workload(ABC): + """ + Abstract base class for workloads. + + A workload is defined by a fixed payload function and problem size. + Different realizations of the workload can be obtained by altering the + lowering schedule. + + The MLIR payload function should take input arrays as memrefs and return + nothing. + """ + + payload_function_name: str = "payload" + + @abstractmethod + def requirements(self) -> list[str]: + """Return a list of requirements for the execution engine.""" + pass + + @abstractmethod + def payload_module(self) -> ir.Module: + """Generate the MLIR module containing the payload function.""" + pass + + @abstractmethod + def schedule_module( + self, + dump_kernel: Optional[str] = None, + parameters: Optional[dict] = None, + ) -> ir.Module: + """Generate the MLIR module containing the transform schedule.""" + pass + + @abstractmethod + def get_input_arrays(self, execution_engine) -> list: + """ + Return the input arrays for the payload function as memrefs. + + Allocation and initialization of the input arrays should be done here. + """ + pass + + @contextmanager + def allocate(self, execution_engine): + """ + Allocate any necessary memory for the workload. + + Override this method if the workload requires memory management.""" + try: + yield None + finally: + pass + + @abstractmethod + def verify(self, execution_engine, verbose: int = 0) -> bool: + """Verify the correctness of the computation.""" + pass + + @abstractmethod + def get_complexity(self) -> list: + """ + Return the computational complexity of the workload. + + Returns a tuple (flop_count, memory_reads, memory_writes). Memory + reads/writes are in bytes. + """ + pass From 01c8007d63b330d16d60abced5b5d0bfc4fe484f Mon Sep 17 00:00:00 2001 From: Tuomas Karna Date: Tue, 2 Dec 2025 18:41:04 +0200 Subject: [PATCH 02/15] clean up context and other fixes --- examples/workload/example.py | 149 +++++++++++++++--------------- examples/workload/example_mlir.py | 54 +++++------ lighthouse/utils/execution.py | 95 +++++++++---------- lighthouse/utils/mlir.py | 10 +- lighthouse/workload.py | 2 +- 5 files changed, 148 insertions(+), 162 deletions(-) diff --git a/examples/workload/example.py b/examples/workload/example.py index 28bbd01..403f27c 100644 --- a/examples/workload/example.py +++ b/examples/workload/example.py @@ -12,7 +12,6 @@ from lighthouse.utils.mlir import ( apply_registered_pass, canonicalize, - cse, match, ) from lighthouse.utils.execution import ( @@ -37,8 +36,6 @@ def __init__(self, M, N): self.M = M self.N = N self.dtype = np.float32 - self.context = ir.Context() - self.location = ir.Location.unknown(context=self.context) @cached_property def _input_arrays(self): @@ -58,7 +55,7 @@ def _reference_solution(self): def get_input_arrays(self, execution_engine): return [get_ranked_memref_descriptor(a) for a in self._input_arrays] - def verify(self, execution_engine, verbose: int = 0) -> bool: + def check_correctness(self, execution_engine, verbose: int = 0) -> bool: C = self._input_arrays[2] C_ref = self._reference_solution if verbose > 1: @@ -85,95 +82,95 @@ def get_complexity(self): return (flop_count, memory_reads, memory_writes) def payload_module(self): - with self.context, self.location: + mod = ir.Module.create() + + with ir.InsertionPoint(mod.body): float32_t = ir.F32Type.get() shape = (self.M, self.N) tensor_t = ir.RankedTensorType.get(shape, float32_t) memref_t = ir.MemRefType.get(shape, float32_t) - mod = ir.Module.create() - with ir.InsertionPoint(mod.body): - args = [memref_t, memref_t, memref_t] - f = func.FuncOp(self.payload_function_name, (tuple(args), ())) - f.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get() - with ir.InsertionPoint(f.add_entry_block()): - A = f.arguments[0] - B = f.arguments[1] - C = f.arguments[2] - a_tensor = bufferization.ToTensorOp(tensor_t, A, restrict=True) - b_tensor = bufferization.ToTensorOp(tensor_t, B, restrict=True) - c_tensor = bufferization.ToTensorOp( + fargs = [memref_t, memref_t, memref_t] + + @func.func(*fargs, name=self.payload_function_name) + def payload(*args): + A, B, C = args + a_tensor = bufferization.to_tensor(tensor_t, A, restrict=True) + b_tensor = bufferization.to_tensor(tensor_t, B, restrict=True) + c_tensor = bufferization.to_tensor( tensor_t, C, restrict=True, writable=True ) add = linalg.add(a_tensor, b_tensor, outs=[c_tensor]) - bufferization.MaterializeInDestinationOp( + bufferization.materialize_in_destination( None, add, C, restrict=True, writable=True ) - func.ReturnOp(()) + + payload.func_op.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get() + return mod def schedule_module(self, dump_kernel=None, parameters=None): - with self.context, self.location: - schedule_module = ir.Module.create() - schedule_module.operation.attributes["transform.with_named_sequence"] = ( - ir.UnitAttr.get() + schedule_module = ir.Module.create() + schedule_module.operation.attributes["transform.with_named_sequence"] = ( + ir.UnitAttr.get() + ) + with ir.InsertionPoint(schedule_module.body): + named_sequence = transform.named_sequence( + "__transform_main", + [transform.AnyOpType.get()], + [], + arg_attrs=[{"transform.readonly": ir.UnitAttr.get()}], ) - with ir.InsertionPoint(schedule_module.body): - named_sequence = transform.NamedSequenceOp( - "__transform_main", - [transform.AnyOpType.get()], - [], - arg_attrs=[{"transform.readonly": ir.UnitAttr.get()}], + with ir.InsertionPoint(named_sequence.body): + anytype = transform.AnyOpType.get() + func = match(named_sequence.bodyTarget, ops={"func.func"}) + mod = transform.get_parent_op( + anytype, + func, + op_name="builtin.module", + deduplicate=True, ) - with ir.InsertionPoint(named_sequence.body): - anytype = transform.AnyOpType.get() - func = match(named_sequence.bodyTarget, ops={"func.func"}) - mod = transform.get_parent_op( - anytype, - func, - op_name="builtin.module", - deduplicate=True, - ) - mod = apply_registered_pass(mod, "one-shot-bufferize") - mod = apply_registered_pass(mod, "convert-linalg-to-loops") - cse(mod) - canonicalize(mod) - - if dump_kernel == "bufferized": - transform.YieldOp() - return schedule_module - - mod = apply_registered_pass(mod, "convert-scf-to-cf") - mod = apply_registered_pass(mod, "finalize-memref-to-llvm") - mod = apply_registered_pass(mod, "convert-cf-to-llvm") - mod = apply_registered_pass(mod, "convert-arith-to-llvm") - mod = apply_registered_pass(mod, "convert-func-to-llvm") - mod = apply_registered_pass(mod, "reconcile-unrealized-casts") + mod = apply_registered_pass(mod, "one-shot-bufferize") + mod = apply_registered_pass(mod, "convert-linalg-to-loops") + transform.apply_cse(mod) + canonicalize(mod) + + if dump_kernel == "bufferized": transform.YieldOp() + return schedule_module + + mod = apply_registered_pass(mod, "convert-scf-to-cf") + mod = apply_registered_pass(mod, "finalize-memref-to-llvm") + mod = apply_registered_pass(mod, "convert-cf-to-llvm") + mod = apply_registered_pass(mod, "convert-arith-to-llvm") + mod = apply_registered_pass(mod, "convert-func-to-llvm") + mod = apply_registered_pass(mod, "reconcile-unrealized-casts") + transform.YieldOp() return schedule_module if __name__ == "__main__": - wload = ElementwiseSum(400, 400) - - print(" Dump kernel ".center(60, "-")) - lower_payload(wload, dump_kernel="bufferized", dump_schedule=True) - - print(" Execute 1 ".center(60, "-")) - execute(wload, verbose=2) - - print(" Execute 2 ".center(60, "-")) - execute(wload, verbose=1) - - print(" Benchmark ".center(60, "-")) - times = benchmark(wload) - times *= 1e6 # convert to microseconds - # compute statistics - mean = np.mean(times) - min = np.min(times) - max = np.max(times) - std = np.std(times) - print(f"Timings (us): mean={mean:.2f}+/-{std:.2f} min={min:.2f} max={max:.2f}") - flop_count = wload.get_complexity()[0] - gflops = flop_count / (mean * 1e-6) / 1e9 - print(f"Throughput: {gflops:.2f} GFLOPS") + with ir.Context(), ir.Location.unknown(): + wload = ElementwiseSum(400, 400) + + print(" Dump kernel ".center(60, "-")) + lower_payload(wload, dump_kernel="bufferized", dump_schedule=True) + + print(" Execute 1 ".center(60, "-")) + execute(wload, verbose=2) + + print(" Execute 2 ".center(60, "-")) + execute(wload, verbose=1) + + print(" Benchmark ".center(60, "-")) + times = benchmark(wload) + times *= 1e6 # convert to microseconds + # compute statistics + mean = np.mean(times) + min = np.min(times) + max = np.max(times) + std = np.std(times) + print(f"Timings (us): mean={mean:.2f}+/-{std:.2f} min={min:.2f} max={max:.2f}") + flop_count = wload.get_complexity()[0] + gflops = flop_count / (mean * 1e-6) / 1e9 + print(f"Throughput: {gflops:.2f} GFLOPS") diff --git a/examples/workload/example_mlir.py b/examples/workload/example_mlir.py index a539a3a..a992454 100644 --- a/examples/workload/example_mlir.py +++ b/examples/workload/example_mlir.py @@ -135,7 +135,7 @@ def get_input_arrays(self, execution_engine): return [A, B, C] - def verify(self, execution_engine, verbose: int = 0) -> bool: + def check_correctness(self, execution_engine, verbose: int = 0) -> bool: # compute reference solution with numpy A = ranked_memref_to_numpy([self.memrefs["A"]]) B = ranked_memref_to_numpy([self.memrefs["B"]]) @@ -173,33 +173,33 @@ def verify(self, execution_engine, verbose: int = 0) -> bool: def payload_module(self): mod = super().payload_module() # extend the payload module with de/alloc/fill functions - with self.context, self.location: - float32_t = ir.F32Type.get() - emit_host_alloc(mod, "f32", float32_t) - emit_host_dealloc(mod, "f32", float32_t) - emit_fill_constant(mod, "zero_f32", 0.0, float32_t) - emit_fill_random(mod, "f32", float32_t, min=-1.0, max=1.0) + float32_t = ir.F32Type.get() + emit_host_alloc(mod, "f32", float32_t) + emit_host_dealloc(mod, "f32", float32_t) + emit_fill_constant(mod, "zero_f32", 0.0, float32_t) + emit_fill_random(mod, "f32", float32_t, min=-1.0, max=1.0) return mod if __name__ == "__main__": - wload = ElementwiseSumMLIRAlloc(400, 400) - - print(" Dump kernel ".center(60, "-")) - lower_payload(wload, dump_kernel="bufferized", dump_schedule=False) - - print(" Execute ".center(60, "-")) - execute(wload, verbose=2) - - print(" Benchmark ".center(60, "-")) - times = benchmark(wload) - times *= 1e6 # convert to microseconds - # compute statistics - mean = np.mean(times) - min = np.min(times) - max = np.max(times) - std = np.std(times) - print(f"Timings (us): mean={mean:.2f}+/-{std:.2f} min={min:.2f} max={max:.2f}") - flop_count = wload.get_complexity()[0] - gflops = flop_count / (mean * 1e-6) / 1e9 - print(f"Throughput: {gflops:.2f} GFLOPS") + with ir.Context(), ir.Location.unknown(): + wload = ElementwiseSumMLIRAlloc(400, 400) + + print(" Dump kernel ".center(60, "-")) + lower_payload(wload, dump_kernel="bufferized", dump_schedule=False) + + print(" Execute ".center(60, "-")) + execute(wload, verbose=2) + + print(" Benchmark ".center(60, "-")) + times = benchmark(wload) + times *= 1e6 # convert to microseconds + # compute statistics + mean = np.mean(times) + min = np.min(times) + max = np.max(times) + std = np.std(times) + print(f"Timings (us): mean={mean:.2f}+/-{std:.2f} min={min:.2f} max={max:.2f}") + flop_count = wload.get_complexity()[0] + gflops = flop_count / (mean * 1e-6) / 1e9 + print(f"Throughput: {gflops:.2f} GFLOPS") diff --git a/lighthouse/utils/execution.py b/lighthouse/utils/execution.py index e4c04a4..b34a125 100644 --- a/lighthouse/utils/execution.py +++ b/lighthouse/utils/execution.py @@ -6,7 +6,6 @@ import ctypes import os from mlir import ir -from mlir.dialects.transform import interpreter as transform_interpreter from mlir.dialects import func, arith, scf, memref from mlir.execution_engine import ExecutionEngine from mlir.runtime.np_to_memref import get_ranked_memref_descriptor @@ -53,19 +52,13 @@ def get_engine(payload_module, requirements=None, opt_level=3) -> ExecutionEngin def apply_transform_schedule( payload_module, schedule_module, - context, - location, dump_kernel: Optional[str] = None, dump_schedule: bool = False, ): if not dump_kernel or dump_kernel != "initial": - with context, location: - # invoke transform interpreter directly - transform_interpreter.apply_named_sequence( - payload_root=payload_module, - transform_root=schedule_module.body.operations[0], - transform_module=schedule_module, - ) + # apply schedule on payload module + named_seq = schedule_module.body.operations[0] + named_seq.apply(payload_module) if dump_kernel: print(payload_module) if dump_schedule: @@ -85,8 +78,6 @@ def lower_payload( apply_transform_schedule( payload_module, schedule_module, - workload.context, - workload.location, dump_kernel=dump_kernel, dump_schedule=dump_schedule, ) @@ -117,13 +108,18 @@ def execute( payload_func(packed_args) if check_correctness: - success = workload.verify(execution_engine=engine, verbose=verbose) + success = workload.check_correctness( + execution_engine=engine, verbose=verbose + ) if not success: raise ValueError("Benchmark verification failed.") def emit_benchmark_function( - payload_module: ir.Module, workload: Workload, nruns: int, nwarmup: int + payload_module: ir.Module, + workload: Workload, + nruns: int, + nwarmup: int, ): """ Emit a benchmark function that calls payload function and times it. @@ -136,49 +132,46 @@ def emit_benchmark_function( for op in payload_module.operation.regions[0].blocks[0]: if ( isinstance(op, func.FuncOp) - and str(op.name).strip('"') == workload.payload_function_name + and op.name.value == workload.payload_function_name ): payload_func = op break assert payload_func is not None, "Could not find payload function" payload_arguments = payload_func.type.inputs - # emit benchmark function - with workload.context, workload.location: - with ir.InsertionPoint(payload_module.body): - # define rtclock function - f64_t = ir.F64Type.get() - f = func.FuncOp("rtclock", ((), (f64_t,)), visibility="private") - # emit new function - time_memref_t = ir.MemRefType.get((nruns,), f64_t) - args = payload_arguments + [time_memref_t] - f = func.FuncOp("benchmark", (tuple(args), ())) - f.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get() - with ir.InsertionPoint(f.add_entry_block()): + + # emit benchmark function that calls payload and times it + with ir.InsertionPoint(payload_module.body): + # define rtclock function + f64_t = ir.F64Type.get() + func.FuncOp("rtclock", ((), (f64_t,)), visibility="private") + # emit benchmark function + time_memref_t = ir.MemRefType.get((nruns,), f64_t) + args = payload_arguments + [time_memref_t] + + @func.func(*args) + def benchmark(*args): index_t = ir.IndexType.get() - zero = arith.ConstantOp(index_t, 0) - one = arith.ConstantOp(index_t, 1) - # call payload for warmup runs - nwarmup_cst = arith.ConstantOp(index_t, nwarmup) - for_op = scf.ForOp(zero, nwarmup_cst, one) - with ir.InsertionPoint(for_op.body): - func.CallOp(payload_func, list(f.arguments[: len(payload_arguments)])) - scf.YieldOp(()) - # call payload for benchmark runs, time every call separately - nruns_cst = arith.ConstantOp(index_t, nruns) - for_op = scf.ForOp(zero, nruns_cst, one) - i = for_op.induction_variable - with ir.InsertionPoint(for_op.body): - tic = func.CallOp((f64_t,), "rtclock", ()).result - func.CallOp(payload_func, list(f.arguments[: len(payload_arguments)])) - toc = func.CallOp((f64_t,), "rtclock", ()).result - time = arith.SubFOp(toc, tic) - memref.StoreOp(time, f.arguments[-1], [i]) - scf.YieldOp(()) - func.ReturnOp(()) + zero = arith.constant(index_t, 0) + one = arith.constant(index_t, 1) + nwarmup_cst = arith.constant(index_t, nwarmup) + for i in scf.for_(zero, nwarmup_cst, one): + # FIXME(upstream): func.call is broken for this use case? + func.CallOp(payload_func, list(args[: len(payload_arguments)])) + scf.yield_(()) + nruns_cst = arith.constant(index_t, nruns) + for i in scf.for_(zero, nruns_cst, one): + tic = func.call((f64_t,), "rtclock", ()) + func.CallOp(payload_func, list(args[: len(payload_arguments)])) + toc = func.call((f64_t,), "rtclock", ()) + time = arith.subf(toc, tic) + memref.store(time, args[-1], [i]) + scf.yield_(()) + + benchmark.func_op.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get() def benchmark( - workload, + workload: Workload, nruns: int = 100, nwarmup: int = 10, schedule_parameters: Optional[dict] = None, @@ -195,8 +188,6 @@ def benchmark( apply_transform_schedule( payload_module, workload.schedule_module(parameters=schedule_parameters), - workload.context, - workload.location, ) # get execution engine, rtclock requires mlir_c_runner requirements = workload.requirements() @@ -214,7 +205,9 @@ def benchmark( payload_func = engine.lookup(workload.payload_function_name) payload_func(packed_args) - success = workload.verify(execution_engine=engine, verbose=verbose) + success = workload.check_correctness( + execution_engine=engine, verbose=verbose + ) if not success: raise ValueError("Benchmark verification failed.") diff --git a/lighthouse/utils/mlir.py b/lighthouse/utils/mlir.py index f32d243..bf6e248 100644 --- a/lighthouse/utils/mlir.py +++ b/lighthouse/utils/mlir.py @@ -13,16 +13,12 @@ def apply_registered_pass(*args, **kwargs): def match(*args, **kwargs): - return structured.MatchOp(transform.AnyOpType.get(), *args, **kwargs) - - -def cse(op): - transform.ApplyCommonSubexpressionEliminationOp(op) + return structured.structured_match(transform.AnyOpType.get(), *args, **kwargs) def canonicalize(op): - with ir.InsertionPoint(transform.ApplyPatternsOp(op).patterns): - transform.ApplyCanonicalizationPatternsOp() + with ir.InsertionPoint(transform.apply_patterns(op).patterns): + transform.apply_patterns_canonicalization() def get_mlir_library_path(): diff --git a/lighthouse/workload.py b/lighthouse/workload.py index 83e2918..26fe8be 100644 --- a/lighthouse/workload.py +++ b/lighthouse/workload.py @@ -64,7 +64,7 @@ def allocate(self, execution_engine): pass @abstractmethod - def verify(self, execution_engine, verbose: int = 0) -> bool: + def check_correctness(self, execution_engine, verbose: int = 0) -> bool: """Verify the correctness of the computation.""" pass From 83b837a9f97cc1e7f31b9d227f98e3d0215eac6a Mon Sep 17 00:00:00 2001 From: Tuomas Karna Date: Tue, 2 Dec 2025 19:05:18 +0200 Subject: [PATCH 03/15] move execution.py -> runner.py --- examples/workload/example.py | 2 +- examples/workload/example_mlir.py | 2 +- lighthouse/utils/{execution.py => runner.py} | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) rename lighthouse/utils/{execution.py => runner.py} (99%) diff --git a/examples/workload/example.py b/examples/workload/example.py index 403f27c..1baa7d9 100644 --- a/examples/workload/example.py +++ b/examples/workload/example.py @@ -14,7 +14,7 @@ canonicalize, match, ) -from lighthouse.utils.execution import ( +from lighthouse.utils.runner import ( lower_payload, execute, benchmark, diff --git a/examples/workload/example_mlir.py b/examples/workload/example_mlir.py index a992454..8f89c01 100644 --- a/examples/workload/example_mlir.py +++ b/examples/workload/example_mlir.py @@ -15,7 +15,7 @@ import ctypes from contextlib import contextmanager from lighthouse.utils import get_packed_arg -from lighthouse.utils.execution import ( +from lighthouse.utils.runner import ( lower_payload, execute, benchmark, diff --git a/lighthouse/utils/execution.py b/lighthouse/utils/runner.py similarity index 99% rename from lighthouse/utils/execution.py rename to lighthouse/utils/runner.py index b34a125..0895a9b 100644 --- a/lighthouse/utils/execution.py +++ b/lighthouse/utils/runner.py @@ -1,5 +1,5 @@ """ -Execution engine utility functions. +Utility functions for running workloads. """ import numpy as np @@ -85,7 +85,7 @@ def lower_payload( def execute( - workload, + workload: Workload, check_correctness: bool = True, schedule_parameters: Optional[dict] = None, verbose: int = 0, From af7a49b5f9094c536a0b0147611ec0417469ff02 Mon Sep 17 00:00:00 2001 From: Tuomas Karna Date: Tue, 2 Dec 2025 21:16:27 +0200 Subject: [PATCH 04/15] workload: allocate_inputs ctx manager returns the input memrefs --- examples/workload/example.py | 12 +++++++- examples/workload/example_mlir.py | 46 +++++++++++++++---------------- lighthouse/utils/runner.py | 6 ++-- lighthouse/workload.py | 18 ++++++------ 4 files changed, 43 insertions(+), 39 deletions(-) diff --git a/examples/workload/example.py b/examples/workload/example.py index 1baa7d9..21963da 100644 --- a/examples/workload/example.py +++ b/examples/workload/example.py @@ -7,6 +7,8 @@ from mlir.runtime.np_to_memref import get_ranked_memref_descriptor from mlir.dialects import func, linalg, bufferization from mlir.dialects import transform +from mlir.execution_engine import ExecutionEngine +from contextlib import contextmanager from functools import cached_property from lighthouse import Workload from lighthouse.utils.mlir import ( @@ -52,9 +54,17 @@ def _reference_solution(self): A, B, _ = self._input_arrays return A + B - def get_input_arrays(self, execution_engine): + def _get_input_arrays(self): return [get_ranked_memref_descriptor(a) for a in self._input_arrays] + @contextmanager + def allocate_inputs(self, execution_engine: ExecutionEngine): + try: + yield self._get_input_arrays() + finally: + # cached numpy arrays are deallocated automatically + pass + def check_correctness(self, execution_engine, verbose: int = 0) -> bool: C = self._input_arrays[2] C_ref = self._reference_solution diff --git a/examples/workload/example_mlir.py b/examples/workload/example_mlir.py index 8f89c01..5d1d6ed 100644 --- a/examples/workload/example_mlir.py +++ b/examples/workload/example_mlir.py @@ -14,7 +14,11 @@ from mlir.dialects import func, linalg, arith, memref import ctypes from contextlib import contextmanager -from lighthouse.utils import get_packed_arg +from lighthouse.utils import ( + get_packed_arg, + memrefs_to_packed_args, + memref_to_ctype, +) from lighthouse.utils.runner import ( lower_payload, execute, @@ -93,34 +97,21 @@ def _allocate_array(self, name, execution_engine): if name in self.memrefs: return self.memrefs[name] alloc_func = execution_engine.lookup("host_alloc_f32") + # construct a memref descriptor for the result memref shape = (self.M, self.N) mref = make_nd_memref_descriptor(len(shape), as_ctype(self.dtype))() - ptr_mref = ctypes.pointer(ctypes.pointer(mref)) + ptr_mref = memref_to_ctype(mref) ptr_dims = [ctypes.pointer(ctypes.c_int32(d)) for d in shape] alloc_func(get_packed_arg([ptr_mref, *ptr_dims])) self.memrefs[name] = mref return mref - def _allocate_inputs(self, execution_engine): - self._allocate_array("A", execution_engine) - self._allocate_array("B", execution_engine) - self._allocate_array("C", execution_engine) - def _deallocate_all(self, execution_engine): for mref in self.memrefs.values(): dealloc_func = execution_engine.lookup("host_dealloc_f32") - ptr_mref = ctypes.pointer(ctypes.pointer(mref)) - dealloc_func(get_packed_arg([ptr_mref])) + dealloc_func(memrefs_to_packed_args([mref])) self.memrefs = {} - @contextmanager - def allocate(self, execution_engine): - try: - self._allocate_inputs(execution_engine) - yield None - finally: - self._deallocate_all(execution_engine) - def get_input_arrays(self, execution_engine): A = self._allocate_array("A", execution_engine) B = self._allocate_array("B", execution_engine) @@ -129,12 +120,19 @@ def get_input_arrays(self, execution_engine): # initialize with MLIR fill_zero_func = execution_engine.lookup("host_fill_constant_zero_f32") fill_random_func = execution_engine.lookup("host_fill_random_f32") - fill_zero_func(get_packed_arg([ctypes.pointer(ctypes.pointer(C))])) - fill_random_func(get_packed_arg([ctypes.pointer(ctypes.pointer(A))])) - fill_random_func(get_packed_arg([ctypes.pointer(ctypes.pointer(B))])) + fill_zero_func(memrefs_to_packed_args([C])) + fill_random_func(memrefs_to_packed_args([A])) + fill_random_func(memrefs_to_packed_args([B])) return [A, B, C] + @contextmanager + def allocate_inputs(self, execution_engine): + try: + yield self.get_input_arrays(execution_engine) + finally: + self._deallocate_all(execution_engine) + def check_correctness(self, execution_engine, verbose: int = 0) -> bool: # compute reference solution with numpy A = ranked_memref_to_numpy([self.memrefs["A"]]) @@ -153,10 +151,10 @@ def check_correctness(self, execution_engine, verbose: int = 0) -> bool: # Here we just call the payload function again. # self._allocate_array("C_ref", execution_engine) # func = execution_engine.lookup("payload") - # func(get_packed_arg([ - # ctypes.pointer(ctypes.pointer(self.memrefs["A"])), - # ctypes.pointer(ctypes.pointer(self.memrefs["B"])), - # ctypes.pointer(ctypes.pointer(self.memrefs["C_ref"])), + # func(memrefs_to_packed_args([ + # self.memrefs["A"], + # self.memrefs["B"], + # self.memrefs["C_ref"], # ])) # Check correctness with numpy. # C = ranked_memref_to_numpy([self.memrefs["C"]]) diff --git a/lighthouse/utils/runner.py b/lighthouse/utils/runner.py index 0895a9b..5046b3a 100644 --- a/lighthouse/utils/runner.py +++ b/lighthouse/utils/runner.py @@ -95,9 +95,8 @@ def execute( # get execution engine engine = get_engine(payload_module, requirements=workload.requirements()) - with workload.allocate(execution_engine=engine): + with workload.allocate_inputs(execution_engine=engine) as inputs: # prepare function arguments - inputs = workload.get_input_arrays(execution_engine=engine) pointers = [ctypes.pointer(ctypes.pointer(m)) for m in inputs] packed_args = get_packed_arg(pointers) @@ -195,8 +194,7 @@ def benchmark( requirements.append("mlir_c_runner") engine = get_engine(payload_module, requirements=requirements) - with workload.allocate(execution_engine=engine): - inputs = workload.get_input_arrays(execution_engine=engine) + with workload.allocate_inputs(execution_engine=engine) as inputs: pointers = [ctypes.pointer(ctypes.pointer(m)) for m in inputs] if check_correctness: # call payload once to verify correctness diff --git a/lighthouse/workload.py b/lighthouse/workload.py index 26fe8be..b7f0aa1 100644 --- a/lighthouse/workload.py +++ b/lighthouse/workload.py @@ -44,23 +44,21 @@ def schedule_module( pass @abstractmethod - def get_input_arrays(self, execution_engine) -> list: + @contextmanager + def allocate_inputs(self, execution_engine): """ - Return the input arrays for the payload function as memrefs. + Context manager that allocates and returns payload input buffers. - Allocation and initialization of the input arrays should be done here. - """ - pass + Returns the payload input buffers as memrefs that can be directly + passed to the compiled payload function. - @contextmanager - def allocate(self, execution_engine): + On exit, frees any manually allocated memory (if any). """ - Allocate any necessary memory for the workload. - - Override this method if the workload requires memory management.""" try: + # Yield payload function input memrefs here. yield None finally: + # Manually deallocate memory here (if needed). pass @abstractmethod From f026282b2fa965a7a540112c3dfe2fa80edcaaac Mon Sep 17 00:00:00 2001 From: Tuomas Karna Date: Tue, 2 Dec 2025 21:26:26 +0200 Subject: [PATCH 05/15] define helper functions with func.func decorator --- examples/workload/example_mlir.py | 69 ++++++++++++++++--------------- 1 file changed, 36 insertions(+), 33 deletions(-) diff --git a/examples/workload/example_mlir.py b/examples/workload/example_mlir.py index 5d1d6ed..428e61e 100644 --- a/examples/workload/example_mlir.py +++ b/examples/workload/example_mlir.py @@ -27,58 +27,60 @@ from example import ElementwiseSum -def emit_host_alloc(mod, suffix, element_type, rank=2): +def emit_host_alloc(suffix: str, element_type: ir.Type, rank: int = 2): dyn = ir.ShapedType.get_dynamic_size() memref_dyn_t = ir.MemRefType.get(rank * (dyn,), element_type) index_t = ir.IndexType.get() i32_t = ir.IntegerType.get_signless(32) - with ir.InsertionPoint(mod.body): - f = func.FuncOp("host_alloc_" + suffix, (rank * (i32_t,), (memref_dyn_t,))) - f.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get() - with ir.InsertionPoint(f.add_entry_block()): - dims = [arith.IndexCastOp(index_t, a) for a in list(f.arguments)] + inputs = rank * (i32_t,) + + @func.func(*inputs, name="host_alloc_" + suffix) + def alloc_func(*shape): + dims = [arith.index_cast(index_t, a) for a in shape] alloc = memref.alloc(memref_dyn_t, dims, []) - func.ReturnOp((alloc,)) + return alloc + + alloc_func.func_op.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get() -def emit_host_dealloc(mod, suffix, element_type, rank=2): +def emit_host_dealloc(suffix: str, element_type: ir.Type, rank: int = 2): dyn = ir.ShapedType.get_dynamic_size() memref_dyn_t = ir.MemRefType.get(rank * (dyn,), element_type) - with ir.InsertionPoint(mod.body): - f = func.FuncOp("host_dealloc_" + suffix, ((memref_dyn_t,), ())) - f.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get() - with ir.InsertionPoint(f.add_entry_block()): - memref.dealloc(f.arguments[0]) - func.ReturnOp(()) + + @func.func(memref_dyn_t, name="host_dealloc_" + suffix) + def dealloc_func(buffer): + memref.dealloc(buffer) + + dealloc_func.func_op.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get() -def emit_fill_constant(mod, suffix, value, element_type, rank=2): +def emit_fill_constant(suffix, value, element_type, rank=2): dyn = ir.ShapedType.get_dynamic_size() memref_dyn_t = ir.MemRefType.get(rank * (dyn,), element_type) - with ir.InsertionPoint(mod.body): - f = func.FuncOp("host_fill_constant_" + suffix, ((memref_dyn_t,), ())) - f.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get() - with ir.InsertionPoint(f.add_entry_block()): + + @func.func(memref_dyn_t, name="host_fill_constant_" + suffix) + def init_func(buffer): const = arith.constant(element_type, value) - linalg.fill(const, outs=[f.arguments[0]]) - func.ReturnOp(()) + linalg.fill(const, outs=[buffer]) + + init_func.func_op.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get() -def emit_fill_random(mod, suffix, element_type, min=0.0, max=1.0, seed=2): +def emit_fill_random(suffix, element_type, min=0.0, max=1.0, seed=2): rank = 2 dyn = ir.ShapedType.get_dynamic_size() memref_dyn_t = ir.MemRefType.get(rank * (dyn,), element_type) i32_t = ir.IntegerType.get_signless(32) f64_t = ir.F64Type.get() - with ir.InsertionPoint(mod.body): - f = func.FuncOp("host_fill_random_" + suffix, ((memref_dyn_t,), ())) - f.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get() - with ir.InsertionPoint(f.add_entry_block()): + + @func.func(memref_dyn_t, name="host_fill_random_" + suffix) + def init_func(buffer): min_cst = arith.constant(f64_t, min) max_cst = arith.constant(f64_t, max) seed_cst = arith.constant(i32_t, seed) - linalg.fill_rng_2d(min_cst, max_cst, seed_cst, outs=[f.arguments[0]]) - func.ReturnOp(()) + linalg.fill_rng_2d(min_cst, max_cst, seed_cst, outs=[buffer]) + + init_func.func_op.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get() class ElementwiseSumMLIRAlloc(ElementwiseSum): @@ -171,11 +173,12 @@ def check_correctness(self, execution_engine, verbose: int = 0) -> bool: def payload_module(self): mod = super().payload_module() # extend the payload module with de/alloc/fill functions - float32_t = ir.F32Type.get() - emit_host_alloc(mod, "f32", float32_t) - emit_host_dealloc(mod, "f32", float32_t) - emit_fill_constant(mod, "zero_f32", 0.0, float32_t) - emit_fill_random(mod, "f32", float32_t, min=-1.0, max=1.0) + with ir.InsertionPoint(mod.body): + float32_t = ir.F32Type.get() + emit_host_alloc("f32", float32_t) + emit_host_dealloc("f32", float32_t) + emit_fill_constant("zero_f32", 0.0, float32_t) + emit_fill_random("f32", float32_t, min=-1.0, max=1.0) return mod From ebcc6bbd32e5d70a2edfb699e7176834eadbb323 Mon Sep 17 00:00:00 2001 From: Tuomas Karna Date: Tue, 2 Dec 2025 21:47:02 +0200 Subject: [PATCH 06/15] remove apply_transform_schedule function --- lighthouse/utils/runner.py | 49 ++++++++++++-------------------------- 1 file changed, 15 insertions(+), 34 deletions(-) diff --git a/lighthouse/utils/runner.py b/lighthouse/utils/runner.py index 5046b3a..2561766 100644 --- a/lighthouse/utils/runner.py +++ b/lighthouse/utils/runner.py @@ -3,14 +3,13 @@ """ import numpy as np -import ctypes import os from mlir import ir from mlir.dialects import func, arith, scf, memref from mlir.execution_engine import ExecutionEngine from mlir.runtime.np_to_memref import get_ranked_memref_descriptor from lighthouse.utils.mlir import get_mlir_library_path -from lighthouse.utils import get_packed_arg +from lighthouse.utils import memrefs_to_packed_args from lighthouse import Workload from typing import Optional @@ -49,22 +48,6 @@ def get_engine(payload_module, requirements=None, opt_level=3) -> ExecutionEngin return execution_engine -def apply_transform_schedule( - payload_module, - schedule_module, - dump_kernel: Optional[str] = None, - dump_schedule: bool = False, -): - if not dump_kernel or dump_kernel != "initial": - # apply schedule on payload module - named_seq = schedule_module.body.operations[0] - named_seq.apply(payload_module) - if dump_kernel: - print(payload_module) - if dump_schedule: - print(schedule_module) - - def lower_payload( workload, dump_kernel: Optional[str] = None, @@ -75,12 +58,14 @@ def lower_payload( schedule_module = workload.schedule_module( dump_kernel=dump_kernel, parameters=schedule_parameters ) - apply_transform_schedule( - payload_module, - schedule_module, - dump_kernel=dump_kernel, - dump_schedule=dump_schedule, - ) + if not dump_kernel or dump_kernel != "initial": + # apply schedule on payload module + named_seq = schedule_module.body.operations[0] + named_seq.apply(payload_module) + if dump_kernel: + print(payload_module) + if dump_schedule: + print(schedule_module) return payload_module @@ -97,8 +82,7 @@ def execute( with workload.allocate_inputs(execution_engine=engine) as inputs: # prepare function arguments - pointers = [ctypes.pointer(ctypes.pointer(m)) for m in inputs] - packed_args = get_packed_arg(pointers) + packed_args = memrefs_to_packed_args(inputs) # handle to payload function payload_func = engine.lookup(workload.payload_function_name) @@ -184,10 +168,9 @@ def benchmark( emit_benchmark_function(payload_module, workload, nruns, nwarmup) # lower - apply_transform_schedule( - payload_module, - workload.schedule_module(parameters=schedule_parameters), - ) + schedule_module = workload.schedule_module(parameters=schedule_parameters) + schedule_module.body.operations[0].apply(payload_module) + # get execution engine, rtclock requires mlir_c_runner requirements = workload.requirements() if "mlir_c_runner" not in requirements: @@ -195,11 +178,10 @@ def benchmark( engine = get_engine(payload_module, requirements=requirements) with workload.allocate_inputs(execution_engine=engine) as inputs: - pointers = [ctypes.pointer(ctypes.pointer(m)) for m in inputs] if check_correctness: # call payload once to verify correctness # prepare function arguments - packed_args = get_packed_arg(pointers) + packed_args = memrefs_to_packed_args(inputs) payload_func = engine.lookup(workload.payload_function_name) payload_func(packed_args) @@ -212,8 +194,7 @@ def benchmark( # allocate buffer for timings and prepare arguments time_array = np.zeros((nruns,), dtype=np.float64) time_memref = get_ranked_memref_descriptor(time_array) - time_pointer = ctypes.pointer(ctypes.pointer(time_memref)) - packed_args_with_time = get_packed_arg(pointers + [time_pointer]) + packed_args_with_time = memrefs_to_packed_args(inputs + [time_memref]) # call benchmark function benchmark_func = engine.lookup("benchmark") From da9eb18c49e9f8b34a8860c1a4f9f023fc936f6c Mon Sep 17 00:00:00 2001 From: Tuomas Karna Date: Wed, 3 Dec 2025 16:16:29 +0200 Subject: [PATCH 07/15] lower_payload function is a member of Workload --- examples/workload/example.py | 7 +++--- examples/workload/example_mlir.py | 3 +-- lighthouse/utils/runner.py | 23 +----------------- lighthouse/workload.py | 40 +++++++++++++++++++++++++++---- 4 files changed, 41 insertions(+), 32 deletions(-) diff --git a/examples/workload/example.py b/examples/workload/example.py index 21963da..d1f71a6 100644 --- a/examples/workload/example.py +++ b/examples/workload/example.py @@ -17,7 +17,6 @@ match, ) from lighthouse.utils.runner import ( - lower_payload, execute, benchmark, ) @@ -118,7 +117,7 @@ def payload(*args): return mod - def schedule_module(self, dump_kernel=None, parameters=None): + def schedule_module(self, stop_at_stage=None, parameters=None): schedule_module = ir.Module.create() schedule_module.operation.attributes["transform.with_named_sequence"] = ( ir.UnitAttr.get() @@ -144,7 +143,7 @@ def schedule_module(self, dump_kernel=None, parameters=None): transform.apply_cse(mod) canonicalize(mod) - if dump_kernel == "bufferized": + if stop_at_stage == "bufferized": transform.YieldOp() return schedule_module @@ -164,7 +163,7 @@ def schedule_module(self, dump_kernel=None, parameters=None): wload = ElementwiseSum(400, 400) print(" Dump kernel ".center(60, "-")) - lower_payload(wload, dump_kernel="bufferized", dump_schedule=True) + wload.lower_payload(dump_payload="bufferized", dump_schedule=True) print(" Execute 1 ".center(60, "-")) execute(wload, verbose=2) diff --git a/examples/workload/example_mlir.py b/examples/workload/example_mlir.py index 428e61e..bc65724 100644 --- a/examples/workload/example_mlir.py +++ b/examples/workload/example_mlir.py @@ -20,7 +20,6 @@ memref_to_ctype, ) from lighthouse.utils.runner import ( - lower_payload, execute, benchmark, ) @@ -187,7 +186,7 @@ def payload_module(self): wload = ElementwiseSumMLIRAlloc(400, 400) print(" Dump kernel ".center(60, "-")) - lower_payload(wload, dump_kernel="bufferized", dump_schedule=False) + wload.lower_payload(dump_payload="bufferized", dump_schedule=False) print(" Execute ".center(60, "-")) execute(wload, verbose=2) diff --git a/lighthouse/utils/runner.py b/lighthouse/utils/runner.py index 2561766..8879d6a 100644 --- a/lighthouse/utils/runner.py +++ b/lighthouse/utils/runner.py @@ -48,27 +48,6 @@ def get_engine(payload_module, requirements=None, opt_level=3) -> ExecutionEngin return execution_engine -def lower_payload( - workload, - dump_kernel: Optional[str] = None, - dump_schedule: bool = False, - schedule_parameters: Optional[dict] = None, -) -> ir.Module: - payload_module = workload.payload_module() - schedule_module = workload.schedule_module( - dump_kernel=dump_kernel, parameters=schedule_parameters - ) - if not dump_kernel or dump_kernel != "initial": - # apply schedule on payload module - named_seq = schedule_module.body.operations[0] - named_seq.apply(payload_module) - if dump_kernel: - print(payload_module) - if dump_schedule: - print(schedule_module) - return payload_module - - def execute( workload: Workload, check_correctness: bool = True, @@ -76,7 +55,7 @@ def execute( verbose: int = 0, ): # lower payload with schedule - payload_module = lower_payload(workload, schedule_parameters=schedule_parameters) + payload_module = workload.lower_payload(schedule_parameters=schedule_parameters) # get execution engine engine = get_engine(payload_module, requirements=workload.requirements()) diff --git a/lighthouse/workload.py b/lighthouse/workload.py index b7f0aa1..dc96831 100644 --- a/lighthouse/workload.py +++ b/lighthouse/workload.py @@ -16,7 +16,7 @@ class Workload(ABC): A workload is defined by a fixed payload function and problem size. Different realizations of the workload can be obtained by altering the - lowering schedule. + lowering schedule parameters. The MLIR payload function should take input arrays as memrefs and return nothing. @@ -37,12 +37,44 @@ def payload_module(self) -> ir.Module: @abstractmethod def schedule_module( self, - dump_kernel: Optional[str] = None, + stop_at_stage: Optional[str] = None, parameters: Optional[dict] = None, ) -> ir.Module: - """Generate the MLIR module containing the transform schedule.""" + """ + Generate the MLIR module containing the transform schedule. + + The `stop_at_stage` argument can be used to interrupt lowering at + a desired IR level for debugging purposes. + """ pass + def lower_payload( + self, + dump_payload: Optional[str] = None, + dump_schedule: bool = False, + schedule_parameters: Optional[dict] = None, + ) -> ir.Module: + """ + Apply transform schedule to the payload module. + + Optionally dumps the payload IR and/or transform schedule to stdout. + + Returns the lowered payload module. + """ + payload_module = self.payload_module() + schedule_module = self.schedule_module( + stop_at_stage=dump_payload, parameters=schedule_parameters + ) + if not dump_payload or dump_payload != "initial": + # apply schedule on payload module + named_seq = schedule_module.body.operations[0] + named_seq.apply(payload_module) + if dump_payload: + print(payload_module) + if dump_schedule: + print(schedule_module) + return payload_module + @abstractmethod @contextmanager def allocate_inputs(self, execution_engine): @@ -67,7 +99,7 @@ def check_correctness(self, execution_engine, verbose: int = 0) -> bool: pass @abstractmethod - def get_complexity(self) -> list: + def get_complexity(self) -> tuple[int, int, int]: """ Return the computational complexity of the workload. From 802ded0784ec8eb3ff77e9f393ef8bc28214cbf2 Mon Sep 17 00:00:00 2001 From: Tuomas Karna Date: Wed, 3 Dec 2025 16:47:40 +0200 Subject: [PATCH 08/15] typing and mlir utils import --- examples/workload/example.py | 26 ++++++++++++++++---------- examples/workload/example_mlir.py | 31 +++++++++++++++++++++---------- lighthouse/utils/__init__.py | 3 +++ lighthouse/utils/mlir.py | 5 +++-- lighthouse/utils/runner.py | 4 +++- lighthouse/workload.py | 10 +++++++--- 6 files changed, 53 insertions(+), 26 deletions(-) diff --git a/examples/workload/example.py b/examples/workload/example.py index d1f71a6..03a0cd6 100644 --- a/examples/workload/example.py +++ b/examples/workload/example.py @@ -10,13 +10,15 @@ from mlir.execution_engine import ExecutionEngine from contextlib import contextmanager from functools import cached_property +import ctypes +from typing import Optional from lighthouse import Workload from lighthouse.utils.mlir import ( apply_registered_pass, canonicalize, match, ) -from lighthouse.utils.runner import ( +from lighthouse.utils import ( execute, benchmark, ) @@ -33,13 +35,13 @@ class ElementwiseSum(Workload): object so that they are only computed once. """ - def __init__(self, M, N): + def __init__(self, M: int, N: int): self.M = M self.N = N self.dtype = np.float32 @cached_property - def _input_arrays(self): + def _input_arrays(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: print(" * Generating input arrays...") np.random.seed(2) A = np.random.rand(self.M, self.N).astype(self.dtype) @@ -48,12 +50,12 @@ def _input_arrays(self): return [A, B, C] @cached_property - def _reference_solution(self): + def _reference_solution(self) -> np.ndarray: print(" * Computing reference solution...") A, B, _ = self._input_arrays return A + B - def _get_input_arrays(self): + def _get_input_arrays(self) -> list[ctypes.Structure]: return [get_ranked_memref_descriptor(a) for a in self._input_arrays] @contextmanager @@ -64,7 +66,9 @@ def allocate_inputs(self, execution_engine: ExecutionEngine): # cached numpy arrays are deallocated automatically pass - def check_correctness(self, execution_engine, verbose: int = 0) -> bool: + def check_correctness( + self, execution_engine: ExecutionEngine, verbose: int = 0 + ) -> bool: C = self._input_arrays[2] C_ref = self._reference_solution if verbose > 1: @@ -80,17 +84,17 @@ def check_correctness(self, execution_engine, verbose: int = 0) -> bool: print("FAILED Result mismatch!") return success - def requirements(self): + def requirements(self) -> list[str]: return [] - def get_complexity(self): + def get_complexity(self) -> tuple[int, int, int]: nbytes = np.dtype(self.dtype).itemsize flop_count = self.M * self.N # one addition per element memory_reads = 2 * self.M * self.N * nbytes # read A and B memory_writes = self.M * self.N * nbytes # write C return (flop_count, memory_reads, memory_writes) - def payload_module(self): + def payload_module(self) -> ir.Module: mod = ir.Module.create() with ir.InsertionPoint(mod.body): @@ -117,7 +121,9 @@ def payload(*args): return mod - def schedule_module(self, stop_at_stage=None, parameters=None): + def schedule_module( + self, stop_at_stage: Optional[str] = None, parameters: Optional[dict] = None + ) -> ir.Module: schedule_module = ir.Module.create() schedule_module.operation.attributes["transform.with_named_sequence"] = ( ir.UnitAttr.get() diff --git a/examples/workload/example_mlir.py b/examples/workload/example_mlir.py index bc65724..dc48b1c 100644 --- a/examples/workload/example_mlir.py +++ b/examples/workload/example_mlir.py @@ -12,14 +12,13 @@ as_ctype, ) from mlir.dialects import func, linalg, arith, memref +from mlir.execution_engine import ExecutionEngine import ctypes from contextlib import contextmanager from lighthouse.utils import ( get_packed_arg, memrefs_to_packed_args, memref_to_ctype, -) -from lighthouse.utils.runner import ( execute, benchmark, ) @@ -53,7 +52,7 @@ def dealloc_func(buffer): dealloc_func.func_op.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get() -def emit_fill_constant(suffix, value, element_type, rank=2): +def emit_fill_constant(suffix: str, value: float, element_type: ir.Type, rank: int = 2): dyn = ir.ShapedType.get_dynamic_size() memref_dyn_t = ir.MemRefType.get(rank * (dyn,), element_type) @@ -65,7 +64,13 @@ def init_func(buffer): init_func.func_op.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get() -def emit_fill_random(suffix, element_type, min=0.0, max=1.0, seed=2): +def emit_fill_random( + suffix: str, + element_type: ir.Type, + min: float = 0.0, + max: float = 1.0, + seed: int = 2, +): rank = 2 dyn = ir.ShapedType.get_dynamic_size() memref_dyn_t = ir.MemRefType.get(rank * (dyn,), element_type) @@ -89,12 +94,14 @@ class ElementwiseSumMLIRAlloc(ElementwiseSum): Extends ElementwiseSum by allocating input arrays in MLIR. """ - def __init__(self, M, N): + def __init__(self, M: int, N: int): super().__init__(M, N) # keep track of allocated memrefs self.memrefs = {} - def _allocate_array(self, name, execution_engine): + def _allocate_array( + self, name: str, execution_engine: ExecutionEngine + ) -> ctypes.Structure: if name in self.memrefs: return self.memrefs[name] alloc_func = execution_engine.lookup("host_alloc_f32") @@ -107,13 +114,15 @@ def _allocate_array(self, name, execution_engine): self.memrefs[name] = mref return mref - def _deallocate_all(self, execution_engine): + def _deallocate_all(self, execution_engine: ExecutionEngine): for mref in self.memrefs.values(): dealloc_func = execution_engine.lookup("host_dealloc_f32") dealloc_func(memrefs_to_packed_args([mref])) self.memrefs = {} - def get_input_arrays(self, execution_engine): + def get_input_arrays( + self, execution_engine: ExecutionEngine + ) -> list[ctypes.Structure]: A = self._allocate_array("A", execution_engine) B = self._allocate_array("B", execution_engine) C = self._allocate_array("C", execution_engine) @@ -128,13 +137,15 @@ def get_input_arrays(self, execution_engine): return [A, B, C] @contextmanager - def allocate_inputs(self, execution_engine): + def allocate_inputs(self, execution_engine: ExecutionEngine): try: yield self.get_input_arrays(execution_engine) finally: self._deallocate_all(execution_engine) - def check_correctness(self, execution_engine, verbose: int = 0) -> bool: + def check_correctness( + self, execution_engine: ExecutionEngine, verbose: int = 0 + ) -> bool: # compute reference solution with numpy A = ranked_memref_to_numpy([self.memrefs["A"]]) B = ranked_memref_to_numpy([self.memrefs["B"]]) diff --git a/lighthouse/utils/__init__.py b/lighthouse/utils/__init__.py index 474b748..738326b 100644 --- a/lighthouse/utils/__init__.py +++ b/lighthouse/utils/__init__.py @@ -8,8 +8,11 @@ torch_to_packed_args, mlir_type_to_torch_dtype, ) +from .runner import execute, benchmark __all__ = [ + "benchmark", + "execute", "get_packed_arg", "memref_to_ctype", "memrefs_to_packed_args", diff --git a/lighthouse/utils/mlir.py b/lighthouse/utils/mlir.py index bf6e248..e900b6a 100644 --- a/lighthouse/utils/mlir.py +++ b/lighthouse/utils/mlir.py @@ -22,12 +22,13 @@ def canonicalize(op): def get_mlir_library_path(): + """Return MLIR shared library path.""" pkg_path = ir.__file__ if "python_packages" in pkg_path: # looks like a local mlir install - path = pkg_path.split("python_packages")[0] + os.sep + "lib" + path = os.path.join(pkg_path.split("python_packages")[0], "lib") else: # maybe installed in python path - path = os.path.split(pkg_path)[0] + os.sep + "_mlir_libs" + path = os.path.join(os.path.split(pkg_path)[0], "_mlir_libs") assert os.path.isdir(path) return path diff --git a/lighthouse/utils/runner.py b/lighthouse/utils/runner.py index 8879d6a..699db2e 100644 --- a/lighthouse/utils/runner.py +++ b/lighthouse/utils/runner.py @@ -14,7 +14,9 @@ from typing import Optional -def get_engine(payload_module, requirements=None, opt_level=3) -> ExecutionEngine: +def get_engine( + payload_module: ir.Module, requirements: list[str] = None, opt_level: int = 3 +) -> ExecutionEngine: requirements = requirements or [] context = ir.Context() location = ir.Location.unknown(context) diff --git a/lighthouse/workload.py b/lighthouse/workload.py index dc96831..dbf6ca0 100644 --- a/lighthouse/workload.py +++ b/lighthouse/workload.py @@ -5,6 +5,7 @@ """ from mlir import ir +from mlir.execution_engine import ExecutionEngine from abc import ABC, abstractmethod from contextlib import contextmanager from typing import Optional @@ -57,7 +58,8 @@ def lower_payload( """ Apply transform schedule to the payload module. - Optionally dumps the payload IR and/or transform schedule to stdout. + Optionally dumps the payload IR at the desired level and/or the + transform schedule to stdout. Returns the lowered payload module. """ @@ -77,7 +79,7 @@ def lower_payload( @abstractmethod @contextmanager - def allocate_inputs(self, execution_engine): + def allocate_inputs(self, execution_engine: ExecutionEngine): """ Context manager that allocates and returns payload input buffers. @@ -94,7 +96,9 @@ def allocate_inputs(self, execution_engine): pass @abstractmethod - def check_correctness(self, execution_engine, verbose: int = 0) -> bool: + def check_correctness( + self, execution_engine: ExecutionEngine, verbose: int = 0 + ) -> bool: """Verify the correctness of the computation.""" pass From 84637482d5f789f9f475c149afdec6cb9b25d305 Mon Sep 17 00:00:00 2001 From: Tuomas Karna Date: Wed, 3 Dec 2025 21:35:50 +0200 Subject: [PATCH 09/15] rename workload requirements to shared_libs --- examples/workload/example.py | 2 +- examples/xegpu_matmul/matmul.py | 3 +++ lighthouse/utils/runner.py | 43 ++++++++++----------------------- lighthouse/workload.py | 4 +-- 4 files changed, 19 insertions(+), 33 deletions(-) diff --git a/examples/workload/example.py b/examples/workload/example.py index 03a0cd6..af800c4 100644 --- a/examples/workload/example.py +++ b/examples/workload/example.py @@ -84,7 +84,7 @@ def check_correctness( print("FAILED Result mismatch!") return success - def requirements(self) -> list[str]: + def shared_libs(self) -> list[str]: return [] def get_complexity(self) -> tuple[int, int, int]: diff --git a/examples/xegpu_matmul/matmul.py b/examples/xegpu_matmul/matmul.py index 32b397f..65a89d2 100644 --- a/examples/xegpu_matmul/matmul.py +++ b/examples/xegpu_matmul/matmul.py @@ -220,6 +220,9 @@ def schedule_module( params=parameters, ) + def shared_libs() -> list[str]: + return ["libmlir_levelzero_runtime.so"] + def parse_cli(): parser = argparse.ArgumentParser( diff --git a/lighthouse/utils/runner.py b/lighthouse/utils/runner.py index 699db2e..202aa2b 100644 --- a/lighthouse/utils/runner.py +++ b/lighthouse/utils/runner.py @@ -15,33 +15,15 @@ def get_engine( - payload_module: ir.Module, requirements: list[str] = None, opt_level: int = 3 + payload_module: ir.Module, shared_libs: list[str] = None, opt_level: int = 3 ) -> ExecutionEngine: - requirements = requirements or [] - context = ir.Context() - location = ir.Location.unknown(context) - required_libs = { - "levelzero": ( - ["libmlir_levelzero_runtime.so"], - "Did you compile LLVM with -DMLIR_ENABLE_LEVELZERO_RUNNER=1?", - ), - "mlir_runner": (["libmlir_runner_utils.so"], ""), - "mlir_c_runner": (["libmlir_c_runner_utils.so"], ""), - } + lib_dir = get_mlir_library_path() libs = [] - lib_dir = os.path.join(get_mlir_library_path()) - for r in requirements: - if r not in required_libs: - raise ValueError(f"Unknown execution engine requirement: {r}") - so_files, hint = required_libs[r] - for f in so_files: - so_path = os.path.join(lib_dir, f) - if not os.path.isfile(so_path): - msg = f"Could not find shared library {so_path}" - if hint: - msg += "\n" + hint - raise ValueError(msg) - libs.append(so_path) + for so_file in shared_libs or []: + so_path = os.path.join(lib_dir, so_file) + if not os.path.isfile(so_path): + raise ValueError(f"Could not find shared library {so_path}") + libs.append(so_path) with context, location: execution_engine = ExecutionEngine( payload_module, opt_level=opt_level, shared_libs=libs @@ -59,7 +41,7 @@ def execute( # lower payload with schedule payload_module = workload.lower_payload(schedule_parameters=schedule_parameters) # get execution engine - engine = get_engine(payload_module, requirements=workload.requirements()) + engine = get_engine(payload_module, shared_libs=workload.shared_libs()) with workload.allocate_inputs(execution_engine=engine) as inputs: # prepare function arguments @@ -153,10 +135,11 @@ def benchmark( schedule_module.body.operations[0].apply(payload_module) # get execution engine, rtclock requires mlir_c_runner - requirements = workload.requirements() - if "mlir_c_runner" not in requirements: - requirements.append("mlir_c_runner") - engine = get_engine(payload_module, requirements=requirements) + libs = workload.shared_libs() + c_runner_lib = "libmlir_c_runner_utils.so" + if c_runner_lib not in libs: + libs.append(c_runner_lib) + engine = get_engine(payload_module, shared_libs=libs) with workload.allocate_inputs(execution_engine=engine) as inputs: if check_correctness: diff --git a/lighthouse/workload.py b/lighthouse/workload.py index dbf6ca0..db13ff7 100644 --- a/lighthouse/workload.py +++ b/lighthouse/workload.py @@ -26,8 +26,8 @@ class Workload(ABC): payload_function_name: str = "payload" @abstractmethod - def requirements(self) -> list[str]: - """Return a list of requirements for the execution engine.""" + def shared_libs(self) -> list[str]: + """Return a list of shared libraries required byt the execution engine.""" pass @abstractmethod From 9214caa07a59c9003ed9cce1c928240d5589ecf8 Mon Sep 17 00:00:00 2001 From: Tuomas Karna Date: Wed, 3 Dec 2025 21:37:29 +0200 Subject: [PATCH 10/15] get_engine: remove context --- lighthouse/utils/runner.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/lighthouse/utils/runner.py b/lighthouse/utils/runner.py index 202aa2b..f203f14 100644 --- a/lighthouse/utils/runner.py +++ b/lighthouse/utils/runner.py @@ -24,11 +24,10 @@ def get_engine( if not os.path.isfile(so_path): raise ValueError(f"Could not find shared library {so_path}") libs.append(so_path) - with context, location: - execution_engine = ExecutionEngine( - payload_module, opt_level=opt_level, shared_libs=libs - ) - execution_engine.initialize() + execution_engine = ExecutionEngine( + payload_module, opt_level=opt_level, shared_libs=libs + ) + execution_engine.initialize() return execution_engine From bf78876f722b4be8f47e4b3ace76a4f14b3f3b7a Mon Sep 17 00:00:00 2001 From: Tuomas Karna Date: Thu, 4 Dec 2025 16:32:55 +0200 Subject: [PATCH 11/15] annotate examples for CI --- examples/workload/example.py | 4 ++++ examples/workload/example_mlir.py | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/examples/workload/example.py b/examples/workload/example.py index af800c4..1c3ffbe 100644 --- a/examples/workload/example.py +++ b/examples/workload/example.py @@ -1,3 +1,7 @@ +# RUN: %PYTHON %s | FileCheck %s +# CHECK: func.func @payload +# CHECK: PASSED +# CHECK: Throughput: """ Workload example: Element-wise sum of two (M, N) float32 arrays on CPU. """ diff --git a/examples/workload/example_mlir.py b/examples/workload/example_mlir.py index dc48b1c..a366863 100644 --- a/examples/workload/example_mlir.py +++ b/examples/workload/example_mlir.py @@ -1,3 +1,7 @@ +# RUN: %PYTHON %s | FileCheck %s +# CHECK: func.func @payload +# CHECK: PASSED +# CHECK: Throughput: """ Workload example: Element-wise sum of two (M, N) float32 arrays on CPU. From 513da56fb3311bc360f4272b9b22e53b2e0a6bf5 Mon Sep 17 00:00:00 2001 From: Tuomas Karna Date: Mon, 8 Dec 2025 19:22:06 +0200 Subject: [PATCH 12/15] move workload specific things to lighthouse/workload --- examples/workload/example.py | 4 ++-- examples/workload/example_mlir.py | 4 +++- lighthouse/__init__.py | 6 ------ lighthouse/utils/__init__.py | 3 --- lighthouse/{utils => workload}/runner.py | 2 +- lighthouse/{ => workload}/workload.py | 0 6 files changed, 6 insertions(+), 13 deletions(-) rename lighthouse/{utils => workload}/runner.py (99%) rename lighthouse/{ => workload}/workload.py (100%) diff --git a/examples/workload/example.py b/examples/workload/example.py index 1c3ffbe..23f0879 100644 --- a/examples/workload/example.py +++ b/examples/workload/example.py @@ -16,13 +16,13 @@ from functools import cached_property import ctypes from typing import Optional -from lighthouse import Workload from lighthouse.utils.mlir import ( apply_registered_pass, canonicalize, match, ) -from lighthouse.utils import ( +from lighthouse.workload import ( + Workload, execute, benchmark, ) diff --git a/examples/workload/example_mlir.py b/examples/workload/example_mlir.py index a366863..7d3211a 100644 --- a/examples/workload/example_mlir.py +++ b/examples/workload/example_mlir.py @@ -23,10 +23,12 @@ get_packed_arg, memrefs_to_packed_args, memref_to_ctype, +) +from example import ElementwiseSum +from lighthouse.workload import ( execute, benchmark, ) -from example import ElementwiseSum def emit_host_alloc(suffix: str, element_type: ir.Type, rank: int = 2): diff --git a/lighthouse/__init__.py b/lighthouse/__init__.py index d05b010..1ac008e 100644 --- a/lighthouse/__init__.py +++ b/lighthouse/__init__.py @@ -1,7 +1 @@ __version__ = "0.1.0a1" - -from .workload import Workload - -__all__ = [ - "Workload", -] diff --git a/lighthouse/utils/__init__.py b/lighthouse/utils/__init__.py index 738326b..474b748 100644 --- a/lighthouse/utils/__init__.py +++ b/lighthouse/utils/__init__.py @@ -8,11 +8,8 @@ torch_to_packed_args, mlir_type_to_torch_dtype, ) -from .runner import execute, benchmark __all__ = [ - "benchmark", - "execute", "get_packed_arg", "memref_to_ctype", "memrefs_to_packed_args", diff --git a/lighthouse/utils/runner.py b/lighthouse/workload/runner.py similarity index 99% rename from lighthouse/utils/runner.py rename to lighthouse/workload/runner.py index f203f14..4f2e9d9 100644 --- a/lighthouse/utils/runner.py +++ b/lighthouse/workload/runner.py @@ -10,7 +10,7 @@ from mlir.runtime.np_to_memref import get_ranked_memref_descriptor from lighthouse.utils.mlir import get_mlir_library_path from lighthouse.utils import memrefs_to_packed_args -from lighthouse import Workload +from lighthouse.workload import Workload from typing import Optional diff --git a/lighthouse/workload.py b/lighthouse/workload/workload.py similarity index 100% rename from lighthouse/workload.py rename to lighthouse/workload/workload.py From 9fa76d9a155388200973835dfb7cd015e26d9b0d Mon Sep 17 00:00:00 2001 From: Tuomas Karna Date: Mon, 8 Dec 2025 19:39:35 +0200 Subject: [PATCH 13/15] nit comments --- examples/workload/example.py | 3 +-- lighthouse/workload/runner.py | 11 +++++------ lighthouse/workload/workload.py | 7 +------ 3 files changed, 7 insertions(+), 14 deletions(-) diff --git a/examples/workload/example.py b/examples/workload/example.py index 23f0879..3137dad 100644 --- a/examples/workload/example.py +++ b/examples/workload/example.py @@ -109,8 +109,7 @@ def payload_module(self) -> ir.Module: fargs = [memref_t, memref_t, memref_t] @func.func(*fargs, name=self.payload_function_name) - def payload(*args): - A, B, C = args + def payload(A, B, C): a_tensor = bufferization.to_tensor(tensor_t, A, restrict=True) b_tensor = bufferization.to_tensor(tensor_t, B, restrict=True) c_tensor = bufferization.to_tensor( diff --git a/lighthouse/workload/runner.py b/lighthouse/workload/runner.py index 4f2e9d9..ae5e07e 100644 --- a/lighthouse/workload/runner.py +++ b/lighthouse/workload/runner.py @@ -62,7 +62,7 @@ def execute( def emit_benchmark_function( payload_module: ir.Module, - workload: Workload, + payload_function_name: str, nruns: int, nwarmup: int, ): @@ -75,10 +75,7 @@ def emit_benchmark_function( # find original payload function payload_func = None for op in payload_module.operation.regions[0].blocks[0]: - if ( - isinstance(op, func.FuncOp) - and op.name.value == workload.payload_function_name - ): + if isinstance(op, func.FuncOp) and op.name.value == payload_function_name: payload_func = op break assert payload_func is not None, "Could not find payload function" @@ -127,7 +124,9 @@ def benchmark( payload_module = workload.payload_module() # add benchmark function with timing - emit_benchmark_function(payload_module, workload, nruns, nwarmup) + emit_benchmark_function( + payload_module, workload.payload_function_name, nruns, nwarmup + ) # lower schedule_module = workload.schedule_module(parameters=schedule_parameters) diff --git a/lighthouse/workload/workload.py b/lighthouse/workload/workload.py index db13ff7..cc2c4f5 100644 --- a/lighthouse/workload/workload.py +++ b/lighthouse/workload/workload.py @@ -88,12 +88,7 @@ def allocate_inputs(self, execution_engine: ExecutionEngine): On exit, frees any manually allocated memory (if any). """ - try: - # Yield payload function input memrefs here. - yield None - finally: - # Manually deallocate memory here (if needed). - pass + pass @abstractmethod def check_correctness( From 4454b097992e39bb8990433270f34a9df4537ad9 Mon Sep 17 00:00:00 2001 From: Tuomas Karna Date: Mon, 8 Dec 2025 19:43:04 +0200 Subject: [PATCH 14/15] add missing init file --- lighthouse/workload/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 lighthouse/workload/__init__.py diff --git a/lighthouse/workload/__init__.py b/lighthouse/workload/__init__.py new file mode 100644 index 0000000..4738604 --- /dev/null +++ b/lighthouse/workload/__init__.py @@ -0,0 +1,4 @@ +from .workload import Workload +from .runner import execute, benchmark + +__all__ = ["Workload", "benchmark", "execute"] From 27424a9ed85929b79938a782dd3e8e581266c948 Mon Sep 17 00:00:00 2001 From: Tuomas Karna Date: Mon, 8 Dec 2025 20:54:29 +0200 Subject: [PATCH 15/15] revert change to xegpu example --- examples/xegpu_matmul/matmul.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/examples/xegpu_matmul/matmul.py b/examples/xegpu_matmul/matmul.py index 65a89d2..32b397f 100644 --- a/examples/xegpu_matmul/matmul.py +++ b/examples/xegpu_matmul/matmul.py @@ -220,9 +220,6 @@ def schedule_module( params=parameters, ) - def shared_libs() -> list[str]: - return ["libmlir_levelzero_runtime.so"] - def parse_cli(): parser = argparse.ArgumentParser(