From 3468a2625f397257cf22edb04208273c8ce6c324 Mon Sep 17 00:00:00 2001
From: Astha <astha.rai713@gmail.com>
Date: Wed, 26 Nov 2025 13:55:40 -0500
Subject: [PATCH 1/3] Restructure Tile Engine's benchmarking process

This change restructures the Benchmark structs into 3 files.
There is an addition of a base class for all GEMM benchmarks, derived classes for
Universal GEMM, multi dim GEMM, and GEMM preshuffle. Common functions have been relocated
into a common directory. For any derived base classes, only the redefination of the
constructor is needed, significantly mitigating the need for duplicated code.
---
 tile_engine/ops/commons/__init__.py           |   2 +
 tile_engine/ops/commons/benchmark_utils.py    | 285 ++++++++
 tile_engine/ops/gemm/gemm_benchmark.py        | 331 +++++++++
 .../gemm_multi_d/gemm_multi_d_benchmark.py    | 602 +---------------
 .../gemm_preshuffle_benchmark.py              | 602 +---------------
 .../ops/gemm/gemm_universal/CMakeLists.txt    |   2 +-
 .../ops/gemm/gemm_universal/gemm_benchmark.py | 678 ------------------
 ...hmark.hpp => gemm_universal_benchmark.hpp} |   2 +-
 .../gemm_universal_benchmark.py               | 146 ++++
 ...pp => gemm_universal_benchmark_single.cpp} |  61 +-
 ...m_common.hpp => gemm_universal_common.hpp} |   0
 ...ofiler.hpp => gemm_universal_profiler.hpp} |   2 +-
 12 files changed, 837 insertions(+), 1876 deletions(-)
 create mode 100644 tile_engine/ops/commons/__init__.py
 create mode 100644 tile_engine/ops/commons/benchmark_utils.py
 create mode 100644 tile_engine/ops/gemm/gemm_benchmark.py
 delete mode 100644 tile_engine/ops/gemm/gemm_universal/gemm_benchmark.py
 rename tile_engine/ops/gemm/gemm_universal/{gemm_benchmark.hpp => gemm_universal_benchmark.hpp} (99%)
 create mode 100755 tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py
 rename tile_engine/ops/gemm/gemm_universal/{gemm_benchmark_single.cpp => gemm_universal_benchmark_single.cpp} (55%)
 rename tile_engine/ops/gemm/gemm_universal/{gemm_common.hpp => gemm_universal_common.hpp} (100%)
 rename tile_engine/ops/gemm/gemm_universal/{gemm_profiler.hpp => gemm_universal_profiler.hpp} (99%)

diff --git a/tile_engine/ops/commons/__init__.py b/tile_engine/ops/commons/__init__.py
new file mode 100644
index 00000000000..1df48571843
--- /dev/null
+++ b/tile_engine/ops/commons/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
diff --git a/tile_engine/ops/commons/benchmark_utils.py b/tile_engine/ops/commons/benchmark_utils.py
new file mode 100644
index 00000000000..0c158fa48cd
--- /dev/null
+++ b/tile_engine/ops/commons/benchmark_utils.py
@@ -0,0 +1,285 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+import sys
+import json
+import subprocess
+import argparse
+import csv
+import time
+from pathlib import Path
+from typing import List, Dict, Tuple, Optional
+
+
+def run_kernel(build_dir: Path, kernel_path: Path, params: Dict[str, str], verbose: bool = False) -> Optional[Dict]:
+    """Run a single kernel with given parameters and save output to individual JSON file"""
+    # Create results directory
+    results_dir = build_dir / "results"
+    results_dir.mkdir(exist_ok=True)
+
+    # Generate unique JSON filename for this kernel
+    json_file = results_dir / f"{kernel_path.stem}.json"
+
+    cmd = [str(kernel_path)]
+
+    # Add parameters
+    for key, value in params.items():
+        cmd.append(f"-{key}={value}")
+
+    # Add JSON output flag for clean JSON output
+    cmd.append("-json_output=true")
+
+    if verbose:
+        print(f"Running: {' '.join(cmd)}")
+
+    try:
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
+
+        if result.returncode != 0:
+            print(f"Error running {kernel_path.name}: {result.stderr}")
+            return None
+
+        # Save raw output to individual JSON file
+        output = result.stdout.strip()
+        if output:
+            with open(json_file, "w") as f:
+                f.write(output)
+
+            # Parse the JSON file
+            return parse_json_file(json_file, verbose=verbose)
+        else:
+            print(f"No output from {kernel_path.name}")
+            return None
+
+    except subprocess.TimeoutExpired:
+        print(f"Timeout running {kernel_path.name}")
+        return None
+    except Exception as e:
+        print(f"Error running {kernel_path.name}: {e}")
+        return None
+
+def parse_json_file(json_file: Path, verbose: bool = False) -> Optional[Dict]:
+    """Parse JSON data from individual kernel output file"""
+    try:
+        with open(json_file, "r") as f:
+            content = f.read().strip()
+
+        # Parse the JSON directly since executables produce clean JSON
+        data = json.loads(content)
+
+        # Return the complete JSON data as-is, just add some convenience fields
+        result = data.copy()
+        if "perf_result" in data:
+            perf = data["perf_result"]
+            # Add convenience fields for backward compatibility
+            result["time_ms"] = perf.get("latency(ms)", 0)
+            result["tflops"] = perf.get("tflops(TFlops)", 0)
+            result["bandwidth_gb_s"] = perf.get("bandwidth(GB/s)", 0)
+
+        return result
+
+    except json.JSONDecodeError as e:
+        if verbose:
+            print(f"Failed to parse JSON from {json_file}: {e}")
+        return None
+    except Exception as e:
+        if verbose:
+            print(f"Error reading JSON file {json_file}: {e}")
+        return None
+
+def find_best_kernel(
+     results: List[Dict], metric: str = "tflops"
+) -> Optional[Dict]:
+    """Find the best performing kernel based on metric"""
+    if not results:
+        return None
+
+    if metric == "tflops":
+        return max(results, key=lambda x: x.get("tflops", 0))
+    elif metric == "time_ms":
+        return min(results, key=lambda x: x.get("time_ms", float("inf")))
+    elif metric == "bandwidth_gb_s":
+        return max(results, key=lambda x: x.get("bandwidth_gb_s", 0))
+    else:
+        raise ValueError(f"Unknown metric: {metric}")
+
+
+def export_csv(results: List[Dict], filename: str, verbose: bool = False):
+    """Export all results to CSV"""
+    if not results:
+        print("No results to export")
+        return
+
+    # Get all unique keys from results
+    all_keys = set()
+    for result in results:
+        all_keys.update(result.keys())
+
+    # Sort keys for consistent output
+    fieldnames = sorted(all_keys)
+
+    with open(filename, "w", newline="") as csvfile:
+        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(results)
+
+    print(f"Results exported to {filename}")
+
+def export_best_kernels( best_kernels: Dict, filename: str, verbose: bool = False):
+    """Export best kernel selections to file"""
+    with open(filename, "w") as f:
+        f.write("# Best kernel selections\n")
+        f.write(
+            "# Format: problem_size -> kernel_name (TFLOPS, bandwidth, latency)\n\n"
+        )
+
+        for key, kernel in sorted(best_kernels.items()):
+            f.write(
+                f"{key}: {kernel['name']} ({kernel['tflops']:.2f} TFLOPS, {kernel['bandwidth_gb_s']:.2f} GB/s, {kernel['time_ms']:.2f}ms)\n"
+            )
+
+    print(f"Best kernels exported to {filename}")
+
+def export_json(results: List[Dict], filename: str, best_kernels: Dict = None, verbose: bool = False):
+    """Export all results and best kernels to JSON with comprehensive metadata"""
+    from datetime import datetime
+
+    # Calculate comprehensive summary statistics for all metrics
+    successful_results = [r for r in results if r.get("tflops", 0) > 0]
+
+    tflops_values = [r.get("tflops", 0) for r in successful_results]
+    bandwidth_values = [r.get("bandwidth_gb_s", 0) for r in successful_results]
+    latency_values = [
+        r.get("time_ms", 0) for r in successful_results if r.get("time_ms", 0) > 0
+    ]
+
+    # Performance breakdown by kernel type
+    pipeline_stats = {}
+    scheduler_stats = {}
+    data_type_stats = {}
+
+    for result in successful_results:
+        # Get config info from the new structure
+        config = result.get("config", {})
+
+        # Pipeline statistics
+        pipeline = config.get("pipeline", "unknown")
+        if pipeline not in pipeline_stats:
+            pipeline_stats[pipeline] = {
+                "count": 0,
+                "avg_tflops": 0,
+                "best_tflops": 0,
+            }
+        pipeline_stats[pipeline]["count"] += 1
+        pipeline_stats[pipeline]["best_tflops"] = max(
+            pipeline_stats[pipeline]["best_tflops"], result.get("tflops", 0)
+        )
+
+        # Scheduler statistics
+        scheduler = config.get("scheduler", "unknown")
+        if scheduler not in scheduler_stats:
+            scheduler_stats[scheduler] = {
+                "count": 0,
+                "avg_tflops": 0,
+                "best_tflops": 0,
+            }
+        scheduler_stats[scheduler]["count"] += 1
+        scheduler_stats[scheduler]["best_tflops"] = max(
+            scheduler_stats[scheduler]["best_tflops"], result.get("tflops", 0)
+        )
+
+        # Data type statistics
+        data_type = config.get("data_type", "unknown")
+        if data_type not in data_type_stats:
+            data_type_stats[data_type] = {
+                "count": 0,
+                "avg_tflops": 0,
+                "best_tflops": 0,
+            }
+        data_type_stats[data_type]["count"] += 1
+        data_type_stats[data_type]["best_tflops"] = max(
+            data_type_stats[data_type]["best_tflops"], result.get("tflops", 0)
+        )
+
+    # Calculate averages for breakdown stats
+    for stats_dict, field_name in [
+        (pipeline_stats, "pipeline"),
+        (scheduler_stats, "scheduler"),
+        (data_type_stats, "data_type"),
+    ]:
+        for key in stats_dict:
+            relevant_results = [
+                r
+                for r in successful_results
+                if r.get("config", {}).get(field_name, "unknown") == key
+            ]
+            if relevant_results:
+                stats_dict[key]["avg_tflops"] = sum(
+                    r.get("tflops", 0) for r in relevant_results
+                ) / len(relevant_results)
+
+    output_data = {
+        "benchmark_metadata": {
+            "timestamp": datetime.now().isoformat(),
+            "total_kernels_tested": len(results),
+            "unique_kernels": len(
+                set(r.get("name", "unknown") for r in results)
+            ),
+            "successful_runs": len(successful_results),
+            "failed_runs": len(results) - len(successful_results),
+        },
+        "performance_summary": {
+            "tflops_stats": {
+                "best": max(tflops_values, default=0),
+                "average": sum(tflops_values) / len(tflops_values)
+                if tflops_values
+                else 0,
+                "min": min(tflops_values, default=0),
+                "median": sorted(tflops_values)[len(tflops_values) // 2]
+                if tflops_values
+                else 0,
+            },
+            "bandwidth_stats": {
+                "best_gb_s": max(bandwidth_values, default=0),
+                "average_gb_s": sum(bandwidth_values) / len(bandwidth_values)
+                if bandwidth_values
+                else 0,
+                "min_gb_s": min(bandwidth_values, default=0),
+                "median_gb_s": sorted(bandwidth_values)[len(bandwidth_values) // 2]
+                if bandwidth_values
+                else 0,
+            },
+            "latency_stats": {
+                "best_ms": min(latency_values, default=0),
+                "average_ms": sum(latency_values) / len(latency_values)
+                if latency_values
+                else 0,
+                "max_ms": max(latency_values, default=0),
+                "median_ms": sorted(latency_values)[len(latency_values) // 2]
+                if latency_values
+                else 0,
+            },
+            "kernel_type_breakdown": {
+                "by_pipeline": pipeline_stats,
+                "by_scheduler": scheduler_stats,
+                "by_data_type": data_type_stats,
+            },
+            "total_problem_configurations": len(best_kernels)
+            if best_kernels
+            else 0,
+        },
+        "kernel_results": results,
+        "best_kernels_by_problem": best_kernels or {},
+    }
+
+    with open(filename, "w") as f:
+        json.dump(output_data, f, indent=2)
+
+    print(f"JSON results exported to {filename}")
+    print(f"  - Total kernels: {len(results)}")
+    print(f"  - Successful runs: {len(successful_results)}")
+    print(f"  - Best TFLOPS: {max(tflops_values, default=0):.2f}")
+    print(f"  - Best bandwidth: {max(bandwidth_values, default=0):.2f} GB/s")
+    print(f"  - Best latency: {min(latency_values, default=0):.2f}ms")
+
diff --git a/tile_engine/ops/gemm/gemm_benchmark.py b/tile_engine/ops/gemm/gemm_benchmark.py
new file mode 100644
index 00000000000..3a7afc74e88
--- /dev/null
+++ b/tile_engine/ops/gemm/gemm_benchmark.py
@@ -0,0 +1,331 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+import os
+import sys
+import json
+import subprocess
+import argparse
+import csv
+import time
+import importlib.util
+from pathlib import Path
+from typing import List, Dict, Tuple, Optional
+
+# TODO: explore modularizing tile engine to avoid accessing imports like this
+def _import_benchmark_utils():
+    """Import benchmark utilities from commons directory."""
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    parent_dir = os.path.dirname(current_dir)
+
+    # Load the module dynamically
+    spec = importlib.util.spec_from_file_location(
+        "benchmark_utils",
+        os.path.join(parent_dir, "commons", "benchmark_utils.py"),
+    )
+    benchmark_utils = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(benchmark_utils)
+
+    return benchmark_utils
+
+benchmark_utils = _import_benchmark_utils()
+
+class GemmBenchmark:
+    def __init__(self, build_dir: str, verbose: bool = False, name: str = "benchmark_gemm_"):
+        self.build_dir = Path(build_dir)
+        self.verbose = verbose
+        self.results = []
+        self.name = name
+
+    def discover_kernels(self) -> List[Path]:
+        """Find all benchmark_gemm_* executables in the build directory"""
+        bin_dir = self.build_dir / "bin"
+        if not bin_dir.exists():
+            print(f"Error: Binary directory {bin_dir} does not exist")
+            return []
+
+        glob_name = f"{self.name}*"
+        kernels = list(bin_dir.glob(glob_name))
+        if self.verbose:
+            print(f"Found {len(kernels)} kernel executables")
+            for k in kernels:
+                print(f"  - {k.name}")
+        return kernels
+
+    def extract_kernel_info(self, kernel_path: Path) -> Dict[str, str]:
+        """Extract comprehensive kernel information from filename"""
+        name = kernel_path.stem
+        if name.startswith(self.name):
+            args = name[len(self.name):]
+        else:
+            args = name
+
+        # Initialize with basic info
+        info = {
+            "executable": str(kernel_path),
+            "name": name,
+            "data_type": "unknown",
+            "layout": "unknown",
+            "pipeline": "unknown",
+            "scheduler": "unknown",
+            "epilogue": "unknown",
+        }
+
+        # Parse the kernel name pattern:
+        # benchmark_gemm_fp16_rcr_mem_default_intrawave_False_False_False_False_False_256x256x32_2x2x1_4x64x16
+        parts = args.split("_")
+
+        if len(parts) >= 5:
+            info["data_type"] = parts[0]
+            info["layout"] = parts[1]
+            info["pipeline"] = parts[2]
+            info["epilogue"] = parts[3]
+            info["scheduler"] = parts[4]
+
+        # Extract detailed configuration from the end of the name
+        config_info = self.parse_detailed_config(name)
+        info.update(config_info)
+
+        # Generate config ID
+        info["config_id"] = self.generate_config_id(info)
+
+        return info
+
+    def parse_detailed_config(self, kernel_name: str) -> Dict:
+        """Parse detailed configuration from kernel name"""
+        config = {
+            "tile_sizes": {"tile_m": 0, "tile_n": 0, "tile_k": 0},
+            "warp_config": {"warp_m": 0, "warp_n": 0, "warp_k": 0},
+            "warp_tile": {"warp_tile_m": 0, "warp_tile_n": 0, "warp_tile_k": 0},
+            "optimization_flags": {
+                "pad_m": False,
+                "pad_n": False,
+                "pad_k": False,
+                "persistent": False,
+            },
+        }
+
+        # Split by underscore and look for patterns
+        parts = kernel_name.split("_")
+
+        # Look for boolean flags (sequence of True/False values)
+        bool_sequence = []
+        for i, part in enumerate(parts):
+            if part in ["True", "False"]:
+                bool_sequence.append(part == "True")
+                # Continue collecting consecutive boolean values
+                j = i + 1
+                while j < len(parts) and parts[j] in ["True", "False"]:
+                    bool_sequence.append(parts[j] == "True")
+                    j += 1
+                break
+
+        # Assign boolean flags if we found them
+        # Order: pad_m, pad_n, pad_k, persistent (4 flags total)
+        if len(bool_sequence) >= 4:
+            config["optimization_flags"]["pad_m"] = bool_sequence[0]
+            config["optimization_flags"]["pad_n"] = bool_sequence[1]
+            config["optimization_flags"]["pad_k"] = bool_sequence[2]
+            config["optimization_flags"]["persistent"] = bool_sequence[3]
+
+        # Look for tile size patterns (e.g., 256x256x32_2x2x1_4x64x16)
+        # The pattern is: tile_sizes_warp_config_warp_tile
+        dimension_groups = []
+        for part in parts:
+            if "x" in part and len(part.split("x")) == 3:
+                try:
+                    dims = [int(x) for x in part.split("x")]
+                    if all(d > 0 for d in dims):
+                        dimension_groups.append(dims)
+                except ValueError:
+                    continue
+
+        # Assign dimensions based on order and magnitude
+        if len(dimension_groups) >= 3:
+            # Sort by magnitude to identify: largest=tile_sizes, smallest=warp_config, middle=warp_tile
+            sorted_groups = sorted(dimension_groups, key=lambda x: max(x), reverse=True)
+
+            # Largest dimensions = tile sizes
+            config["tile_sizes"]["tile_m"] = sorted_groups[0][0]
+            config["tile_sizes"]["tile_n"] = sorted_groups[0][1]
+            config["tile_sizes"]["tile_k"] = sorted_groups[0][2]
+
+            # Smallest dimensions = warp config
+            config["warp_config"]["warp_m"] = sorted_groups[2][0]
+            config["warp_config"]["warp_n"] = sorted_groups[2][1]
+            config["warp_config"]["warp_k"] = sorted_groups[2][2]
+
+            # Middle dimensions = warp tile
+            config["warp_tile"]["warp_tile_m"] = sorted_groups[1][0]
+            config["warp_tile"]["warp_tile_n"] = sorted_groups[1][1]
+            config["warp_tile"]["warp_tile_k"] = sorted_groups[1][2]
+        elif len(dimension_groups) == 2:
+            # If only 2 groups, assign based on magnitude
+            sorted_groups = sorted(dimension_groups, key=lambda x: max(x), reverse=True)
+
+            # Larger = tile sizes
+            config["tile_sizes"]["tile_m"] = sorted_groups[0][0]
+            config["tile_sizes"]["tile_n"] = sorted_groups[0][1]
+            config["tile_sizes"]["tile_k"] = sorted_groups[0][2]
+
+            # Smaller = warp config
+            config["warp_config"]["warp_m"] = sorted_groups[1][0]
+            config["warp_config"]["warp_n"] = sorted_groups[1][1]
+            config["warp_config"]["warp_k"] = sorted_groups[1][2]
+        elif len(dimension_groups) == 1:
+            # Only one group - assume it's tile sizes
+            config["tile_sizes"]["tile_m"] = dimension_groups[0][0]
+            config["tile_sizes"]["tile_n"] = dimension_groups[0][1]
+            config["tile_sizes"]["tile_k"] = dimension_groups[0][2]
+
+        return config
+
+    def generate_config_id(self, info: Dict) -> str:
+        """Generate a compact config ID from kernel info"""
+        # Create a compact identifier
+        parts = [
+            info.get("data_type", "unk"),
+            info.get("layout", "unk"),
+            info.get("pipeline", "unk"),
+            info.get("scheduler", "unk"),
+        ]
+
+        # Add tile configuration if available
+        tile_sizes = info.get("tile_sizes", {})
+        if tile_sizes.get("tile_m", 0) > 0:
+            tile_str = (
+                f"{tile_sizes['tile_m']}x{tile_sizes['tile_n']}x{tile_sizes['tile_k']}"
+            )
+            parts.append(tile_str)
+
+        # Add warp config if available
+        warp_config = info.get("warp_config", {})
+        if warp_config.get("warp_m", 0) > 0:
+            warp_str = f"w{warp_config['warp_m']}x{warp_config['warp_n']}x{warp_config['warp_k']}"
+            parts.append(warp_str)
+
+        # Add warp tile if available
+        warp_tile = info.get("warp_tile", {})
+        if warp_tile.get("warp_tile_m", 0) > 0:
+            warp_tile_str = f"wt{warp_tile['warp_tile_m']}x{warp_tile['warp_tile_n']}x{warp_tile['warp_tile_k']}"
+            parts.append(warp_tile_str)
+
+        return "_".join(parts)
+
+    def benchmark_problem_size(
+        self,
+        kernels: List[Path],
+        m: int,
+        n: int,
+        k: int,
+        split_k: int = 1,
+        verify: int = 0,
+        warmup: int = 50,
+        repeat: int = 100,
+        flush_cache: bool = True,
+        rotating_count: int = 1000,
+    ) -> List[Dict]:
+        """Benchmark all kernels for a specific problem size"""
+        results = []
+
+        params = {
+            "m": m,
+            "n": n,
+            "k": k,
+            "split_k": split_k,
+            "verify": verify,
+            "warmup": warmup,
+            "repeat": repeat,
+            "flush_cache": str(flush_cache).lower(),
+            "rotating_count": rotating_count,
+        }
+
+        print(f"\nBenchmarking M={m}, N={n}, K={k}, split_k={split_k}")
+
+        for kernel_path in kernels:
+            kernel_info = self.extract_kernel_info(kernel_path)
+            result = benchmark_utils.run_kernel(self.build_dir, kernel_path, params, verbose=self.verbose)
+            if result:
+                # Create new structured result format
+                structured_result = {
+                    "name": kernel_info["name"],  # Add name field for compatibility
+                    "config_id": kernel_info["config_id"],
+                    "problem": result.get("problem", {}),
+                    "perf_result": result.get("perf_result", {}),
+                    "config": {
+                        "data_type": kernel_info["data_type"],
+                        "layout": kernel_info["layout"],
+                        "pipeline": kernel_info["pipeline"],
+                        "scheduler": kernel_info["scheduler"],
+                        "epilogue": kernel_info["epilogue"],
+                        "tile_sizes": kernel_info.get("tile_sizes", {}),
+                        "warp_config": kernel_info.get("warp_config", {}),
+                        "warp_tile": kernel_info.get("warp_tile", {}),
+                        "optimization_flags": kernel_info.get("optimization_flags", {}),
+                    },
+                    "executable": kernel_info["executable"],
+                    # Keep backward compatibility fields
+                    "time_ms": result.get("time_ms", 0),
+                    "tflops": result.get("tflops", 0),
+                    "bandwidth_gb_s": result.get("bandwidth_gb_s", 0),
+                }
+
+                results.append(structured_result)
+
+                if self.verbose:
+                    print(
+                        f"  {kernel_info['config_id']}: {structured_result['tflops']:.2f} TFLOPS, {structured_result['bandwidth_gb_s']:.2f} GB/s, {structured_result['time_ms']:.2f}ms"
+                    )
+
+        return results
+
+    def benchmark_sweep(
+        self,
+        problem_sizes: List[Tuple[int, int, int]],
+        split_k_values: List[int] = [1],
+        verify: bool = False,
+        warmup: int = 50,
+        repeat: int = 100,
+        flush_cache: bool = True,
+        rotating_count: int = 1000,
+    ) -> Dict:
+        """Run comprehensive benchmark sweep"""
+        kernels = self.discover_kernels()
+        if not kernels:
+            print("No kernels found!")
+            return {}
+
+        all_results = []
+        best_kernels = {}
+
+        for m, n, k in problem_sizes:
+            for split_k in split_k_values:
+                results = self.benchmark_problem_size(
+                    kernels,
+                    m,
+                    n,
+                    k,
+                    split_k,
+                    verify=2 if verify else 0,
+                    warmup=warmup,
+                    repeat=repeat,
+                    flush_cache=flush_cache,
+                    rotating_count=rotating_count,
+                )
+
+                all_results.extend(results)
+
+                # Find best kernel for this configuration
+                best = benchmark_utils.find_best_kernel(results)
+                if best:
+                    key = f"m{m}_n{n}_k{k}_splitk{split_k}"
+                    best_kernels[key] = best
+                    print(
+                        f"Best for {key}: {best['name']} ({best['tflops']:.2f} TFLOPS, {best['bandwidth_gb_s']:.2f} GB/s, {best['time_ms']:.2f}ms)"
+                    )
+
+        self.results = all_results
+        return best_kernels
+
+
diff --git a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py
index faf04a7de0d..ae796687075 100644
--- a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py
+++ b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py
@@ -1,587 +1,53 @@
 # Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 # SPDX-License-Identifier: MIT
 
+import os
 import sys
 import json
 import subprocess
 import argparse
 import csv
 import time
+import importlib.util
 from pathlib import Path
 from typing import List, Dict, Tuple, Optional
 
+def _import_gemm_benchmark():
+    """Import validation utilities from commons directory."""
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    parent_dir = os.path.dirname(current_dir)
 
-class GemmMultiDBenchmark:
-    def __init__(self, build_dir: str, verbose: bool = False):
-        self.build_dir = Path(build_dir)
-        self.verbose = verbose
-        self.results = []
-
-    def discover_kernels(self) -> List[Path]:
-        """Find all benchmark_gemm_multi_d_* executables in the build directory"""
-        bin_dir = self.build_dir / "bin"
-        if not bin_dir.exists():
-            print(f"Error: Binary directory {bin_dir} does not exist")
-            return []
-
-        kernels = list(bin_dir.glob("benchmark_gemm_multi_d_*"))
-        if self.verbose:
-            print(f"Found {len(kernels)} kernel executables")
-            for k in kernels:
-                print(f"  - {k.name}")
-        return kernels
-
-    def extract_kernel_info(self, kernel_path: Path) -> Dict[str, str]:
-        """Extract comprehensive kernel information from filename"""
-        name = kernel_path.stem
-
-        # Initialize with basic info
-        info = {
-            "executable": str(kernel_path),
-            "name": name,
-            "data_type": "unknown",
-            "layout": "unknown",
-            "pipeline": "unknown",
-            "scheduler": "unknown",
-            "epilogue": "unknown",
-        }
-
-        # Parse the kernel name pattern:
-        # benchmark_gemm_multi_d_fp16_rcr_mem_default_intrawave_False_False_False_False_False_256x256x32_2x2x1_4x64x16
-        parts = name.split("_")
-
-        if len(parts) >= 5:
-            # Extract data type (3rd part after benchmark_gemm_)
-            info["data_type"] = parts[4] if len(parts) > 4 else "unknown"
-
-            # Extract layout (4th part)
-            info["layout"] = parts[5] if len(parts) > 5 else "unknown"
-
-            # Extract pipeline (5th part)
-            info["pipeline"] = parts[6] if len(parts) > 6 else "unknown"
-
-            # Extract epilogue (6th part)
-            info["epilogue"] = parts[7] if len(parts) > 7 else "unknown"
-
-            # Extract scheduler (7th part)
-            info["scheduler"] = parts[8] if len(parts) > 8 else "unknown"
-
-        # Extract detailed configuration from the end of the name
-        config_info = self.parse_detailed_config(name)
-        info.update(config_info)
-
-        # Generate config ID
-        info["config_id"] = self.generate_config_id(info)
-
-        return info
-
-    def parse_detailed_config(self, kernel_name: str) -> Dict:
-        """Parse detailed configuration from kernel name"""
-        config = {
-            "tile_sizes": {"tile_m": 0, "tile_n": 0, "tile_k": 0},
-            "warp_config": {"warp_m": 0, "warp_n": 0, "warp_k": 0},
-            "warp_tile": {"warp_tile_m": 0, "warp_tile_n": 0, "warp_tile_k": 0},
-            "optimization_flags": {
-                "pad_m": False,
-                "pad_n": False,
-                "pad_k": False,
-                "persistent": False,
-            },
-        }
-
-        # Split by underscore and look for patterns
-        parts = kernel_name.split("_")
-
-        # Look for boolean flags (sequence of True/False values)
-        bool_sequence = []
-        for i, part in enumerate(parts):
-            if part in ["True", "False"]:
-                bool_sequence.append(part == "True")
-                # Continue collecting consecutive boolean values
-                j = i + 1
-                while j < len(parts) and parts[j] in ["True", "False"]:
-                    bool_sequence.append(parts[j] == "True")
-                    j += 1
-                break
-
-        # Assign boolean flags if we found them
-        # Order: pad_m, pad_n, pad_k, persistent (4 flags total)
-        if len(bool_sequence) >= 4:
-            config["optimization_flags"]["pad_m"] = bool_sequence[0]
-            config["optimization_flags"]["pad_n"] = bool_sequence[1]
-            config["optimization_flags"]["pad_k"] = bool_sequence[2]
-            config["optimization_flags"]["persistent"] = bool_sequence[3]
-
-        # Look for tile size patterns (e.g., 256x256x32_2x2x1_4x64x16)
-        # The pattern is: tile_sizes_warp_config_warp_tile
-        dimension_groups = []
-        for part in parts:
-            if "x" in part and len(part.split("x")) == 3:
-                try:
-                    dims = [int(x) for x in part.split("x")]
-                    if all(d > 0 for d in dims):
-                        dimension_groups.append(dims)
-                except ValueError:
-                    continue
-
-        # Assign dimensions based on order and magnitude
-        if len(dimension_groups) >= 3:
-            # Sort by magnitude to identify: largest=tile_sizes, smallest=warp_config, middle=warp_tile
-            sorted_groups = sorted(dimension_groups, key=max, reverse=True)
-
-            # Largest dimensions = tile sizes
-            config["tile_sizes"]["tile_m"] = sorted_groups[0][0]
-            config["tile_sizes"]["tile_n"] = sorted_groups[0][1]
-            config["tile_sizes"]["tile_k"] = sorted_groups[0][2]
-
-            # Smallest dimensions = warp config
-            config["warp_config"]["warp_m"] = sorted_groups[2][0]
-            config["warp_config"]["warp_n"] = sorted_groups[2][1]
-            config["warp_config"]["warp_k"] = sorted_groups[2][2]
-
-            # Middle dimensions = warp tile
-            config["warp_tile"]["warp_tile_m"] = sorted_groups[1][0]
-            config["warp_tile"]["warp_tile_n"] = sorted_groups[1][1]
-            config["warp_tile"]["warp_tile_k"] = sorted_groups[1][2]
-        elif len(dimension_groups) == 2:
-            # If only 2 groups, assign based on magnitude
-            sorted_groups = sorted(dimension_groups, key=max, reverse=True)
-
-            # Larger = tile sizes
-            config["tile_sizes"]["tile_m"] = sorted_groups[0][0]
-            config["tile_sizes"]["tile_n"] = sorted_groups[0][1]
-            config["tile_sizes"]["tile_k"] = sorted_groups[0][2]
-
-            # Smaller = warp config
-            config["warp_config"]["warp_m"] = sorted_groups[1][0]
-            config["warp_config"]["warp_n"] = sorted_groups[1][1]
-            config["warp_config"]["warp_k"] = sorted_groups[1][2]
-        elif len(dimension_groups) == 1:
-            # Only one group - assume it's tile sizes
-            config["tile_sizes"]["tile_m"] = dimension_groups[0][0]
-            config["tile_sizes"]["tile_n"] = dimension_groups[0][1]
-            config["tile_sizes"]["tile_k"] = dimension_groups[0][2]
-
-        return config
-
-    def generate_config_id(self, info: Dict) -> str:
-        """Generate a compact config ID from kernel info"""
-        # Create a compact identifier
-        parts = [
-            info.get("data_type", "unk"),
-            info.get("layout", "unk"),
-            info.get("pipeline", "unk"),
-            info.get("scheduler", "unk"),
-        ]
-
-        # Add tile configuration if available
-        tile_sizes = info.get("tile_sizes", {})
-        if tile_sizes.get("tile_m", 0) > 0:
-            tile_str = (
-                f"{tile_sizes['tile_m']}x{tile_sizes['tile_n']}x{tile_sizes['tile_k']}"
-            )
-            parts.append(tile_str)
-
-        # Add warp config if available
-        warp_config = info.get("warp_config", {})
-        if warp_config.get("warp_m", 0) > 0:
-            warp_str = f"w{warp_config['warp_m']}x{warp_config['warp_n']}x{warp_config['warp_k']}"
-            parts.append(warp_str)
-
-        # Add warp tile if available
-        warp_tile = info.get("warp_tile", {})
-        if warp_tile.get("warp_tile_m", 0) > 0:
-            warp_tile_str = f"wt{warp_tile['warp_tile_m']}x{warp_tile['warp_tile_n']}x{warp_tile['warp_tile_k']}"
-            parts.append(warp_tile_str)
-
-        return "_".join(parts)
-
-    def run_kernel(self, kernel_path: Path, params: Dict[str, str]) -> Optional[Dict]:
-        """Run a single kernel with given parameters and save output to individual JSON file"""
-        # Create results directory
-        results_dir = self.build_dir / "results"
-        results_dir.mkdir(exist_ok=True)
-
-        # Generate unique JSON filename for this kernel
-        json_file = results_dir / f"{kernel_path.stem}.json"
-
-        cmd = [str(kernel_path)]
-
-        # Add parameters
-        for key, value in params.items():
-            cmd.append(f"-{key}={value}")
-
-        # Add JSON output flag for clean JSON output
-        cmd.append("-json_output=true")
-
-        if self.verbose:
-            print(f"Running: {' '.join(cmd)}")
-
-        try:
-            result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
-
-            if result.returncode != 0:
-                print(f"Error running {kernel_path.name}: {result.stderr}")
-                return None
-
-            # Save raw output to individual JSON file
-            output = result.stdout.strip()
-            if output:
-                with open(json_file, "w") as f:
-                    f.write(output)
-
-                # Parse the JSON file
-                return self.parse_json_file(json_file)
-            else:
-                print(f"No output from {kernel_path.name}")
-                return None
-
-        except subprocess.TimeoutExpired:
-            print(f"Timeout running {kernel_path.name}")
-            return None
-        except Exception as e:
-            print(f"Error running {kernel_path.name}: {e}")
-            return None
-
-    def parse_json_file(self, json_file: Path) -> Optional[Dict]:
-        """Parse JSON data from individual kernel output file"""
-        try:
-            with open(json_file, "r") as f:
-                content = f.read().strip()
-
-            # Parse the JSON directly since executables produce clean JSON
-            data = json.loads(content)
-
-            # Return the complete JSON data as-is, just add some convenience fields
-            result = data.copy()
-            if "perf_result" in data:
-                perf = data["perf_result"]
-                # Add convenience fields for backward compatibility
-                result["time_ms"] = perf.get("latency(ms)", 0)
-                result["tflops"] = perf.get("tflops(TFlops)", 0)
-                result["bandwidth_gb_s"] = perf.get("bandwidth(GB/s)", 0)
-
-            return result
-
-        except json.JSONDecodeError as e:
-            if self.verbose:
-                print(f"Failed to parse JSON from {json_file}: {e}")
-            return None
-        except Exception as e:
-            if self.verbose:
-                print(f"Error reading JSON file {json_file}: {e}")
-            return None
-
-    def benchmark_problem_size(
-        self,
-        kernels: List[Path],
-        m: int,
-        n: int,
-        k: int,
-        split_k: int = 1,
-        verify: int = 0,
-        warmup: int = 50,
-        repeat: int = 100,
-        flush_cache: bool = True,
-        rotating_count: int = 1000,
-    ) -> List[Dict]:
-        """Benchmark all kernels for a specific problem size"""
-        results = []
-
-        params = {
-            "m": m,
-            "n": n,
-            "k": k,
-            "split_k": split_k,
-            "verify": verify,
-            "warmup": warmup,
-            "repeat": repeat,
-            "flush_cache": str(flush_cache).lower(),
-            "rotating_count": rotating_count,
-        }
-
-        print(f"\nBenchmarking M={m}, N={n}, K={k}, split_k={split_k}")
-
-        for kernel_path in kernels:
-            kernel_info = self.extract_kernel_info(kernel_path)
-            result = self.run_kernel(kernel_path, params)
-
-            if result:
-                # Create new structured result format
-                structured_result = {
-                    "name": kernel_info["name"],  # Add name field for compatibility
-                    "config_id": kernel_info["config_id"],
-                    "problem": result.get("problem", {}),
-                    "perf_result": result.get("perf_result", {}),
-                    "config": {
-                        "data_type": kernel_info["data_type"],
-                        "layout": kernel_info["layout"],
-                        "pipeline": kernel_info["pipeline"],
-                        "scheduler": kernel_info["scheduler"],
-                        "epilogue": kernel_info["epilogue"],
-                        "tile_sizes": kernel_info.get("tile_sizes", {}),
-                        "warp_config": kernel_info.get("warp_config", {}),
-                        "warp_tile": kernel_info.get("warp_tile", {}),
-                        "optimization_flags": kernel_info.get("optimization_flags", {}),
-                    },
-                    "executable": kernel_info["executable"],
-                    # Keep backward compatibility fields
-                    "time_ms": result.get("time_ms", 0),
-                    "tflops": result.get("tflops", 0),
-                    "bandwidth_gb_s": result.get("bandwidth_gb_s", 0),
-                }
-
-                results.append(structured_result)
-
-                if self.verbose:
-                    print(
-                        f"  {kernel_info['config_id']}: {structured_result['tflops']:.2f} TFLOPS, {structured_result['bandwidth_gb_s']:.2f} GB/s, {structured_result['time_ms']:.2f}ms"
-                    )
-
-        return results
-
-    def find_best_kernel(
-        self, results: List[Dict], metric: str = "tflops"
-    ) -> Optional[Dict]:
-        """Find the best performing kernel based on metric"""
-        if not results:
-            return None
-
-        if metric == "tflops":
-            return max(results, key=lambda x: x.get("tflops", 0))
-        elif metric == "time_ms":
-            return min(results, key=lambda x: x.get("time_ms", float("inf")))
-        elif metric == "bandwidth_gb_s":
-            return max(results, key=lambda x: x.get("bandwidth_gb_s", 0))
-        else:
-            raise ValueError(f"Unknown metric: {metric}")
-
-    def benchmark_sweep(
-        self,
-        problem_sizes: List[Tuple[int, int, int]],
-        split_k_values: List[int] = [1],
-        verify: bool = False,
-        warmup: int = 50,
-        repeat: int = 100,
-        flush_cache: bool = True,
-        rotating_count: int = 1000,
-    ) -> Dict:
-        """Run comprehensive benchmark sweep"""
-        kernels = self.discover_kernels()
-        if not kernels:
-            print("No kernels found!")
-            return {}
-
-        all_results = []
-        best_kernels = {}
-
-        for m, n, k in problem_sizes:
-            for split_k in split_k_values:
-                results = self.benchmark_problem_size(
-                    kernels,
-                    m,
-                    n,
-                    k,
-                    split_k,
-                    verify=2 if verify else 0,
-                    warmup=warmup,
-                    repeat=repeat,
-                    flush_cache=flush_cache,
-                    rotating_count=rotating_count,
-                )
-
-                all_results.extend(results)
-
-                # Find best kernel for this configuration
-                best = self.find_best_kernel(results)
-                if best:
-                    key = f"m{m}_n{n}_k{k}_splitk{split_k}"
-                    best_kernels[key] = best
-                    print(
-                        f"Best for {key}: {best['name']} ({best['tflops']:.2f} TFLOPS, {best['bandwidth_gb_s']:.2f} GB/s, {best['time_ms']:.2f}ms)"
-                    )
-
-        self.results = all_results
-        return best_kernels
-
-    def export_csv(self, filename: str):
-        """Export all results to CSV"""
-        if not self.results:
-            print("No results to export")
-            return
-
-        # Get all unique keys from results
-        all_keys = set()
-        for result in self.results:
-            all_keys.update(result.keys())
-
-        # Sort keys for consistent output
-        fieldnames = sorted(all_keys)
-
-        with open(filename, "w", newline="") as csvfile:
-            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
-            writer.writeheader()
-            writer.writerows(self.results)
-
-        print(f"Results exported to {filename}")
-
-    def export_best_kernels(self, best_kernels: Dict, filename: str):
-        """Export best kernel selections to file"""
-        with open(filename, "w") as f:
-            f.write("# Best kernel selections\n")
-            f.write(
-                "# Format: problem_size -> kernel_name (TFLOPS, bandwidth, latency)\n\n"
-            )
-
-            for key, kernel in sorted(best_kernels.items()):
-                f.write(
-                    f"{key}: {kernel['name']} ({kernel['tflops']:.2f} TFLOPS, {kernel['bandwidth_gb_s']:.2f} GB/s, {kernel['time_ms']:.2f}ms)\n"
-                )
-
-        print(f"Best kernels exported to {filename}")
-
-    def export_json(self, filename: str, best_kernels: Dict = None):
-        """Export all results and best kernels to JSON with comprehensive metadata"""
-        from datetime import datetime
-
-        # Calculate comprehensive summary statistics for all metrics
-        successful_results = [r for r in self.results if r.get("tflops", 0) > 0]
-
-        tflops_values = [r.get("tflops", 0) for r in successful_results]
-        bandwidth_values = [r.get("bandwidth_gb_s", 0) for r in successful_results]
-        latency_values = [
-            r.get("time_ms", 0) for r in successful_results if r.get("time_ms", 0) > 0
-        ]
-
-        # Performance breakdown by kernel type
-        pipeline_stats = {}
-        scheduler_stats = {}
-        data_type_stats = {}
-
-        for result in successful_results:
-            # Get config info from the new structure
-            config = result.get("config", {})
-
-            # Pipeline statistics
-            pipeline = config.get("pipeline", "unknown")
-            if pipeline not in pipeline_stats:
-                pipeline_stats[pipeline] = {
-                    "count": 0,
-                    "avg_tflops": 0,
-                    "best_tflops": 0,
-                }
-            pipeline_stats[pipeline]["count"] += 1
-            pipeline_stats[pipeline]["best_tflops"] = max(
-                pipeline_stats[pipeline]["best_tflops"], result.get("tflops", 0)
-            )
-
-            # Scheduler statistics
-            scheduler = config.get("scheduler", "unknown")
-            if scheduler not in scheduler_stats:
-                scheduler_stats[scheduler] = {
-                    "count": 0,
-                    "avg_tflops": 0,
-                    "best_tflops": 0,
-                }
-            scheduler_stats[scheduler]["count"] += 1
-            scheduler_stats[scheduler]["best_tflops"] = max(
-                scheduler_stats[scheduler]["best_tflops"], result.get("tflops", 0)
-            )
+    # Load the module dynamically
+    spec = importlib.util.spec_from_file_location(
+        "gemm_benchmark",
+        os.path.join(parent_dir, "gemm_benchmark.py"),
+    )
+    gemm_benchmark_module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(gemm_benchmark_module)
 
-            # Data type statistics
-            data_type = config.get("data_type", "unknown")
-            if data_type not in data_type_stats:
-                data_type_stats[data_type] = {
-                    "count": 0,
-                    "avg_tflops": 0,
-                    "best_tflops": 0,
-                }
-            data_type_stats[data_type]["count"] += 1
-            data_type_stats[data_type]["best_tflops"] = max(
-                data_type_stats[data_type]["best_tflops"], result.get("tflops", 0)
-            )
+    return gemm_benchmark_module.GemmBenchmark
 
-        # Calculate averages for breakdown stats
-        for stats_dict, field_name in [
-            (pipeline_stats, "pipeline"),
-            (scheduler_stats, "scheduler"),
-            (data_type_stats, "data_type"),
-        ]:
-            for key in stats_dict:
-                relevant_results = [
-                    r
-                    for r in successful_results
-                    if r.get("config", {}).get(field_name, "unknown") == key
-                ]
-                if relevant_results:
-                    stats_dict[key]["avg_tflops"] = sum(
-                        r.get("tflops", 0) for r in relevant_results
-                    ) / len(relevant_results)
+def _import_benchmark_utils():
+    """Import benchmark utilities from commons directory."""
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    parent_dir = os.path.dirname(os.path.dirname(current_dir))
 
-        output_data = {
-            "benchmark_metadata": {
-                "timestamp": datetime.now().isoformat(),
-                "total_kernels_tested": len(self.results),
-                "unique_kernels": len(
-                    set(r.get("name", "unknown") for r in self.results)
-                ),
-                "successful_runs": len(successful_results),
-                "failed_runs": len(self.results) - len(successful_results),
-            },
-            "performance_summary": {
-                "tflops_stats": {
-                    "best": max(tflops_values, default=0),
-                    "average": sum(tflops_values) / len(tflops_values)
-                    if tflops_values
-                    else 0,
-                    "min": min(tflops_values, default=0),
-                    "median": sorted(tflops_values)[len(tflops_values) // 2]
-                    if tflops_values
-                    else 0,
-                },
-                "bandwidth_stats": {
-                    "best_gb_s": max(bandwidth_values, default=0),
-                    "average_gb_s": sum(bandwidth_values) / len(bandwidth_values)
-                    if bandwidth_values
-                    else 0,
-                    "min_gb_s": min(bandwidth_values, default=0),
-                    "median_gb_s": sorted(bandwidth_values)[len(bandwidth_values) // 2]
-                    if bandwidth_values
-                    else 0,
-                },
-                "latency_stats": {
-                    "best_ms": min(latency_values, default=0),
-                    "average_ms": sum(latency_values) / len(latency_values)
-                    if latency_values
-                    else 0,
-                    "max_ms": max(latency_values, default=0),
-                    "median_ms": sorted(latency_values)[len(latency_values) // 2]
-                    if latency_values
-                    else 0,
-                },
-                "kernel_type_breakdown": {
-                    "by_pipeline": pipeline_stats,
-                    "by_scheduler": scheduler_stats,
-                    "by_data_type": data_type_stats,
-                },
-                "total_problem_configurations": len(best_kernels)
-                if best_kernels
-                else 0,
-            },
-            "kernel_results": self.results,
-            "best_kernels_by_problem": best_kernels or {},
-        }
+    # Load the module dynamically
+    spec = importlib.util.spec_from_file_location(
+        "benchmark_utils",
+        os.path.join(parent_dir, "commons", "benchmark_utils.py"),
+    )
+    benchmark_utils = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(benchmark_utils)
 
-        with open(filename, "w") as f:
-            json.dump(output_data, f, indent=2)
+    return benchmark_utils
 
-        print(f"JSON results exported to {filename}")
-        print(f"  - Total kernels: {len(self.results)}")
-        print(f"  - Successful runs: {len(successful_results)}")
-        print(f"  - Best TFLOPS: {max(tflops_values, default=0):.2f}")
-        print(f"  - Best bandwidth: {max(bandwidth_values, default=0):.2f} GB/s")
-        print(f"  - Best latency: {min(latency_values, default=0):.2f}ms")
+GemmBenchmark = _import_gemm_benchmark()
+benchmark_utils = _import_benchmark_utils()
 
+class GemmMultiDBenchmark(GemmBenchmark):
+    def __init__(self, build_dir: str, verbose: bool = False):
+        super().__init__(build_dir, verbose, name="benchmark_gemm_multi_d_")
 
 def main():
     parser = argparse.ArgumentParser(
@@ -668,12 +134,12 @@ def main():
     print(f"\nBenchmark completed in {elapsed_time:.2f} seconds")
 
     # Export results
-    benchmark.export_csv(args.csv)
-    benchmark.export_best_kernels(best_kernels, args.best)
+    benchmark_utils.export_csv(benchmark.results, args.csv)
+    benchmark_utils.export_best_kernels(best_kernels, args.best)
 
     # Export JSON if requested
     if args.json:
-        benchmark.export_json(args.json, best_kernels)
+        benchmark_utils.export_json(benchmark.results, args.json, best_kernels)
 
     return 0
 
diff --git a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py
index 53ae6336faf..1ea33834d7d 100644
--- a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py
+++ b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py
@@ -1,588 +1,54 @@
 # Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 # SPDX-License-Identifier: MIT
 
+import os
 import sys
 import json
 import subprocess
 import argparse
 import csv
 import time
+import importlib.util
 from pathlib import Path
 from typing import List, Dict, Tuple, Optional
 
 
-class GemmPreshuffleBenchmark:
-    def __init__(self, build_dir: str, verbose: bool = False):
-        self.build_dir = Path(build_dir)
-        self.verbose = verbose
-        self.results = []
-
-    def discover_kernels(self) -> List[Path]:
-        """Find all benchmark_gemm_preshuffle* executables in the build directory"""
-        bin_dir = self.build_dir / "bin"
-        if not bin_dir.exists():
-            print(f"Error: Binary directory {bin_dir} does not exist")
-            return []
-
-        kernels = list(bin_dir.glob("benchmark_gemm_preshuffle*"))
-        if self.verbose:
-            print(f"Found {len(kernels)} kernel executables")
-            for k in kernels:
-                print(f"  - {k.name}")
-        return kernels
-
-    def extract_kernel_info(self, kernel_path: Path) -> Dict[str, str]:
-        """Extract comprehensive kernel information from filename"""
-        name = kernel_path.stem
-
-        # Initialize with basic info
-        info = {
-            "executable": str(kernel_path),
-            "name": name,
-            "data_type": "unknown",
-            "layout": "unknown",
-            "pipeline": "unknown",
-            "scheduler": "unknown",
-            "epilogue": "unknown",
-        }
-
-        # Parse the kernel name pattern:
-        # benchmark_gemm_preshuffle_fp16_rcr_mem_default_intrawave_False_False_False_False_False_256x256x32_2x2x1_4x64x16
-        parts = name.split("_")
-
-        if len(parts) >= 4:
-            # Extract data type (4rd part after benchmark_gemm_preshuffle_)
-            info["data_type"] = parts[3] if len(parts) > 2 else "unknown"
-
-            # Extract layout (5th part)
-            info["layout"] = parts[4] if len(parts) > 3 else "unknown"
-
-            # Extract pipeline (6th part)
-            info["pipeline"] = parts[5] if len(parts) > 4 else "unknown"
-
-            # Extract epilogue (7th part)
-            info["epilogue"] = parts[6] if len(parts) > 5 else "unknown"
-
-            # Extract scheduler (8th part)
-            info["scheduler"] = parts[7] if len(parts) > 6 else "unknown"
-
-        # Extract detailed configuration from the end of the name
-        config_info = self.parse_detailed_config(name)
-        info.update(config_info)
-
-        # Generate config ID
-        info["config_id"] = self.generate_config_id(info)
-
-        return info
-
-    def parse_detailed_config(self, kernel_name: str) -> Dict:
-        """Parse detailed configuration from kernel name"""
-        config = {
-            "tile_sizes": {"tile_m": 0, "tile_n": 0, "tile_k": 0},
-            "warp_config": {"warp_m": 0, "warp_n": 0, "warp_k": 0},
-            "warp_tile": {"warp_tile_m": 0, "warp_tile_n": 0, "warp_tile_k": 0},
-            "optimization_flags": {
-                "pad_m": False,
-                "pad_n": False,
-                "pad_k": False,
-                "persistent": False,
-            },
-        }
-
-        # Split by underscore and look for patterns
-        parts = kernel_name.split("_")
-
-        # Look for boolean flags (sequence of True/False values)
-        bool_sequence = []
-        for i, part in enumerate(parts):
-            if part in ["True", "False"]:
-                bool_sequence.append(part == "True")
-                # Continue collecting consecutive boolean values
-                j = i + 1
-                while j < len(parts) and parts[j] in ["True", "False"]:
-                    bool_sequence.append(parts[j] == "True")
-                    j += 1
-                break
-
-        # Assign boolean flags if we found them
-        # Order: pad_m, pad_n, pad_k, persistent (4 flags total)
-        if len(bool_sequence) >= 4:
-            config["optimization_flags"]["pad_m"] = bool_sequence[0]
-            config["optimization_flags"]["pad_n"] = bool_sequence[1]
-            config["optimization_flags"]["pad_k"] = bool_sequence[2]
-            config["optimization_flags"]["persistent"] = bool_sequence[3]
-
-        # Look for tile size patterns (e.g., 256x256x32_2x2x1_4x64x16)
-        # The pattern is: tile_sizes_warp_config_warp_tile
-        dimension_groups = []
-        for part in parts:
-            if "x" in part and len(part.split("x")) == 3:
-                try:
-                    dims = [int(x) for x in part.split("x")]
-                    if all(d > 0 for d in dims):
-                        dimension_groups.append(dims)
-                except ValueError:
-                    continue
-
-        # Assign dimensions based on order and magnitude
-        if len(dimension_groups) >= 3:
-            # Sort by magnitude to identify: largest=tile_sizes, smallest=warp_config, middle=warp_tile
-            sorted_groups = sorted(dimension_groups, key=lambda x: max(x), reverse=True)
-
-            # Largest dimensions = tile sizes
-            config["tile_sizes"]["tile_m"] = sorted_groups[0][0]
-            config["tile_sizes"]["tile_n"] = sorted_groups[0][1]
-            config["tile_sizes"]["tile_k"] = sorted_groups[0][2]
-
-            # Smallest dimensions = warp config
-            config["warp_config"]["warp_m"] = sorted_groups[2][0]
-            config["warp_config"]["warp_n"] = sorted_groups[2][1]
-            config["warp_config"]["warp_k"] = sorted_groups[2][2]
-
-            # Middle dimensions = warp tile
-            config["warp_tile"]["warp_tile_m"] = sorted_groups[1][0]
-            config["warp_tile"]["warp_tile_n"] = sorted_groups[1][1]
-            config["warp_tile"]["warp_tile_k"] = sorted_groups[1][2]
-        elif len(dimension_groups) == 2:
-            # If only 2 groups, assign based on magnitude
-            sorted_groups = sorted(dimension_groups, key=lambda x: max(x), reverse=True)
-
-            # Larger = tile sizes
-            config["tile_sizes"]["tile_m"] = sorted_groups[0][0]
-            config["tile_sizes"]["tile_n"] = sorted_groups[0][1]
-            config["tile_sizes"]["tile_k"] = sorted_groups[0][2]
-
-            # Smaller = warp config
-            config["warp_config"]["warp_m"] = sorted_groups[1][0]
-            config["warp_config"]["warp_n"] = sorted_groups[1][1]
-            config["warp_config"]["warp_k"] = sorted_groups[1][2]
-        elif len(dimension_groups) == 1:
-            # Only one group - assume it's tile sizes
-            config["tile_sizes"]["tile_m"] = dimension_groups[0][0]
-            config["tile_sizes"]["tile_n"] = dimension_groups[0][1]
-            config["tile_sizes"]["tile_k"] = dimension_groups[0][2]
-
-        return config
-
-    def generate_config_id(self, info: Dict) -> str:
-        """Generate a compact config ID from kernel info"""
-        # Create a compact identifier
-        parts = [
-            info.get("data_type", "unk"),
-            info.get("layout", "unk"),
-            info.get("pipeline", "unk"),
-            info.get("scheduler", "unk"),
-        ]
-
-        # Add tile configuration if available
-        tile_sizes = info.get("tile_sizes", {})
-        if tile_sizes.get("tile_m", 0) > 0:
-            tile_str = (
-                f"{tile_sizes['tile_m']}x{tile_sizes['tile_n']}x{tile_sizes['tile_k']}"
-            )
-            parts.append(tile_str)
-
-        # Add warp config if available
-        warp_config = info.get("warp_config", {})
-        if warp_config.get("warp_m", 0) > 0:
-            warp_str = f"w{warp_config['warp_m']}x{warp_config['warp_n']}x{warp_config['warp_k']}"
-            parts.append(warp_str)
-
-        # Add warp tile if available
-        warp_tile = info.get("warp_tile", {})
-        if warp_tile.get("warp_tile_m", 0) > 0:
-            warp_tile_str = f"wt{warp_tile['warp_tile_m']}x{warp_tile['warp_tile_n']}x{warp_tile['warp_tile_k']}"
-            parts.append(warp_tile_str)
-
-        return "_".join(parts)
-
-    def run_kernel(self, kernel_path: Path, params: Dict[str, str]) -> Optional[Dict]:
-        """Run a single kernel with given parameters and save output to individual JSON file"""
-        # Create results directory
-        results_dir = self.build_dir / "results"
-        results_dir.mkdir(exist_ok=True)
-
-        # Generate unique JSON filename for this kernel
-        json_file = results_dir / f"{kernel_path.stem}.json"
-
-        cmd = [str(kernel_path)]
-
-        # Add parameters
-        for key, value in params.items():
-            cmd.append(f"-{key}={value}")
-
-        # Add JSON output flag for clean JSON output
-        cmd.append("-json_output=true")
-
-        if self.verbose:
-            print(f"Running: {' '.join(cmd)}")
-
-        try:
-            result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
-
-            if result.returncode != 0:
-                print(f"Error running {kernel_path.name}: {result.stderr}")
-                return None
-
-            # Save raw output to individual JSON file
-            output = result.stdout.strip()
-
-            if output:
-                with open(json_file, "w") as f:
-                    f.write(output)
+def _import_gemm_benchmark():
+    """Import validation utilities from commons directory."""
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    parent_dir = os.path.dirname(current_dir)
 
-                # Parse the JSON file
-                return self.parse_json_file(json_file)
-            else:
-                print(f"No output from {kernel_path.name}")
-                return None
-
-        except subprocess.TimeoutExpired:
-            print(f"Timeout running {kernel_path.name}")
-            return None
-        except Exception as e:
-            print(f"Error running {kernel_path.name}: {e}")
-            return None
-
-    def parse_json_file(self, json_file: Path) -> Optional[Dict]:
-        """Parse JSON data from individual kernel output file"""
-        try:
-            with open(json_file, "r") as f:
-                content = f.read().strip()
-
-            # Parse the JSON directly since executables produce clean JSON
-            data = json.loads(content)
-
-            # Return the complete JSON data as-is, just add some convenience fields
-            result = data.copy()
-            if "perf_result" in data:
-                perf = data["perf_result"]
-                # Add convenience fields for backward compatibility
-                result["time_ms"] = perf.get("latency(ms)", 0)
-                result["tflops"] = perf.get("tflops(TFlops)", 0)
-                result["bandwidth_gb_s"] = perf.get("bandwidth(GB/s)", 0)
-
-            return result
-
-        except json.JSONDecodeError as e:
-            if self.verbose:
-                print(f"Failed to parse JSON from {json_file}: {e}")
-            return None
-        except Exception as e:
-            if self.verbose:
-                print(f"Error reading JSON file {json_file}: {e}")
-            return None
-
-    def benchmark_problem_size(
-        self,
-        kernels: List[Path],
-        m: int,
-        n: int,
-        k: int,
-        split_k: int = 1,
-        verify: int = 0,
-        warmup: int = 50,
-        repeat: int = 100,
-        flush_cache: bool = True,
-        rotating_count: int = 1000,
-    ) -> List[Dict]:
-        """Benchmark all kernels for a specific problem size"""
-        results = []
-
-        params = {
-            "m": m,
-            "n": n,
-            "k": k,
-            "split_k": split_k,
-            "verify": verify,
-            "warmup": warmup,
-            "repeat": repeat,
-            "flush_cache": str(flush_cache).lower(),
-            "rotating_count": rotating_count,
-        }
-
-        print(f"\nBenchmarking M={m}, N={n}, K={k}, split_k={split_k}")
-
-        for kernel_path in kernels:
-            kernel_info = self.extract_kernel_info(kernel_path)
-            result = self.run_kernel(kernel_path, params)
-
-            if result:
-                # Create new structured result format
-                structured_result = {
-                    "name": kernel_info["name"],  # Add name field for compatibility
-                    "config_id": kernel_info["config_id"],
-                    "problem": result.get("problem", {}),
-                    "perf_result": result.get("perf_result", {}),
-                    "config": {
-                        "data_type": kernel_info["data_type"],
-                        "layout": kernel_info["layout"],
-                        "pipeline": kernel_info["pipeline"],
-                        "scheduler": kernel_info["scheduler"],
-                        "epilogue": kernel_info["epilogue"],
-                        "tile_sizes": kernel_info.get("tile_sizes", {}),
-                        "warp_config": kernel_info.get("warp_config", {}),
-                        "warp_tile": kernel_info.get("warp_tile", {}),
-                        "optimization_flags": kernel_info.get("optimization_flags", {}),
-                    },
-                    "executable": kernel_info["executable"],
-                    # Keep backward compatibility fields
-                    "time_ms": result.get("time_ms", 0),
-                    "tflops": result.get("tflops", 0),
-                    "bandwidth_gb_s": result.get("bandwidth_gb_s", 0),
-                }
-
-                results.append(structured_result)
-
-                if self.verbose:
-                    print(
-                        f"  {kernel_info['config_id']}: {structured_result['tflops']:.2f} TFLOPS, {structured_result['bandwidth_gb_s']:.2f} GB/s, {structured_result['time_ms']:.2f}ms"
-                    )
-
-        return results
-
-    def find_best_kernel(
-        self, results: List[Dict], metric: str = "tflops"
-    ) -> Optional[Dict]:
-        """Find the best performing kernel based on metric"""
-        if not results:
-            return None
-
-        if metric == "tflops":
-            return max(results, key=lambda x: x.get("tflops", 0))
-        elif metric == "time_ms":
-            return min(results, key=lambda x: x.get("time_ms", float("inf")))
-        elif metric == "bandwidth_gb_s":
-            return max(results, key=lambda x: x.get("bandwidth_gb_s", 0))
-        else:
-            raise ValueError(f"Unknown metric: {metric}")
-
-    def benchmark_sweep(
-        self,
-        problem_sizes: List[Tuple[int, int, int]],
-        split_k_values: List[int] = [1],
-        verify: bool = False,
-        warmup: int = 50,
-        repeat: int = 100,
-        flush_cache: bool = True,
-        rotating_count: int = 1000,
-    ) -> Dict:
-        """Run comprehensive benchmark sweep"""
-        kernels = self.discover_kernels()
-        if not kernels:
-            print("No kernels found!")
-            return {}
-
-        all_results = []
-        best_kernels = {}
-
-        for m, n, k in problem_sizes:
-            for split_k in split_k_values:
-                results = self.benchmark_problem_size(
-                    kernels,
-                    m,
-                    n,
-                    k,
-                    split_k,
-                    verify=2 if verify else 0,
-                    warmup=warmup,
-                    repeat=repeat,
-                    flush_cache=flush_cache,
-                    rotating_count=rotating_count,
-                )
-
-                all_results.extend(results)
-
-                # Find best kernel for this configuration
-                best = self.find_best_kernel(results)
-                if best:
-                    key = f"m{m}_n{n}_k{k}_splitk{split_k}"
-                    best_kernels[key] = best
-                    print(
-                        f"Best for {key}: {best['name']} ({best['tflops']:.2f} TFLOPS, {best['bandwidth_gb_s']:.2f} GB/s, {best['time_ms']:.2f}ms)"
-                    )
-
-        self.results = all_results
-        return best_kernels
-
-    def export_csv(self, filename: str):
-        """Export all results to CSV"""
-        if not self.results:
-            print("No results to export")
-            return
-
-        # Get all unique keys from results
-        all_keys = set()
-        for result in self.results:
-            all_keys.update(result.keys())
-
-        # Sort keys for consistent output
-        fieldnames = sorted(all_keys)
-
-        with open(filename, "w", newline="") as csvfile:
-            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
-            writer.writeheader()
-            writer.writerows(self.results)
-
-        print(f"Results exported to {filename}")
-
-    def export_best_kernels(self, best_kernels: Dict, filename: str):
-        """Export best kernel selections to file"""
-        with open(filename, "w") as f:
-            f.write("# Best kernel selections\n")
-            f.write(
-                "# Format: problem_size -> kernel_name (TFLOPS, bandwidth, latency)\n\n"
-            )
-
-            for key, kernel in sorted(best_kernels.items()):
-                f.write(
-                    f"{key}: {kernel['name']} ({kernel['tflops']:.2f} TFLOPS, {kernel['bandwidth_gb_s']:.2f} GB/s, {kernel['time_ms']:.2f}ms)\n"
-                )
-
-        print(f"Best kernels exported to {filename}")
-
-    def export_json(self, filename: str, best_kernels: Dict = None):
-        """Export all results and best kernels to JSON with comprehensive metadata"""
-        from datetime import datetime
-
-        # Calculate comprehensive summary statistics for all metrics
-        successful_results = [r for r in self.results if r.get("tflops", 0) > 0]
-
-        tflops_values = [r.get("tflops", 0) for r in successful_results]
-        bandwidth_values = [r.get("bandwidth_gb_s", 0) for r in successful_results]
-        latency_values = [
-            r.get("time_ms", 0) for r in successful_results if r.get("time_ms", 0) > 0
-        ]
-
-        # Performance breakdown by kernel type
-        pipeline_stats = {}
-        scheduler_stats = {}
-        data_type_stats = {}
-
-        for result in successful_results:
-            # Get config info from the new structure
-            config = result.get("config", {})
-
-            # Pipeline statistics
-            pipeline = config.get("pipeline", "unknown")
-            if pipeline not in pipeline_stats:
-                pipeline_stats[pipeline] = {
-                    "count": 0,
-                    "avg_tflops": 0,
-                    "best_tflops": 0,
-                }
-            pipeline_stats[pipeline]["count"] += 1
-            pipeline_stats[pipeline]["best_tflops"] = max(
-                pipeline_stats[pipeline]["best_tflops"], result.get("tflops", 0)
-            )
-
-            # Scheduler statistics
-            scheduler = config.get("scheduler", "unknown")
-            if scheduler not in scheduler_stats:
-                scheduler_stats[scheduler] = {
-                    "count": 0,
-                    "avg_tflops": 0,
-                    "best_tflops": 0,
-                }
-            scheduler_stats[scheduler]["count"] += 1
-            scheduler_stats[scheduler]["best_tflops"] = max(
-                scheduler_stats[scheduler]["best_tflops"], result.get("tflops", 0)
-            )
+    # Load the module dynamically
+    spec = importlib.util.spec_from_file_location(
+        "gemm_benchmark",
+        os.path.join(parent_dir, "gemm_benchmark.py"),
+    )
+    gemm_benchmark_module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(gemm_benchmark_module)
 
-            # Data type statistics
-            data_type = config.get("data_type", "unknown")
-            if data_type not in data_type_stats:
-                data_type_stats[data_type] = {
-                    "count": 0,
-                    "avg_tflops": 0,
-                    "best_tflops": 0,
-                }
-            data_type_stats[data_type]["count"] += 1
-            data_type_stats[data_type]["best_tflops"] = max(
-                data_type_stats[data_type]["best_tflops"], result.get("tflops", 0)
-            )
+    return gemm_benchmark_module.GemmBenchmark
 
-        # Calculate averages for breakdown stats
-        for stats_dict, field_name in [
-            (pipeline_stats, "pipeline"),
-            (scheduler_stats, "scheduler"),
-            (data_type_stats, "data_type"),
-        ]:
-            for key in stats_dict:
-                relevant_results = [
-                    r
-                    for r in successful_results
-                    if r.get("config", {}).get(field_name, "unknown") == key
-                ]
-                if relevant_results:
-                    stats_dict[key]["avg_tflops"] = sum(
-                        r.get("tflops", 0) for r in relevant_results
-                    ) / len(relevant_results)
+def _import_benchmark_utils():
+    """Import benchmark utilities from commons directory."""
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    parent_dir = os.path.dirname(os.path.dirname(current_dir))
 
-        output_data = {
-            "benchmark_metadata": {
-                "timestamp": datetime.now().isoformat(),
-                "total_kernels_tested": len(self.results),
-                "unique_kernels": len(
-                    set(r.get("name", "unknown") for r in self.results)
-                ),
-                "successful_runs": len(successful_results),
-                "failed_runs": len(self.results) - len(successful_results),
-            },
-            "performance_summary": {
-                "tflops_stats": {
-                    "best": max(tflops_values, default=0),
-                    "average": sum(tflops_values) / len(tflops_values)
-                    if tflops_values
-                    else 0,
-                    "min": min(tflops_values, default=0),
-                    "median": sorted(tflops_values)[len(tflops_values) // 2]
-                    if tflops_values
-                    else 0,
-                },
-                "bandwidth_stats": {
-                    "best_gb_s": max(bandwidth_values, default=0),
-                    "average_gb_s": sum(bandwidth_values) / len(bandwidth_values)
-                    if bandwidth_values
-                    else 0,
-                    "min_gb_s": min(bandwidth_values, default=0),
-                    "median_gb_s": sorted(bandwidth_values)[len(bandwidth_values) // 2]
-                    if bandwidth_values
-                    else 0,
-                },
-                "latency_stats": {
-                    "best_ms": min(latency_values, default=0),
-                    "average_ms": sum(latency_values) / len(latency_values)
-                    if latency_values
-                    else 0,
-                    "max_ms": max(latency_values, default=0),
-                    "median_ms": sorted(latency_values)[len(latency_values) // 2]
-                    if latency_values
-                    else 0,
-                },
-                "kernel_type_breakdown": {
-                    "by_pipeline": pipeline_stats,
-                    "by_scheduler": scheduler_stats,
-                    "by_data_type": data_type_stats,
-                },
-                "total_problem_configurations": len(best_kernels)
-                if best_kernels
-                else 0,
-            },
-            "kernel_results": self.results,
-            "best_kernels_by_problem": best_kernels or {},
-        }
+    # Load the module dynamically
+    spec = importlib.util.spec_from_file_location(
+        "benchmark_utils",
+        os.path.join(parent_dir, "commons", "benchmark_utils.py"),
+    )
+    benchmark_utils = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(benchmark_utils)
 
-        with open(filename, "w") as f:
-            json.dump(output_data, f, indent=2)
+    return benchmark_utils
 
-        print(f"JSON results exported to {filename}")
-        print(f"  - Total kernels: {len(self.results)}")
-        print(f"  - Successful runs: {len(successful_results)}")
-        print(f"  - Best TFLOPS: {max(tflops_values, default=0):.2f}")
-        print(f"  - Best bandwidth: {max(bandwidth_values, default=0):.2f} GB/s")
-        print(f"  - Best latency: {min(latency_values, default=0):.2f}ms")
+GemmBenchmark = _import_gemm_benchmark()
+benchmark_utils = _import_benchmark_utils()
 
+class GemmPreshuffleBenchmark(GemmBenchmark):
+    def __init__(self, build_dir: str, verbose: bool = False):
+        super().__init__(build_dir, verbose, name="benchmark_gemm_preshuffle_")
 
 def main():
     parser = argparse.ArgumentParser(
@@ -669,12 +135,12 @@ def main():
     print(f"\nBenchmark completed in {elapsed_time:.2f} seconds")
 
     # Export results
-    benchmark.export_csv(args.csv)
-    benchmark.export_best_kernels(best_kernels, args.best)
+    benchmark_utils.export_csv(benchmark.results, args.csv)
+    benchmark_utils.export_best_kernels(best_kernels, args.best)
 
     # Export JSON if requested
     if args.json:
-        benchmark.export_json(args.json, best_kernels)
+        benchmark_utils.export_json(benchmark.results, args.json, best_kernels)
 
     return 0
 
diff --git a/tile_engine/ops/gemm/gemm_universal/CMakeLists.txt b/tile_engine/ops/gemm/gemm_universal/CMakeLists.txt
index 7505fcd6d04..7f8048b5945 100644
--- a/tile_engine/ops/gemm/gemm_universal/CMakeLists.txt
+++ b/tile_engine/ops/gemm/gemm_universal/CMakeLists.txt
@@ -68,7 +68,7 @@ function(create_individual_gemm_universal_target datatype layout trait tile_conf
     # Create the executable
     add_executable(${target_name}
         EXCLUDE_FROM_ALL
-        ${GEMM_UNIVERSAL_SOURCE_DIR}/gemm_benchmark_single.cpp
+        ${GEMM_UNIVERSAL_SOURCE_DIR}/gemm_universal_benchmark_single.cpp
         ${instance_header}
     )
 
diff --git a/tile_engine/ops/gemm/gemm_universal/gemm_benchmark.py b/tile_engine/ops/gemm/gemm_universal/gemm_benchmark.py
deleted file mode 100644
index b7424c6d1da..00000000000
--- a/tile_engine/ops/gemm/gemm_universal/gemm_benchmark.py
+++ /dev/null
@@ -1,678 +0,0 @@
-# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-# SPDX-License-Identifier: MIT
-
-import sys
-import json
-import subprocess
-import argparse
-import csv
-import time
-from pathlib import Path
-from typing import List, Dict, Tuple, Optional
-
-
-class GemmBenchmark:
-    def __init__(self, build_dir: str, verbose: bool = False):
-        self.build_dir = Path(build_dir)
-        self.verbose = verbose
-        self.results = []
-
-    def discover_kernels(self) -> List[Path]:
-        """Find all benchmark_gemm_* executables in the build directory"""
-        bin_dir = self.build_dir / "bin"
-        if not bin_dir.exists():
-            print(f"Error: Binary directory {bin_dir} does not exist")
-            return []
-
-        kernels = list(bin_dir.glob("benchmark_gemm_*"))
-        if self.verbose:
-            print(f"Found {len(kernels)} kernel executables")
-            for k in kernels:
-                print(f"  - {k.name}")
-        return kernels
-
-    def extract_kernel_info(self, kernel_path: Path) -> Dict[str, str]:
-        """Extract comprehensive kernel information from filename"""
-        name = kernel_path.stem
-
-        # Initialize with basic info
-        info = {
-            "executable": str(kernel_path),
-            "name": name,
-            "data_type": "unknown",
-            "layout": "unknown",
-            "pipeline": "unknown",
-            "scheduler": "unknown",
-            "epilogue": "unknown",
-        }
-
-        # Parse the kernel name pattern:
-        # benchmark_gemm_fp16_rcr_mem_default_intrawave_False_False_False_False_False_256x256x32_2x2x1_4x64x16
-        parts = name.split("_")
-
-        if len(parts) >= 3:
-            # Extract data type (3rd part after benchmark_gemm_)
-            info["data_type"] = parts[2] if len(parts) > 2 else "unknown"
-
-            # Extract layout (4th part)
-            info["layout"] = parts[3] if len(parts) > 3 else "unknown"
-
-            # Extract pipeline (5th part)
-            info["pipeline"] = parts[4] if len(parts) > 4 else "unknown"
-
-            # Extract epilogue (6th part)
-            info["epilogue"] = parts[5] if len(parts) > 5 else "unknown"
-
-            # Extract scheduler (7th part)
-            info["scheduler"] = parts[6] if len(parts) > 6 else "unknown"
-
-        # Extract detailed configuration from the end of the name
-        config_info = self.parse_detailed_config(name)
-        info.update(config_info)
-
-        # Generate config ID
-        info["config_id"] = self.generate_config_id(info)
-
-        return info
-
-    def parse_detailed_config(self, kernel_name: str) -> Dict:
-        """Parse detailed configuration from kernel name"""
-        config = {
-            "tile_sizes": {"tile_m": 0, "tile_n": 0, "tile_k": 0},
-            "warp_config": {"warp_m": 0, "warp_n": 0, "warp_k": 0},
-            "warp_tile": {"warp_tile_m": 0, "warp_tile_n": 0, "warp_tile_k": 0},
-            "optimization_flags": {
-                "pad_m": False,
-                "pad_n": False,
-                "pad_k": False,
-                "persistent": False,
-            },
-        }
-
-        # Split by underscore and look for patterns
-        parts = kernel_name.split("_")
-
-        # Look for boolean flags (sequence of True/False values)
-        bool_sequence = []
-        for i, part in enumerate(parts):
-            if part in ["True", "False"]:
-                bool_sequence.append(part == "True")
-                # Continue collecting consecutive boolean values
-                j = i + 1
-                while j < len(parts) and parts[j] in ["True", "False"]:
-                    bool_sequence.append(parts[j] == "True")
-                    j += 1
-                break
-
-        # Assign boolean flags if we found them
-        # Order: pad_m, pad_n, pad_k, persistent (4 flags total)
-        if len(bool_sequence) >= 4:
-            config["optimization_flags"]["pad_m"] = bool_sequence[0]
-            config["optimization_flags"]["pad_n"] = bool_sequence[1]
-            config["optimization_flags"]["pad_k"] = bool_sequence[2]
-            config["optimization_flags"]["persistent"] = bool_sequence[3]
-
-        # Look for tile size patterns (e.g., 256x256x32_2x2x1_4x64x16)
-        # The pattern is: tile_sizes_warp_config_warp_tile
-        dimension_groups = []
-        for part in parts:
-            if "x" in part and len(part.split("x")) == 3:
-                try:
-                    dims = [int(x) for x in part.split("x")]
-                    if all(d > 0 for d in dims):
-                        dimension_groups.append(dims)
-                except ValueError:
-                    continue
-
-        # Assign dimensions based on order and magnitude
-        if len(dimension_groups) >= 3:
-            # Sort by magnitude to identify: largest=tile_sizes, smallest=warp_config, middle=warp_tile
-            sorted_groups = sorted(dimension_groups, key=lambda x: max(x), reverse=True)
-
-            # Largest dimensions = tile sizes
-            config["tile_sizes"]["tile_m"] = sorted_groups[0][0]
-            config["tile_sizes"]["tile_n"] = sorted_groups[0][1]
-            config["tile_sizes"]["tile_k"] = sorted_groups[0][2]
-
-            # Smallest dimensions = warp config
-            config["warp_config"]["warp_m"] = sorted_groups[2][0]
-            config["warp_config"]["warp_n"] = sorted_groups[2][1]
-            config["warp_config"]["warp_k"] = sorted_groups[2][2]
-
-            # Middle dimensions = warp tile
-            config["warp_tile"]["warp_tile_m"] = sorted_groups[1][0]
-            config["warp_tile"]["warp_tile_n"] = sorted_groups[1][1]
-            config["warp_tile"]["warp_tile_k"] = sorted_groups[1][2]
-        elif len(dimension_groups) == 2:
-            # If only 2 groups, assign based on magnitude
-            sorted_groups = sorted(dimension_groups, key=lambda x: max(x), reverse=True)
-
-            # Larger = tile sizes
-            config["tile_sizes"]["tile_m"] = sorted_groups[0][0]
-            config["tile_sizes"]["tile_n"] = sorted_groups[0][1]
-            config["tile_sizes"]["tile_k"] = sorted_groups[0][2]
-
-            # Smaller = warp config
-            config["warp_config"]["warp_m"] = sorted_groups[1][0]
-            config["warp_config"]["warp_n"] = sorted_groups[1][1]
-            config["warp_config"]["warp_k"] = sorted_groups[1][2]
-        elif len(dimension_groups) == 1:
-            # Only one group - assume it's tile sizes
-            config["tile_sizes"]["tile_m"] = dimension_groups[0][0]
-            config["tile_sizes"]["tile_n"] = dimension_groups[0][1]
-            config["tile_sizes"]["tile_k"] = dimension_groups[0][2]
-
-        return config
-
-    def generate_config_id(self, info: Dict) -> str:
-        """Generate a compact config ID from kernel info"""
-        # Create a compact identifier
-        parts = [
-            info.get("data_type", "unk"),
-            info.get("layout", "unk"),
-            info.get("pipeline", "unk"),
-            info.get("scheduler", "unk"),
-        ]
-
-        # Add tile configuration if available
-        tile_sizes = info.get("tile_sizes", {})
-        if tile_sizes.get("tile_m", 0) > 0:
-            tile_str = (
-                f"{tile_sizes['tile_m']}x{tile_sizes['tile_n']}x{tile_sizes['tile_k']}"
-            )
-            parts.append(tile_str)
-
-        # Add warp config if available
-        warp_config = info.get("warp_config", {})
-        if warp_config.get("warp_m", 0) > 0:
-            warp_str = f"w{warp_config['warp_m']}x{warp_config['warp_n']}x{warp_config['warp_k']}"
-            parts.append(warp_str)
-
-        # Add warp tile if available
-        warp_tile = info.get("warp_tile", {})
-        if warp_tile.get("warp_tile_m", 0) > 0:
-            warp_tile_str = f"wt{warp_tile['warp_tile_m']}x{warp_tile['warp_tile_n']}x{warp_tile['warp_tile_k']}"
-            parts.append(warp_tile_str)
-
-        return "_".join(parts)
-
-    def run_kernel(self, kernel_path: Path, params: Dict[str, str]) -> Optional[Dict]:
-        """Run a single kernel with given parameters and save output to individual JSON file"""
-        # Create results directory
-        results_dir = self.build_dir / "results"
-        results_dir.mkdir(exist_ok=True)
-
-        # Generate unique JSON filename for this kernel
-        json_file = results_dir / f"{kernel_path.stem}.json"
-
-        cmd = [str(kernel_path)]
-
-        # Add parameters
-        for key, value in params.items():
-            cmd.append(f"-{key}={value}")
-
-        # Add JSON output flag for clean JSON output
-        cmd.append("-json_output=true")
-
-        if self.verbose:
-            print(f"Running: {' '.join(cmd)}")
-
-        try:
-            result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
-
-            if result.returncode != 0:
-                print(f"Error running {kernel_path.name}: {result.stderr}")
-                return None
-
-            # Save raw output to individual JSON file
-            output = result.stdout.strip()
-            if output:
-                with open(json_file, "w") as f:
-                    f.write(output)
-
-                # Parse the JSON file
-                return self.parse_json_file(json_file)
-            else:
-                print(f"No output from {kernel_path.name}")
-                return None
-
-        except subprocess.TimeoutExpired:
-            print(f"Timeout running {kernel_path.name}")
-            return None
-        except Exception as e:
-            print(f"Error running {kernel_path.name}: {e}")
-            return None
-
-    def parse_json_file(self, json_file: Path) -> Optional[Dict]:
-        """Parse JSON data from individual kernel output file"""
-        try:
-            with open(json_file, "r") as f:
-                content = f.read().strip()
-
-            # Parse the JSON directly since executables produce clean JSON
-            data = json.loads(content)
-
-            # Return the complete JSON data as-is, just add some convenience fields
-            result = data.copy()
-            if "perf_result" in data:
-                perf = data["perf_result"]
-                # Add convenience fields for backward compatibility
-                result["time_ms"] = perf.get("latency(ms)", 0)
-                result["tflops"] = perf.get("tflops(TFlops)", 0)
-                result["bandwidth_gb_s"] = perf.get("bandwidth(GB/s)", 0)
-
-            return result
-
-        except json.JSONDecodeError as e:
-            if self.verbose:
-                print(f"Failed to parse JSON from {json_file}: {e}")
-            return None
-        except Exception as e:
-            if self.verbose:
-                print(f"Error reading JSON file {json_file}: {e}")
-            return None
-
-    def benchmark_problem_size(
-        self,
-        kernels: List[Path],
-        m: int,
-        n: int,
-        k: int,
-        split_k: int = 1,
-        verify: int = 0,
-        warmup: int = 50,
-        repeat: int = 100,
-        flush_cache: bool = True,
-        rotating_count: int = 1000,
-    ) -> List[Dict]:
-        """Benchmark all kernels for a specific problem size"""
-        results = []
-
-        params = {
-            "m": m,
-            "n": n,
-            "k": k,
-            "split_k": split_k,
-            "verify": verify,
-            "warmup": warmup,
-            "repeat": repeat,
-            "flush_cache": str(flush_cache).lower(),
-            "rotating_count": rotating_count,
-        }
-
-        print(f"\nBenchmarking M={m}, N={n}, K={k}, split_k={split_k}")
-
-        for kernel_path in kernels:
-            kernel_info = self.extract_kernel_info(kernel_path)
-            result = self.run_kernel(kernel_path, params)
-
-            if result:
-                # Create new structured result format
-                structured_result = {
-                    "name": kernel_info["name"],  # Add name field for compatibility
-                    "config_id": kernel_info["config_id"],
-                    "problem": result.get("problem", {}),
-                    "perf_result": result.get("perf_result", {}),
-                    "config": {
-                        "data_type": kernel_info["data_type"],
-                        "layout": kernel_info["layout"],
-                        "pipeline": kernel_info["pipeline"],
-                        "scheduler": kernel_info["scheduler"],
-                        "epilogue": kernel_info["epilogue"],
-                        "tile_sizes": kernel_info.get("tile_sizes", {}),
-                        "warp_config": kernel_info.get("warp_config", {}),
-                        "warp_tile": kernel_info.get("warp_tile", {}),
-                        "optimization_flags": kernel_info.get("optimization_flags", {}),
-                    },
-                    "executable": kernel_info["executable"],
-                    # Keep backward compatibility fields
-                    "time_ms": result.get("time_ms", 0),
-                    "tflops": result.get("tflops", 0),
-                    "bandwidth_gb_s": result.get("bandwidth_gb_s", 0),
-                }
-
-                results.append(structured_result)
-
-                if self.verbose:
-                    print(
-                        f"  {kernel_info['config_id']}: {structured_result['tflops']:.2f} TFLOPS, {structured_result['bandwidth_gb_s']:.2f} GB/s, {structured_result['time_ms']:.2f}ms"
-                    )
-
-        return results
-
-    def find_best_kernel(
-        self, results: List[Dict], metric: str = "tflops"
-    ) -> Optional[Dict]:
-        """Find the best performing kernel based on metric"""
-        if not results:
-            return None
-
-        if metric == "tflops":
-            return max(results, key=lambda x: x.get("tflops", 0))
-        elif metric == "time_ms":
-            return min(results, key=lambda x: x.get("time_ms", float("inf")))
-        elif metric == "bandwidth_gb_s":
-            return max(results, key=lambda x: x.get("bandwidth_gb_s", 0))
-        else:
-            raise ValueError(f"Unknown metric: {metric}")
-
-    def benchmark_sweep(
-        self,
-        problem_sizes: List[Tuple[int, int, int]],
-        split_k_values: List[int] = [1],
-        verify: bool = False,
-        warmup: int = 50,
-        repeat: int = 100,
-        flush_cache: bool = True,
-        rotating_count: int = 1000,
-    ) -> Dict:
-        """Run comprehensive benchmark sweep"""
-        kernels = self.discover_kernels()
-        if not kernels:
-            print("No kernels found!")
-            return {}
-
-        all_results = []
-        best_kernels = {}
-
-        for m, n, k in problem_sizes:
-            for split_k in split_k_values:
-                results = self.benchmark_problem_size(
-                    kernels,
-                    m,
-                    n,
-                    k,
-                    split_k,
-                    verify=2 if verify else 0,
-                    warmup=warmup,
-                    repeat=repeat,
-                    flush_cache=flush_cache,
-                    rotating_count=rotating_count,
-                )
-
-                all_results.extend(results)
-
-                # Find best kernel for this configuration
-                best = self.find_best_kernel(results)
-                if best:
-                    key = f"m{m}_n{n}_k{k}_splitk{split_k}"
-                    best_kernels[key] = best
-                    print(
-                        f"Best for {key}: {best['name']} ({best['tflops']:.2f} TFLOPS, {best['bandwidth_gb_s']:.2f} GB/s, {best['time_ms']:.2f}ms)"
-                    )
-
-        self.results = all_results
-        return best_kernels
-
-    def export_csv(self, filename: str):
-        """Export all results to CSV"""
-        if not self.results:
-            print("No results to export")
-            return
-
-        # Get all unique keys from results
-        all_keys = set()
-        for result in self.results:
-            all_keys.update(result.keys())
-
-        # Sort keys for consistent output
-        fieldnames = sorted(all_keys)
-
-        with open(filename, "w", newline="") as csvfile:
-            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
-            writer.writeheader()
-            writer.writerows(self.results)
-
-        print(f"Results exported to {filename}")
-
-    def export_best_kernels(self, best_kernels: Dict, filename: str):
-        """Export best kernel selections to file"""
-        with open(filename, "w") as f:
-            f.write("# Best kernel selections\n")
-            f.write(
-                "# Format: problem_size -> kernel_name (TFLOPS, bandwidth, latency)\n\n"
-            )
-
-            for key, kernel in sorted(best_kernels.items()):
-                f.write(
-                    f"{key}: {kernel['name']} ({kernel['tflops']:.2f} TFLOPS, {kernel['bandwidth_gb_s']:.2f} GB/s, {kernel['time_ms']:.2f}ms)\n"
-                )
-
-        print(f"Best kernels exported to {filename}")
-
-    def export_json(self, filename: str, best_kernels: Dict = None):
-        """Export all results and best kernels to JSON with comprehensive metadata"""
-        from datetime import datetime
-
-        # Calculate comprehensive summary statistics for all metrics
-        successful_results = [r for r in self.results if r.get("tflops", 0) > 0]
-
-        tflops_values = [r.get("tflops", 0) for r in successful_results]
-        bandwidth_values = [r.get("bandwidth_gb_s", 0) for r in successful_results]
-        latency_values = [
-            r.get("time_ms", 0) for r in successful_results if r.get("time_ms", 0) > 0
-        ]
-
-        # Performance breakdown by kernel type
-        pipeline_stats = {}
-        scheduler_stats = {}
-        data_type_stats = {}
-
-        for result in successful_results:
-            # Get config info from the new structure
-            config = result.get("config", {})
-
-            # Pipeline statistics
-            pipeline = config.get("pipeline", "unknown")
-            if pipeline not in pipeline_stats:
-                pipeline_stats[pipeline] = {
-                    "count": 0,
-                    "avg_tflops": 0,
-                    "best_tflops": 0,
-                }
-            pipeline_stats[pipeline]["count"] += 1
-            pipeline_stats[pipeline]["best_tflops"] = max(
-                pipeline_stats[pipeline]["best_tflops"], result.get("tflops", 0)
-            )
-
-            # Scheduler statistics
-            scheduler = config.get("scheduler", "unknown")
-            if scheduler not in scheduler_stats:
-                scheduler_stats[scheduler] = {
-                    "count": 0,
-                    "avg_tflops": 0,
-                    "best_tflops": 0,
-                }
-            scheduler_stats[scheduler]["count"] += 1
-            scheduler_stats[scheduler]["best_tflops"] = max(
-                scheduler_stats[scheduler]["best_tflops"], result.get("tflops", 0)
-            )
-
-            # Data type statistics
-            data_type = config.get("data_type", "unknown")
-            if data_type not in data_type_stats:
-                data_type_stats[data_type] = {
-                    "count": 0,
-                    "avg_tflops": 0,
-                    "best_tflops": 0,
-                }
-            data_type_stats[data_type]["count"] += 1
-            data_type_stats[data_type]["best_tflops"] = max(
-                data_type_stats[data_type]["best_tflops"], result.get("tflops", 0)
-            )
-
-        # Calculate averages for breakdown stats
-        for stats_dict, field_name in [
-            (pipeline_stats, "pipeline"),
-            (scheduler_stats, "scheduler"),
-            (data_type_stats, "data_type"),
-        ]:
-            for key in stats_dict:
-                relevant_results = [
-                    r
-                    for r in successful_results
-                    if r.get("config", {}).get(field_name, "unknown") == key
-                ]
-                if relevant_results:
-                    stats_dict[key]["avg_tflops"] = sum(
-                        r.get("tflops", 0) for r in relevant_results
-                    ) / len(relevant_results)
-
-        output_data = {
-            "benchmark_metadata": {
-                "timestamp": datetime.now().isoformat(),
-                "total_kernels_tested": len(self.results),
-                "unique_kernels": len(
-                    set(r.get("name", "unknown") for r in self.results)
-                ),
-                "successful_runs": len(successful_results),
-                "failed_runs": len(self.results) - len(successful_results),
-            },
-            "performance_summary": {
-                "tflops_stats": {
-                    "best": max(tflops_values, default=0),
-                    "average": sum(tflops_values) / len(tflops_values)
-                    if tflops_values
-                    else 0,
-                    "min": min(tflops_values, default=0),
-                    "median": sorted(tflops_values)[len(tflops_values) // 2]
-                    if tflops_values
-                    else 0,
-                },
-                "bandwidth_stats": {
-                    "best_gb_s": max(bandwidth_values, default=0),
-                    "average_gb_s": sum(bandwidth_values) / len(bandwidth_values)
-                    if bandwidth_values
-                    else 0,
-                    "min_gb_s": min(bandwidth_values, default=0),
-                    "median_gb_s": sorted(bandwidth_values)[len(bandwidth_values) // 2]
-                    if bandwidth_values
-                    else 0,
-                },
-                "latency_stats": {
-                    "best_ms": min(latency_values, default=0),
-                    "average_ms": sum(latency_values) / len(latency_values)
-                    if latency_values
-                    else 0,
-                    "max_ms": max(latency_values, default=0),
-                    "median_ms": sorted(latency_values)[len(latency_values) // 2]
-                    if latency_values
-                    else 0,
-                },
-                "kernel_type_breakdown": {
-                    "by_pipeline": pipeline_stats,
-                    "by_scheduler": scheduler_stats,
-                    "by_data_type": data_type_stats,
-                },
-                "total_problem_configurations": len(best_kernels)
-                if best_kernels
-                else 0,
-            },
-            "kernel_results": self.results,
-            "best_kernels_by_problem": best_kernels or {},
-        }
-
-        with open(filename, "w") as f:
-            json.dump(output_data, f, indent=2)
-
-        print(f"JSON results exported to {filename}")
-        print(f"  - Total kernels: {len(self.results)}")
-        print(f"  - Successful runs: {len(successful_results)}")
-        print(f"  - Best TFLOPS: {max(tflops_values, default=0):.2f}")
-        print(f"  - Best bandwidth: {max(bandwidth_values, default=0):.2f} GB/s")
-        print(f"  - Best latency: {min(latency_values, default=0):.2f}ms")
-
-
-def main():
-    parser = argparse.ArgumentParser(description="GEMM Kernel Benchmarking Tool")
-    parser.add_argument(
-        "build_dir", help="Build directory containing kernel executables"
-    )
-    parser.add_argument(
-        "--problem-sizes",
-        nargs="+",
-        default=["1024,1024,1024", "2048,2048,2048", "4096,4096,4096"],
-        help="Problem sizes as M,N,K tuples",
-    )
-    parser.add_argument(
-        "--split-k", nargs="+", type=int, default=[1], help="Split-K values to test"
-    )
-    parser.add_argument("--verify", action="store_true", help="Enable verification")
-    parser.add_argument(
-        "--csv", default="gemm_benchmark_results.csv", help="CSV output filename"
-    )
-    parser.add_argument(
-        "--best", default="best_kernels.txt", help="Best kernels output filename"
-    )
-    parser.add_argument("--verbose", action="store_true", help="Verbose output")
-    parser.add_argument(
-        "--warmup",
-        type=int,
-        default=50,
-        help="Number of warmup iterations (default: 50)",
-    )
-    parser.add_argument(
-        "--repeat",
-        type=int,
-        default=100,
-        help="Number of benchmark iterations (default: 100)",
-    )
-    parser.add_argument(
-        "--flush-cache",
-        action="store_true",
-        default=True,
-        help="Enable cache flushing (default: True)",
-    )
-    parser.add_argument(
-        "--rotating-count",
-        type=int,
-        default=1000,
-        help="Number of iterations to rotate cache (default: 1000)",
-    )
-    parser.add_argument("--json", help="JSON output filename (optional)")
-
-    args = parser.parse_args()
-
-    # Parse problem sizes
-    problem_sizes = []
-    for size_str in args.problem_sizes:
-        try:
-            m, n, k = map(int, size_str.split(","))
-            problem_sizes.append((m, n, k))
-        except ValueError:
-            print(f"Invalid problem size: {size_str}")
-            return 1
-
-    # Create benchmark instance
-    benchmark = GemmBenchmark(args.build_dir, verbose=args.verbose)
-
-    # Run benchmark sweep
-    print("Starting GEMM kernel benchmark sweep...")
-    start_time = time.time()
-
-    best_kernels = benchmark.benchmark_sweep(
-        problem_sizes=problem_sizes,
-        split_k_values=args.split_k,
-        verify=args.verify,
-        warmup=args.warmup,
-        repeat=args.repeat,
-        flush_cache=args.flush_cache,
-        rotating_count=args.rotating_count,
-    )
-
-    elapsed_time = time.time() - start_time
-    print(f"\nBenchmark completed in {elapsed_time:.2f} seconds")
-
-    # Export results
-    benchmark.export_csv(args.csv)
-    benchmark.export_best_kernels(best_kernels, args.best)
-
-    # Export JSON if requested
-    if args.json:
-        benchmark.export_json(args.json, best_kernels)
-
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/tile_engine/ops/gemm/gemm_universal/gemm_benchmark.hpp b/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.hpp
similarity index 99%
rename from tile_engine/ops/gemm/gemm_universal/gemm_benchmark.hpp
rename to tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.hpp
index 7c8df32ad89..c7f4f470b0f 100644
--- a/tile_engine/ops/gemm/gemm_universal/gemm_benchmark.hpp
+++ b/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.hpp
@@ -11,7 +11,7 @@
 
 #include "ck_tile/core.hpp"
 #include "ck_tile/host.hpp"
-#include "gemm_common.hpp"
+#include "gemm_universal_common.hpp"
 
 // Data types and Layouts are defined by the generated kernel headers
 // No hardcoded type definitions here to avoid conflicts
diff --git a/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py b/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py
new file mode 100755
index 00000000000..88ed4465af3
--- /dev/null
+++ b/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+import os
+import sys
+import json
+import subprocess
+import argparse
+import csv
+import time
+import importlib.util
+from pathlib import Path
+from typing import List, Dict, Tuple, Optional
+
+def _import_gemm_benchmark():
+    """Import validation utilities from commons directory."""
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    parent_dir = os.path.dirname(current_dir)
+
+    # Load the module dynamically
+    spec = importlib.util.spec_from_file_location(
+        "gemm_benchmark",
+        os.path.join(parent_dir, "gemm_benchmark.py"),
+    )
+    gemm_benchmark_module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(gemm_benchmark_module)
+
+    return gemm_benchmark_module.GemmBenchmark
+
+def _import_benchmark_utils():
+    """Import benchmark utilities from commons directory."""
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    parent_dir = os.path.dirname(os.path.dirname(current_dir))
+
+    # Load the module dynamically
+    spec = importlib.util.spec_from_file_location(
+        "benchmark_utils",
+        os.path.join(parent_dir, "commons", "benchmark_utils.py"),
+    )
+    benchmark_utils = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(benchmark_utils)
+
+    return benchmark_utils
+
+GemmBenchmark = _import_gemm_benchmark()
+benchmark_utils = _import_benchmark_utils()
+
+class GemmUniversalBenchmark(GemmBenchmark):
+    def __init__(self, build_dir: str, verbose: bool = False):
+        super().__init__(build_dir, verbose, name="benchmark_gemm_")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="GEMM Kernel Benchmarking Tool")
+    parser.add_argument(
+        "build_dir", help="Build directory containing kernel executables"
+    )
+    parser.add_argument(
+        "--problem-sizes",
+        nargs="+",
+        default=["1024,1024,1024", "2048,2048,2048", "4096,4096,4096"],
+        help="Problem sizes as M,N,K tuples",
+    )
+    parser.add_argument(
+        "--split-k", nargs="+", type=int, default=[1], help="Split-K values to test"
+    )
+    parser.add_argument("--verify", action="store_true", help="Enable verification")
+    parser.add_argument(
+        "--csv", default="gemm_benchmark_results.csv", help="CSV output filename"
+    )
+    parser.add_argument(
+        "--best", default="best_kernels.txt", help="Best kernels output filename"
+    )
+    parser.add_argument("--verbose", action="store_true", help="Verbose output")
+    parser.add_argument(
+        "--warmup",
+        type=int,
+        default=50,
+        help="Number of warmup iterations (default: 50)",
+    )
+    parser.add_argument(
+        "--repeat",
+        type=int,
+        default=100,
+        help="Number of benchmark iterations (default: 100)",
+    )
+    parser.add_argument(
+        "--flush-cache",
+        action="store_true",
+        default=True,
+        help="Enable cache flushing (default: True)",
+    )
+    parser.add_argument(
+        "--rotating-count",
+        type=int,
+        default=1000,
+        help="Number of iterations to rotate cache (default: 1000)",
+    )
+    parser.add_argument("--json", help="JSON output filename (optional)")
+
+    args = parser.parse_args()
+
+    # Parse problem sizes
+    problem_sizes = []
+    for size_str in args.problem_sizes:
+        try:
+            m, n, k = map(int, size_str.split(","))
+            problem_sizes.append((m, n, k))
+        except ValueError:
+            print(f"Invalid problem size: {size_str}")
+            return 1
+
+    # Create benchmark instance
+    benchmark = GemmUniversalBenchmark(args.build_dir, verbose=args.verbose)
+
+    # Run benchmark sweep
+    print("Starting GEMM kernel benchmark sweep...")
+    start_time = time.time()
+
+    best_kernels = benchmark.benchmark_sweep(
+        problem_sizes=problem_sizes,
+        split_k_values=args.split_k,
+        verify=args.verify,
+        warmup=args.warmup,
+        repeat=args.repeat,
+        flush_cache=args.flush_cache,
+        rotating_count=args.rotating_count,
+    )
+
+    elapsed_time = time.time() - start_time
+    print(f"\nBenchmark completed in {elapsed_time:.2f} seconds")
+
+    # Export results
+    benchmark_utils.export_csv(benchmark.results, args.csv)
+    benchmark_utils.export_best_kernels(best_kernels, args.best)
+
+    # Export JSON if requested
+    if args.json:
+        benchmark_utils.export_json(benchmark.results, args.json, best_kernels)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tile_engine/ops/gemm/gemm_universal/gemm_benchmark_single.cpp b/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark_single.cpp
similarity index 55%
rename from tile_engine/ops/gemm/gemm_universal/gemm_benchmark_single.cpp
rename to tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark_single.cpp
index 6323c066a1a..613a42ff80b 100644
--- a/tile_engine/ops/gemm/gemm_universal/gemm_benchmark_single.cpp
+++ b/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark_single.cpp
@@ -11,70 +11,13 @@
 
 #include "ck_tile/core.hpp"
 #include "ck_tile/host.hpp"
-#include "gemm_profiler.hpp"
-#include "gemm_common.hpp"
+#include "gemm_universal_profiler.hpp"
+#include "gemm_universal_common.hpp"
 
 // The kernel header is included via the compile command line with -include flag
 // It defines SelectedKernel struct and KERNEL_NAME
 // DataTypeTraits are now defined in gemm_common.hpp
 
-// Create argument parser
-inline auto create_args(int argc, char* argv[])
-{
-    ck_tile::ArgParser arg_parser;
-    arg_parser.insert("m", "3840", "The value for m dimension. Default is 3840.")
-        .insert("n", "4096", "The value for n dimension. Default is 4096.")
-        .insert("k", "2048", "The value for k dimension. Default is 2048.")
-        .insert("stride_a", "0", "The stride value for tensor A. Default is 0.")
-        .insert("stride_b", "0", "The stride value for tensor B. Default is 0.")
-        .insert("stride_c", "0", "The stride value for tensor C. Default is 0.")
-        .insert("split_k", "1", "The split value for k dimension. Default is 1.")
-        .insert("verify",
-                "2",
-                "The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 "
-                "for validation on GPU. Default is 2, GPU validation.")
-        .insert("log",
-                "false",
-                "Whether output kernel instance information or not. Possible values are true or "
-                "false. Default is false")
-        .insert(
-            "warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.")
-        .insert(
-            "repeat", "100", "The number of iterations to benchmark the kernel. Default is 100.")
-        .insert("timer",
-                "true",
-                "Whether if the timer is gpu timer or not. Possible values are false or true. "
-                "Default is true.")
-        .insert("init",
-                "0",
-                "The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 "
-                "for constant(1). Default is 0, random.")
-        .insert("flush_cache",
-                "true",
-                "To flush cache, possible values are true or false. "
-                "Default is false.")
-        .insert("rotating_count", "1000", "number of iterations to rotate the cache. default is 5.")
-        .insert("metric",
-                "0",
-                "Metric with which to measure kernel performance. Set to 0 for latency, 1 for "
-                "tflops, or 2 for bandwidth. Default is 0, latency.")
-        .insert("csv_filename",
-                "",
-                "The filename of benchmark result. Default is empty (no CSV output).")
-        .insert("structured_sparsity",
-                "false",
-                "Whether use sparsity kernel or not. Possible values are true or false. Default is "
-                "false")
-        .insert("json_output",
-                "false",
-                "Whether to output results in JSON format only. Possible values are true or false. "
-                "Default is "
-                "false");
-
-    bool result = arg_parser.parse(argc, argv);
-    return std::make_tuple(result, arg_parser);
-}
-
 void benchmark_single(const ck_tile::ArgParser& arg_parser)
 {
     // Use DataTypeTraits to get the actual type names from the generated header
diff --git a/tile_engine/ops/gemm/gemm_universal/gemm_common.hpp b/tile_engine/ops/gemm/gemm_universal/gemm_universal_common.hpp
similarity index 100%
rename from tile_engine/ops/gemm/gemm_universal/gemm_common.hpp
rename to tile_engine/ops/gemm/gemm_universal/gemm_universal_common.hpp
diff --git a/tile_engine/ops/gemm/gemm_universal/gemm_profiler.hpp b/tile_engine/ops/gemm/gemm_universal/gemm_universal_profiler.hpp
similarity index 99%
rename from tile_engine/ops/gemm/gemm_universal/gemm_profiler.hpp
rename to tile_engine/ops/gemm/gemm_universal/gemm_universal_profiler.hpp
index 3c6bbc34d3d..9b728c52d6a 100644
--- a/tile_engine/ops/gemm/gemm_universal/gemm_profiler.hpp
+++ b/tile_engine/ops/gemm/gemm_universal/gemm_universal_profiler.hpp
@@ -9,7 +9,7 @@
 
 #include "ck_tile/host/device_prop.hpp"
 #include "ck_tile/ops/gemm.hpp"
-#include "gemm_benchmark.hpp"
+#include "gemm_universal_benchmark.hpp"
 
 class GemmProfiler
 {

From fd5da76c8a960fb31e29463eafecf90d02601eda Mon Sep 17 00:00:00 2001
From: Astha <astha.rai713@gmail.com>
Date: Thu, 8 Jan 2026 04:39:21 -0500
Subject: [PATCH 2/3] Restructure Tile Engine's profiling process

This change restructures the profiling process in Tile Engine into
a base class for the Profiling and Problem structs. With this all files
needed for Tile Engine will have a base struct and files in the gemm/
directory that can be extended for each GEMM variant. Only the Problem
and Profiler structs along with the reference functions need to be
defined. Profiling functions that are common to each operation have
been moved into a common utility file.
---
 tile_engine/CMakeLists.txt                    |   1 +
 .../ops/{commons => common}/__init__.py       |   0
 .../{commons => common}/benchmark_utils.py    |   0
 tile_engine/ops/common/utils.hpp              | 254 ++++++++++++++++++
 tile_engine/ops/gemm/gemm_benchmark.hpp       | 108 ++++++++
 tile_engine/ops/gemm/gemm_benchmark.py        |   2 +-
 .../gemm_multi_d/gemm_multi_d_benchmark.hpp   | 170 +-----------
 .../gemm_multi_d/gemm_multi_d_benchmark.py    |   2 +-
 .../gemm_multi_d_benchmark_single.cpp         |  91 ++-----
 .../gemm/gemm_multi_d/gemm_multi_d_common.hpp | 100 -------
 .../gemm_multi_d/gemm_multi_d_profiler.hpp    | 180 ++-----------
 .../gemm_preshuffle_benchmark.hpp             | 186 +------------
 .../gemm_preshuffle_benchmark.py              |   2 +-
 .../gemm_preshuffle_common.hpp                |  66 -----
 .../gemm_preshuffle_profiler.hpp              | 165 +-----------
 tile_engine/ops/gemm/gemm_profiler.hpp        | 200 ++++++++++++++
 .../gemm_universal_benchmark.hpp              | 178 +-----------
 .../gemm_universal_benchmark.py               |   2 +-
 .../gemm_universal_benchmark_single.cpp       |  12 +-
 .../gemm_universal/gemm_universal_common.hpp  | 100 -------
 .../gemm_universal_profiler.hpp               | 166 +-----------
 21 files changed, 637 insertions(+), 1348 deletions(-)
 rename tile_engine/ops/{commons => common}/__init__.py (100%)
 rename tile_engine/ops/{commons => common}/benchmark_utils.py (100%)
 create mode 100644 tile_engine/ops/common/utils.hpp
 create mode 100644 tile_engine/ops/gemm/gemm_benchmark.hpp
 delete mode 100644 tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_common.hpp
 create mode 100644 tile_engine/ops/gemm/gemm_profiler.hpp
 delete mode 100644 tile_engine/ops/gemm/gemm_universal/gemm_universal_common.hpp

diff --git a/tile_engine/CMakeLists.txt b/tile_engine/CMakeLists.txt
index b9dc3201282..0bb885bc35f 100644
--- a/tile_engine/CMakeLists.txt
+++ b/tile_engine/CMakeLists.txt
@@ -3,6 +3,7 @@
 
 include_directories(BEFORE
         ${CMAKE_CURRENT_LIST_DIR}/include
+        ${CMAKE_CURRENT_LIST_DIR}/ops
     )
 
 add_subdirectory(ops/gemm)
diff --git a/tile_engine/ops/commons/__init__.py b/tile_engine/ops/common/__init__.py
similarity index 100%
rename from tile_engine/ops/commons/__init__.py
rename to tile_engine/ops/common/__init__.py
diff --git a/tile_engine/ops/commons/benchmark_utils.py b/tile_engine/ops/common/benchmark_utils.py
similarity index 100%
rename from tile_engine/ops/commons/benchmark_utils.py
rename to tile_engine/ops/common/benchmark_utils.py
diff --git a/tile_engine/ops/common/utils.hpp b/tile_engine/ops/common/utils.hpp
new file mode 100644
index 00000000000..20994578e64
--- /dev/null
+++ b/tile_engine/ops/common/utils.hpp
@@ -0,0 +1,254 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+#include <iostream>
+#include <functional>
+#include <tuple>
+#include <exception>
+#include <sstream>
+#include <vector>
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+
+// Helper function to determine if a layout is row-major
+template <typename Layout>
+constexpr auto is_row_major(Layout)
+{
+    return ck_tile::bool_constant<std::is_same_v<Layout, ck_tile::tensor_layout::gemm::RowMajor>>{};
+}
+
+// Structure to hold kernel traits for dispatcher
+struct KernelTraits
+{
+    std::string pipeline;  // compv3, compv4, mem
+    std::string scheduler; // intrawave, interwave
+    std::string epilogue;  // cshuffle, default
+    bool pad_m;
+    bool pad_n;
+    bool pad_k;
+    bool persistent;
+
+    // Constructor with defaults
+    KernelTraits()
+        : pipeline("compv3"),
+          scheduler("intrawave"),
+          epilogue("cshuffle"),
+          pad_m(false),
+          pad_n(false),
+          pad_k(false),
+          persistent(false)
+    {
+    }
+};
+
+
+// Create argument parser
+inline auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "3840", "The value for m dimension. Default is 3840.")
+        .insert("n", "4096", "The value for n dimension. Default is 4096.")
+        .insert("k", "2048", "The value for k dimension. Default is 2048.")
+        .insert("stride_a", "0", "The stride value for tensor A. Default is 0.")
+        .insert("stride_b", "0", "The stride value for tensor B. Default is 0.")
+	.insert("stride_ds", "0", "The stride value for tensor Ds . Default is 0.")
+        .insert("stride_c", "0", "The stride value for tensor C. Default is 0.")
+        .insert("split_k", "1", "The split value for k dimension. Default is 1.")
+        .insert("verify",
+                "2",
+                "The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 "
+                "for validation on GPU. Default is 2, GPU validation.")
+        .insert("log",
+                "false",
+                "Whether output kernel instance information or not. Possible values are true or "
+                "false. Default is false")
+        .insert(
+            "warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.")
+        .insert(
+            "repeat", "100", "The number of iterations to benchmark the kernel. Default is 100.")
+        .insert("timer",
+                "true",
+                "Whether if the timer is gpu timer or not. Possible values are false or true. "
+                "Default is true.")
+        .insert("init",
+                "0",
+                "The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 "
+                "for constant(1). Default is 0, random.")
+        .insert("flush_cache",
+                "true",
+                "To flush cache, possible values are true or false. "
+                "Default is false.")
+        .insert("rotating_count", "1000", "number of iterations to rotate the cache. default is 5.")
+        .insert("metric",
+                "0",
+                "Metric with which to measure kernel performance. Set to 0 for latency, 1 for "
+                "tflops, or 2 for bandwidth. Default is 0, latency.")
+        .insert("csv_filename",
+                "",
+                "The filename of benchmark result. Default is empty (no CSV output).")
+        .insert("structured_sparsity",
+                "false",
+                "Whether use sparsity kernel or not. Possible values are true or false. Default is "
+                "false")
+        .insert("json_output",
+                "false",
+                "Whether to output results in JSON format only. Possible values are true or false. "
+                "Default is "
+                "false");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+enum class Metric
+{
+    LATENCY   = 0,
+    TFLOPS    = 1,
+    BANDWIDTH = 2
+};
+
+inline constexpr auto get_metric_name(Metric m)
+{
+    switch(m)
+    {
+    case Metric::LATENCY: return "latency";
+    case Metric::TFLOPS: return "tflops";
+    case Metric::BANDWIDTH: return "bandwidth";
+    default: throw std::invalid_argument("Unsupported metric type");
+    }
+}
+
+struct PerformanceResult
+{
+    double latency_;
+    double tflops_;
+    double bandwidth_;
+
+    static bool compare(const PerformanceResult& a, const PerformanceResult& b, Metric m)
+    {
+        switch(m)
+        {
+        case Metric::LATENCY: return a.latency_ < b.latency_;
+        case Metric::TFLOPS: return a.tflops_ > b.tflops_;
+        case Metric::BANDWIDTH: return a.bandwidth_ > b.bandwidth_;
+        default: throw std::invalid_argument("Unsupported metric type");
+        }
+    }
+
+    friend std::ostream& operator<<(std::ostream& os, const PerformanceResult& result)
+    {
+        os << "{\n"
+           << "   \"latency(ms)\": " << std::fixed << std::setprecision(2) << result.latency_
+           << ",\n"
+           << "   \"tflops(TFlops)\": " << result.tflops_ << ",\n"
+           << "   \"bandwidth(GB/s)\": " << result.bandwidth_ << "\n"
+           << "}";
+        return os;
+    }
+};
+
+template <typename Problem>
+struct KernelInstance
+{
+    std::string name_;
+    Problem problem_;
+    PerformanceResult perf_result_;
+
+    static bool compare(const KernelInstance& a, const KernelInstance& b, Metric m)
+    {
+        return PerformanceResult::compare(a.perf_result_, b.perf_result_, m);
+    }
+
+    friend std::ostream& operator<<(std::ostream& os, const KernelInstance& obj)
+    {
+        os << "{\n"
+           << " \"name\": \"" << obj.name_ << "\",\n"
+           << " \"problem\": " << obj.problem_ << ",\n"
+           << " \"perf_result\": " << obj.perf_result_ << "\n"
+           << "}";
+        return os;
+    }
+};
+
+struct Setting
+{
+    int n_warmup_;
+    int n_repeat_;
+    bool is_gpu_timer_;
+    int verify_;
+    int init_method_;
+    bool log_;
+    std::string csv_filename_;
+    bool flush_cache_;
+    int rotating_count_;
+    bool json_output_;
+};
+
+inline std::string get_rocm_version()
+{
+    std::ifstream version_file("/opt/rocm/.info/version");
+    if(version_file.is_open())
+    {
+        std::string version;
+        std::getline(version_file, version);
+        return version;
+    }
+    return "Unknown";
+}
+
+template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
+auto calculate_rtol_atol(const ck_tile::index_t K,
+                         const ck_tile::index_t kbatch,
+                         const float max_accumulated_value)
+{
+    using ComputeType =
+        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
+    // Calculate thresholds
+    const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
+        ck_tile::integer_divide_ceil(K, kbatch));
+    const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
+        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
+    // Calculate error due to split_k accumulation
+    const auto rtol_split_k =
+        ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
+    const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
+        max_accumulated_value, kbatch);
+    // Use higher threshold
+    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+}
+
+template <typename ADataType,
+          typename BDataType,
+          typename D0DataType,
+          typename AccDataType,
+          typename CDataType>
+auto calculate_rtol_atol(const ck_tile::index_t K,
+                         const ck_tile::index_t kbatch,
+                         const float max_accumulated_value)
+{
+    using ComputeTypeAB =
+        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
+
+    using ComputeType =
+        std::conditional_t<sizeof(ComputeTypeAB) < sizeof(D0DataType), ComputeTypeAB, D0DataType>;
+
+    // Calculate thresholds
+    const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
+        ck_tile::integer_divide_ceil(K, kbatch));
+
+    const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
+        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
+
+    // Calculate error due to split_k accumulation
+    const auto rtol_split_k =
+        ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
+
+    const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
+        max_accumulated_value, kbatch);
+
+    // Use higher threshold
+    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+}
diff --git a/tile_engine/ops/gemm/gemm_benchmark.hpp b/tile_engine/ops/gemm/gemm_benchmark.hpp
new file mode 100644
index 00000000000..6ff09186a43
--- /dev/null
+++ b/tile_engine/ops/gemm/gemm_benchmark.hpp
@@ -0,0 +1,108 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <iostream>
+#include <string>
+#include <fstream>
+#include <stdexcept>
+#include <iomanip>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "common/utils.hpp"
+
+// Data types and Layouts are defined by the generated kernel headers
+// No hardcoded type definitions here to avoid conflicts
+struct GemmProblem
+{
+    int split_k_;
+    int m_, n_, k_;
+    int stride_a_, stride_b_, stride_c_;
+
+    std::string dtype_a_, dtype_b_, dtype_acc_, dtype_c_;
+    std::string layout_a_, layout_b_, layout_c_;
+
+    bool structured_sparsity_;
+
+    friend std::ostream& operator<<(std::ostream& os, const GemmProblem& problem)
+    {
+        os << "{\n"
+           << "   \"split_k\":" << problem.split_k_ << ",\n"
+           << "   \"m\":" << problem.m_ << ",\n"
+           << "   \"n\":" << problem.n_ << ",\n"
+           << "   \"k\":" << problem.k_ << ",\n"
+           << "   \"stride_a\":" << problem.stride_a_ << ",\n"
+           << "   \"stride_b\":" << problem.stride_b_ << ",\n"
+           << "   \"stride_c\":" << problem.stride_c_ << ",\n"
+           << "   \"dtype_a\":\"" << problem.dtype_a_ << "\",\n"
+           << "   \"dtype_b\":\"" << problem.dtype_b_ << "\",\n"
+           << "   \"dtype_acc\":\"" << problem.dtype_acc_ << "\",\n"
+           << "   \"dtype_c\":\"" << problem.dtype_c_ << "\",\n"
+           << "   \"layout_a\":\"" << problem.layout_a_ << "\",\n"
+           << "   \"layout_b\":\"" << problem.layout_b_ << "\",\n"
+           << "   \"layout_c\":\"" << problem.layout_c_ << "\",\n"
+           << "   \"structured_sparsity\":" << (problem.structured_sparsity_ ? "true" : "false")
+           << "\n"
+           << "}";
+        return os;
+    }
+};
+
+// Detect Problem::DsDataType, default to void when absent
+template <class T, class = void>
+struct get_DsDataType { using type = void; };
+
+template <class T>
+struct get_DsDataType<T, std::void_t<typename T::DsDataType>> {
+    using type = typename T::DsDataType;
+};
+
+// Detect Problem::D0DataType, default to void when absent
+template <class T, class = void>
+struct get_D0DataType { using type = void; };
+
+template <class T>
+struct get_D0DataType<T, std::void_t<typename T::D0DataType>> {
+    using type = typename T::D0DataType;
+};
+
+/// @brief Function to compare the results of the device and host computations
+template <typename Problem>
+bool compare(std::string instanceName,
+             ck_tile::index_t K,
+             ck_tile::index_t kbatch,
+             ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
+             ck_tile::HostTensor<CDataType>& c_m_n_host_result)
+{
+    using DDataType = typename get_D0DataType<Problem>::type;
+    const float max_accumulated_value =
+        *std::max_element(c_m_n_host_result.mData.begin(), c_m_n_host_result.mData.end());
+    //const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+        //K, kbatch, max_accumulated_value);
+    auto rtol_atol = [&] {
+        if constexpr (std::is_void_v<DDataType>)
+        {
+            return calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+                K, kbatch, max_accumulated_value);
+        }
+        else
+        {
+            return calculate_rtol_atol<ADataType, BDataType, DDataType, AccDataType, CDataType>(
+                K, kbatch, max_accumulated_value);
+        }
+    }();
+    bool pass = ck_tile::check_err(c_m_n_dev_result,
+                                   c_m_n_host_result,
+                                   "Error: Incorrect results!",
+                                   rtol_atol.at(ck_tile::number<0>{}),
+                                   rtol_atol.at(ck_tile::number<1>{}));
+
+    std::cout << "For " << instanceName << " Relative error threshold is "
+              << rtol_atol.at(ck_tile::number<0>{}) << " Absolute error threshold is "
+              << rtol_atol.at(ck_tile::number<1>{}) << std::endl;
+    std::cout << "The verification result is:" << (pass ? "correct" : "fail") << std::endl;
+
+    return pass;
+}
diff --git a/tile_engine/ops/gemm/gemm_benchmark.py b/tile_engine/ops/gemm/gemm_benchmark.py
index 3a7afc74e88..c229b142338 100644
--- a/tile_engine/ops/gemm/gemm_benchmark.py
+++ b/tile_engine/ops/gemm/gemm_benchmark.py
@@ -22,7 +22,7 @@ def _import_benchmark_utils():
     # Load the module dynamically
     spec = importlib.util.spec_from_file_location(
         "benchmark_utils",
-        os.path.join(parent_dir, "commons", "benchmark_utils.py"),
+        os.path.join(parent_dir, "common", "benchmark_utils.py"),
     )
     benchmark_utils = importlib.util.module_from_spec(spec)
     spec.loader.exec_module(benchmark_utils)
diff --git a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.hpp b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.hpp
index f8c196e32af..23efbfc1682 100644
--- a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.hpp
+++ b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.hpp
@@ -11,37 +11,15 @@
 
 #include "ck_tile/core.hpp"
 #include "ck_tile/host.hpp"
-#include "gemm_multi_d_common.hpp"
+#include "gemm/gemm_benchmark.hpp"
 
 // Data types and Layouts are defined by the generated kernel headers
 // No hardcoded type definitions here to avoid conflicts
-
-enum class Metric
-{
-    LATENCY   = 0,
-    TFLOPS    = 1,
-    BANDWIDTH = 2
-};
-
-inline constexpr auto get_metric_name(Metric m)
+struct GemmMultiDProblem : GemmProblem
 {
-    switch(m)
-    {
-    case Metric::LATENCY: return "latency";
-    case Metric::TFLOPS: return "tflops";
-    case Metric::BANDWIDTH: return "bandwidth";
-    default: throw std::invalid_argument("Unsupported metric type");
-    }
-}
-
-struct GemmMultiDProblem
-{
-    int split_k_;
-    int m_, n_, k_;
-    int stride_a_, stride_b_, stride_d0_, stride_d1_, stride_c_;
-
-    std::string dtype_a_, dtype_b_, dtype_d0_, dtype_d1_, dtype_acc_, dtype_c_;
-    std::string layout_a_, layout_b_, layout_d0_, layout_d1_, layout_c_;
+    int stride_d0_, stride_d1_;
+    std::string dtype_d0_, dtype_d1_;
+    std::string layout_d0_, layout_d1_;
 
     friend std::ostream& operator<<(std::ostream& os, const GemmMultiDProblem& problem)
     {
@@ -71,144 +49,6 @@ struct GemmMultiDProblem
     }
 };
 
-struct PerformanceResult
-{
-    double latency_;
-    double tflops_;
-    double bandwidth_;
-
-    static bool compare(const PerformanceResult& a, const PerformanceResult& b, Metric m)
-    {
-        switch(m)
-        {
-        case Metric::LATENCY: return a.latency_ < b.latency_;
-        case Metric::TFLOPS: return a.tflops_ > b.tflops_;
-        case Metric::BANDWIDTH: return a.bandwidth_ > b.bandwidth_;
-        default: throw std::invalid_argument("Unsupported metric type");
-        }
-    }
-
-    friend std::ostream& operator<<(std::ostream& os, const PerformanceResult& result)
-    {
-        os << "{\n"
-           << "   \"latency(ms)\": " << std::fixed << std::setprecision(2) << result.latency_
-           << ",\n"
-           << "   \"tflops(TFlops)\": " << result.tflops_ << ",\n"
-           << "   \"bandwidth(GB/s)\": " << result.bandwidth_ << "\n"
-           << "}";
-        return os;
-    }
-};
-
-struct KernelInstance
-{
-    std::string name_;
-    GemmMultiDProblem problem_;
-    PerformanceResult perf_result_;
-
-    static bool compare(const KernelInstance& a, const KernelInstance& b, Metric m)
-    {
-        return PerformanceResult::compare(a.perf_result_, b.perf_result_, m);
-    }
-
-    friend std::ostream& operator<<(std::ostream& os, const KernelInstance& obj)
-    {
-        os << "{\n"
-           << " \"name\": \"" << obj.name_ << "\",\n"
-           << " \"problem\": " << obj.problem_ << ",\n"
-           << " \"perf_result\": " << obj.perf_result_ << "\n"
-           << "}";
-        return os;
-    }
-};
-
-struct Setting
-{
-    int n_warmup_;
-    int n_repeat_;
-    bool is_gpu_timer_;
-    int verify_;
-    int init_method_;
-    bool log_;
-    std::string csv_filename_;
-    bool flush_cache_;
-    int rotating_count_;
-    bool json_output_;
-};
-
-inline std::string get_rocm_version()
-{
-    std::ifstream version_file("/opt/rocm/.info/version");
-    if(version_file.is_open())
-    {
-        std::string version;
-        std::getline(version_file, version);
-        return version;
-    }
-    return "Unknown";
-}
-
-template <typename ADataType,
-          typename BDataType,
-          typename D0DataType,
-          typename AccDataType,
-          typename CDataType>
-auto calculate_rtol_atol(const ck_tile::index_t K,
-                         const ck_tile::index_t kbatch,
-                         const float max_accumulated_value)
-{
-    using ComputeTypeAB =
-        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
-
-    using ComputeType =
-        std::conditional_t<sizeof(ComputeTypeAB) < sizeof(D0DataType), ComputeTypeAB, D0DataType>;
-
-    // Calculate thresholds
-    const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
-        ck_tile::integer_divide_ceil(K, kbatch));
-
-    const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
-        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
-
-    // Calculate error due to split_k accumulation
-    const auto rtol_split_k =
-        ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
-
-    const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
-        max_accumulated_value, kbatch);
-
-    // Use higher threshold
-    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
-}
-
-/// @brief Function to compare the results of the device and host computations
-bool compare(std::string instanceName,
-             ck_tile::index_t K,
-             ck_tile::index_t kbatch,
-             ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
-             ck_tile::HostTensor<CDataType>& c_m_n_host_result)
-{
-    const float max_accumulated_value =
-        *std::max_element(c_m_n_host_result.mData.begin(), c_m_n_host_result.mData.end());
-
-    const auto rtol_atol =
-        calculate_rtol_atol<ADataType, BDataType, D0DataType, AccDataType, CDataType>(
-            K, kbatch, max_accumulated_value);
-
-    bool pass = ck_tile::check_err(c_m_n_dev_result,
-                                   c_m_n_host_result,
-                                   "Error: Incorrect results!",
-                                   rtol_atol.at(ck_tile::number<0>{}),
-                                   rtol_atol.at(ck_tile::number<1>{}));
-
-    std::cout << "For " << instanceName << " Relative error threshold is "
-              << rtol_atol.at(ck_tile::number<0>{}) << " Absolute error threshold is "
-              << rtol_atol.at(ck_tile::number<1>{}) << std::endl;
-    std::cout << "The verification result is:" << (pass ? "correct" : "fail") << std::endl;
-
-    return pass;
-}
-
 /// @brief Function to get the kernel output with reference implementation on CPU/GPU
 void gemm_multi_d_host_reference(int verify,
                                  ck_tile::HostTensor<ADataType>& a_m_k,
diff --git a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py
index ae796687075..2e313c3fed6 100644
--- a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py
+++ b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py
@@ -35,7 +35,7 @@ def _import_benchmark_utils():
     # Load the module dynamically
     spec = importlib.util.spec_from_file_location(
         "benchmark_utils",
-        os.path.join(parent_dir, "commons", "benchmark_utils.py"),
+        os.path.join(parent_dir, "common", "benchmark_utils.py"),
     )
     benchmark_utils = importlib.util.module_from_spec(spec)
     spec.loader.exec_module(benchmark_utils)
diff --git a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark_single.cpp b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark_single.cpp
index 41d2f736e1f..e72ceb0d76a 100644
--- a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark_single.cpp
+++ b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark_single.cpp
@@ -12,80 +12,20 @@
 #include "ck_tile/core.hpp"
 #include "ck_tile/host.hpp"
 #include "gemm_multi_d_profiler.hpp"
-#include "gemm_multi_d_common.hpp"
 
 // The kernel header is included via the compile command line with -include flag
 // It defines SelectedKernel struct and KERNEL_NAME
-// DataTypeTraits are now defined in gemm_multi_d_common.hpp
-
-// Create argument parser
-inline auto create_args(int argc, char* argv[])
-{
-    ck_tile::ArgParser arg_parser;
-    arg_parser.insert("m", "3840", "The value for m dimension. Default is 3840.")
-        .insert("n", "4096", "The value for n dimension. Default is 4096.")
-        .insert("k", "2048", "The value for k dimension. Default is 2048.")
-        .insert("stride_a", "0", "The stride value for tensor A. Default is 0.")
-        .insert("stride_b", "0", "The stride value for tensor B. Default is 0.")
-        .insert("stride_ds", "0", "The stride value for tensor Ds . Default is 0.")
-        .insert("stride_c", "0", "The stride value for tensor C. Default is 0.")
-        .insert("split_k", "1", "The split value for k dimension. Default is 1.")
-        .insert("verify",
-                "1",
-                "for validation on GPU. Default is 1, validation on CPU, as validation on GPU is "
-                "not supported.")
-        .insert("log",
-                "false",
-                "Whether output kernel instance information or not. Possible values are true or "
-                "false. Default is false")
-        .insert(
-            "warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.")
-        .insert(
-            "repeat", "100", "The number of iterations to benchmark the kernel. Default is 100.")
-        .insert("timer",
-                "true",
-                "Whether if the timer is gpu timer or not. Possible values are false or true. "
-                "Default is true.")
-        .insert("init",
-                "0",
-                "The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 "
-                "for constant(1). Default is 0, random.")
-        .insert("flush_cache",
-                "true",
-                "To flush cache, possible values are true or false. "
-                "Default is false.")
-        .insert("rotating_count", "1000", "number of iterations to rotate the cache. default is 5.")
-        .insert("metric",
-                "0",
-                "Metric with which to measure kernel performance. Set to 0 for latency, 1 for "
-                "tflops, or 2 for bandwidth. Default is 0, latency.")
-        .insert("csv_filename",
-                "",
-                "The filename of benchmark result. Default is empty (no CSV output).")
-        .insert("structured_sparsity",
-                "false",
-                "Whether use sparsity kernel or not. Possible values are true or false. Default is "
-                "false")
-        .insert("json_output",
-                "false",
-                "Whether to output results in JSON format only. Possible values are true or false. "
-                "Default is "
-                "false");
-
-    bool result = arg_parser.parse(argc, argv);
-    return std::make_tuple(result, arg_parser);
-}
 
 void benchmark_single(const ck_tile::ArgParser& arg_parser)
 {
     // Use DataTypeTraits to get the actual type names from the generated header
     // The generated header defines ADataType, BDataType, AccDataType, CDataType
-    std::string dtype_a   = DataTypeTraits<ADataType>::name;
-    std::string dtype_b   = DataTypeTraits<BDataType>::name;
-    std::string dtype_acc = DataTypeTraits<AccDataType>::name;
-    std::string dtype_c   = DataTypeTraits<CDataType>::name;
-    std::string dtype_d0  = DataTypeTraits<D0DataType>::name;
-    std::string dtype_d1  = DataTypeTraits<D1DataType>::name;
+    std::string dtype_a   = ck_tile::DataTypeTraits<ADataType>::name;
+    std::string dtype_b   = ck_tile::DataTypeTraits<BDataType>::name;
+    std::string dtype_acc = ck_tile::DataTypeTraits<AccDataType>::name;
+    std::string dtype_c   = ck_tile::DataTypeTraits<CDataType>::name;
+    std::string dtype_d0  = ck_tile::DataTypeTraits<D0DataType>::name;
+    std::string dtype_d1  = ck_tile::DataTypeTraits<D1DataType>::name;
 
     // Layout names from the layout types
     std::string layout_a  = ALayout::name;
@@ -95,26 +35,29 @@ void benchmark_single(const ck_tile::ArgParser& arg_parser)
     std::string layout_d1 = D1Layout::name;
 
     // Create GemmMultiDProblem struct
-    GemmMultiDProblem gemm_multi_d_problem{arg_parser.get_int("split_k"),
+    GemmMultiDProblem gemm_multi_d_problem{
+	    				GemmProblem{
+	    				   arg_parser.get_int("split_k"),
                                            arg_parser.get_int("m"),
                                            arg_parser.get_int("n"),
                                            arg_parser.get_int("k"),
                                            arg_parser.get_int("stride_a"),
                                            arg_parser.get_int("stride_b"),
-                                           arg_parser.get_int("stride_ds"),
-                                           arg_parser.get_int("stride_ds"),
                                            arg_parser.get_int("stride_c"),
                                            dtype_a,
                                            dtype_b,
-                                           dtype_d0,
-                                           dtype_d1,
                                            dtype_acc,
                                            dtype_c,
                                            layout_a,
                                            layout_b,
-                                           layout_d0,
-                                           layout_d1,
-                                           layout_c};
+                                           layout_c,
+					   arg_parser.get_bool("structured_sparsity")},
+					   arg_parser.get_int("stride_ds"),
+        				   arg_parser.get_int("stride_ds"),
+        				   dtype_d0,                 
+        				   dtype_d1,                      
+        				   layout_d0,                       
+        				   layout_d1};
 
     // Create Setting struct
     Setting setting{arg_parser.get_int("warmup"),
diff --git a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_common.hpp b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_common.hpp
deleted file mode 100644
index 899221547f6..00000000000
--- a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_common.hpp
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#pragma once
-
-#include <string>
-#include "ck_tile/core.hpp"
-#include "ck_tile/host.hpp"
-#include "ck_tile/core/numeric/integer.hpp"
-#include "ck_tile/core/numeric/pk_int4.hpp"
-
-//[TODO] This can be moved to commons
-// DataTypeTraits for all supported types
-template <typename T>
-struct DataTypeTraits;
-
-template <>
-struct DataTypeTraits<float>
-{
-    static constexpr const char* name = "fp32";
-};
-
-template <>
-struct DataTypeTraits<double>
-{
-    static constexpr const char* name = "fp64";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::half_t>
-{
-    static constexpr const char* name = "fp16";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::bf16_t>
-{
-    static constexpr const char* name = "bf16";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::fp8_t>
-{
-    static constexpr const char* name = "fp8";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::bf8_t>
-{
-    static constexpr const char* name = "bf8";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::int8_t>
-{
-    static constexpr const char* name = "int8";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::int32_t>
-{
-    static constexpr const char* name = "int32";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::pk_int4_t>
-{
-    static constexpr const char* name = "pk_int4_t";
-};
-
-// Helper function to determine if a layout is row-major
-template <typename Layout>
-constexpr auto is_row_major(Layout)
-{
-    return ck_tile::bool_constant<std::is_same_v<Layout, ck_tile::tensor_layout::gemm::RowMajor>>{};
-}
-
-// Structure to hold kernel traits for dispatcher
-struct KernelTraits
-{
-    std::string pipeline;  // compv3, compv4, mem
-    std::string scheduler; // intrawave, interwave
-    std::string epilogue;  // cshuffle, default
-    bool pad_m;
-    bool pad_n;
-    bool pad_k;
-    bool persistent;
-
-    // Constructor with defaults
-    KernelTraits()
-        : pipeline("compv3"),
-          scheduler("intrawave"),
-          epilogue("cshuffle"),
-          pad_m(false),
-          pad_n(false),
-          pad_k(false),
-          persistent(false)
-    {
-    }
-};
diff --git a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_profiler.hpp b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_profiler.hpp
index 3a2cdc71fe6..583dfd85a7f 100644
--- a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_profiler.hpp
+++ b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_profiler.hpp
@@ -6,44 +6,36 @@
 #include <iostream>
 #include <fstream>
 #include <iomanip>
+#include <vector>
+#include <algorithm>
+#include <functional>
+#include <tuple>
+
 
 #include "ck_tile/host/device_prop.hpp"
 #include "ck_tile/ops/gemm.hpp"
+#include "gemm/gemm_profiler.hpp"
 #include "gemm_multi_d_benchmark.hpp"
 
-class GemmMultiDProfiler
+class GemmMultiDProfiler: public GemmProfiler<GemmMultiDProfiler,
+                          GemmMultiDProblem,
+                          ck_tile::GemmMultiDHostArgs<DsDataType::size()>>
 {
-    public:
-    static GemmMultiDProfiler& instance(Setting setting)
-    {
-        static GemmMultiDProfiler instance{setting};
-        return instance;
-    }
-
-    // Overload for single kernel benchmarking
-    void benchmark(GemmMultiDProblem& gemm_multi_d_problem,
-                   std::function<float(const ck_tile::GemmMultiDHostArgs<DsDataType::size()>&,
-                                       const ck_tile::stream_config&)> kernel_func)
-    {
-        // Create a vector with a single callable that returns both name and time
-        std::vector<std::function<std::tuple<std::string, float>(
-            ck_tile::GemmMultiDHostArgs<DsDataType::size()>&, const ck_tile::stream_config&)>>
-            callables;
+public:
+    using BaseGemm = GemmProfiler<GemmMultiDProfiler,
+                              GemmMultiDProblem,
+                              ck_tile::GemmMultiDHostArgs<DsDataType::size()>>;
+    using BaseGemm::benchmark;
 
-        callables.push_back([kernel_func](ck_tile::GemmMultiDHostArgs<DsDataType::size()>& args,
-                                          const ck_tile::stream_config& stream) {
-            float time = kernel_func(args, stream);
-            return std::make_tuple(std::string(KERNEL_NAME), time);
-        });
+    GemmMultiDProfiler(Setting setting)
+        : GemmProfiler<GemmMultiDProfiler, GemmMultiDProblem, ck_tile::GemmMultiDHostArgs<DsDataType::size()>>(setting) {}
 
-        benchmark(gemm_multi_d_problem, callables);
-    }
 
     void benchmark(
         GemmMultiDProblem& gemm_multi_d_problem,
         std::vector<std::function<std::tuple<std::string, float>(
             ck_tile::GemmMultiDHostArgs<DsDataType::size()>&, const ck_tile::stream_config&)>>&
-            callables)
+            callables) override
     {
         const ALayout layout_a   = ALayout{};
         const BLayout layout_b   = BLayout{};
@@ -166,142 +158,4 @@ class GemmMultiDProfiler
         }
     }
 
-    void process_result(const GemmMultiDProblem& gemm_multi_d_problem,
-                        ck_tile::DeviceMem& c_m_n_dev_buf,
-                        ck_tile::HostTensor<CDataType>& c_m_n_host_result,
-                        ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
-                        const std::tuple<std::string, float>& kernel_run_result)
-    {
-        auto [name, avg_time] = kernel_run_result;
-
-        KernelInstance kernel_instance{name, gemm_multi_d_problem, {-1.0f, -1.0f, -1.0f}};
-
-        // compute performance metric
-        std::size_t flop = std::size_t(2) * gemm_multi_d_problem.m_ * gemm_multi_d_problem.n_ *
-                           gemm_multi_d_problem.k_;
-        std::size_t num_byte =
-            sizeof(ADataType) * gemm_multi_d_problem.m_ * gemm_multi_d_problem.k_ +
-            sizeof(BDataType) * gemm_multi_d_problem.n_ * gemm_multi_d_problem.k_ +
-            sizeof(CDataType) * gemm_multi_d_problem.m_ * gemm_multi_d_problem.n_;
-
-        // Dth Dimension Updates
-        ck_tile::static_for<0, DsDataType::size(), 1>{}([&](auto i) {
-            num_byte += sizeof(ck_tile::remove_cvref_t<std::tuple_element_t<i, DsDataType>>) *
-                        gemm_multi_d_problem.m_ * gemm_multi_d_problem.n_;
-            flop += sizeof(ck_tile::remove_cvref_t<std::tuple_element_t<i, DsDataType>>) *
-                    gemm_multi_d_problem.m_ * gemm_multi_d_problem.n_;
-        });
-
-        // update
-        kernel_instance.perf_result_.latency_   = avg_time;
-        kernel_instance.perf_result_.tflops_    = static_cast<float>(flop) / 1.E9 / avg_time;
-        kernel_instance.perf_result_.bandwidth_ = num_byte / 1.E6 / avg_time;
-
-        if(setting_.log_ > 0 && !setting_.json_output_)
-        {
-            std::cout << kernel_instance << std::endl;
-        }
-
-        // verify result
-        c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
-        bool verified_correct =
-            !setting_.verify_ || compare(name,
-                                         gemm_multi_d_problem.k_,
-                                         1, // Multi d currently supports only k_batch  = 1
-                                         c_m_n_dev_result,
-                                         c_m_n_host_result);
-
-        if(verified_correct)
-        {
-            kernel_instances_.emplace_back(kernel_instance);
-        }
-        else
-        {
-            std::cout << "Verification failed, skip kernel: " << name << std::endl;
-        }
-
-        // clear tensor
-        c_m_n_dev_buf.SetZero();
-        c_m_n_dev_result.SetZero();
-    }
-
-    KernelInstance select_best_instance(Metric metric)
-    {
-        if(kernel_instances_.empty())
-            throw std::runtime_error("Empty instances");
-
-        auto kernel_instance = *std::max_element(kernel_instances_.begin(),
-                                                 kernel_instances_.end(),
-                                                 [metric](const auto& a, const auto& b) {
-                                                     return PerformanceResult::compare(
-                                                         b.perf_result_, a.perf_result_, metric);
-                                                 });
-
-        if(setting_.json_output_)
-        {
-            // Output clean JSON only
-            std::cout << kernel_instance << std::endl;
-        }
-        else
-        {
-            std::cout << "**********************************" << std::endl;
-            std::cout << "According to given metrics: " << get_metric_name(metric) << "\n"
-                      << "Current kernel performance is: " << kernel_instance << std::endl;
-            std::cout << "**********************************" << std::endl;
-        }
-
-        if(!setting_.csv_filename_.empty())
-        {
-            std::ofstream file(setting_.csv_filename_ + ".csv", std::ios::app);
-
-            if(!file.is_open())
-            {
-                std::cerr << "Warning: Failed to open CSV file for writing." << std::endl;
-            }
-            else
-            {
-                if(file.tellp() == 0)
-                {
-                    file << "rocm_version,device_name,"
-                         << "split_k,m,n,k,stride_a,stride_b,stride_c,"
-                         << "dtype_a,dtype_b,dtype_acc,dtype_c," << "layout_a,layout_b,layout_c,"
-                         << "structured_sparsity," << "name,"
-                         << "latency(ms),tflops(TFlops),bandwidth(GB/s),metric\n";
-                }
-
-                const auto& problem = kernel_instance.problem_;
-                const auto& name    = kernel_instance.name_;
-                const auto& perf    = kernel_instance.perf_result_;
-
-                file << get_rocm_version() << "," << ck_tile::get_device_name() << ","
-                     << problem.split_k_ << "," << problem.m_ << "," << problem.n_ << ","
-                     << problem.k_ << "," << problem.stride_a_ << "," << problem.stride_b_ << ","
-                     << problem.stride_c_ << "," << problem.dtype_a_ << "," << problem.dtype_b_
-                     << "," << problem.dtype_acc_ << "," << problem.dtype_c_ << ","
-                     << problem.layout_a_ << "," << problem.layout_b_ << "," << problem.layout_c_
-                     << "," << name << "," << std::fixed << std::setprecision(4) << perf.latency_
-                     << "," << std::fixed << std::setprecision(4) << perf.tflops_ << ","
-                     << std::fixed << std::setprecision(4) << perf.bandwidth_ << ","
-                     << get_metric_name(metric) << "\n";
-
-                if(!file)
-                {
-                    std::cerr << "Warning: Error occurred while writing to CSV file." << std::endl;
-                }
-            }
-        }
-
-        return kernel_instance;
-    }
-
-    GemmMultiDProfiler(const GemmMultiDProfiler&)            = delete;
-    GemmMultiDProfiler& operator=(const GemmMultiDProfiler&) = delete;
-
-    private:
-    ~GemmMultiDProfiler() { kernel_instances_.clear(); }
-    GemmMultiDProfiler(Setting setting) : setting_(setting) {}
-
-    Setting setting_;
-
-    std::vector<KernelInstance> kernel_instances_;
 };
diff --git a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.hpp b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.hpp
index 748fe581d35..48f8a46ecbb 100644
--- a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.hpp
+++ b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.hpp
@@ -6,191 +6,7 @@
 #include "ck_tile/core.hpp"
 #include "ck_tile/host.hpp"
 #include "gemm_preshuffle_common.hpp"
-
-//[TODO] Move parts of this File to commons
-enum class Metric
-{
-    LATENCY   = 0,
-    TFLOPS    = 1,
-    BANDWIDTH = 2
-};
-
-inline constexpr auto get_metric_name(Metric m)
-{
-    switch(m)
-    {
-    case Metric::LATENCY: return "latency";
-    case Metric::TFLOPS: return "tflops";
-    case Metric::BANDWIDTH: return "bandwidth";
-    default: throw std::invalid_argument("Unsupported metric type");
-    }
-}
-
-struct KernelConfig
-{
-    std::tuple<int, int, int> tile_dims;
-    std::tuple<int, int, int> warp_dims;
-    std::tuple<int, int, int> warp_tile_dims;
-    bool permuteN;
-};
-
-struct GemmProblem
-{
-    int split_k_;
-    int m_, n_, k_;
-    int stride_a_, stride_b_, stride_c_;
-
-    std::string dtype_a_, dtype_b_, dtype_acc_, dtype_c_;
-    std::string layout_a_, layout_b_, layout_c_;
-
-    bool structured_sparsity_;
-
-    friend std::ostream& operator<<(std::ostream& os, const GemmProblem& problem)
-    {
-        os << "{\n"
-           << "   \"split_k\":" << problem.split_k_ << ",\n"
-           << "   \"m\":" << problem.m_ << ",\n"
-           << "   \"n\":" << problem.n_ << ",\n"
-           << "   \"k\":" << problem.k_ << ",\n"
-           << "   \"stride_a\":" << problem.stride_a_ << ",\n"
-           << "   \"stride_b\":" << problem.stride_b_ << ",\n"
-           << "   \"stride_c\":" << problem.stride_c_ << ",\n"
-           << "   \"dtype_a\":\"" << problem.dtype_a_ << "\",\n"
-           << "   \"dtype_b\":\"" << problem.dtype_b_ << "\",\n"
-           << "   \"dtype_acc\":\"" << problem.dtype_acc_ << "\",\n"
-           << "   \"dtype_c\":\"" << problem.dtype_c_ << "\",\n"
-           << "   \"layout_a\":\"" << problem.layout_a_ << "\",\n"
-           << "   \"layout_b\":\"" << problem.layout_b_ << "\",\n"
-           << "   \"layout_c\":\"" << problem.layout_c_ << "\",\n"
-           << "   \"structured_sparsity\":" << (problem.structured_sparsity_ ? "true" : "false")
-           << "\n"
-           << "}";
-        return os;
-    }
-};
-
-struct PerformanceResult
-{
-    double latency_;
-    double tflops_;
-    double bandwidth_;
-
-    static bool compare(const PerformanceResult& a, const PerformanceResult& b, Metric m)
-    {
-        switch(m)
-        {
-        case Metric::LATENCY: return a.latency_ < b.latency_;
-        case Metric::TFLOPS: return a.tflops_ > b.tflops_;
-        case Metric::BANDWIDTH: return a.bandwidth_ > b.bandwidth_;
-        default: throw std::invalid_argument("Unsupported metric type");
-        }
-    }
-
-    friend std::ostream& operator<<(std::ostream& os, const PerformanceResult& result)
-    {
-        os << "{\n"
-           << "   \"latency(ms)\": " << std::fixed << std::setprecision(2) << result.latency_
-           << ",\n"
-           << "   \"tflops(TFlops)\": " << result.tflops_ << ",\n"
-           << "   \"bandwidth(GB/s)\": " << result.bandwidth_ << "\n"
-           << "}";
-        return os;
-    }
-};
-
-struct KernelInstance
-{
-    std::string name_;
-    GemmProblem problem_;
-    PerformanceResult perf_result_;
-
-    static bool compare(const KernelInstance& a, const KernelInstance& b, Metric m)
-    {
-        return PerformanceResult::compare(a.perf_result_, b.perf_result_, m);
-    }
-
-    friend std::ostream& operator<<(std::ostream& os, const KernelInstance& obj)
-    {
-        os << "{\n"
-           << " \"name\": \"" << obj.name_ << "\",\n"
-           << " \"problem\": " << obj.problem_ << ",\n"
-           << " \"perf_result\": " << obj.perf_result_ << "\n"
-           << "}";
-        return os;
-    }
-};
-
-struct Setting
-{
-    int n_warmup_;
-    int n_repeat_;
-    bool is_gpu_timer_;
-    int verify_;
-    int init_method_;
-    bool log_;
-    std::string csv_filename_;
-    bool flush_cache_;
-    int rotating_count_;
-    bool json_output_;
-};
-
-inline std::string get_rocm_version()
-{
-    std::ifstream version_file("/opt/rocm/.info/version");
-    if(version_file.is_open())
-    {
-        std::string version;
-        std::getline(version_file, version);
-        return version;
-    }
-    return "Unknown";
-}
-
-template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
-auto calculate_rtol_atol(const ck_tile::index_t K,
-                         const ck_tile::index_t kbatch,
-                         const float max_accumulated_value)
-{
-    using ComputeType =
-        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
-    // Calculate thresholds
-    const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
-        ck_tile::integer_divide_ceil(K, kbatch));
-    const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
-        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
-    // Calculate error due to split_k accumulation
-    const auto rtol_split_k =
-        ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
-    const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
-        max_accumulated_value, kbatch);
-    // Use higher threshold
-    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
-}
-
-/// @brief Function to compare the results of the device and host computations
-bool compare(std::string instanceName,
-             ck_tile::index_t K,
-             ck_tile::index_t kbatch,
-             ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
-             ck_tile::HostTensor<CDataType>& c_m_n_ref)
-{
-    const float max_accumulated_value =
-        *std::max_element(c_m_n_ref.mData.begin(), c_m_n_ref.mData.end());
-    const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
-        K, kbatch, max_accumulated_value);
-    bool pass = ck_tile::check_err(c_m_n_dev_result,
-                                   c_m_n_ref,
-                                   "Error: Incorrect results!",
-                                   rtol_atol.at(ck_tile::number<0>{}),
-                                   rtol_atol.at(ck_tile::number<1>{}));
-
-    std::cout << "For " << instanceName << " Relative error threshold is "
-              << rtol_atol.at(ck_tile::number<0>{}) << " Absolute error threshold is "
-              << rtol_atol.at(ck_tile::number<1>{}) << std::endl;
-    std::cout << "The verification result is:" << (pass ? "correct" : "fail") << std::endl;
-
-    return pass;
-}
+#include "gemm/gemm_benchmark.hpp"
 
 /// @brief Function to get the kernel output with reference implementation on CPU/GPU
 void gemm_host_reference(int verify,
diff --git a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py
index 1ea33834d7d..935de186d66 100644
--- a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py
+++ b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py
@@ -36,7 +36,7 @@ def _import_benchmark_utils():
     # Load the module dynamically
     spec = importlib.util.spec_from_file_location(
         "benchmark_utils",
-        os.path.join(parent_dir, "commons", "benchmark_utils.py"),
+        os.path.join(parent_dir, "common", "benchmark_utils.py"),
     )
     benchmark_utils = importlib.util.module_from_spec(spec)
     spec.loader.exec_module(benchmark_utils)
diff --git a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_common.hpp b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_common.hpp
index 1b2cfe37350..7d3164f9d48 100644
--- a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_common.hpp
+++ b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_common.hpp
@@ -9,72 +9,6 @@
 #include "ck_tile/core/numeric/integer.hpp"
 #include "ck_tile/core/numeric/pk_int4.hpp"
 
-//[TODO] This can be moved to commons
-// DataTypeTraits for all supported types
-template <typename T>
-struct DataTypeTraits;
-
-template <>
-struct DataTypeTraits<float>
-{
-    static constexpr const char* name = "fp32";
-};
-
-template <>
-struct DataTypeTraits<double>
-{
-    static constexpr const char* name = "fp64";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::half_t>
-{
-    static constexpr const char* name = "fp16";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::bf16_t>
-{
-    static constexpr const char* name = "bf16";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::fp8_t>
-{
-    static constexpr const char* name = "fp8";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::bf8_t>
-{
-    static constexpr const char* name = "bf8";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::int8_t>
-{
-    static constexpr const char* name = "int8";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::int32_t>
-{
-    static constexpr const char* name = "int32";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::pk_int4_t>
-{
-    static constexpr const char* name = "pk_int4_t";
-};
-
-// Helper function to determine if a layout is row-major
-template <typename Layout>
-constexpr auto is_row_major(Layout)
-{
-    return ck_tile::bool_constant<std::is_same_v<Layout, ck_tile::tensor_layout::gemm::RowMajor>>{};
-}
-
 // Structure to hold kernel traits for dispatcher
 struct KernelTraits
 {
diff --git a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_profiler.hpp b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_profiler.hpp
index 739bd7e677a..4cf980dbf73 100644
--- a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_profiler.hpp
+++ b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_profiler.hpp
@@ -5,41 +5,27 @@
 
 #include "ck_tile/host/device_prop.hpp"
 #include "ck_tile/ops/gemm.hpp"
+#include "gemm/gemm_profiler.hpp"
 #include "gemm_preshuffle_benchmark.hpp"
 
-class GemmProfiler
+class GemmPreshuffleProfiler : public GemmProfiler<GemmPreshuffleProfiler,
+                          GemmProblem,
+                          ck_tile::GemmHostArgs>
 {
-    public:
-    static GemmProfiler& instance(Setting setting)
-    {
-        static GemmProfiler instance{setting};
-        return instance;
-    }
+public:
+    using BaseGemm = GemmProfiler<GemmPreshuffleProfiler,
+                              GemmProblem,
+                              ck_tile::GemmHostArgs>;
+    using BaseGemm::benchmark;
 
-    // Overload for single kernel benchmarking
-    void benchmark(GemmProblem& gemm_problem,
-                   std::function<float(const ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>
-                       kernel_func,
-                   KernelConfig& config)
-    {
-        // Create a vector with a single callable that returns both name and time
-        std::vector<std::function<std::tuple<std::string, float>(ck_tile::GemmHostArgs&,
-                                                                 const ck_tile::stream_config&)>>
-            callables;
+    GemmPreshuffleProfiler(Setting setting)
+        : GemmProfiler<GemmPreshuffleProfiler, GemmProblem, ck_tile::GemmHostArgs>(setting) {}
 
-        callables.push_back(
-            [kernel_func](ck_tile::GemmHostArgs& args, const ck_tile::stream_config& stream) {
-                float time = kernel_func(args, stream);
-                return std::make_tuple(std::string(KERNEL_NAME), time);
-            });
-
-        benchmark(gemm_problem, callables, config);
-    }
 
     void benchmark(GemmProblem& gemm_problem,
                    std::vector<std::function<std::tuple<std::string, float>(
                        ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>>& callables,
-                   KernelConfig& config)
+                   KernelConfig& config) override
     {
         const ALayout layout_a = ALayout{};
         const BLayout layout_b = BLayout{};
@@ -159,131 +145,4 @@ class GemmProfiler
         }
     }
 
-    void process_result(const GemmProblem& gemm_problem,
-                        ck_tile::DeviceMem& c_m_n_dev_buf,
-                        ck_tile::HostTensor<CDataType>& c_m_n_ref,
-                        ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
-                        const std::tuple<std::string, float>& kernel_run_result)
-    {
-        auto [name, avg_time] = kernel_run_result;
-
-        KernelInstance kernel_instance{name, gemm_problem, {-1.0f, -1.0f, -1.0f}};
-
-        // compute performance metric
-        std::size_t flop     = std::size_t(2) * gemm_problem.m_ * gemm_problem.n_ * gemm_problem.k_;
-        std::size_t num_byte = sizeof(ADataType) * gemm_problem.m_ * gemm_problem.k_ +
-                               sizeof(BDataType) * gemm_problem.n_ * gemm_problem.k_ +
-                               sizeof(CDataType) * gemm_problem.m_ * gemm_problem.n_;
-
-        // update
-        kernel_instance.perf_result_.latency_   = avg_time;
-        kernel_instance.perf_result_.tflops_    = static_cast<float>(flop) / 1.E9 / avg_time;
-        kernel_instance.perf_result_.bandwidth_ = num_byte / 1.E6 / avg_time;
-
-        if(setting_.log_ > 0 && !setting_.json_output_)
-        {
-            std::cout << kernel_instance << std::endl;
-        }
-
-        // verify result
-        c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
-
-        bool verified_correct =
-            !setting_.verify_ ||
-            compare(name, gemm_problem.k_, gemm_problem.split_k_, c_m_n_dev_result, c_m_n_ref);
-
-        if(verified_correct)
-        {
-            kernel_instances_.emplace_back(kernel_instance);
-        }
-        else
-        {
-            std::cout << "Verification failed, skip kernel: " << name << std::endl;
-        }
-
-        // clear tensor
-        c_m_n_dev_buf.SetZero();
-        c_m_n_dev_result.SetZero();
-    }
-
-    KernelInstance select_best_instance(Metric metric)
-    {
-        if(kernel_instances_.empty())
-            throw std::runtime_error("Empty instances");
-
-        auto kernel_instance = *std::max_element(kernel_instances_.begin(),
-                                                 kernel_instances_.end(),
-                                                 [metric](const auto& a, const auto& b) {
-                                                     return PerformanceResult::compare(
-                                                         b.perf_result_, a.perf_result_, metric);
-                                                 });
-
-        if(setting_.json_output_)
-        {
-            // Output clean JSON only
-            std::cout << kernel_instance << std::endl;
-        }
-        else
-        {
-            std::cout << "**********************************" << std::endl;
-            std::cout << "According to given metrics: " << get_metric_name(metric) << "\n"
-                      << "Current kernel performance is: " << kernel_instance << std::endl;
-            std::cout << "**********************************" << std::endl;
-        }
-
-        if(!setting_.csv_filename_.empty())
-        {
-            std::ofstream file(setting_.csv_filename_ + ".csv", std::ios::app);
-
-            if(!file.is_open())
-            {
-                std::cerr << "Warning: Failed to open CSV file for writing." << std::endl;
-            }
-            else
-            {
-                if(file.tellp() == 0)
-                {
-                    file << "rocm_version,device_name,"
-                         << "split_k,m,n,k,stride_a,stride_b,stride_c,"
-                         << "dtype_a,dtype_b,dtype_acc,dtype_c," << "layout_a,layout_b,layout_c,"
-                         << "structured_sparsity," << "name,"
-                         << "latency(ms),tflops(TFlops),bandwidth(GB/s),metric\n";
-                }
-
-                const auto& problem = kernel_instance.problem_;
-                const auto& name    = kernel_instance.name_;
-                const auto& perf    = kernel_instance.perf_result_;
-
-                file << get_rocm_version() << "," << ck_tile::get_device_name() << ","
-                     << problem.split_k_ << "," << problem.m_ << "," << problem.n_ << ","
-                     << problem.k_ << "," << problem.stride_a_ << "," << problem.stride_b_ << ","
-                     << problem.stride_c_ << "," << problem.dtype_a_ << "," << problem.dtype_b_
-                     << "," << problem.dtype_acc_ << "," << problem.dtype_c_ << ","
-                     << problem.layout_a_ << "," << problem.layout_b_ << "," << problem.layout_c_
-                     << "," << problem.structured_sparsity_ << "," << name << "," << std::fixed
-                     << std::setprecision(4) << perf.latency_ << "," << std::fixed
-                     << std::setprecision(4) << perf.tflops_ << "," << std::fixed
-                     << std::setprecision(4) << perf.bandwidth_ << "," << get_metric_name(metric)
-                     << "\n";
-
-                if(!file)
-                {
-                    std::cerr << "Warning: Error occurred while writing to CSV file." << std::endl;
-                }
-            }
-        }
-
-        return kernel_instance;
-    }
-
-    GemmProfiler(const GemmProfiler&)            = delete;
-    GemmProfiler& operator=(const GemmProfiler&) = delete;
-
-    private:
-    ~GemmProfiler() { kernel_instances_.clear(); }
-    GemmProfiler(Setting setting) : setting_(setting) {}
-
-    Setting setting_;
-
-    std::vector<KernelInstance> kernel_instances_;
 };
diff --git a/tile_engine/ops/gemm/gemm_profiler.hpp b/tile_engine/ops/gemm/gemm_profiler.hpp
new file mode 100644
index 00000000000..4c9b706fe35
--- /dev/null
+++ b/tile_engine/ops/gemm/gemm_profiler.hpp
@@ -0,0 +1,200 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <iomanip>
+#include <vector>
+#include <algorithm>
+#include <functional>
+#include <tuple>
+
+
+#include "ck_tile/host/device_prop.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "gemm_benchmark.hpp"
+
+template <typename Gemm,
+          typename Problem,
+          typename GemmArgs>
+class GemmProfiler
+{
+public:
+    static Gemm& instance(Setting setting)
+    {
+        static Gemm instance{setting};
+        return instance;
+    }
+
+    // Overload for single kernel benchmarking
+    void benchmark(Problem& gemm_problem,
+                   std::function<float(const GemmArgs&, const ck_tile::stream_config&)>
+                       kernel_func)
+    {
+        // Create a vector with a single callable that returns both name and time
+        std::vector<std::function<std::tuple<std::string, float>(GemmArgs&,
+                                                                 const ck_tile::stream_config&)>>
+            callables;
+
+        callables.push_back(
+            [kernel_func](GemmArgs& args, const ck_tile::stream_config& stream) {
+                float time = kernel_func(args, stream);
+                return std::make_tuple(std::string(KERNEL_NAME), time);
+            });
+
+        benchmark(gemm_problem, callables); // TODO: need to cast this?
+    }
+
+    virtual void benchmark(Problem& gemm_problem,
+                   std::vector<std::function<std::tuple<std::string, float>(
+                       GemmArgs&, const ck_tile::stream_config&)>>& callables) = 0;
+   
+    void process_result(const Problem& gemm_problem,
+                        ck_tile::DeviceMem& c_m_n_dev_buf,
+                        ck_tile::HostTensor<CDataType>& c_m_n_host_result,
+                        ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
+                        const std::tuple<std::string, float>& kernel_run_result)
+    {
+        auto [name, avg_time] = kernel_run_result;
+	using DDataType = typename get_DsDataType<Problem>::type;
+
+        KernelInstance<Problem> kernel_instance{name, gemm_problem, {-1.0f, -1.0f, -1.0f}};
+
+        // compute performance metric
+        std::size_t flop     = std::size_t(2) * gemm_problem.m_ * gemm_problem.n_ * gemm_problem.k_;
+        std::size_t num_byte = sizeof(ADataType) * gemm_problem.m_ * gemm_problem.k_ +
+                               sizeof(BDataType) * gemm_problem.n_ * gemm_problem.k_ +
+                               sizeof(CDataType) * gemm_problem.m_ * gemm_problem.n_;
+
+
+	if constexpr (!std::is_void_v<DDataType>)
+	{
+		ck_tile::static_for<0, DDataType::size(), 1>{}([&](auto i) {
+				using DType = ck_tile::remove_cvref_t<std::tuple_element_t<i, DDataType>>;
+		    		num_byte += sizeof(DType) * gemm_problem.m_ * gemm_problem.n_;
+		    		flop     += sizeof(DType) * gemm_problem.m_ * gemm_problem.n_;
+				});
+	}
+
+
+        // update
+        kernel_instance.perf_result_.latency_   = avg_time;
+        kernel_instance.perf_result_.tflops_    = static_cast<float>(flop) / 1.E9 / avg_time;
+        kernel_instance.perf_result_.bandwidth_ = num_byte / 1.E6 / avg_time;
+
+        if(setting_.log_ > 0 && !setting_.json_output_)
+        {
+            std::cout << kernel_instance << std::endl;
+        }
+
+        // verify result
+        c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
+	int split_k = 1;
+    	if constexpr (std::is_same_v<Problem, GemmProblem>)
+    	{
+        	split_k = gemm_problem.split_k_;
+    	}
+        bool verified_correct =
+            !setting_.verify_ ||
+            compare<Problem>(
+                name, gemm_problem.k_, split_k, c_m_n_dev_result, c_m_n_host_result);
+
+        if(verified_correct)
+        {
+            kernel_instances_.emplace_back(kernel_instance);
+        }
+        else
+        {
+            std::cout << "Verification failed, skip kernel: " << name << std::endl;
+        }
+
+        // clear tensor
+        c_m_n_dev_buf.SetZero();
+        c_m_n_dev_result.SetZero();
+    }
+
+    KernelInstance<Problem> select_best_instance(Metric metric)
+    {
+        if(kernel_instances_.empty())
+            throw std::runtime_error("Empty instances");
+
+        auto kernel_instance = *std::max_element(kernel_instances_.begin(),
+                                                 kernel_instances_.end(),
+                                                 [metric](const auto& a, const auto& b) {
+                                                     return PerformanceResult::compare(
+                                                         b.perf_result_, a.perf_result_, metric);
+                                                 });
+
+        if(setting_.json_output_)
+        {
+            // Output clean JSON only
+            std::cout << kernel_instance << std::endl;
+        }
+        else
+        {
+            std::cout << "**********************************" << std::endl;
+            std::cout << "According to given metrics: " << get_metric_name(metric) << "\n"
+                      << "Current kernel performance is: " << kernel_instance << std::endl;
+            std::cout << "**********************************" << std::endl;
+        }
+
+        if(!setting_.csv_filename_.empty())
+        {
+            std::ofstream file(setting_.csv_filename_ + ".csv", std::ios::app);
+
+            if(!file.is_open())
+            {
+                std::cerr << "Warning: Failed to open CSV file for writing." << std::endl;
+            }
+            else
+            {
+                if(file.tellp() == 0)
+                {
+                    file << "rocm_version,device_name,"
+                         << "split_k,m,n,k,stride_a,stride_b,stride_c,"
+                         << "dtype_a,dtype_b,dtype_acc,dtype_c," << "layout_a,layout_b,layout_c,"
+                         << "structured_sparsity," << "name,"
+                         << "latency(ms),tflops(TFlops),bandwidth(GB/s),metric\n";
+                }
+
+                const auto& problem = kernel_instance.problem_;
+                const auto& name    = kernel_instance.name_;
+                const auto& perf    = kernel_instance.perf_result_;
+
+                file << get_rocm_version() << "," << ck_tile::get_device_name() << ","
+                     << problem.split_k_ << "," << problem.m_ << "," << problem.n_ << ","
+                     << problem.k_ << "," << problem.stride_a_ << "," << problem.stride_b_ << ","
+                     << problem.stride_c_ << "," << problem.dtype_a_ << "," << problem.dtype_b_
+                     << "," << problem.dtype_acc_ << "," << problem.dtype_c_ << ","
+                     << problem.layout_a_ << "," << problem.layout_b_ << "," << problem.layout_c_
+                     << "," << problem.structured_sparsity_ << "," << name << "," << std::fixed
+                     << std::setprecision(4) << perf.latency_ << "," << std::fixed
+                     << std::setprecision(4) << perf.tflops_ << "," << std::fixed
+                     << std::setprecision(4) << perf.bandwidth_ << "," << get_metric_name(metric)
+                     << "\n";
+
+                if(!file)
+                {
+                    std::cerr << "Warning: Error occurred while writing to CSV file." << std::endl;
+                }
+            }
+        }
+
+        return kernel_instance;
+    }
+
+    GemmProfiler(const GemmProfiler&)            = delete;
+    GemmProfiler& operator=(const GemmProfiler&) = delete;
+
+    protected:
+    virtual ~GemmProfiler() { kernel_instances_.clear(); }
+    GemmProfiler(Setting setting) : setting_(setting) {}
+
+    Setting setting_;
+
+    std::vector<KernelInstance<Problem>> kernel_instances_;
+};
+
+
diff --git a/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.hpp b/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.hpp
index c7f4f470b0f..4cf272a3c82 100644
--- a/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.hpp
+++ b/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.hpp
@@ -11,187 +11,11 @@
 
 #include "ck_tile/core.hpp"
 #include "ck_tile/host.hpp"
-#include "gemm_universal_common.hpp"
+#include "gemm/gemm_benchmark.hpp"
 
 // Data types and Layouts are defined by the generated kernel headers
 // No hardcoded type definitions here to avoid conflicts
 
-enum class Metric
-{
-    LATENCY   = 0,
-    TFLOPS    = 1,
-    BANDWIDTH = 2
-};
-
-inline constexpr auto get_metric_name(Metric m)
-{
-    switch(m)
-    {
-    case Metric::LATENCY: return "latency";
-    case Metric::TFLOPS: return "tflops";
-    case Metric::BANDWIDTH: return "bandwidth";
-    default: throw std::invalid_argument("Unsupported metric type");
-    }
-}
-
-struct GemmProblem
-{
-    int split_k_;
-    int m_, n_, k_;
-    int stride_a_, stride_b_, stride_c_;
-
-    std::string dtype_a_, dtype_b_, dtype_acc_, dtype_c_;
-    std::string layout_a_, layout_b_, layout_c_;
-
-    bool structured_sparsity_;
-
-    friend std::ostream& operator<<(std::ostream& os, const GemmProblem& problem)
-    {
-        os << "{\n"
-           << "   \"split_k\":" << problem.split_k_ << ",\n"
-           << "   \"m\":" << problem.m_ << ",\n"
-           << "   \"n\":" << problem.n_ << ",\n"
-           << "   \"k\":" << problem.k_ << ",\n"
-           << "   \"stride_a\":" << problem.stride_a_ << ",\n"
-           << "   \"stride_b\":" << problem.stride_b_ << ",\n"
-           << "   \"stride_c\":" << problem.stride_c_ << ",\n"
-           << "   \"dtype_a\":\"" << problem.dtype_a_ << "\",\n"
-           << "   \"dtype_b\":\"" << problem.dtype_b_ << "\",\n"
-           << "   \"dtype_acc\":\"" << problem.dtype_acc_ << "\",\n"
-           << "   \"dtype_c\":\"" << problem.dtype_c_ << "\",\n"
-           << "   \"layout_a\":\"" << problem.layout_a_ << "\",\n"
-           << "   \"layout_b\":\"" << problem.layout_b_ << "\",\n"
-           << "   \"layout_c\":\"" << problem.layout_c_ << "\",\n"
-           << "   \"structured_sparsity\":" << (problem.structured_sparsity_ ? "true" : "false")
-           << "\n"
-           << "}";
-        return os;
-    }
-};
-
-struct PerformanceResult
-{
-    double latency_;
-    double tflops_;
-    double bandwidth_;
-
-    static bool compare(const PerformanceResult& a, const PerformanceResult& b, Metric m)
-    {
-        switch(m)
-        {
-        case Metric::LATENCY: return a.latency_ < b.latency_;
-        case Metric::TFLOPS: return a.tflops_ > b.tflops_;
-        case Metric::BANDWIDTH: return a.bandwidth_ > b.bandwidth_;
-        default: throw std::invalid_argument("Unsupported metric type");
-        }
-    }
-
-    friend std::ostream& operator<<(std::ostream& os, const PerformanceResult& result)
-    {
-        os << "{\n"
-           << "   \"latency(ms)\": " << std::fixed << std::setprecision(2) << result.latency_
-           << ",\n"
-           << "   \"tflops(TFlops)\": " << result.tflops_ << ",\n"
-           << "   \"bandwidth(GB/s)\": " << result.bandwidth_ << "\n"
-           << "}";
-        return os;
-    }
-};
-
-struct KernelInstance
-{
-    std::string name_;
-    GemmProblem problem_;
-    PerformanceResult perf_result_;
-
-    static bool compare(const KernelInstance& a, const KernelInstance& b, Metric m)
-    {
-        return PerformanceResult::compare(a.perf_result_, b.perf_result_, m);
-    }
-
-    friend std::ostream& operator<<(std::ostream& os, const KernelInstance& obj)
-    {
-        os << "{\n"
-           << " \"name\": \"" << obj.name_ << "\",\n"
-           << " \"problem\": " << obj.problem_ << ",\n"
-           << " \"perf_result\": " << obj.perf_result_ << "\n"
-           << "}";
-        return os;
-    }
-};
-
-struct Setting
-{
-    int n_warmup_;
-    int n_repeat_;
-    bool is_gpu_timer_;
-    int verify_;
-    int init_method_;
-    bool log_;
-    std::string csv_filename_;
-    bool flush_cache_;
-    int rotating_count_;
-    bool json_output_;
-};
-
-inline std::string get_rocm_version()
-{
-    std::ifstream version_file("/opt/rocm/.info/version");
-    if(version_file.is_open())
-    {
-        std::string version;
-        std::getline(version_file, version);
-        return version;
-    }
-    return "Unknown";
-}
-
-template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
-auto calculate_rtol_atol(const ck_tile::index_t K,
-                         const ck_tile::index_t kbatch,
-                         const float max_accumulated_value)
-{
-    using ComputeType =
-        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
-    // Calculate thresholds
-    const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
-        ck_tile::integer_divide_ceil(K, kbatch));
-    const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
-        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
-    // Calculate error due to split_k accumulation
-    const auto rtol_split_k =
-        ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
-    const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
-        max_accumulated_value, kbatch);
-    // Use higher threshold
-    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
-}
-
-/// @brief Function to compare the results of the device and host computations
-bool compare(std::string instanceName,
-             ck_tile::index_t K,
-             ck_tile::index_t kbatch,
-             ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
-             ck_tile::HostTensor<CDataType>& c_m_n_host_result)
-{
-    const float max_accumulated_value =
-        *std::max_element(c_m_n_host_result.mData.begin(), c_m_n_host_result.mData.end());
-    const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
-        K, kbatch, max_accumulated_value);
-    bool pass = ck_tile::check_err(c_m_n_dev_result,
-                                   c_m_n_host_result,
-                                   "Error: Incorrect results!",
-                                   rtol_atol.at(ck_tile::number<0>{}),
-                                   rtol_atol.at(ck_tile::number<1>{}));
-
-    std::cout << "For " << instanceName << " Relative error threshold is "
-              << rtol_atol.at(ck_tile::number<0>{}) << " Absolute error threshold is "
-              << rtol_atol.at(ck_tile::number<1>{}) << std::endl;
-    std::cout << "The verification result is:" << (pass ? "correct" : "fail") << std::endl;
-
-    return pass;
-}
-
 /// @brief Function to get the kernel output with reference implementation on CPU/GPU
 void gemm_host_reference(int verify,
                          ck_tile::HostTensor<ADataType>& a_m_k,
diff --git a/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py b/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py
index 88ed4465af3..4fa5d5dee0e 100755
--- a/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py
+++ b/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py
@@ -36,7 +36,7 @@ def _import_benchmark_utils():
     # Load the module dynamically
     spec = importlib.util.spec_from_file_location(
         "benchmark_utils",
-        os.path.join(parent_dir, "commons", "benchmark_utils.py"),
+        os.path.join(parent_dir, "common", "benchmark_utils.py"),
     )
     benchmark_utils = importlib.util.module_from_spec(spec)
     spec.loader.exec_module(benchmark_utils)
diff --git a/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark_single.cpp b/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark_single.cpp
index 613a42ff80b..bf81acabd29 100644
--- a/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark_single.cpp
+++ b/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark_single.cpp
@@ -12,20 +12,18 @@
 #include "ck_tile/core.hpp"
 #include "ck_tile/host.hpp"
 #include "gemm_universal_profiler.hpp"
-#include "gemm_universal_common.hpp"
 
 // The kernel header is included via the compile command line with -include flag
 // It defines SelectedKernel struct and KERNEL_NAME
-// DataTypeTraits are now defined in gemm_common.hpp
 
 void benchmark_single(const ck_tile::ArgParser& arg_parser)
 {
     // Use DataTypeTraits to get the actual type names from the generated header
     // The generated header defines ADataType, BDataType, AccDataType, CDataType
-    std::string dtype_a   = DataTypeTraits<ADataType>::name;
-    std::string dtype_b   = DataTypeTraits<BDataType>::name;
-    std::string dtype_acc = DataTypeTraits<AccDataType>::name;
-    std::string dtype_c   = DataTypeTraits<CDataType>::name;
+    std::string dtype_a   = ck_tile::DataTypeTraits<ADataType>::name;
+    std::string dtype_b   = ck_tile::DataTypeTraits<BDataType>::name;
+    std::string dtype_acc = ck_tile::DataTypeTraits<AccDataType>::name;
+    std::string dtype_c   = ck_tile::DataTypeTraits<CDataType>::name;
 
     // Layout names from the layout types
     std::string layout_a = ALayout::name;
@@ -62,7 +60,7 @@ void benchmark_single(const ck_tile::ArgParser& arg_parser)
                     arg_parser.get_bool("json_output")};
 
     // Get the profiler instance
-    auto& profiler = GemmProfiler::instance(setting);
+    auto& profiler = UniversalGemmProfiler::GemmProfiler::instance(setting);
 
     try
     {
diff --git a/tile_engine/ops/gemm/gemm_universal/gemm_universal_common.hpp b/tile_engine/ops/gemm/gemm_universal/gemm_universal_common.hpp
deleted file mode 100644
index 899221547f6..00000000000
--- a/tile_engine/ops/gemm/gemm_universal/gemm_universal_common.hpp
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
-// SPDX-License-Identifier: MIT
-
-#pragma once
-
-#include <string>
-#include "ck_tile/core.hpp"
-#include "ck_tile/host.hpp"
-#include "ck_tile/core/numeric/integer.hpp"
-#include "ck_tile/core/numeric/pk_int4.hpp"
-
-//[TODO] This can be moved to commons
-// DataTypeTraits for all supported types
-template <typename T>
-struct DataTypeTraits;
-
-template <>
-struct DataTypeTraits<float>
-{
-    static constexpr const char* name = "fp32";
-};
-
-template <>
-struct DataTypeTraits<double>
-{
-    static constexpr const char* name = "fp64";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::half_t>
-{
-    static constexpr const char* name = "fp16";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::bf16_t>
-{
-    static constexpr const char* name = "bf16";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::fp8_t>
-{
-    static constexpr const char* name = "fp8";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::bf8_t>
-{
-    static constexpr const char* name = "bf8";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::int8_t>
-{
-    static constexpr const char* name = "int8";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::int32_t>
-{
-    static constexpr const char* name = "int32";
-};
-
-template <>
-struct DataTypeTraits<ck_tile::pk_int4_t>
-{
-    static constexpr const char* name = "pk_int4_t";
-};
-
-// Helper function to determine if a layout is row-major
-template <typename Layout>
-constexpr auto is_row_major(Layout)
-{
-    return ck_tile::bool_constant<std::is_same_v<Layout, ck_tile::tensor_layout::gemm::RowMajor>>{};
-}
-
-// Structure to hold kernel traits for dispatcher
-struct KernelTraits
-{
-    std::string pipeline;  // compv3, compv4, mem
-    std::string scheduler; // intrawave, interwave
-    std::string epilogue;  // cshuffle, default
-    bool pad_m;
-    bool pad_n;
-    bool pad_k;
-    bool persistent;
-
-    // Constructor with defaults
-    KernelTraits()
-        : pipeline("compv3"),
-          scheduler("intrawave"),
-          epilogue("cshuffle"),
-          pad_m(false),
-          pad_n(false),
-          pad_k(false),
-          persistent(false)
-    {
-    }
-};
diff --git a/tile_engine/ops/gemm/gemm_universal/gemm_universal_profiler.hpp b/tile_engine/ops/gemm/gemm_universal/gemm_universal_profiler.hpp
index 9b728c52d6a..a1d749d6784 100644
--- a/tile_engine/ops/gemm/gemm_universal/gemm_universal_profiler.hpp
+++ b/tile_engine/ops/gemm/gemm_universal/gemm_universal_profiler.hpp
@@ -10,38 +10,24 @@
 #include "ck_tile/host/device_prop.hpp"
 #include "ck_tile/ops/gemm.hpp"
 #include "gemm_universal_benchmark.hpp"
+#include "gemm/gemm_profiler.hpp"
 
-class GemmProfiler
+class UniversalGemmProfiler : public GemmProfiler<UniversalGemmProfiler,
+                          GemmProblem,
+                          ck_tile::GemmHostArgs>
 {
-    public:
-    static GemmProfiler& instance(Setting setting)
-    {
-        static GemmProfiler instance{setting};
-        return instance;
-    }
-
-    // Overload for single kernel benchmarking
-    void benchmark(GemmProblem& gemm_problem,
-                   std::function<float(const ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>
-                       kernel_func)
-    {
-        // Create a vector with a single callable that returns both name and time
-        std::vector<std::function<std::tuple<std::string, float>(ck_tile::GemmHostArgs&,
-                                                                 const ck_tile::stream_config&)>>
-            callables;
-
-        callables.push_back(
-            [kernel_func](ck_tile::GemmHostArgs& args, const ck_tile::stream_config& stream) {
-                float time = kernel_func(args, stream);
-                return std::make_tuple(std::string(KERNEL_NAME), time);
-            });
+public:
+    using BaseGemm = GemmProfiler<UniversalGemmProfiler,
+                              GemmProblem,
+                              ck_tile::GemmHostArgs>;
+    using BaseGemm::benchmark;
 
-        benchmark(gemm_problem, callables);
-    }
+    UniversalGemmProfiler(Setting setting)
+        : GemmProfiler<UniversalGemmProfiler, GemmProblem, ck_tile::GemmHostArgs>(setting) {}
 
     void benchmark(GemmProblem& gemm_problem,
                    std::vector<std::function<std::tuple<std::string, float>(
-                       ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>>& callables)
+                       ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>>& callables) override
     {
         const ALayout layout_a = ALayout{};
         const BLayout layout_b = BLayout{};
@@ -158,132 +144,4 @@ class GemmProfiler
                            kernel_run_result);
         }
     }
-
-    void process_result(const GemmProblem& gemm_problem,
-                        ck_tile::DeviceMem& c_m_n_dev_buf,
-                        ck_tile::HostTensor<CDataType>& c_m_n_host_result,
-                        ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
-                        const std::tuple<std::string, float>& kernel_run_result)
-    {
-        auto [name, avg_time] = kernel_run_result;
-
-        KernelInstance kernel_instance{name, gemm_problem, {-1.0f, -1.0f, -1.0f}};
-
-        // compute performance metric
-        std::size_t flop     = std::size_t(2) * gemm_problem.m_ * gemm_problem.n_ * gemm_problem.k_;
-        std::size_t num_byte = sizeof(ADataType) * gemm_problem.m_ * gemm_problem.k_ +
-                               sizeof(BDataType) * gemm_problem.n_ * gemm_problem.k_ +
-                               sizeof(CDataType) * gemm_problem.m_ * gemm_problem.n_;
-
-        // update
-        kernel_instance.perf_result_.latency_   = avg_time;
-        kernel_instance.perf_result_.tflops_    = static_cast<float>(flop) / 1.E9 / avg_time;
-        kernel_instance.perf_result_.bandwidth_ = num_byte / 1.E6 / avg_time;
-
-        if(setting_.log_ > 0 && !setting_.json_output_)
-        {
-            std::cout << kernel_instance << std::endl;
-        }
-
-        // verify result
-        c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
-        bool verified_correct =
-            !setting_.verify_ ||
-            compare(
-                name, gemm_problem.k_, gemm_problem.split_k_, c_m_n_dev_result, c_m_n_host_result);
-
-        if(verified_correct)
-        {
-            kernel_instances_.emplace_back(kernel_instance);
-        }
-        else
-        {
-            std::cout << "Verification failed, skip kernel: " << name << std::endl;
-        }
-
-        // clear tensor
-        c_m_n_dev_buf.SetZero();
-        c_m_n_dev_result.SetZero();
-    }
-
-    KernelInstance select_best_instance(Metric metric)
-    {
-        if(kernel_instances_.empty())
-            throw std::runtime_error("Empty instances");
-
-        auto kernel_instance = *std::max_element(kernel_instances_.begin(),
-                                                 kernel_instances_.end(),
-                                                 [metric](const auto& a, const auto& b) {
-                                                     return PerformanceResult::compare(
-                                                         b.perf_result_, a.perf_result_, metric);
-                                                 });
-
-        if(setting_.json_output_)
-        {
-            // Output clean JSON only
-            std::cout << kernel_instance << std::endl;
-        }
-        else
-        {
-            std::cout << "**********************************" << std::endl;
-            std::cout << "According to given metrics: " << get_metric_name(metric) << "\n"
-                      << "Current kernel performance is: " << kernel_instance << std::endl;
-            std::cout << "**********************************" << std::endl;
-        }
-
-        if(!setting_.csv_filename_.empty())
-        {
-            std::ofstream file(setting_.csv_filename_ + ".csv", std::ios::app);
-
-            if(!file.is_open())
-            {
-                std::cerr << "Warning: Failed to open CSV file for writing." << std::endl;
-            }
-            else
-            {
-                if(file.tellp() == 0)
-                {
-                    file << "rocm_version,device_name,"
-                         << "split_k,m,n,k,stride_a,stride_b,stride_c,"
-                         << "dtype_a,dtype_b,dtype_acc,dtype_c," << "layout_a,layout_b,layout_c,"
-                         << "structured_sparsity," << "name,"
-                         << "latency(ms),tflops(TFlops),bandwidth(GB/s),metric\n";
-                }
-
-                const auto& problem = kernel_instance.problem_;
-                const auto& name    = kernel_instance.name_;
-                const auto& perf    = kernel_instance.perf_result_;
-
-                file << get_rocm_version() << "," << ck_tile::get_device_name() << ","
-                     << problem.split_k_ << "," << problem.m_ << "," << problem.n_ << ","
-                     << problem.k_ << "," << problem.stride_a_ << "," << problem.stride_b_ << ","
-                     << problem.stride_c_ << "," << problem.dtype_a_ << "," << problem.dtype_b_
-                     << "," << problem.dtype_acc_ << "," << problem.dtype_c_ << ","
-                     << problem.layout_a_ << "," << problem.layout_b_ << "," << problem.layout_c_
-                     << "," << problem.structured_sparsity_ << "," << name << "," << std::fixed
-                     << std::setprecision(4) << perf.latency_ << "," << std::fixed
-                     << std::setprecision(4) << perf.tflops_ << "," << std::fixed
-                     << std::setprecision(4) << perf.bandwidth_ << "," << get_metric_name(metric)
-                     << "\n";
-
-                if(!file)
-                {
-                    std::cerr << "Warning: Error occurred while writing to CSV file." << std::endl;
-                }
-            }
-        }
-
-        return kernel_instance;
-    }
-
-    GemmProfiler(const GemmProfiler&)            = delete;
-    GemmProfiler& operator=(const GemmProfiler&) = delete;
-
-    private:
-    ~GemmProfiler() { kernel_instances_.clear(); }
-    GemmProfiler(Setting setting) : setting_(setting) {}
-
-    Setting setting_;
-
-    std::vector<KernelInstance> kernel_instances_;
 };

From bede2963ea5016ecb7a1e8bdb9ceb503f47fec5f Mon Sep 17 00:00:00 2001
From: Astha <astha.rai713@gmail.com>
Date: Sun, 11 Jan 2026 23:15:28 -0500
Subject: [PATCH 3/3] Adding README back into the gemm directory and integrate
 new preshuffle functions

---
 Jenkinsfile                                   |   6 +-
 test/ck_tile/gemm_tile_engine/CMakeLists.txt  |  10 +-
 tile_engine/ops/common/utils.hpp              |  83 ----
 tile_engine/ops/gemm/README.md                | 442 ++++++++++++++++++
 tile_engine/ops/gemm/gemm_benchmark.hpp       |  22 +-
 tile_engine/ops/gemm/gemm_common.hpp          |  96 ++++
 .../gemm_multi_d_benchmark_single.cpp         |  45 +-
 .../gemm_multi_d/gemm_multi_d_profiler.hpp    |  22 +-
 .../gemm_preshuffle_benchmark.hpp             |  17 +
 .../gemm_preshuffle_benchmark_single.cpp      |  83 +---
 .../gemm_preshuffle_common.hpp                |  64 +--
 .../gemm_preshuffle_profiler.hpp              |  42 +-
 tile_engine/ops/gemm/gemm_profiler.hpp        |  66 ++-
 .../gemm_universal_benchmark.py               |  21 +-
 .../gemm_universal_benchmark_single.cpp       |   1 +
 .../gemm_universal_profiler.hpp               |  18 +-
 16 files changed, 697 insertions(+), 341 deletions(-)
 create mode 100644 tile_engine/ops/gemm/README.md
 create mode 100644 tile_engine/ops/gemm/gemm_common.hpp

diff --git a/Jenkinsfile b/Jenkinsfile
index 2f2229c7a5f..1b0034474dd 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -1644,7 +1644,7 @@ pipeline {
                                             -D GEMM_PRESHUFFLE_LAYOUT="rcr" \
                                             -D GEMM_PRESHUFFLE_CONFIG_FILE="default_ci_config.json" .. && \
                                            ninja -j${nthreads()} benchmark_gemm_universal_all benchmark_gemm_preshuffle_all benchmark_gemm_multi_d_all && \
-                                           python3 ../tile_engine/ops/gemm/gemm_universal/gemm_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
+                                           python3 ../tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
                                            python3 ../tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
                                            python3 ../tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
                     }
@@ -1685,7 +1685,7 @@ pipeline {
                                             -D GEMM_PRESHUFFLE_DATATYPE="fp16;fp8;bf16;bf8" \
                                             -D GEMM_PRESHUFFLE_LAYOUT="rcr" .. && \
                                            ninja -j${nthreads()} benchmark_gemm_universal_all benchmark_gemm_preshuffle_all benchmark_gemm_multi_d_all benchmark_gemm_streamk_all && \
-                                           python3 ../tile_engine/ops/gemm/gemm_universal/gemm_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
+                                           python3 ../tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
                                            python3 ../tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
                                            python3 ../tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
                     }
@@ -1710,7 +1710,7 @@ pipeline {
                                             -D GEMM_UNIVERSAL_DATATYPE="fp16" \
                                             -D GEMM_UNIVERSAL_LAYOUT="rcr;rrr;crr;ccr" .. && \
                                            ninja -j${nthreads()} benchmark_gemm_universal_all && \
-                                           python3 ../tile_engine/ops/gemm/gemm_universal/gemm_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
+                                           python3 ../tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
                     }
                     steps{
                         buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)
diff --git a/test/ck_tile/gemm_tile_engine/CMakeLists.txt b/test/ck_tile/gemm_tile_engine/CMakeLists.txt
index 33effcc1206..dc148d45e78 100644
--- a/test/ck_tile/gemm_tile_engine/CMakeLists.txt
+++ b/test/ck_tile/gemm_tile_engine/CMakeLists.txt
@@ -10,7 +10,7 @@
 # ============================================================================
 
 # Locate tile_engine GEMM scripts directory
-set(TILE_ENGINE_GEMM_DIR "${PROJECT_SOURCE_DIR}/tile_engine/ops/gemm")
+set(TILE_ENGINE_GEMM_DIR "${PROJECT_SOURCE_DIR}/tile_engine/ops/gemm/gemm_universal")
 
 if(NOT EXISTS ${TILE_ENGINE_GEMM_DIR})
     message(WARNING "Tile engine directory not found: ${TILE_ENGINE_GEMM_DIR}")
@@ -32,11 +32,11 @@ endif()
 #   config_json  - Full path to JSON configuration file
 # ============================================================================
 function(create_individual_gemm_test_target datatype layout config_name trait tile_config config_json)
-    set(target_name "test_gemm_tile_engine_${datatype}_${layout}_${config_name}_${trait}_${tile_config}")
+    set(target_name "test_gemm_universal_tile_engine_${datatype}_${layout}_${config_name}_${trait}_${tile_config}")
     set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${layout}/${config_name}")
     
     # Generated header path (already created during cmake configuration)
-    set(test_header "${working_path}/gemm_single_${datatype}_${layout}_${trait}_${tile_config}.hpp")
+    set(test_header "${working_path}/gemm_universal_single_${datatype}_${layout}_${trait}_${tile_config}.hpp")
     set(test_params_header "${working_path}/test_params.hpp")
     
     # Verify header exists (should have been generated during cmake configuration)
@@ -118,7 +118,7 @@ function(build_gemm_test_targets datatype layout config_name)
 
     # STEP 1: Discovery phase - list all valid kernel configurations
     execute_process(
-        COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_instance_builder.py
+        COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_universal_instance_builder.py
                 --working_path ${working_path}
                 --datatype ${datatype}
                 --layout ${layout}
@@ -178,7 +178,7 @@ function(build_gemm_test_targets datatype layout config_name)
                     
                     # Generate header using --gen_single
                     execute_process(
-                        COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_instance_builder.py
+                        COMMAND ${Python3_EXECUTABLE} -u ${TILE_ENGINE_GEMM_DIR}/gemm_universal_instance_builder.py
                                 --working_path ${working_path}
                                 --gpu_target "${GEMM_TEST_GPU_TARGETS}"
                                 --datatype ${datatype}
diff --git a/tile_engine/ops/common/utils.hpp b/tile_engine/ops/common/utils.hpp
index 20994578e64..56bfbde5a07 100644
--- a/tile_engine/ops/common/utils.hpp
+++ b/tile_engine/ops/common/utils.hpp
@@ -20,89 +20,6 @@ constexpr auto is_row_major(Layout)
     return ck_tile::bool_constant<std::is_same_v<Layout, ck_tile::tensor_layout::gemm::RowMajor>>{};
 }
 
-// Structure to hold kernel traits for dispatcher
-struct KernelTraits
-{
-    std::string pipeline;  // compv3, compv4, mem
-    std::string scheduler; // intrawave, interwave
-    std::string epilogue;  // cshuffle, default
-    bool pad_m;
-    bool pad_n;
-    bool pad_k;
-    bool persistent;
-
-    // Constructor with defaults
-    KernelTraits()
-        : pipeline("compv3"),
-          scheduler("intrawave"),
-          epilogue("cshuffle"),
-          pad_m(false),
-          pad_n(false),
-          pad_k(false),
-          persistent(false)
-    {
-    }
-};
-
-
-// Create argument parser
-inline auto create_args(int argc, char* argv[])
-{
-    ck_tile::ArgParser arg_parser;
-    arg_parser.insert("m", "3840", "The value for m dimension. Default is 3840.")
-        .insert("n", "4096", "The value for n dimension. Default is 4096.")
-        .insert("k", "2048", "The value for k dimension. Default is 2048.")
-        .insert("stride_a", "0", "The stride value for tensor A. Default is 0.")
-        .insert("stride_b", "0", "The stride value for tensor B. Default is 0.")
-	.insert("stride_ds", "0", "The stride value for tensor Ds . Default is 0.")
-        .insert("stride_c", "0", "The stride value for tensor C. Default is 0.")
-        .insert("split_k", "1", "The split value for k dimension. Default is 1.")
-        .insert("verify",
-                "2",
-                "The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 "
-                "for validation on GPU. Default is 2, GPU validation.")
-        .insert("log",
-                "false",
-                "Whether output kernel instance information or not. Possible values are true or "
-                "false. Default is false")
-        .insert(
-            "warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.")
-        .insert(
-            "repeat", "100", "The number of iterations to benchmark the kernel. Default is 100.")
-        .insert("timer",
-                "true",
-                "Whether if the timer is gpu timer or not. Possible values are false or true. "
-                "Default is true.")
-        .insert("init",
-                "0",
-                "The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 "
-                "for constant(1). Default is 0, random.")
-        .insert("flush_cache",
-                "true",
-                "To flush cache, possible values are true or false. "
-                "Default is false.")
-        .insert("rotating_count", "1000", "number of iterations to rotate the cache. default is 5.")
-        .insert("metric",
-                "0",
-                "Metric with which to measure kernel performance. Set to 0 for latency, 1 for "
-                "tflops, or 2 for bandwidth. Default is 0, latency.")
-        .insert("csv_filename",
-                "",
-                "The filename of benchmark result. Default is empty (no CSV output).")
-        .insert("structured_sparsity",
-                "false",
-                "Whether use sparsity kernel or not. Possible values are true or false. Default is "
-                "false")
-        .insert("json_output",
-                "false",
-                "Whether to output results in JSON format only. Possible values are true or false. "
-                "Default is "
-                "false");
-
-    bool result = arg_parser.parse(argc, argv);
-    return std::make_tuple(result, arg_parser);
-}
-
 enum class Metric
 {
     LATENCY   = 0,
diff --git a/tile_engine/ops/gemm/README.md b/tile_engine/ops/gemm/README.md
new file mode 100644
index 00000000000..5e0bae70806
--- /dev/null
+++ b/tile_engine/ops/gemm/README.md
@@ -0,0 +1,442 @@
+# CK Tile Engine GEMM Operations
+
+## Overview
+
+The CK Tile Engine GEMM module provides a comprehensive system for generating, building, and benchmarking GEMM (General Matrix Multiplication) kernels with various configurations. It supports multiple data types, layouts, and optimization strategies. The system has evolved from a monolithic build approach (where all kernels compile into a single executable) to a more flexible individual kernel compilation system, providing better build parallelism and targeted testing capabilities.
+
+## Table of Contents
+
+1. [Build System Architecture](#build-system-architecture)
+2. [Build Instructions](#build-instructions)
+3. [Running Benchmarks](#running-benchmarks)
+4. [Configuration System](#configuration-system)
+5. [Scripts and Tools](#scripts-and-tools)
+6. [Command Line Options](#command-line-options)
+7. [Understanding Kernel Names](#understanding-kernel-names)
+8. [Troubleshooting](#troubleshooting)
+9. [Performance Tips](#performance-tips)
+
+## Build System Architecture
+
+### Individual Kernel Compilation (New Approach)
+
+The new tile engine benchmark system compiles each kernel configuration into a separate executable. This provides:
+- Better build parallelism
+- Faster incremental builds
+- More targeted testing
+- Easier debugging of specific configurations
+
+Each benchmark executable follows the naming pattern:
+```
+benchmark_gemm_<dtype>_<layout>_<config>_<tile_sizes>
+```
+
+### Monolithic Build (Legacy Approach)
+
+The original system compiles all kernels into a single executable (`benchmark_gemm_[Datatype]_[Layout]`), which can then be filtered at runtime using command-line arguments.
+
+## Build Instructions
+
+### Prerequisites
+- ROCm installation
+- CMake 3.16 or higher
+- C++17 compatible compiler
+
+### Basic Build
+
+```bash
+# In the root of composable kernel, create build directory
+mkdir build && cd build
+
+# Configure with specific datatypes and layouts
+# Replace [Arch] with your GPU architecture (e.g., gfx90a, gfx942)
+# Replace [Datatype1;Datatype2;...] with datatypes (fp8, bf8, int8, fp16, bf16, fp32, fp64)
+# Replace [Layout1;Layout2;...] with layouts (rcr, rrr, crr, ccr)
+../script/cmake-ck-dev.sh ../ [Arch] -DGEMM_DATATYPE="[Datatype1;Datatype2]" -DGEMM_LAYOUT="[Layout1;Layout2]"
+
+# Build specific benchmarks
+make benchmark_gemm_[Datatype1]_[Layout1] -j
+```
+
+### Configuration Options
+
+The build system supports several configuration options:
+
+#### Using Custom Config Files
+```bash
+# Method 1: CMake variable (config file must be in configs/ directory)
+cmake -DGEMM_CONFIG_FILE=my_custom_config.json ...
+
+# Method 2: Environment variable (takes precedence over CMake variable)
+export GEMM_CONFIG_FILE=my_custom_config.json
+cmake ...
+```
+
+#### Config File Priority Order
+1. **Environment variable** `GEMM_CONFIG_FILE` (highest priority)
+2. **CMake variable** `GEMM_CONFIG_FILE`
+3. **Default config** (default_config.json for all layouts)
+
+**Note**: All custom config files must be placed in the `tile_engine/ops/gemm/configs/` directory.
+
+### Example Build Commands
+
+```bash
+# Build for gfx942 with fp8 and fp16 datatypes, rcr layout
+mkdir build && cd build
+../script/cmake-ck-dev.sh ../ gfx942 -DGEMM_DATATYPE="fp8;fp16" -DGEMM_LAYOUT="rcr;ccr;rrr;crr"
+make benchmark_gemm_universal_fp8_rcr -j
+make benchmark_gemm_universal_fp16_rcr -j
+```
+
+### Building Individual Kernels
+
+```bash
+# Build a specific kernel configuration
+make benchmark_gemm_universal_fp8_rcr_compv4_default_intrawave_False_False_False_False_256x256x32_1x4x1_32x32x32
+
+# Build all fp16 benchmarks in parallel
+make -j$(nproc) $(make help | grep benchmark_gemm_fp16 | awk '{print $2}')
+```
+
+### Rebuilding After Configuration Changes
+
+If you modify the configuration file, you must rebuild:
+```bash
+rm -rf tile_engine/ && make benchmark_gemm_universal_[Datatype]_[Layout] -j
+```
+
+## Running Benchmarks
+
+### Individual Kernel Execution
+
+```bash
+cd /path/to/build/directory
+./bin/benchmark_gemm_universal_fp16_rcr_compv3_default_intrawave_False_False_False_False_256x128x32_4x1x1_32x32x16 \
+    -m=512 -n=512 -k=512 -verify=1
+```
+
+### Monolithic Executable (Legacy)
+
+```bash
+# Run specific pipeline/scheduler/epilogue combination
+./bin/benchmark_gemm_universal_[Datatype]_[Layout] -pipeline=compv3 -scheduler=intrawave -epilogue=default
+```
+
+### Automated Testing
+
+Use the provided test script to run multiple benchmarks:
+```bash
+cd /path/to/composable_kernel/tile_engine/ops/gemm
+./test_benchmark.sh [build_directory]
+```
+
+## Configuration System
+
+### Configuration Files
+
+The system uses JSON configuration files to specify kernel parameters:
+
+- `configs/default_config.json` - Default configurations for various datatypes
+- `configs/user_provided_config.json` - User-customizable configurations
+
+### Configuration Structure
+
+```json
+{
+    "tile_config": {
+        "tile_m": {"values": [256, 128]},
+        "tile_n": {"values": [256, 128]},
+        "tile_k": {"values": [64, 32]},
+        "warp_m": {"values": [2, 4]},
+        "warp_n": {"values": [2, 1]},
+        "warp_k": {"values": [1]},
+        "warp_tile_m": {"values": [32, 16]},
+        "warp_tile_n": {"values": [32, 16]},
+        "warp_tile_k": {"values": [16, 32]}
+    },
+    "trait_config": {
+        "pipeline": {"values": ["compv3", "compv4", "mem"]},
+        "scheduler": {"values": ["intrawave", "interwave"]},
+        "epilogue": {"values": ["default", "cshuffle"]},
+        "pad_m": {"values": [false]},
+        "pad_n": {"values": [false]},
+        "pad_k": {"values": [false]},
+        "persistent": {"values": [false]}
+    }
+}
+```
+
+## Scripts and Tools
+
+### Python Scripts
+
+#### gemm_universal_instance_builder.py
+**Purpose**: Main kernel instance generation script that creates C++ kernel implementations based on configuration files.
+
+**Key Features**:
+- Generates individual kernel header files for separate compilation
+- Supports multiple data types (fp16, fp8, bf16, fp32, fp64)
+- Validates tile configurations for correctness
+- Creates CMake integration files
+
+**Usage**:
+```bash
+python gemm_universal_instance_builder.py \
+    --working_path ./generated \
+    --datatype fp16 \
+    --layout rcr \
+    --config_json configs/user_provided_config.json \
+    --gen_all_individual
+```
+
+#### gemm_instance_builder_parallel.py
+**Purpose**: Parallel version of the instance builder for faster generation of multiple kernel configurations.
+
+**Features**:
+- Multi-threaded kernel generation
+- Improved performance for large configuration spaces
+
+#### validation_utils.py
+**Purpose**: Provides comprehensive validation functions for kernel configurations.
+
+**Key Functions**:
+- `is_tile_config_valid()` - Validates tile dimensions and alignments
+- `is_trait_combination_valid()` - Checks if pipeline/epilogue/scheduler combinations are supported
+- `validate_warp_tile_combination()` - GPU-specific warp tile validation
+- `validate_lds_capacity()` - Ensures configurations fit in LDS memory
+
+**Validation Checks**:
+- Dimension alignment (tile dimensions must be divisible by warp dimensions)
+- LDS capacity constraints
+- GPU-specific warp tile support
+- Unsupported trait combinations
+
+#### test_validation.py
+**Purpose**: Test suite for the validation logic to ensure correctness.
+
+**Usage**:
+```bash
+python test_validation.py
+```
+
+**Tests**:
+- Warp tile combination validation
+- Trait combination validation
+- Full tile configuration validation
+
+#### gemm_universal_benchmark.py
+**Purpose**: Python script for running and analyzing GEMM benchmarks.
+
+**Features**:
+- Automated benchmark execution
+- Performance data collection
+- Result analysis and reporting
+
+#### json_config.py
+**Purpose**: Configuration file parsing and management.
+
+**Features**:
+- JSON configuration loading
+- Default configuration handling
+- Configuration validation
+
+#### codegen_utils.py
+**Purpose**: Utility functions for code generation.
+
+**Features**:
+- Template processing
+- Code formatting utilities
+- File generation helpers
+
+### Shell Scripts
+
+#### test_benchmark.sh
+**Purpose**: Automated benchmark testing script that finds and runs all built benchmark executables.
+
+**Features**:
+- Automatic build directory detection
+- Batch execution of multiple benchmarks
+- CSV result collection
+- Colored output for easy reading
+- Example command generation
+
+**Usage**:
+```bash
+# Auto-detect build directory
+./test_benchmark.sh
+
+# Specify build directory
+./test_benchmark.sh /path/to/build/directory
+```
+
+**What it does**:
+1. Finds all benchmark executables in the build directory
+2. Runs each with multiple problem sizes (512, 1024, 2048)
+3. Performs GPU verification
+4. Saves results to timestamped CSV file
+5. Provides summary statistics
+
+## Command Line Options
+
+All benchmark executables support the following options:
+
+### Matrix Dimensions
+- `-m=<value>` - M dimension (default: 3840)
+- `-n=<value>` - N dimension (default: 4096)
+- `-k=<value>` - K dimension (default: 2048)
+
+### Strides
+- `-stride_a=<value>` - Stride for matrix A (default: 0, auto-calculated)
+- `-stride_b=<value>` - Stride for matrix B (default: 0, auto-calculated)
+- `-stride_c=<value>` - Stride for matrix C (default: 0, auto-calculated)
+
+### Verification
+- `-verify=<0|1|2>` - Verification mode
+  - 0: No verification (default)
+  - 1: CPU verification
+  - 2: GPU verification
+
+### Performance Testing
+- `-warmup=<value>` - Warmup iterations (default: 50)
+- `-repeat=<value>` - Benchmark iterations (default: 100)
+- `-timer=<true|false>` - Use GPU timer (default: true)
+- `-flush_cache=<true|false>` - Flush cache between runs (default: true)
+- `-rotating_count=<value>` - Cache rotation count (default: 1000)
+
+### Initialization
+- `-init=<0|1|2>` - Tensor initialization method
+  - 0: Random values [-1, 1] (default)
+  - 1: Linear sequence (i % 17)
+  - 2: Constant value (1.0)
+
+### Output Options
+- `-log=<true|false>` - Enable verbose logging (default: false)
+- `-metric=<0|1|2>` - Performance metric
+  - 0: Latency in ms (default)
+  - 1: TFLOPS
+  - 2: Bandwidth in GB/s
+- `-json_output=<true|false>` - JSON format output (default: false)
+- `-csv_filename=<filename>` - Save results to CSV
+- `-csv_format=<simple|comprehensive>` - CSV format (default: comprehensive)
+
+### Advanced Options
+- `-split_k=<value>` - Split-K factor (default: 1)
+- `-structured_sparsity=<true|false>` - Enable structured sparsity (default: false)
+- `-pipeline=<compv3|compv4|mem>` - Pipeline type (default: compv3)
+- `-scheduler=<intrawave|interwave>` - Scheduler type (default: intrawave)
+- `-epilogue=<cshuffle|default>` - Epilogue type (default: cshuffle)
+- `-pad_m=<true|false>` - Pad M dimension (default: false)
+- `-pad_n=<true|false>` - Pad N dimension (default: false)
+- `-pad_k=<true|false>` - Pad K dimension (default: false)
+- `-persistent=<true|false>` - Use persistent kernel (default: false)
+
+## Understanding Kernel Names
+
+The kernel naming convention encodes the configuration:
+
+```
+benchmark_gemm_universal_fp16_rcr_compv3_default_intrawave_False_False_False_False_256x128x32_4x1x1_32x32x16
+               		 ^^^^  ^^^ ^^^^^^ ^^^^^^^ ^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^ ^^^^^^^ ^^^^^^^^^
+               		 |     |   |      |       |         |                       |         |       |
+               		 |     |   |      |       |         Padding & flags         |         |       Warp tile
+               		 |     |   |      |       Scheduler                         |         Thread tile
+               		 |     |   |      Epilogue                                  Block tile
+               		 |     |   Pipeline
+               		 |     Layout (Row-Column-Row)
+               		 Data type
+```
+
+### Components:
+- **Data type**: fp16, fp32, bf16, fp8, bf8, int8
+- **Layout**: rcr (Row-Column-Row), rrr, crr, ccr
+- **Pipeline**: mem, compv3, compv4
+- **Epilogue**: default, cshuffle
+- **Scheduler**: intrawave, interwave
+- **Flags**: pad_m, pad_n, pad_k, persistent (4 boolean flags)
+- **Tile sizes**: BlockTile x ThreadTile x WarpTile
+
+## Troubleshooting
+
+### Common Issues
+
+1. **Kernel not found**
+   - Ensure the specific benchmark executable is built
+   - Check the build directory bin/ folder
+
+2. **Verification failures**
+   - Try GPU verification (-verify=2) which may be more accurate
+   - Check data type compatibility
+   - Verify stride calculations
+
+3. **Build failures**
+   - Check GPU architecture compatibility
+   - Ensure ROCm is properly installed
+   - Verify configuration file syntax
+
+4. **Performance variations**
+   - Increase warmup iterations
+   - Disable CPU frequency scaling
+   - Use GPU timer for accurate measurements
+
+### Debug Options
+
+Enable verbose logging:
+```bash
+./bin/benchmark_gemm_... -log=true -verify=1
+```
+
+Test validation logic:
+```bash
+python test_validation.py
+```
+
+## Performance Tips
+
+1. **Optimal Problem Sizes**: Use sizes that are multiples of tile dimensions
+2. **Warmup**: Use at least 50-100 warmup iterations
+3. **GPU Timer**: Always use `-timer=true` for accurate measurements
+4. **Cache Management**: Enable cache flushing for consistent results
+5. **Thread Affinity**: Set CPU affinity to reduce variation
+
+## Integration Examples
+
+### Python Integration
+
+```python
+import subprocess
+import json
+
+# Run benchmark with JSON output
+result = subprocess.run([
+    './bin/benchmark_gemm_universal_fp16_rcr_...', 
+    '-m=1024', '-n=1024', '-k=1024',
+    '-json_output=true'
+], capture_output=True, text=True)
+
+# Parse results
+data = json.loads(result.stdout)
+print(f"Performance: {data['tflops']} TFLOPS")
+```
+
+### Batch Testing Script
+
+```bash
+#!/bin/bash
+SIZES="512 1024 2048 4096"
+for size in $SIZES; do
+    echo "Testing ${size}x${size}x${size}"
+    ./bin/benchmark_gemm_... -m=$size -n=$size -k=$size \
+        -verify=2 -csv_filename=results.csv
+done
+```
+
+## Contributing
+
+When adding new features or configurations:
+1. Update validation logic in `validation_utils.py`
+2. Add tests to `test_validation.py`
+3. Update configuration examples
+4. Document new command-line options
+
+For more information about the Composable Kernel project, visit the main repository documentation.
diff --git a/tile_engine/ops/gemm/gemm_benchmark.hpp b/tile_engine/ops/gemm/gemm_benchmark.hpp
index 6ff09186a43..7439264a391 100644
--- a/tile_engine/ops/gemm/gemm_benchmark.hpp
+++ b/tile_engine/ops/gemm/gemm_benchmark.hpp
@@ -52,19 +52,27 @@ struct GemmProblem
 
 // Detect Problem::DsDataType, default to void when absent
 template <class T, class = void>
-struct get_DsDataType { using type = void; };
+struct get_DsDataType
+{
+    using type = void;
+};
 
 template <class T>
-struct get_DsDataType<T, std::void_t<typename T::DsDataType>> {
+struct get_DsDataType<T, std::void_t<typename T::DsDataType>>
+{
     using type = typename T::DsDataType;
 };
 
 // Detect Problem::D0DataType, default to void when absent
 template <class T, class = void>
-struct get_D0DataType { using type = void; };
+struct get_D0DataType
+{
+    using type = void;
+};
 
 template <class T>
-struct get_D0DataType<T, std::void_t<typename T::D0DataType>> {
+struct get_D0DataType<T, std::void_t<typename T::D0DataType>>
+{
     using type = typename T::D0DataType;
 };
 
@@ -79,10 +87,10 @@ bool compare(std::string instanceName,
     using DDataType = typename get_D0DataType<Problem>::type;
     const float max_accumulated_value =
         *std::max_element(c_m_n_host_result.mData.begin(), c_m_n_host_result.mData.end());
-    //const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
-        //K, kbatch, max_accumulated_value);
+    // const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+    // K, kbatch, max_accumulated_value);
     auto rtol_atol = [&] {
-        if constexpr (std::is_void_v<DDataType>)
+        if constexpr(std::is_void_v<DDataType>)
         {
             return calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
                 K, kbatch, max_accumulated_value);
diff --git a/tile_engine/ops/gemm/gemm_common.hpp b/tile_engine/ops/gemm/gemm_common.hpp
new file mode 100644
index 00000000000..3a9aed2bc6d
--- /dev/null
+++ b/tile_engine/ops/gemm/gemm_common.hpp
@@ -0,0 +1,96 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+#include <iostream>
+#include <functional>
+#include <tuple>
+#include <exception>
+#include <sstream>
+#include <vector>
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+
+// Structure to hold kernel traits for dispatcher
+struct KernelTraits
+{
+    std::string pipeline;  // compv3, compv4, mem
+    std::string scheduler; // intrawave, interwave
+    std::string epilogue;  // cshuffle, default
+    bool pad_m;
+    bool pad_n;
+    bool pad_k;
+    bool persistent;
+
+    // Constructor with defaults
+    KernelTraits()
+        : pipeline("compv3"),
+          scheduler("intrawave"),
+          epilogue("cshuffle"),
+          pad_m(false),
+          pad_n(false),
+          pad_k(false),
+          persistent(false)
+    {
+    }
+};
+
+// Create argument parser
+inline auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "3840", "The value for m dimension. Default is 3840.")
+        .insert("n", "4096", "The value for n dimension. Default is 4096.")
+        .insert("k", "2048", "The value for k dimension. Default is 2048.")
+        .insert("stride_a", "0", "The stride value for tensor A. Default is 0.")
+        .insert("stride_b", "0", "The stride value for tensor B. Default is 0.")
+        .insert("stride_ds", "0", "The stride value for tensor Ds . Default is 0.")
+        .insert("stride_c", "0", "The stride value for tensor C. Default is 0.")
+        .insert("split_k", "1", "The split value for k dimension. Default is 1.")
+        .insert("verify",
+                "2",
+                "The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 "
+                "for validation on GPU. Default is 2, GPU validation.")
+        .insert("log",
+                "false",
+                "Whether output kernel instance information or not. Possible values are true or "
+                "false. Default is false")
+        .insert(
+            "warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.")
+        .insert(
+            "repeat", "100", "The number of iterations to benchmark the kernel. Default is 100.")
+        .insert("timer",
+                "true",
+                "Whether if the timer is gpu timer or not. Possible values are false or true. "
+                "Default is true.")
+        .insert("init",
+                "0",
+                "The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 "
+                "for constant(1). Default is 0, random.")
+        .insert("flush_cache",
+                "true",
+                "To flush cache, possible values are true or false. "
+                "Default is false.")
+        .insert("rotating_count", "1000", "number of iterations to rotate the cache. default is 5.")
+        .insert("metric",
+                "0",
+                "Metric with which to measure kernel performance. Set to 0 for latency, 1 for "
+                "tflops, or 2 for bandwidth. Default is 0, latency.")
+        .insert("csv_filename",
+                "",
+                "The filename of benchmark result. Default is empty (no CSV output).")
+        .insert("structured_sparsity",
+                "false",
+                "Whether use sparsity kernel or not. Possible values are true or false. Default is "
+                "false")
+        .insert("json_output",
+                "false",
+                "Whether to output results in JSON format only. Possible values are true or false. "
+                "Default is "
+                "false");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
diff --git a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark_single.cpp b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark_single.cpp
index e72ceb0d76a..767e8eda6ef 100644
--- a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark_single.cpp
+++ b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_benchmark_single.cpp
@@ -11,6 +11,7 @@
 
 #include "ck_tile/core.hpp"
 #include "ck_tile/host.hpp"
+#include "gemm/gemm_common.hpp"
 #include "gemm_multi_d_profiler.hpp"
 
 // The kernel header is included via the compile command line with -include flag
@@ -35,29 +36,27 @@ void benchmark_single(const ck_tile::ArgParser& arg_parser)
     std::string layout_d1 = D1Layout::name;
 
     // Create GemmMultiDProblem struct
-    GemmMultiDProblem gemm_multi_d_problem{
-	    				GemmProblem{
-	    				   arg_parser.get_int("split_k"),
-                                           arg_parser.get_int("m"),
-                                           arg_parser.get_int("n"),
-                                           arg_parser.get_int("k"),
-                                           arg_parser.get_int("stride_a"),
-                                           arg_parser.get_int("stride_b"),
-                                           arg_parser.get_int("stride_c"),
-                                           dtype_a,
-                                           dtype_b,
-                                           dtype_acc,
-                                           dtype_c,
-                                           layout_a,
-                                           layout_b,
-                                           layout_c,
-					   arg_parser.get_bool("structured_sparsity")},
-					   arg_parser.get_int("stride_ds"),
-        				   arg_parser.get_int("stride_ds"),
-        				   dtype_d0,                 
-        				   dtype_d1,                      
-        				   layout_d0,                       
-        				   layout_d1};
+    GemmMultiDProblem gemm_multi_d_problem{GemmProblem{arg_parser.get_int("split_k"),
+                                                       arg_parser.get_int("m"),
+                                                       arg_parser.get_int("n"),
+                                                       arg_parser.get_int("k"),
+                                                       arg_parser.get_int("stride_a"),
+                                                       arg_parser.get_int("stride_b"),
+                                                       arg_parser.get_int("stride_c"),
+                                                       dtype_a,
+                                                       dtype_b,
+                                                       dtype_acc,
+                                                       dtype_c,
+                                                       layout_a,
+                                                       layout_b,
+                                                       layout_c,
+                                                       arg_parser.get_bool("structured_sparsity")},
+                                           arg_parser.get_int("stride_ds"),
+                                           arg_parser.get_int("stride_ds"),
+                                           dtype_d0,
+                                           dtype_d1,
+                                           layout_d0,
+                                           layout_d1};
 
     // Create Setting struct
     Setting setting{arg_parser.get_int("warmup"),
diff --git a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_profiler.hpp b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_profiler.hpp
index 583dfd85a7f..aeac6c984dc 100644
--- a/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_profiler.hpp
+++ b/tile_engine/ops/gemm/gemm_multi_d/gemm_multi_d_profiler.hpp
@@ -11,25 +11,28 @@
 #include <functional>
 #include <tuple>
 
-
 #include "ck_tile/host/device_prop.hpp"
 #include "ck_tile/ops/gemm.hpp"
 #include "gemm/gemm_profiler.hpp"
+#include "common/utils.hpp"
 #include "gemm_multi_d_benchmark.hpp"
 
-class GemmMultiDProfiler: public GemmProfiler<GemmMultiDProfiler,
-                          GemmMultiDProblem,
-                          ck_tile::GemmMultiDHostArgs<DsDataType::size()>>
+class GemmMultiDProfiler : public GemmProfiler<GemmMultiDProfiler,
+                                               GemmMultiDProblem,
+                                               ck_tile::GemmMultiDHostArgs<DsDataType::size()>>
 {
-public:
+    public:
     using BaseGemm = GemmProfiler<GemmMultiDProfiler,
-                              GemmMultiDProblem,
-                              ck_tile::GemmMultiDHostArgs<DsDataType::size()>>;
+                                  GemmMultiDProblem,
+                                  ck_tile::GemmMultiDHostArgs<DsDataType::size()>>;
     using BaseGemm::benchmark;
 
     GemmMultiDProfiler(Setting setting)
-        : GemmProfiler<GemmMultiDProfiler, GemmMultiDProblem, ck_tile::GemmMultiDHostArgs<DsDataType::size()>>(setting) {}
-
+        : GemmProfiler<GemmMultiDProfiler,
+                       GemmMultiDProblem,
+                       ck_tile::GemmMultiDHostArgs<DsDataType::size()>>(setting)
+    {
+    }
 
     void benchmark(
         GemmMultiDProblem& gemm_multi_d_problem,
@@ -157,5 +160,4 @@ class GemmMultiDProfiler: public GemmProfiler<GemmMultiDProfiler,
                            kernel_run_result);
         }
     }
-
 };
diff --git a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.hpp b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.hpp
index 48f8a46ecbb..817ff8e99ec 100644
--- a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.hpp
+++ b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark.hpp
@@ -8,6 +8,23 @@
 #include "gemm_preshuffle_common.hpp"
 #include "gemm/gemm_benchmark.hpp"
 
+struct KernelConfig
+{
+    ck_tile::index_t M_Tile;
+    ck_tile::index_t N_Tile;
+    ck_tile::index_t K_Tile;
+
+    ck_tile::index_t M_Warp;
+    ck_tile::index_t N_Warp;
+    ck_tile::index_t K_Warp;
+
+    ck_tile::index_t M_Warp_Tile;
+    ck_tile::index_t N_Warp_Tile;
+    ck_tile::index_t K_Warp_Tile;
+
+    bool permuteN;
+};
+
 /// @brief Function to get the kernel output with reference implementation on CPU/GPU
 void gemm_host_reference(int verify,
                          ck_tile::HostTensor<ADataType>& a_m_k,
diff --git a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark_single.cpp b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark_single.cpp
index 4fbb25f0c90..d03b35f2b46 100644
--- a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark_single.cpp
+++ b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_benchmark_single.cpp
@@ -11,78 +11,21 @@
 
 #include "ck_tile/core.hpp"
 #include "ck_tile/host.hpp"
+#include "gemm/gemm_common.hpp"
 #include "gemm_preshuffle_profiler.hpp"
 #include "gemm_preshuffle_common.hpp"
 
 // The kernel header is included via the compile command line with -include flag
 // It defines SelectedKernel struct and KERNEL_NAME
-// DataTypeTraits are now defined in gemm_common.hpp
-
-// Create argument parser
-inline auto create_args(int argc, char* argv[])
-{
-    ck_tile::ArgParser arg_parser;
-    arg_parser.insert("m", "3840", "The value for m dimension. Default is 3840.")
-        .insert("n", "4096", "The value for n dimension. Default is 4096.")
-        .insert("k", "2048", "The value for k dimension. Default is 2048.")
-        .insert("stride_a", "0", "The stride value for tensor A. Default is 0.")
-        .insert("stride_b", "0", "The stride value for tensor B. Default is 0.")
-        .insert("stride_c", "0", "The stride value for tensor C. Default is 0.")
-        .insert("split_k", "1", "The split value for k dimension. Default is 1.")
-        .insert("verify",
-                "2",
-                "The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 "
-                "for validation on GPU. Default is 0, no validation.")
-        .insert("log",
-                "false",
-                "Whether output kernel instance information or not. Possible values are true or "
-                "false. Default is false")
-        .insert(
-            "warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.")
-        .insert(
-            "repeat", "100", "The number of iterations to benchmark the kernel. Default is 100.")
-        .insert("timer",
-                "true",
-                "Whether if the timer is gpu timer or not. Possible values are false or true. "
-                "Default is true.")
-        .insert("init",
-                "0",
-                "The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 "
-                "for constant(1). Default is 0, random.")
-        .insert("flush_cache",
-                "true",
-                "To flush cache, possible values are true or false. "
-                "Default is false.")
-        .insert("rotating_count", "1000", "number of iterations to rotate the cache. default is 5.")
-        .insert("metric",
-                "0",
-                "Metric with which to measure kernel performance. Set to 0 for latency, 1 for "
-                "tflops, or 2 for bandwidth. Default is 0, latency.")
-        .insert("csv_filename",
-                "",
-                "The filename of benchmark result. Default is empty (no CSV output).")
-        .insert("structured_sparsity",
-                "false",
-                "Whether use sparsity kernel or not. Possible values are true or false. Default is "
-                "false")
-        .insert("json_output",
-                "false",
-                "Whether to output results in JSON format only. Possible values are true or false. "
-                "Default is "
-                "false");
-
-    bool result = arg_parser.parse(argc, argv);
-    return std::make_tuple(result, arg_parser);
-}
 
 void benchmark_single(const ck_tile::ArgParser& arg_parser)
 {
     // Use DataTypeTraits to get the actual type names from the generated header
     // The generated header defines ADataType, BDataType, AccDataType, CDataType
-    std::string dtype_a   = DataTypeTraits<ADataType>::name;
-    std::string dtype_b   = DataTypeTraits<BDataType>::name;
-    std::string dtype_acc = DataTypeTraits<AccDataType>::name;
-    std::string dtype_c   = DataTypeTraits<CDataType>::name;
+    std::string dtype_a   = ck_tile::DataTypeTraits<ADataType>::name;
+    std::string dtype_b   = ck_tile::DataTypeTraits<BDataType>::name;
+    std::string dtype_acc = ck_tile::DataTypeTraits<AccDataType>::name;
+    std::string dtype_c   = ck_tile::DataTypeTraits<CDataType>::name;
 
     // Layout names from the layout types
     std::string layout_a = ALayout::name;
@@ -119,29 +62,17 @@ void benchmark_single(const ck_tile::ArgParser& arg_parser)
                     arg_parser.get_bool("json_output")};
 
     // Get the profiler instance
-    auto& profiler = GemmProfiler::instance(setting);
+    auto& profiler = GemmPreshuffleProfiler::instance(setting);
 
     try
     {
-        // Create a lambda that wraps the kernel launch
-        std::tuple<int, int, int> warp_tile_dims = std::make_tuple(
-            SelectedKernel::WarpTileM, SelectedKernel::WarpTileN, SelectedKernel::WarpTileK);
-        std::tuple<int, int, int> tile_dims =
-            std::make_tuple(SelectedKernel::TileM, SelectedKernel::TileN, SelectedKernel::TileK);
-        std::tuple<int, int, int> warp_dims = std::make_tuple(SelectedKernel::WarpPerBlock_M,
-                                                              SelectedKernel::WarpPerBlock_N,
-                                                              SelectedKernel::WarpPerBlock_K);
-        bool permuteN                       = SelectedKernel::PermuteN;
-
-        KernelConfig config{tile_dims, warp_dims, warp_tile_dims, permuteN};
-
         auto kernel_func = [](const ck_tile::GemmHostArgs& args,
                               const ck_tile::stream_config& stream) {
             return SelectedKernel::launch(args, stream);
         };
 
         // Benchmark the kernel
-        profiler.benchmark(gemm_problem, kernel_func, config);
+        profiler.benchmark(gemm_problem, kernel_func);
 
         // Select best instance based on metric
         profiler.select_best_instance(static_cast<Metric>(arg_parser.get_int("metric")));
diff --git a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_common.hpp b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_common.hpp
index 7d3164f9d48..21cda28f754 100644
--- a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_common.hpp
+++ b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_common.hpp
@@ -8,35 +8,20 @@
 #include "ck_tile/host.hpp"
 #include "ck_tile/core/numeric/integer.hpp"
 #include "ck_tile/core/numeric/pk_int4.hpp"
+#include "gemm/gemm_common.hpp"
 
 // Structure to hold kernel traits for dispatcher
-struct KernelTraits
+struct PreshuffleKernelTraits : KernelTraits
 {
-    std::string pipeline;  // preshufflev2
-    std::string scheduler; // intrawave, interwave, default
-    std::string epilogue;  // cshuffle, default
-    bool pad_m;
-    bool pad_n;
-    bool pad_k;
-    bool persistent;
 
     // Constructor with defaults
-    KernelTraits()
-        : pipeline("preshufflev2"),
-          scheduler("default"),
-          epilogue("default"),
-          pad_m(false),
-          pad_n(false),
-          pad_k(false),
-          persistent(false)
-    {
-    }
+    PreshuffleKernelTraits() : KernelTraits() { this->pipeline = "preshufflev2"; }
 };
 
 // Helper to extract traits from kernel name
-inline KernelTraits extract_traits_from_name(const std::string& kernel_name)
+inline PreshuffleKernelTraits extract_traits_from_name(const std::string& kernel_name)
 {
-    KernelTraits traits;
+    PreshuffleKernelTraits traits;
 
     // Extract pipeline
     if(kernel_name.find("preshufflev2") != std::string::npos)
@@ -74,42 +59,3 @@ inline KernelTraits extract_traits_from_name(const std::string& kernel_name)
 
     return traits;
 }
-
-template <typename T>
-auto shuffle_b(const ck_tile::HostTensor<T>& t,
-               ck_tile::index_t N_Warp_Tile,
-               ck_tile::index_t K_Warp_Tile)
-{
-    assert(t.get_lengths().size() == 2);
-    int n_      = t.get_lengths()[1];
-    int k_      = t.get_lengths()[0];
-    int divisor = N_Warp_Tile == 32 ? 2 : 4;
-    ck_tile::HostTensor<T> t_view(
-        {n_ / N_Warp_Tile, N_Warp_Tile, k_ / K_Warp_Tile, divisor, K_Warp_Tile / divisor});
-    std::copy(t.begin(), t.end(), t_view.begin());
-    return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
-}
-
-template <typename T>
-auto shuffle_b_permuteN(const ck_tile::HostTensor<T>& t,
-                        ck_tile::index_t N_Warp_Tile,
-                        ck_tile::index_t K_Warp_Tile,
-                        ck_tile::index_t N_Tile,
-                        ck_tile::index_t N_Warp)
-{
-    assert(t.get_lengths().size() == 2);
-
-    int n_      = t.get_lengths()[1];
-    int k_      = t.get_lengths()[0];
-    int divisor = N_Warp_Tile == 32 ? 2 : 4;
-    int NRepeat = N_Tile / N_Warp_Tile / N_Warp;
-    ck_tile::HostTensor<T> t_view({n_ / N_Tile,
-                                   N_Warp,
-                                   N_Warp_Tile,
-                                   NRepeat,
-                                   k_ / K_Warp_Tile,
-                                   divisor,
-                                   K_Warp_Tile / divisor});
-    std::copy(t.begin(), t.end(), t_view.begin());
-    return ck_tile::reference_permute(t_view, {0, 3, 1, 4, 5, 2, 6});
-}
diff --git a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_profiler.hpp b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_profiler.hpp
index 4cf980dbf73..e7af0738776 100644
--- a/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_profiler.hpp
+++ b/tile_engine/ops/gemm/gemm_preshuffle/gemm_preshuffle_profiler.hpp
@@ -4,28 +4,26 @@
 #pragma once
 
 #include "ck_tile/host/device_prop.hpp"
+#include "ck_tile/host/tensor_shuffle_utils.hpp"
 #include "ck_tile/ops/gemm.hpp"
 #include "gemm/gemm_profiler.hpp"
 #include "gemm_preshuffle_benchmark.hpp"
 
-class GemmPreshuffleProfiler : public GemmProfiler<GemmPreshuffleProfiler,
-                          GemmProblem,
-                          ck_tile::GemmHostArgs>
+class GemmPreshuffleProfiler
+    : public GemmProfiler<GemmPreshuffleProfiler, GemmProblem, ck_tile::GemmHostArgs>
 {
-public:
-    using BaseGemm = GemmProfiler<GemmPreshuffleProfiler,
-                              GemmProblem,
-                              ck_tile::GemmHostArgs>;
+    public:
+    using BaseGemm = GemmProfiler<GemmPreshuffleProfiler, GemmProblem, ck_tile::GemmHostArgs>;
     using BaseGemm::benchmark;
 
     GemmPreshuffleProfiler(Setting setting)
-        : GemmProfiler<GemmPreshuffleProfiler, GemmProblem, ck_tile::GemmHostArgs>(setting) {}
-
+        : GemmProfiler<GemmPreshuffleProfiler, GemmProblem, ck_tile::GemmHostArgs>(setting)
+    {
+    }
 
     void benchmark(GemmProblem& gemm_problem,
                    std::vector<std::function<std::tuple<std::string, float>(
-                       ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>>& callables,
-                   KernelConfig& config) override
+                       ck_tile::GemmHostArgs&, const ck_tile::stream_config&)>>& callables) override
     {
         const ALayout layout_a = ALayout{};
         const BLayout layout_b = BLayout{};
@@ -97,21 +95,28 @@ class GemmPreshuffleProfiler : public GemmProfiler<GemmPreshuffleProfiler,
         c_m_n_dev_buf.SetZero();
         c_m_n_dev_result.SetZero();
 
+        // Create a lambda that wraps the kernel launch
+        KernelConfig config{SelectedKernel::WarpTileM,
+                            SelectedKernel::WarpTileN,
+                            SelectedKernel::WarpTileK,
+                            SelectedKernel::TileM,
+                            SelectedKernel::TileN,
+                            SelectedKernel::TileK,
+                            SelectedKernel::WarpPerBlock_M,
+                            SelectedKernel::WarpPerBlock_N,
+                            SelectedKernel::WarpPerBlock_K,
+                            SelectedKernel::PermuteN};
+
         for(const auto& callable : callables)
         {
-            ck_tile::index_t N_Warp_Tile = std::get<1>(config.warp_tile_dims);
-            ck_tile::index_t K_Warp_Tile = std::get<2>(config.warp_tile_dims);
-            ck_tile::index_t N_Tile      = std::get<1>(config.tile_dims);
-            ck_tile::index_t N_Warp      = std::get<1>(config.warp_dims);
-
             ck_tile::HostTensor<BDataType> b_shuffle_host = [&]() {
                 if(config.permuteN)
                 {
-                    return shuffle_b_permuteN(b_k_n, N_Warp_Tile, K_Warp_Tile, N_Tile, N_Warp);
+                    return ck_tile::shuffle_b_permuteN<KernelConfig>(b_k_n, config);
                 }
                 else
                 {
-                    return shuffle_b(b_k_n, N_Warp_Tile, K_Warp_Tile);
+                    return ck_tile::shuffle_b<KernelConfig>(b_k_n, config);
                 }
             }();
 
@@ -144,5 +149,4 @@ class GemmPreshuffleProfiler : public GemmProfiler<GemmPreshuffleProfiler,
                 gemm_problem, c_m_n_dev_buf, c_m_n_ref, c_m_n_dev_result, kernel_run_result);
         }
     }
-
 };
diff --git a/tile_engine/ops/gemm/gemm_profiler.hpp b/tile_engine/ops/gemm/gemm_profiler.hpp
index 4c9b706fe35..ab62b0616f1 100644
--- a/tile_engine/ops/gemm/gemm_profiler.hpp
+++ b/tile_engine/ops/gemm/gemm_profiler.hpp
@@ -11,17 +11,14 @@
 #include <functional>
 #include <tuple>
 
-
 #include "ck_tile/host/device_prop.hpp"
 #include "ck_tile/ops/gemm.hpp"
 #include "gemm_benchmark.hpp"
 
-template <typename Gemm,
-          typename Problem,
-          typename GemmArgs>
+template <typename Gemm, typename Problem, typename GemmArgs>
 class GemmProfiler
 {
-public:
+    public:
     static Gemm& instance(Setting setting)
     {
         static Gemm instance{setting};
@@ -30,27 +27,25 @@ class GemmProfiler
 
     // Overload for single kernel benchmarking
     void benchmark(Problem& gemm_problem,
-                   std::function<float(const GemmArgs&, const ck_tile::stream_config&)>
-                       kernel_func)
+                   std::function<float(const GemmArgs&, const ck_tile::stream_config&)> kernel_func)
     {
         // Create a vector with a single callable that returns both name and time
-        std::vector<std::function<std::tuple<std::string, float>(GemmArgs&,
-                                                                 const ck_tile::stream_config&)>>
+        std::vector<
+            std::function<std::tuple<std::string, float>(GemmArgs&, const ck_tile::stream_config&)>>
             callables;
 
-        callables.push_back(
-            [kernel_func](GemmArgs& args, const ck_tile::stream_config& stream) {
-                float time = kernel_func(args, stream);
-                return std::make_tuple(std::string(KERNEL_NAME), time);
-            });
+        callables.push_back([kernel_func](GemmArgs& args, const ck_tile::stream_config& stream) {
+            float time = kernel_func(args, stream);
+            return std::make_tuple(std::string(KERNEL_NAME), time);
+        });
 
-        benchmark(gemm_problem, callables); // TODO: need to cast this?
+        benchmark(gemm_problem, callables);
     }
 
     virtual void benchmark(Problem& gemm_problem,
-                   std::vector<std::function<std::tuple<std::string, float>(
-                       GemmArgs&, const ck_tile::stream_config&)>>& callables) = 0;
-   
+                           std::vector<std::function<std::tuple<std::string, float>(
+                               GemmArgs&, const ck_tile::stream_config&)>>& callables) = 0;
+
     void process_result(const Problem& gemm_problem,
                         ck_tile::DeviceMem& c_m_n_dev_buf,
                         ck_tile::HostTensor<CDataType>& c_m_n_host_result,
@@ -58,7 +53,7 @@ class GemmProfiler
                         const std::tuple<std::string, float>& kernel_run_result)
     {
         auto [name, avg_time] = kernel_run_result;
-	using DDataType = typename get_DsDataType<Problem>::type;
+        using DDataType       = typename get_DsDataType<Problem>::type;
 
         KernelInstance<Problem> kernel_instance{name, gemm_problem, {-1.0f, -1.0f, -1.0f}};
 
@@ -68,16 +63,14 @@ class GemmProfiler
                                sizeof(BDataType) * gemm_problem.n_ * gemm_problem.k_ +
                                sizeof(CDataType) * gemm_problem.m_ * gemm_problem.n_;
 
-
-	if constexpr (!std::is_void_v<DDataType>)
-	{
-		ck_tile::static_for<0, DDataType::size(), 1>{}([&](auto i) {
-				using DType = ck_tile::remove_cvref_t<std::tuple_element_t<i, DDataType>>;
-		    		num_byte += sizeof(DType) * gemm_problem.m_ * gemm_problem.n_;
-		    		flop     += sizeof(DType) * gemm_problem.m_ * gemm_problem.n_;
-				});
-	}
-
+        if constexpr(!std::is_void_v<DDataType>)
+        {
+            ck_tile::static_for<0, DDataType::size(), 1>{}([&](auto i) {
+                using DType = ck_tile::remove_cvref_t<std::tuple_element_t<i, DDataType>>;
+                num_byte += sizeof(DType) * gemm_problem.m_ * gemm_problem.n_;
+                flop += sizeof(DType) * gemm_problem.m_ * gemm_problem.n_;
+            });
+        }
 
         // update
         kernel_instance.perf_result_.latency_   = avg_time;
@@ -91,15 +84,14 @@ class GemmProfiler
 
         // verify result
         c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
-	int split_k = 1;
-    	if constexpr (std::is_same_v<Problem, GemmProblem>)
-    	{
-        	split_k = gemm_problem.split_k_;
-    	}
+        int split_k = 1;
+        if constexpr(std::is_same_v<Problem, GemmProblem>)
+        {
+            split_k = gemm_problem.split_k_;
+        }
         bool verified_correct =
             !setting_.verify_ ||
-            compare<Problem>(
-                name, gemm_problem.k_, split_k, c_m_n_dev_result, c_m_n_host_result);
+            compare<Problem>(name, gemm_problem.k_, split_k, c_m_n_dev_result, c_m_n_host_result);
 
         if(verified_correct)
         {
@@ -196,5 +188,3 @@ class GemmProfiler
 
     std::vector<KernelInstance<Problem>> kernel_instances_;
 };
-
-
diff --git a/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py b/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py
index 4fa5d5dee0e..008ffaa14fa 100755
--- a/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py
+++ b/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark.py
@@ -4,14 +4,10 @@
 
 import os
 import sys
-import json
-import subprocess
 import argparse
-import csv
 import time
 import importlib.util
-from pathlib import Path
-from typing import List, Dict, Tuple, Optional
+
 
 def _import_gemm_benchmark():
     """Import validation utilities from commons directory."""
@@ -28,6 +24,7 @@ def _import_gemm_benchmark():
 
     return gemm_benchmark_module.GemmBenchmark
 
+
 def _import_benchmark_utils():
     """Import benchmark utilities from commons directory."""
     current_dir = os.path.dirname(os.path.abspath(__file__))
@@ -43,16 +40,20 @@ def _import_benchmark_utils():
 
     return benchmark_utils
 
+
 GemmBenchmark = _import_gemm_benchmark()
 benchmark_utils = _import_benchmark_utils()
 
+
 class GemmUniversalBenchmark(GemmBenchmark):
     def __init__(self, build_dir: str, verbose: bool = False):
-        super().__init__(build_dir, verbose, name="benchmark_gemm_")
+        super().__init__(build_dir, verbose, name="benchmark_gemm_universal_")
 
 
 def main():
-    parser = argparse.ArgumentParser(description="GEMM Kernel Benchmarking Tool")
+    parser = argparse.ArgumentParser(
+        description="Universal GEMM Kernel Benchmarking Tool"
+    )
     parser.add_argument(
         "build_dir", help="Build directory containing kernel executables"
     )
@@ -67,7 +68,9 @@ def main():
     )
     parser.add_argument("--verify", action="store_true", help="Enable verification")
     parser.add_argument(
-        "--csv", default="gemm_benchmark_results.csv", help="CSV output filename"
+        "--csv",
+        default="gemm_universal_benchmark_results.csv",
+        help="CSV output filename",
     )
     parser.add_argument(
         "--best", default="best_kernels.txt", help="Best kernels output filename"
@@ -115,7 +118,7 @@ def main():
     benchmark = GemmUniversalBenchmark(args.build_dir, verbose=args.verbose)
 
     # Run benchmark sweep
-    print("Starting GEMM kernel benchmark sweep...")
+    print("Starting Universal GEMM kernel benchmark sweep...")
     start_time = time.time()
 
     best_kernels = benchmark.benchmark_sweep(
diff --git a/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark_single.cpp b/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark_single.cpp
index bf81acabd29..b2015f8571d 100644
--- a/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark_single.cpp
+++ b/tile_engine/ops/gemm/gemm_universal/gemm_universal_benchmark_single.cpp
@@ -11,6 +11,7 @@
 
 #include "ck_tile/core.hpp"
 #include "ck_tile/host.hpp"
+#include "gemm/gemm_common.hpp"
 #include "gemm_universal_profiler.hpp"
 
 // The kernel header is included via the compile command line with -include flag
diff --git a/tile_engine/ops/gemm/gemm_universal/gemm_universal_profiler.hpp b/tile_engine/ops/gemm/gemm_universal/gemm_universal_profiler.hpp
index a1d749d6784..6cfdcab8009 100644
--- a/tile_engine/ops/gemm/gemm_universal/gemm_universal_profiler.hpp
+++ b/tile_engine/ops/gemm/gemm_universal/gemm_universal_profiler.hpp
@@ -9,21 +9,21 @@
 
 #include "ck_tile/host/device_prop.hpp"
 #include "ck_tile/ops/gemm.hpp"
-#include "gemm_universal_benchmark.hpp"
+#include "gemm/gemm_benchmark.hpp"
 #include "gemm/gemm_profiler.hpp"
+#include "gemm_universal_benchmark.hpp"
 
-class UniversalGemmProfiler : public GemmProfiler<UniversalGemmProfiler,
-                          GemmProblem,
-                          ck_tile::GemmHostArgs>
+class UniversalGemmProfiler
+    : public GemmProfiler<UniversalGemmProfiler, GemmProblem, ck_tile::GemmHostArgs>
 {
-public:
-    using BaseGemm = GemmProfiler<UniversalGemmProfiler,
-                              GemmProblem,
-                              ck_tile::GemmHostArgs>;
+    public:
+    using BaseGemm = GemmProfiler<UniversalGemmProfiler, GemmProblem, ck_tile::GemmHostArgs>;
     using BaseGemm::benchmark;
 
     UniversalGemmProfiler(Setting setting)
-        : GemmProfiler<UniversalGemmProfiler, GemmProblem, ck_tile::GemmHostArgs>(setting) {}
+        : GemmProfiler<UniversalGemmProfiler, GemmProblem, ck_tile::GemmHostArgs>(setting)
+    {
+    }
 
     void benchmark(GemmProblem& gemm_problem,
                    std::vector<std::function<std::tuple<std::string, float>(