ROCm · ErwinTerpstra · Jan 14, 2026 · Jan 14, 2026 · Jan 14, 2026 · Jan 15, 2026
@@ -13,6 +13,9 @@ if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx12")
     add_executable(${EXE_NAME}
         gemm_quant.cpp
         gemm_abquant_quantgrouped.cpp
+        gemm_abquant_quantgrouped_fp8.cpp
+        gemm_abquant_quantgrouped_bf8.cpp
+        gemm_abquant_quantgrouped_fp4.cpp
         gemm_aquant_quantgrouped.cpp
         gemm_aquant_quantgrouped_preshufflequant.cpp
         gemm_bquant_quantgrouped_bf8i4.cpp

@@ -6,127 +6,17 @@
 template <typename T>
 using GemmConfig = GemmConfigQuantPrefill<T>;
 
+void abquant_quantgrouped_fp4_instance_factory(
+    std::unordered_map<size_t, std::function<int(const ck_tile::ArgParser&)>>& lut);
+void abquant_quantgrouped_fp8_instance_factory(
+    std::unordered_map<size_t, std::function<int(const ck_tile::ArgParser&)>>& lut);
+void abquant_quantgrouped_bf8_instance_factory(
+    std::unordered_map<size_t, std::function<int(const ck_tile::ArgParser&)>>& lut);
+
 void abquant_quantgrouped_instance_factory(
     std::unordered_map<size_t, std::function<int(const ck_tile::ArgParser&)>>& lut)
 {
-    lut[hash_multiple_strings({"fp8",
-                               "abquant",
-                               "non-preshuffleb",
-                               "non-preshufflequant",
-                               "1x1x128"})] = [](const ck_tile::ArgParser& arg_parser) {
-        using AQuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
-        using BQuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
-        using TypeConfig =
-            decltype(GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t, float>{});
-        return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>,
-                                          TypeConfig,
-                                          AQuantGroupSize,
-                                          BQuantGroupSize,
-                                          ck_tile::QuantType::ABQuantGrouped>(arg_parser);
-    };
-    lut[hash_multiple_strings({"fp8",
-                               "abquant",
-                               "non-preshuffleb",
-                               "non-preshufflequant",
-                               "1x128x128"})] = [](const ck_tile::ArgParser& arg_parser) {
-        using AQuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
-        using BQuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 128, 128>>;
-        using TypeConfig =
-            decltype(GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t, float>{});
-        return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>,
-                                          TypeConfig,
-                                          AQuantGroupSize,
-                                          BQuantGroupSize,
-                                          ck_tile::QuantType::ABQuantGrouped>(arg_parser);
-    };
-    lut[hash_multiple_strings({"bf8",
-                               "abquant",
-                               "non-preshuffleb",
-                               "non-preshufflequant",
-                               "1x1x128"})] = [](const ck_tile::ArgParser& arg_parser) {
-        using AQuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
-        using BQuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
-        using TypeConfig =
-            decltype(GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t, float>{});
-        return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>,
-                                          TypeConfig,
-                                          AQuantGroupSize,
-                                          BQuantGroupSize,
-                                          ck_tile::QuantType::ABQuantGrouped>(arg_parser);
-    };
-    lut[hash_multiple_strings({"bf8",
-                               "abquant",
-                               "non-preshuffleb",
-                               "non-preshufflequant",
-                               "1x128x128"})] = [](const ck_tile::ArgParser& arg_parser) {
-        using AQuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
-        using BQuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 128, 128>>;
-        using TypeConfig =
-            decltype(GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t, float>{});
-        return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>,
-                                          TypeConfig,
-                                          AQuantGroupSize,
-                                          BQuantGroupSize,
-                                          ck_tile::QuantType::ABQuantGrouped>(arg_parser);
-    };
-    lut[hash_multiple_strings({"fp8",
-                               "abquant",
-                               "preshuffleb",
-                               "non-preshufflequant",
-                               "1x1x128"})] = [](const ck_tile::ArgParser& arg_parser) {
-        using AQuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
-        using BQuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
-        using TypeConfig =
-            decltype(GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t, float>{});
-        return run_gemm_example_prec_type<GemmConfigPreshuffleB_BQuant_Prefill<ck_tile::fp8_t>,
-                                          TypeConfig,
-                                          AQuantGroupSize,
-                                          BQuantGroupSize,
-                                          ck_tile::QuantType::ABQuantGrouped>(arg_parser);
-    };
-    lut[hash_multiple_strings({"fp8",
-                               "abquant",
-                               "preshuffleb",
-                               "non-preshufflequant",
-                               "1x128x128"})] = [](const ck_tile::ArgParser& arg_parser) {
-        using AQuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
-        using BQuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 128, 128>>;
-        using TypeConfig =
-            decltype(GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t, float>{});
-        return run_gemm_example_prec_type<GemmConfigPreshuffleB_BQuant_Prefill<ck_tile::fp8_t>,
-                                          TypeConfig,
-                                          AQuantGroupSize,
-                                          BQuantGroupSize,
-                                          ck_tile::QuantType::ABQuantGrouped>(arg_parser);
-    };
-    lut[hash_multiple_strings({"bf8",
-                               "abquant",
-                               "preshuffleb",
-                               "non-preshufflequant",
-                               "1x1x128"})] = [](const ck_tile::ArgParser& arg_parser) {
-        using AQuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
-        using BQuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
-        using TypeConfig =
-            decltype(GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t, float>{});
-        return run_gemm_example_prec_type<GemmConfigPreshuffleB_BQuant_Prefill<ck_tile::bf8_t>,
-                                          TypeConfig,
-                                          AQuantGroupSize,
-                                          BQuantGroupSize,
-                                          ck_tile::QuantType::ABQuantGrouped>(arg_parser);
-    };
-    lut[hash_multiple_strings({"bf8",
-                               "abquant",
-                               "preshuffleb",
-                               "non-preshufflequant",
-                               "1x128x128"})] = [](const ck_tile::ArgParser& arg_parser) {
-        using AQuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
-        using BQuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 128, 128>>;
-        using TypeConfig =
-            decltype(GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t, float>{});
-        return run_gemm_example_prec_type<GemmConfigPreshuffleB_BQuant_Prefill<ck_tile::bf8_t>,
-                                          TypeConfig,
-                                          AQuantGroupSize,
-                                          BQuantGroupSize,
-                                          ck_tile::QuantType::ABQuantGrouped>(arg_parser);
-    };
+    abquant_quantgrouped_fp4_instance_factory(lut);
+    abquant_quantgrouped_fp8_instance_factory(lut);
+    abquant_quantgrouped_bf8_instance_factory(lut);
 }
@@ -0,0 +1,72 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "run_gemm_quant_example.inc"
+
+template <typename T>
+using GemmConfig = GemmConfigQuantPrefill<T>;
+
+void abquant_quantgrouped_bf8_instance_factory(
+    std::unordered_map<size_t, std::function<int(const ck_tile::ArgParser&)>>& lut)
+{
+    lut[hash_multiple_strings({"bf8",
+                               "abquant",
+                               "non-preshuffleb",
+                               "non-preshufflequant",
+                               "1x1x128"})] = [](const ck_tile::ArgParser& arg_parser) {
+        using AQuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
+        using BQuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
+        using TypeConfig =
+            decltype(GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t, float>{});
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>,
+                                          TypeConfig,
+                                          AQuantGroupSize,
+                                          BQuantGroupSize,
+                                          ck_tile::QuantType::ABQuantGrouped>(arg_parser);
+    };
+    lut[hash_multiple_strings({"bf8",
+                               "abquant",
+                               "non-preshuffleb",
+                               "non-preshufflequant",
+                               "1x128x128"})] = [](const ck_tile::ArgParser& arg_parser) {
+        using AQuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
+        using BQuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 128, 128>>;
+        using TypeConfig =
+            decltype(GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t, float>{});
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>,
+                                          TypeConfig,
+                                          AQuantGroupSize,
+                                          BQuantGroupSize,
+                                          ck_tile::QuantType::ABQuantGrouped>(arg_parser);
+    };
+    lut[hash_multiple_strings({"bf8",
+                               "abquant",
+                               "preshuffleb",
+                               "non-preshufflequant",
+                               "1x1x128"})] = [](const ck_tile::ArgParser& arg_parser) {
+        using AQuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
+        using BQuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
+        using TypeConfig =
+            decltype(GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t, float>{});
+        return run_gemm_example_prec_type<GemmConfigPreshuffleB_BQuant_Prefill<ck_tile::bf8_t>,
+                                          TypeConfig,
+                                          AQuantGroupSize,
+                                          BQuantGroupSize,
+                                          ck_tile::QuantType::ABQuantGrouped>(arg_parser);
+    };
+    lut[hash_multiple_strings({"bf8",
+                               "abquant",
+                               "preshuffleb",
+                               "non-preshufflequant",
+                               "1x128x128"})] = [](const ck_tile::ArgParser& arg_parser) {
+        using AQuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
+        using BQuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 128, 128>>;
+        using TypeConfig =
+            decltype(GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t, float>{});
+        return run_gemm_example_prec_type<GemmConfigPreshuffleB_BQuant_Prefill<ck_tile::bf8_t>,
+                                          TypeConfig,
+                                          AQuantGroupSize,
+                                          BQuantGroupSize,
+                                          ck_tile::QuantType::ABQuantGrouped>(arg_parser);
+    };
+}
@@ -0,0 +1,27 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "run_gemm_quant_example.inc"
+
+template <typename T>
+using GemmConfig = GemmConfigQuantPrefill<T>;
+
+void abquant_quantgrouped_fp4_instance_factory(
+    std::unordered_map<size_t, std::function<int(const ck_tile::ArgParser&)>>& lut)
+{
+    lut[hash_multiple_strings(
+        {"fp4", "abquant", "non-preshuffleb", "non-preshufflequant", "1x128x128"})] =
+        [](const ck_tile::ArgParser& arg_parser) {
+            using AQuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
+            using BQuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 128, 128>>;
+            using TypeConfig      = decltype(GemmQuantTypeConfig<ck_tile::pk_fp4_t,
+                                                                 ck_tile::pk_fp4_t,
+                                                                 ck_tile::half_t,
+                                                                 float>{});
+            return run_gemm_example_prec_type<GemmConfig<ck_tile::pk_fp4_raw_t>,
+                                              TypeConfig,
+                                              AQuantGroupSize,
+                                              BQuantGroupSize,
+                                              ck_tile::QuantType::ABQuantGrouped>(arg_parser);
+        };
+}
@@ -0,0 +1,72 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "run_gemm_quant_example.inc"
+
+template <typename T>
+using GemmConfig = GemmConfigQuantPrefill<T>;
+
+void abquant_quantgrouped_fp8_instance_factory(
+    std::unordered_map<size_t, std::function<int(const ck_tile::ArgParser&)>>& lut)
+{
+    lut[hash_multiple_strings({"fp8",
+                               "abquant",
+                               "non-preshuffleb",
+                               "non-preshufflequant",
+                               "1x1x128"})] = [](const ck_tile::ArgParser& arg_parser) {
+        using AQuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
+        using BQuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
+        using TypeConfig =
+            decltype(GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t, float>{});
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>,
+                                          TypeConfig,
+                                          AQuantGroupSize,
+                                          BQuantGroupSize,
+                                          ck_tile::QuantType::ABQuantGrouped>(arg_parser);
+    };
+    lut[hash_multiple_strings({"fp8",
+                               "abquant",
+                               "non-preshuffleb",
+                               "non-preshufflequant",
+                               "1x128x128"})] = [](const ck_tile::ArgParser& arg_parser) {
+        using AQuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
+        using BQuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 128, 128>>;
+        using TypeConfig =
+            decltype(GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t, float>{});
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>,
+                                          TypeConfig,
+                                          AQuantGroupSize,
+                                          BQuantGroupSize,
+                                          ck_tile::QuantType::ABQuantGrouped>(arg_parser);
+    };
+    lut[hash_multiple_strings({"fp8",
+                               "abquant",
+                               "preshuffleb",
+                               "non-preshufflequant",
+                               "1x1x128"})] = [](const ck_tile::ArgParser& arg_parser) {
+        using AQuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
+        using BQuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
+        using TypeConfig =
+            decltype(GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t, float>{});
+        return run_gemm_example_prec_type<GemmConfigPreshuffleB_BQuant_Prefill<ck_tile::fp8_t>,
+                                          TypeConfig,
+                                          AQuantGroupSize,
+                                          BQuantGroupSize,
+                                          ck_tile::QuantType::ABQuantGrouped>(arg_parser);
+    };
+    lut[hash_multiple_strings({"fp8",
+                               "abquant",
+                               "preshuffleb",
+                               "non-preshufflequant",
+                               "1x128x128"})] = [](const ck_tile::ArgParser& arg_parser) {
+        using AQuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
+        using BQuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 128, 128>>;
+        using TypeConfig =
+            decltype(GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t, float>{});
+        return run_gemm_example_prec_type<GemmConfigPreshuffleB_BQuant_Prefill<ck_tile::fp8_t>,
+                                          TypeConfig,
+                                          AQuantGroupSize,
+                                          BQuantGroupSize,
+                                          ck_tile::QuantType::ABQuantGrouped>(arg_parser);
+    };
+}
@@ -32,7 +32,7 @@ auto create_args(int argc, char* argv[])
         .insert("prec",
                 "fp8",
                 "Data type. For AQuant: fp8, bf8, i4fp8, or i4bf8;  for Bquant: fp8, bf8, fp8i4, "
-                "or bf8i4;  for ABQuant: fp8, bf8")
+                "or bf8i4;  for ABQuant: fp8, bf8, fp4")
         .insert("warmup", "50", "Number of iterations before benchmarking the kernel")
         .insert("repeat", "1000", "Number of iterations to benchmark the kernel")
         .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")