diff --git a/.buildkite/kernel_microbenchmarks/all_gather_matmul/w16a16.yml b/.buildkite/kernel_microbenchmarks/all_gather_matmul/w16a16.yml
new file mode 100644
index 000000000..478a0bc14
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/all_gather_matmul/w16a16.yml
@@ -0,0 +1,45 @@
+# all-gather-matmul-w16a16
+# kernel support matrix microbenchmarks
+steps:
+  - label: "Correctness tests for all-gather-matmul-w16a16"
+    key: "all-gather-matmul-w16a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "all-gather-matmul-w16a16_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for all-gather-matmul-w16a16"
+    key: "record_all-gather-matmul-w16a16_CorrectnessTest"
+    depends_on: "all-gather-matmul-w16a16_CorrectnessTest"
+    env:
+      CI_TARGET: "all-gather-matmul-w16a16"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh all-gather-matmul-w16a16_CorrectnessTest
+
+  - label: "Performance tests for all-gather-matmul-w16a16"
+    key: "all-gather-matmul-w16a16_PerformanceTest"
+    depends_on: "record_all-gather-matmul-w16a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "all-gather-matmul-w16a16_PerformanceTest" "to be added"
+  - label: "Record performance test result for all-gather-matmul-w16a16"
+    key: "record_all-gather-matmul-w16a16_PerformanceTest"
+    depends_on: "all-gather-matmul-w16a16_PerformanceTest"
+    env:
+      CI_TARGET: "all-gather-matmul-w16a16"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh all-gather-matmul-w16a16_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/all_gather_matmul/w4a16.yml b/.buildkite/kernel_microbenchmarks/all_gather_matmul/w4a16.yml
new file mode 100644
index 000000000..1a8b32847
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/all_gather_matmul/w4a16.yml
@@ -0,0 +1,45 @@
+# all-gather-matmul-w4a16
+# kernel support matrix microbenchmarks
+steps:
+  - label: "Correctness tests for all-gather-matmul-w4a16"
+    key: "all-gather-matmul-w4a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "all-gather-matmul-w4a16_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for all-gather-matmul-w4a16"
+    key: "record_all-gather-matmul-w4a16_CorrectnessTest"
+    depends_on: "all-gather-matmul-w4a16_CorrectnessTest"
+    env:
+      CI_TARGET: "all-gather-matmul-w4a16"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh all-gather-matmul-w4a16_CorrectnessTest
+
+  - label: "Performance tests for all-gather-matmul-w4a16"
+    key: "all-gather-matmul-w4a16_PerformanceTest"
+    depends_on: "record_all-gather-matmul-w4a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "all-gather-matmul-w4a16_PerformanceTest" "to be added"
+  - label: "Record performance test result for all-gather-matmul-w4a16"
+    key: "record_all-gather-matmul-w4a16_PerformanceTest"
+    depends_on: "all-gather-matmul-w4a16_PerformanceTest"
+    env:
+      CI_TARGET: "all-gather-matmul-w4a16"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh all-gather-matmul-w4a16_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/all_gather_matmul/w4a4.yml b/.buildkite/kernel_microbenchmarks/all_gather_matmul/w4a4.yml
new file mode 100644
index 000000000..da764ad44
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/all_gather_matmul/w4a4.yml
@@ -0,0 +1,45 @@
+# all-gather-matmul-w4a4
+# kernel support matrix microbenchmarks
+steps:
+  - label: "Correctness tests for all-gather-matmul-w4a4"
+    key: "all-gather-matmul-w4a4_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "all-gather-matmul-w4a4_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for all-gather-matmul-w4a4"
+    key: "record_all-gather-matmul-w4a4_CorrectnessTest"
+    depends_on: "all-gather-matmul-w4a4_CorrectnessTest"
+    env:
+      CI_TARGET: "all-gather-matmul-w4a4"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh all-gather-matmul-w4a4_CorrectnessTest
+
+  - label: "Performance tests for all-gather-matmul-w4a4"
+    key: "all-gather-matmul-w4a4_PerformanceTest"
+    depends_on: "record_all-gather-matmul-w4a4_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "all-gather-matmul-w4a4_PerformanceTest" "to be added"
+  - label: "Record performance test result for all-gather-matmul-w4a4"
+    key: "record_all-gather-matmul-w4a4_PerformanceTest"
+    depends_on: "all-gather-matmul-w4a4_PerformanceTest"
+    env:
+      CI_TARGET: "all-gather-matmul-w4a4"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh all-gather-matmul-w4a4_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/all_gather_matmul/w4a8.yml b/.buildkite/kernel_microbenchmarks/all_gather_matmul/w4a8.yml
new file mode 100644
index 000000000..520dde456
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/all_gather_matmul/w4a8.yml
@@ -0,0 +1,45 @@
+# all-gather-matmul-w4a8
+# kernel support matrix microbenchmarks
+steps:
+  - label: "Correctness tests for all-gather-matmul-w4a8"
+    key: "all-gather-matmul-w4a8_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "all-gather-matmul-w4a8_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for all-gather-matmul-w4a8"
+    key: "record_all-gather-matmul-w4a8_CorrectnessTest"
+    depends_on: "all-gather-matmul-w4a8_CorrectnessTest"
+    env:
+      CI_TARGET: "all-gather-matmul-w4a8"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh all-gather-matmul-w4a8_CorrectnessTest
+
+  - label: "Performance tests for all-gather-matmul-w4a8"
+    key: "all-gather-matmul-w4a8_PerformanceTest"
+    depends_on: "record_all-gather-matmul-w4a8_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "all-gather-matmul-w4a8_PerformanceTest" "to be added"
+  - label: "Record performance test result for all-gather-matmul-w4a8"
+    key: "record_all-gather-matmul-w4a8_PerformanceTest"
+    depends_on: "all-gather-matmul-w4a8_PerformanceTest"
+    env:
+      CI_TARGET: "all-gather-matmul-w4a8"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh all-gather-matmul-w4a8_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/all_gather_matmul/w8a16.yml b/.buildkite/kernel_microbenchmarks/all_gather_matmul/w8a16.yml
new file mode 100644
index 000000000..c8982d8d4
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/all_gather_matmul/w8a16.yml
@@ -0,0 +1,45 @@
+# all-gather-matmul-w8a16
+# kernel support matrix microbenchmarks
+steps:
+  - label: "Correctness tests for all-gather-matmul-w8a16"
+    key: "all-gather-matmul-w8a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "all-gather-matmul-w8a16_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for all-gather-matmul-w8a16"
+    key: "record_all-gather-matmul-w8a16_CorrectnessTest"
+    depends_on: "all-gather-matmul-w8a16_CorrectnessTest"
+    env:
+      CI_TARGET: "all-gather-matmul-w8a16"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh all-gather-matmul-w8a16_CorrectnessTest
+
+  - label: "Performance tests for all-gather-matmul-w8a16"
+    key: "all-gather-matmul-w8a16_PerformanceTest"
+    depends_on: "record_all-gather-matmul-w8a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "all-gather-matmul-w8a16_PerformanceTest" "to be added"
+  - label: "Record performance test result for all-gather-matmul-w8a16"
+    key: "record_all-gather-matmul-w8a16_PerformanceTest"
+    depends_on: "all-gather-matmul-w8a16_PerformanceTest"
+    env:
+      CI_TARGET: "all-gather-matmul-w8a16"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh all-gather-matmul-w8a16_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/all_gather_matmul/w8a8.yml b/.buildkite/kernel_microbenchmarks/all_gather_matmul/w8a8.yml
new file mode 100644
index 000000000..9129b6962
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/all_gather_matmul/w8a8.yml
@@ -0,0 +1,45 @@
+# all-gather-matmul-w8a8
+# kernel support matrix microbenchmarks
+steps:
+  - label: "Correctness tests for all-gather-matmul-w8a8"
+    key: "all-gather-matmul-w8a8_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "all-gather-matmul-w8a8_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for all-gather-matmul-w8a8"
+    key: "record_all-gather-matmul-w8a8_CorrectnessTest"
+    depends_on: "all-gather-matmul-w8a8_CorrectnessTest"
+    env:
+      CI_TARGET: "all-gather-matmul-w8a8"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh all-gather-matmul-w8a8_CorrectnessTest
+
+  - label: "Performance tests for all-gather-matmul-w8a8"
+    key: "all-gather-matmul-w8a8_PerformanceTest"
+    depends_on: "record_all-gather-matmul-w8a8_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "all-gather-matmul-w8a8_PerformanceTest" "to be added"
+  - label: "Record performance test result for all-gather-matmul-w8a8"
+    key: "record_all-gather-matmul-w8a8_PerformanceTest"
+    depends_on: "all-gather-matmul-w8a8_PerformanceTest"
+    env:
+      CI_TARGET: "all-gather-matmul-w8a8"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh all-gather-matmul-w8a8_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/for_attention_kernels_KV_cache/w16a16.yml b/.buildkite/kernel_microbenchmarks/for_attention_kernels_KV_cache/w16a16.yml
new file mode 100644
index 000000000..69f471de1
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/for_attention_kernels_KV_cache/w16a16.yml
@@ -0,0 +1,46 @@
+# attention_kernels-w16a16
+# kernel support matrix microbenchmarks
+# For attention kernels, W[x]A[y] denotes KV cache as W, A as compute, and x, y as bit precision-w16a16
+steps:
+  - label: "Correctness tests for attention kernels-w16a16"
+    key: "attention_kernels-w16a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "attention_kernels-w16a16_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for attention kernels-w16a16"
+    key: "record_attention_kernels-w16a16_CorrectnessTest"
+    depends_on: "attention_kernels-w16a16_CorrectnessTest"
+    env:
+      CI_TARGET: "attention_kernels-w16a16"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh attention_kernels-w16a16_CorrectnessTest
+
+  - label: "Performance tests for attention kernels-w16a16"
+    key: "attention_kernels-w16a16_PerformanceTest"
+    depends_on: "record_attention_kernels-w16a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "attention_kernels-w16a16_PerformanceTest" "to be added"
+  - label: "Record performance test result for attention kernels-w16a16"
+    key: "record_attention_kernels-w16a16_PerformanceTest"
+    depends_on: "attention_kernels-w16a16_PerformanceTest"
+    env:
+      CI_TARGET: "attention_kernels-w16a16"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh attention_kernels-w16a16_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/for_attention_kernels_KV_cache/w4a16.yml b/.buildkite/kernel_microbenchmarks/for_attention_kernels_KV_cache/w4a16.yml
new file mode 100644
index 000000000..43f79ebbd
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/for_attention_kernels_KV_cache/w4a16.yml
@@ -0,0 +1,46 @@
+# attention_kernels-w4a16
+# kernel support matrix microbenchmarks
+# For attention kernels, W[x]A[y] denotes KV cache as W, A as compute, and x, y as bit precision-w4a16
+steps:
+  - label: "Correctness tests for attention kernels-w4a16"
+    key: "attention_kernels-w4a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "attention_kernels-w4a16_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for attention kernels-w4a16"
+    key: "record_attention_kernels-w4a16_CorrectnessTest"
+    depends_on: "attention_kernels-w4a16_CorrectnessTest"
+    env:
+      CI_TARGET: "attention_kernels-w4a16"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh attention_kernels-w4a16_CorrectnessTest
+
+  - label: "Performance tests for attention kernels-w4a16"
+    key: "attention_kernels-w4a16_PerformanceTest"
+    depends_on: "record_attention_kernels-w4a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "attention_kernels-w4a16_PerformanceTest" "to be added"
+  - label: "Record performance test result for attention kernels-w4a16"
+    key: "record_attention_kernels-w4a16_PerformanceTest"
+    depends_on: "attention_kernels-w4a16_PerformanceTest"
+    env:
+      CI_TARGET: "attention_kernels-w4a16"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh attention_kernels-w4a16_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/for_attention_kernels_KV_cache/w4a4.yml b/.buildkite/kernel_microbenchmarks/for_attention_kernels_KV_cache/w4a4.yml
new file mode 100644
index 000000000..7bf43b97b
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/for_attention_kernels_KV_cache/w4a4.yml
@@ -0,0 +1,46 @@
+# attention_kernels-w4a4
+# kernel support matrix microbenchmarks
+# For attention kernels, W[x]A[y] denotes KV cache as W, A as compute, and x, y as bit precision-w4a4
+steps:
+  - label: "Correctness tests for attention kernels-w4a4"
+    key: "attention_kernels-w4a4_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "attention_kernels-w4a4_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for attention kernels-w4a4"
+    key: "record_attention_kernels-w4a4_CorrectnessTest"
+    depends_on: "attention_kernels-w4a4_CorrectnessTest"
+    env:
+      CI_TARGET: "attention_kernels-w4a4"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh attention_kernels-w4a4_CorrectnessTest
+
+  - label: "Performance tests for attention kernels-w4a4"
+    key: "attention_kernels-w4a4_PerformanceTest"
+    depends_on: "record_attention_kernels-w4a4_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "attention_kernels-w4a4_PerformanceTest" "to be added"
+  - label: "Record performance test result for attention kernels-w4a4"
+    key: "record_attention_kernels-w4a4_PerformanceTest"
+    depends_on: "attention_kernels-w4a4_PerformanceTest"
+    env:
+      CI_TARGET: "attention_kernels-w4a4"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh attention_kernels-w4a4_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/for_attention_kernels_KV_cache/w4a8.yml b/.buildkite/kernel_microbenchmarks/for_attention_kernels_KV_cache/w4a8.yml
new file mode 100644
index 000000000..e930b238f
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/for_attention_kernels_KV_cache/w4a8.yml
@@ -0,0 +1,46 @@
+# attention_kernels-w4a8
+# kernel support matrix microbenchmarks
+# For attention kernels, W[x]A[y] denotes KV cache as W, A as compute, and x, y as bit precision-w4a8
+steps:
+  - label: "Correctness tests for attention kernels-w4a8"
+    key: "attention_kernels-w4a8_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "attention_kernels-w4a8_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for attention kernels-w4a8"
+    key: "record_attention_kernels-w4a8_CorrectnessTest"
+    depends_on: "attention_kernels-w4a8_CorrectnessTest"
+    env:
+      CI_TARGET: "attention_kernels-w4a8"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh attention_kernels-w4a8_CorrectnessTest
+
+  - label: "Performance tests for attention kernels-w4a8"
+    key: "attention_kernels-w4a8_PerformanceTest"
+    depends_on: "record_attention_kernels-w4a8_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "attention_kernels-w4a8_PerformanceTest" "to be added"
+  - label: "Record performance test result for attention kernels-w4a8"
+    key: "record_attention_kernels-w4a8_PerformanceTest"
+    depends_on: "attention_kernels-w4a8_PerformanceTest"
+    env:
+      CI_TARGET: "attention_kernels-w4a8"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh attention_kernels-w4a8_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/for_attention_kernels_KV_cache/w8a16.yml b/.buildkite/kernel_microbenchmarks/for_attention_kernels_KV_cache/w8a16.yml
new file mode 100644
index 000000000..9e69303e8
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/for_attention_kernels_KV_cache/w8a16.yml
@@ -0,0 +1,46 @@
+# attention_kernels-w8a16
+# kernel support matrix microbenchmarks
+# For attention kernels, W[x]A[y] denotes KV cache as W, A as compute, and x, y as bit precision-w8a16
+steps:
+  - label: "Correctness tests for attention kernels-w8a16"
+    key: "attention_kernels-w8a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "attention_kernels-w8a16_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for attention kernels-w8a16"
+    key: "record_attention_kernels-w8a16_CorrectnessTest"
+    depends_on: "attention_kernels-w8a16_CorrectnessTest"
+    env:
+      CI_TARGET: "attention_kernels-w8a16"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh attention_kernels-w8a16_CorrectnessTest
+
+  - label: "Performance tests for attention kernels-w8a16"
+    key: "attention_kernels-w8a16_PerformanceTest"
+    depends_on: "record_attention_kernels-w8a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "attention_kernels-w8a16_PerformanceTest" "to be added"
+  - label: "Record performance test result for attention kernels-w8a16"
+    key: "record_attention_kernels-w8a16_PerformanceTest"
+    depends_on: "attention_kernels-w8a16_PerformanceTest"
+    env:
+      CI_TARGET: "attention_kernels-w8a16"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh attention_kernels-w8a16_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/for_attention_kernels_KV_cache/w8a8.yml b/.buildkite/kernel_microbenchmarks/for_attention_kernels_KV_cache/w8a8.yml
new file mode 100644
index 000000000..c9e8aef64
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/for_attention_kernels_KV_cache/w8a8.yml
@@ -0,0 +1,46 @@
+# attention_kernels-w8a8
+# kernel support matrix microbenchmarks
+# For attention kernels, W[x]A[y] denotes KV cache as W, A as compute, and x, y as bit precision-w8a8
+steps:
+  - label: "Correctness tests for attention kernels-w8a8"
+    key: "attention_kernels-w8a8_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "attention_kernels-w8a8_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for attention kernels-w8a8"
+    key: "record_attention_kernels-w8a8_CorrectnessTest"
+    depends_on: "attention_kernels-w8a8_CorrectnessTest"
+    env:
+      CI_TARGET: "attention_kernels-w8a8"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh attention_kernels-w8a8_CorrectnessTest
+
+  - label: "Performance tests for attention kernels-w8a8"
+    key: "attention_kernels-w8a8_PerformanceTest"
+    depends_on: "record_attention_kernels-w8a8_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "attention_kernels-w8a8_PerformanceTest" "to be added"
+  - label: "Record performance test result for attention kernels-w8a8"
+    key: "record_attention_kernels-w8a8_PerformanceTest"
+    depends_on: "attention_kernels-w8a8_PerformanceTest"
+    env:
+      CI_TARGET: "attention_kernels-w8a8"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh attention_kernels-w8a8_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/fused_moe/w16a16.yml b/.buildkite/kernel_microbenchmarks/fused_moe/w16a16.yml
new file mode 100644
index 000000000..278489120
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/fused_moe/w16a16.yml
@@ -0,0 +1,45 @@
+# fused moe-w16a16
+# kernel support matrix microbenchmarks
+steps:
+  - label: "Correctness tests for fused moe-w16a16"
+    key: "fused_moe-w16a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "fused_moe-w16a16_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for fused moe w16a16"
+    key: "record_fused_moe-w16a16_CorrectnessTest"
+    depends_on: "fused_moe-w16a16_CorrectnessTest"
+    env:
+      CI_TARGET: "fused moe-w16a16"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh fused_moe-w16a16_CorrectnessTest
+
+  - label: "Performance tests for fused moe-w16a16"
+    key: "fused_moe-w16a16_PerformanceTest"
+    depends_on: "record_fused_moe-w16a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "fused_moe-w16a16_PerformanceTest" "to be added"
+  - label: "Record performance test result for fused moe-w16a16"
+    key: "record_fused_moe-w16a16_PerformanceTest"
+    depends_on: "fused_moe-w16a16_PerformanceTest"
+    env:
+      CI_TARGET: "fused moe-w16a16"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh fused_moe-w16a16_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/fused_moe/w4a16.yml b/.buildkite/kernel_microbenchmarks/fused_moe/w4a16.yml
new file mode 100644
index 000000000..c3506cd37
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/fused_moe/w4a16.yml
@@ -0,0 +1,45 @@
+# fused moe-w4a16
+# kernel support matrix microbenchmarks
+steps:
+  - label: "Correctness tests for fused moe-w4a16"
+    key: "fused_moe-w4a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "fused_moe-w4a16_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for fused moe w4a16"
+    key: "record_fused_moe-w4a16_CorrectnessTest"
+    depends_on: "fused_moe-w4a16_CorrectnessTest"
+    env:
+      CI_TARGET: "fused moe-w4a16"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh fused_moe-w4a16_CorrectnessTest
+
+  - label: "Performance tests for fused moe-w4a16"
+    key: "fused_moe-w4a16_PerformanceTest"
+    depends_on: "record_fused_moe-w4a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "fused_moe-w4a16_PerformanceTest" "to be added"
+  - label: "Record performance test result for fused moe-w4a16"
+    key: "record_fused_moe-w4a16_PerformanceTest"
+    depends_on: "fused_moe-w4a16_PerformanceTest"
+    env:
+      CI_TARGET: "fused moe-w4a16"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh fused_moe-w4a16_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/fused_moe/w4a4.yml b/.buildkite/kernel_microbenchmarks/fused_moe/w4a4.yml
new file mode 100644
index 000000000..4f44597c3
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/fused_moe/w4a4.yml
@@ -0,0 +1,45 @@
+# fused moe-w4a4
+# kernel support matrix microbenchmarks
+steps:
+  - label: "Correctness tests for fused moe-w4a4"
+    key: "fused_moe-w4a4_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "fused_moe-w4a4_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for fused moe w4a4"
+    key: "record_fused_moe-w4a4_CorrectnessTest"
+    depends_on: "fused_moe-w4a4_CorrectnessTest"
+    env:
+      CI_TARGET: "fused moe-w4a4"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh fused_moe-w4a4_CorrectnessTest
+
+  - label: "Performance tests for fused moe-w4a4"
+    key: "fused_moe-w4a4_PerformanceTest"
+    depends_on: "record_fused_moe-w4a4_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "fused_moe-w4a4_PerformanceTest" "to be added"
+  - label: "Record performance test result for fused moe-w4a4"
+    key: "record_fused_moe-w4a4_PerformanceTest"
+    depends_on: "fused_moe-w4a4_PerformanceTest"
+    env:
+      CI_TARGET: "fused moe-w4a4"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh fused_moe-w4a4_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/fused_moe/w4a8.yml b/.buildkite/kernel_microbenchmarks/fused_moe/w4a8.yml
new file mode 100644
index 000000000..2c14d3daa
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/fused_moe/w4a8.yml
@@ -0,0 +1,45 @@
+# fused moe-w4a8
+# kernel support matrix microbenchmarks
+steps:
+  - label: "Correctness tests for fused moe-w4a8"
+    key: "fused_moe-w4a8_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "fused_moe-w4a8_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for fused moe w4a8"
+    key: "record_fused_moe-w4a8_CorrectnessTest"
+    depends_on: "fused_moe-w4a8_CorrectnessTest"
+    env:
+      CI_TARGET: "fused moe-w4a8"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh fused_moe-w4a8_CorrectnessTest
+
+  - label: "Performance tests for fused moe-w4a8"
+    key: "fused_moe-w4a8_PerformanceTest"
+    depends_on: "record_fused_moe-w4a8_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "fused_moe-w4a8_PerformanceTest" "to be added"
+  - label: "Record performance test result for fused moe-w4a8"
+    key: "record_fused_moe-w4a8_PerformanceTest"
+    depends_on: "fused_moe-w4a8_PerformanceTest"
+    env:
+      CI_TARGET: "fused moe-w4a8"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh fused_moe-w4a8_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/fused_moe/w8a16.yml b/.buildkite/kernel_microbenchmarks/fused_moe/w8a16.yml
new file mode 100644
index 000000000..b9d1d51ff
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/fused_moe/w8a16.yml
@@ -0,0 +1,45 @@
+# fused moe-w8a16
+# kernel support matrix microbenchmarks
+steps:
+  - label: "Correctness tests for fused moe-w8a16"
+    key: "fused_moe-w8a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "fused_moe-w8a16_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for fused moe w8a16"
+    key: "record_fused_moe-w8a16_CorrectnessTest"
+    depends_on: "fused_moe-w8a16_CorrectnessTest"
+    env:
+      CI_TARGET: "fused moe-w8a16"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh fused_moe-w8a16_CorrectnessTest
+
+  - label: "Performance tests for fused moe-w8a16"
+    key: "fused_moe-w8a16_PerformanceTest"
+    depends_on: "record_fused_moe-w8a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "fused_moe-w8a16_PerformanceTest" "to be added"
+  - label: "Record performance test result for fused moe-w8a16"
+    key: "record_fused_moe-w8a16_PerformanceTest"
+    depends_on: "fused_moe-w8a16_PerformanceTest"
+    env:
+      CI_TARGET: "fused moe-w8a16"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh fused_moe-w8a16_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/fused_moe/w8a8.yml b/.buildkite/kernel_microbenchmarks/fused_moe/w8a8.yml
new file mode 100644
index 000000000..dda161b7e
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/fused_moe/w8a8.yml
@@ -0,0 +1,45 @@
+# fused moe-w8a8
+# kernel support matrix microbenchmarks
+steps:
+  - label: "Correctness tests for fused moe-w8a8"
+    key: "fused_moe-w8a8_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "fused_moe-w8a8_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for fused moe w8a8"
+    key: "record_fused_moe-w8a8_CorrectnessTest"
+    depends_on: "fused_moe-w8a8_CorrectnessTest"
+    env:
+      CI_TARGET: "fused moe-w8a8"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh fused_moe-w8a8_CorrectnessTest
+
+  - label: "Performance tests for fused moe-w8a8"
+    key: "fused_moe-w8a8_PerformanceTest"
+    depends_on: "record_fused_moe-w8a8_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "fused_moe-w8a8_PerformanceTest" "to be added"
+  - label: "Record performance test result for fused moe-w8a8"
+    key: "record_fused_moe-w8a8_PerformanceTest"
+    depends_on: "fused_moe-w8a8_PerformanceTest"
+    env:
+      CI_TARGET: "fused moe-w8a8"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh fused_moe-w8a8_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/generic_ragged_paged_attention_v3/w16a16.yml b/.buildkite/kernel_microbenchmarks/generic_ragged_paged_attention_v3/w16a16.yml
new file mode 100644
index 000000000..66bba1e28
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/generic_ragged_paged_attention_v3/w16a16.yml
@@ -0,0 +1,45 @@
+# generic ragged paged attention v3-w16a16
+# kernel support matrix microbenchmarks
+steps:
+  - label: "Correctness tests for generic ragged paged attention v3-w16a16"
+    key: "generic_ragged_paged_attention_v3-w16a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "generic_ragged_paged_attention_v3-w16a16_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for generic ragged paged attention v3 w16a16"
+    key: "record_generic_ragged_paged_attention_v3-w16a16_CorrectnessTest"
+    depends_on: "generic_ragged_paged_attention_v3-w16a16_CorrectnessTest"
+    env:
+      CI_TARGET: "generic ragged paged attention v3-w16a16"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh generic_ragged_paged_attention_v3-w16a16_CorrectnessTest
+
+  - label: "Performance tests for generic ragged paged attention v3-w16a16"
+    key: "generic_ragged_paged_attention_v3-w16a16_PerformanceTest"
+    depends_on: "record_generic_ragged_paged_attention_v3-w16a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "generic_ragged_paged_attention_v3-w16a16_PerformanceTest" "to be added"
+  - label: "Record performance test result for generic ragged paged attention v3-w16a16"
+    key: "record_generic_ragged_paged_attention_v3-w16a16_PerformanceTest"
+    depends_on: "generic_ragged_paged_attention_v3-w16a16_PerformanceTest"
+    env:
+      CI_TARGET: "generic ragged paged attention v3-w16a16"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh generic_ragged_paged_attention_v3-w16a16_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/generic_ragged_paged_attention_v3/w4a16.yml b/.buildkite/kernel_microbenchmarks/generic_ragged_paged_attention_v3/w4a16.yml
new file mode 100644
index 000000000..8194672ad
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/generic_ragged_paged_attention_v3/w4a16.yml
@@ -0,0 +1,45 @@
+# generic ragged paged attention v3-w4a16
+# kernel support matrix microbenchmarks
+steps:
+  - label: "Correctness tests for generic ragged paged attention v3-w4a16"
+    key: "generic_ragged_paged_attention_v3-w4a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "generic_ragged_paged_attention_v3-w4a16_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for generic ragged paged attention v3 w4a16"
+    key: "record_generic_ragged_paged_attention_v3-w4a16_CorrectnessTest"
+    depends_on: "generic_ragged_paged_attention_v3-w4a16_CorrectnessTest"
+    env:
+      CI_TARGET: "generic ragged paged attention v3-w4a16"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh generic_ragged_paged_attention_v3-w4a16_CorrectnessTest
+
+  - label: "Performance tests for generic ragged paged attention v3-w4a16"
+    key: "generic_ragged_paged_attention_v3-w4a16_PerformanceTest"
+    depends_on: "record_generic_ragged_paged_attention_v3-w4a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "generic_ragged_paged_attention_v3-w4a16_PerformanceTest" "to be added"
+  - label: "Record performance test result for generic ragged paged attention v3-w4a16"
+    key: "record_generic_ragged_paged_attention_v3-w4a16_PerformanceTest"
+    depends_on: "generic_ragged_paged_attention_v3-w4a16_PerformanceTest"
+    env:
+      CI_TARGET: "generic ragged paged attention v3-w4a16"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh generic_ragged_paged_attention_v3-w4a16_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/generic_ragged_paged_attention_v3/w4a4.yml b/.buildkite/kernel_microbenchmarks/generic_ragged_paged_attention_v3/w4a4.yml
new file mode 100644
index 000000000..4debaf7d4
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/generic_ragged_paged_attention_v3/w4a4.yml
@@ -0,0 +1,45 @@
+# generic ragged paged attention v3-w4a4
+# kernel support matrix microbenchmarks
+steps:
+  - label: "Correctness tests for generic ragged paged attention v3-w4a4"
+    key: "generic_ragged_paged_attention_v3-w4a4_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "generic_ragged_paged_attention_v3-w4a4_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for generic ragged paged attention v3 w4a4"
+    key: "record_generic_ragged_paged_attention_v3-w4a4_CorrectnessTest"
+    depends_on: "generic_ragged_paged_attention_v3-w4a4_CorrectnessTest"
+    env:
+      CI_TARGET: "generic ragged paged attention v3-w4a4"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh generic_ragged_paged_attention_v3-w4a4_CorrectnessTest
+
+  - label: "Performance tests for generic ragged paged attention v3-w4a4"
+    key: "generic_ragged_paged_attention_v3-w4a4_PerformanceTest"
+    depends_on: "record_generic_ragged_paged_attention_v3-w4a4_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "generic_ragged_paged_attention_v3-w4a4_PerformanceTest" "to be added"
+  - label: "Record performance test result for generic ragged paged attention v3-w4a4"
+    key: "record_generic_ragged_paged_attention_v3-w4a4_PerformanceTest"
+    depends_on: "generic_ragged_paged_attention_v3-w4a4_PerformanceTest"
+    env:
+      CI_TARGET: "generic ragged paged attention v3-w4a4"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh generic_ragged_paged_attention_v3-w4a4_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/generic_ragged_paged_attention_v3/w4a8.yml b/.buildkite/kernel_microbenchmarks/generic_ragged_paged_attention_v3/w4a8.yml
new file mode 100644
index 000000000..c46503d13
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/generic_ragged_paged_attention_v3/w4a8.yml
@@ -0,0 +1,45 @@
+# generic ragged paged attention v3-w4a8
+# kernel support matrix microbenchmarks
+steps:
+  - label: "Correctness tests for generic ragged paged attention v3-w4a8"
+    key: "generic_ragged_paged_attention_v3-w4a8_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "generic_ragged_paged_attention_v3-w4a8_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for generic ragged paged attention v3 w4a8"
+    key: "record_generic_ragged_paged_attention_v3-w4a8_CorrectnessTest"
+    depends_on: "generic_ragged_paged_attention_v3-w4a8_CorrectnessTest"
+    env:
+      CI_TARGET: "generic ragged paged attention v3-w4a8"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh generic_ragged_paged_attention_v3-w4a8_CorrectnessTest
+
+  - label: "Performance tests for generic ragged paged attention v3-w4a8"
+    key: "generic_ragged_paged_attention_v3-w4a8_PerformanceTest"
+    depends_on: "record_generic_ragged_paged_attention_v3-w4a8_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "generic_ragged_paged_attention_v3-w4a8_PerformanceTest" "to be added"
+  - label: "Record performance test result for generic ragged paged attention v3-w4a8"
+    key: "record_generic_ragged_paged_attention_v3-w4a8_PerformanceTest"
+    depends_on: "generic_ragged_paged_attention_v3-w4a8_PerformanceTest"
+    env:
+      CI_TARGET: "generic ragged paged attention v3-w4a8"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh generic_ragged_paged_attention_v3-w4a8_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/generic_ragged_paged_attention_v3/w8a16.yml b/.buildkite/kernel_microbenchmarks/generic_ragged_paged_attention_v3/w8a16.yml
new file mode 100644
index 000000000..0dad69212
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/generic_ragged_paged_attention_v3/w8a16.yml
@@ -0,0 +1,45 @@
+# generic ragged paged attention v3-w8a16
+# kernel support matrix microbenchmarks
+steps:
+  - label: "Correctness tests for generic ragged paged attention v3-w8a16"
+    key: "generic_ragged_paged_attention_v3-w8a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "generic_ragged_paged_attention_v3-w8a16_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for generic ragged paged attention v3 w8a16"
+    key: "record_generic_ragged_paged_attention_v3-w8a16_CorrectnessTest"
+    depends_on: "generic_ragged_paged_attention_v3-w8a16_CorrectnessTest"
+    env:
+      CI_TARGET: "generic ragged paged attention v3-w8a16"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh generic_ragged_paged_attention_v3-w8a16_CorrectnessTest
+
+  - label: "Performance tests for generic ragged paged attention v3-w8a16"
+    key: "generic_ragged_paged_attention_v3-w8a16_PerformanceTest"
+    depends_on: "record_generic_ragged_paged_attention_v3-w8a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "generic_ragged_paged_attention_v3-w8a16_PerformanceTest" "to be added"
+  - label: "Record performance test result for generic ragged paged attention v3-w8a16"
+    key: "record_generic_ragged_paged_attention_v3-w8a16_PerformanceTest"
+    depends_on: "generic_ragged_paged_attention_v3-w8a16_PerformanceTest"
+    env:
+      CI_TARGET: "generic ragged paged attention v3-w8a16"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh generic_ragged_paged_attention_v3-w8a16_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/generic_ragged_paged_attention_v3/w8a8.yml b/.buildkite/kernel_microbenchmarks/generic_ragged_paged_attention_v3/w8a8.yml
new file mode 100644
index 000000000..810182edc
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/generic_ragged_paged_attention_v3/w8a8.yml
@@ -0,0 +1,45 @@
+# generic ragged paged attention v3-w8a8
+# kernel support matrix microbenchmarks
+steps:
+  - label: "Correctness tests for generic ragged paged attention v3-w8a8"
+    key: "generic_ragged_paged_attention_v3-w8a8_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "generic_ragged_paged_attention_v3-w8a8_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for generic ragged paged attention v3 w8a8"
+    key: "record_generic_ragged_paged_attention_v3-w8a8_CorrectnessTest"
+    depends_on: "generic_ragged_paged_attention_v3-w8a8_CorrectnessTest"
+    env:
+      CI_TARGET: "generic ragged paged attention v3-w8a8"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh generic_ragged_paged_attention_v3-w8a8_CorrectnessTest
+
+  - label: "Performance tests for generic ragged paged attention v3-w8a8"
+    key: "generic_ragged_paged_attention_v3-w8a8_PerformanceTest"
+    depends_on: "record_generic_ragged_paged_attention_v3-w8a8_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "generic_ragged_paged_attention_v3-w8a8_PerformanceTest" "to be added"
+  - label: "Record performance test result for generic ragged paged attention v3-w8a8"
+    key: "record_generic_ragged_paged_attention_v3-w8a8_PerformanceTest"
+    depends_on: "generic_ragged_paged_attention_v3-w8a8_PerformanceTest"
+    env:
+      CI_TARGET: "generic ragged paged attention v3-w8a8"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh generic_ragged_paged_attention_v3-w8a8_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/gmm/w16a16.yml b/.buildkite/kernel_microbenchmarks/gmm/w16a16.yml
new file mode 100644
index 000000000..4ba87dbb9
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/gmm/w16a16.yml
@@ -0,0 +1,45 @@
+# gmm-w16a16
+# kernel support matrix microbenchmarks
+steps:
+  - label: "Correctness tests for gmm-w16a16"
+    key: "gmm-w16a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "gmm-w16a16_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for gmm-w16a16"
+    key: "record_gmm-w16a16_CorrectnessTest"
+    depends_on: "gmm-w16a16_CorrectnessTest"
+    env:
+      CI_TARGET: "gmm-w16a16"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh gmm-w16a16_CorrectnessTest
+
+  - label: "Performance tests for gmm-w16a16"
+    key: "gmm-w16a16_PerformanceTest"
+    depends_on: "record_gmm-w16a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "gmm-w16a16_PerformanceTest" "to be added"
+  - label: "Record performance test result for gmm-w16a16"
+    key: "record_gmm-w16a16_PerformanceTest"
+    depends_on: "gmm-w16a16_PerformanceTest"
+    env:
+      CI_TARGET: "gmm-w16a16"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh gmm-w16a16_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/gmm/w4a16.yml b/.buildkite/kernel_microbenchmarks/gmm/w4a16.yml
new file mode 100644
index 000000000..40dc81973
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/gmm/w4a16.yml
@@ -0,0 +1,45 @@
+# gmm-w4a16
+# kernel support matrix microbenchmarks
+steps:
+  - label: "Correctness tests for gmm-w4a16"
+    key: "gmm-w4a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "gmm-w4a16_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for gmm-w4a16"
+    key: "record_gmm-w4a16_CorrectnessTest"
+    depends_on: "gmm-w4a16_CorrectnessTest"
+    env:
+      CI_TARGET: "gmm-w4a16"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh gmm-w4a16_CorrectnessTest
+
+  - label: "Performance tests for gmm-w4a16"
+    key: "gmm-w4a16_PerformanceTest"
+    depends_on: "record_gmm-w4a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "gmm-w4a16_PerformanceTest" "to be added"
+  - label: "Record performance test result for gmm-w4a16"
+    key: "record_gmm-w4a16_PerformanceTest"
+    depends_on: "gmm-w4a16_PerformanceTest"
+    env:
+      CI_TARGET: "gmm-w4a16"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh gmm-w4a16_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/gmm/w4a4.yml b/.buildkite/kernel_microbenchmarks/gmm/w4a4.yml
new file mode 100644
index 000000000..36273f1d5
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/gmm/w4a4.yml
@@ -0,0 +1,45 @@
+# gmm-w4a4
+# kernel support matrix microbenchmarks
+steps:
+  - label: "Correctness tests for gmm-w4a4"
+    key: "gmm-w4a4_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "gmm-w4a4_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for gmm-w4a4"
+    key: "record_gmm-w4a4_CorrectnessTest"
+    depends_on: "gmm-w4a4_CorrectnessTest"
+    env:
+      CI_TARGET: "gmm-w4a4"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh gmm-w4a4_CorrectnessTest
+
+  - label: "Performance tests for gmm-w4a4"
+    key: "gmm-w4a4_PerformanceTest"
+    depends_on: "record_gmm-w4a4_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "gmm-w4a4_PerformanceTest" "to be added"
+  - label: "Record performance test result for gmm-w4a4"
+    key: "record_gmm-w4a4_PerformanceTest"
+    depends_on: "gmm-w4a4_PerformanceTest"
+    env:
+      CI_TARGET: "gmm-w4a4"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh gmm-w4a4_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/gmm/w4a8.yml b/.buildkite/kernel_microbenchmarks/gmm/w4a8.yml
new file mode 100644
index 000000000..f08f2d4c2
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/gmm/w4a8.yml
@@ -0,0 +1,45 @@
+# gmm-w4a8
+# kernel support matrix microbenchmarks
+steps:
+  - label: "Correctness tests for gmm-w4a8"
+    key: "gmm-w4a8_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "gmm-w4a8_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for gmm-w4a8"
+    key: "record_gmm-w4a8_CorrectnessTest"
+    depends_on: "gmm-w4a8_CorrectnessTest"
+    env:
+      CI_TARGET: "gmm-w4a8"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh gmm-w4a8_CorrectnessTest
+
+  - label: "Performance tests for gmm-w4a8"
+    key: "gmm-w4a8_PerformanceTest"
+    depends_on: "record_gmm-w4a8_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "gmm-w4a8_PerformanceTest" "to be added"
+  - label: "Record performance test result for gmm-w4a8"
+    key: "record_gmm-w4a8_PerformanceTest"
+    depends_on: "gmm-w4a8_PerformanceTest"
+    env:
+      CI_TARGET: "gmm-w4a8"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh gmm-w4a8_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/gmm/w8a16.yml b/.buildkite/kernel_microbenchmarks/gmm/w8a16.yml
new file mode 100644
index 000000000..16a50ed16
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/gmm/w8a16.yml
@@ -0,0 +1,45 @@
+# gmm-w8a16
+# kernel support matrix microbenchmarks
+steps:
+  - label: "Correctness tests for gmm-w8a16"
+    key: "gmm-w8a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "gmm-w8a16_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for gmm-w8a16"
+    key: "record_gmm-w8a16_CorrectnessTest"
+    depends_on: "gmm-w8a16_CorrectnessTest"
+    env:
+      CI_TARGET: "gmm-w8a16"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh gmm-w8a16_CorrectnessTest
+
+  - label: "Performance tests for gmm-w8a16"
+    key: "gmm-w8a16_PerformanceTest"
+    depends_on: "record_gmm-w8a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "gmm-w8a16_PerformanceTest" "to be added"
+  - label: "Record performance test result for gmm-w8a16"
+    key: "record_gmm-w8a16_PerformanceTest"
+    depends_on: "gmm-w8a16_PerformanceTest"
+    env:
+      CI_TARGET: "gmm-w8a16"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh gmm-w8a16_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/gmm/w8a8.yml b/.buildkite/kernel_microbenchmarks/gmm/w8a8.yml
new file mode 100644
index 000000000..51bde7e92
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/gmm/w8a8.yml
@@ -0,0 +1,45 @@
+# gmm-w8a8
+# kernel support matrix microbenchmarks
+steps:
+  - label: "Correctness tests for gmm-w8a8"
+    key: "gmm-w8a8_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "gmm-w8a8_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for gmm-w8a8"
+    key: "record_gmm-w8a8_CorrectnessTest"
+    depends_on: "gmm-w8a8_CorrectnessTest"
+    env:
+      CI_TARGET: "gmm-w8a8"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh gmm-w8a8_CorrectnessTest
+
+  - label: "Performance tests for gmm-w8a8"
+    key: "gmm-w8a8_PerformanceTest"
+    depends_on: "record_gmm-w8a8_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "gmm-w8a8_PerformanceTest" "to be added"
+  - label: "Record performance test result for gmm-w8a8"
+    key: "record_gmm-w8a8_PerformanceTest"
+    depends_on: "gmm-w8a8_PerformanceTest"
+    env:
+      CI_TARGET: "gmm-w8a8"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh gmm-w8a8_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/mla/w16a16.yml b/.buildkite/kernel_microbenchmarks/mla/w16a16.yml
new file mode 100644
index 000000000..a0c8df8e9
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/mla/w16a16.yml
@@ -0,0 +1,45 @@
+# mla-w16a16
+# kernel support matrix microbenchmarks
+steps:
+  - label: "Correctness tests for mla-w16a16"
+    key: "mla-w16a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "mla-w16a16_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for mla-w16a16"
+    key: "record_mla-w16a16_CorrectnessTest"
+    depends_on: "mla-w16a16_CorrectnessTest"
+    env:
+      CI_TARGET: "mla-w16a16"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh mla-w16a16_CorrectnessTest
+
+  - label: "Performance tests for mla-w16a16"
+    key: "mla-w16a16_PerformanceTest"
+    depends_on: "record_mla-w16a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "mla-w16a16_PerformanceTest" "to be added"
+  - label: "Record performance test result for mla-w16a16"
+    key: "record_mla-w16a16_PerformanceTest"
+    depends_on: "mla-w16a16_PerformanceTest"
+    env:
+      CI_TARGET: "mla-w16a16"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh mla-w16a16_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/mla/w4a16.yml b/.buildkite/kernel_microbenchmarks/mla/w4a16.yml
new file mode 100644
index 000000000..a28c8456b
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/mla/w4a16.yml
@@ -0,0 +1,45 @@
+# mla-w4a16
+# kernel support matrix microbenchmarks
+steps:
+  - label: "Correctness tests for mla-w4a16"
+    key: "mla-w4a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "mla-w4a16_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for mla-w4a16"
+    key: "record_mla-w4a16_CorrectnessTest"
+    depends_on: "mla-w4a16_CorrectnessTest"
+    env:
+      CI_TARGET: "mla-w4a16"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh mla-w4a16_CorrectnessTest
+
+  - label: "Performance tests for mla-w4a16"
+    key: "mla-w4a16_PerformanceTest"
+    depends_on: "record_mla-w4a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "mla-w4a16_PerformanceTest" "to be added"
+  - label: "Record performance test result for mla-w4a16"
+    key: "record_mla-w4a16_PerformanceTest"
+    depends_on: "mla-w4a16_PerformanceTest"
+    env:
+      CI_TARGET: "mla-w4a16"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh mla-w4a16_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/mla/w4a4.yml b/.buildkite/kernel_microbenchmarks/mla/w4a4.yml
new file mode 100644
index 000000000..74e226d50
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/mla/w4a4.yml
@@ -0,0 +1,45 @@
+# mla-w4a4
+# kernel support matrix microbenchmarks
+steps:
+  - label: "Correctness tests for mla-w4a4"
+    key: "mla-w4a4_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "mla-w4a4_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for mla-w4a4"
+    key: "record_mla-w4a4_CorrectnessTest"
+    depends_on: "mla-w4a4_CorrectnessTest"
+    env:
+      CI_TARGET: "mla-w4a4"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh mla-w4a4_CorrectnessTest
+
+  - label: "Performance tests for mla-w4a4"
+    key: "mla-w4a4_PerformanceTest"
+    depends_on: "record_mla-w4a4_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "mla-w4a4_PerformanceTest" "to be added"
+  - label: "Record performance test result for mla-w4a4"
+    key: "record_mla-w4a4_PerformanceTest"
+    depends_on: "mla-w4a4_PerformanceTest"
+    env:
+      CI_TARGET: "mla-w4a4"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh mla-w4a4_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/mla/w4a8.yml b/.buildkite/kernel_microbenchmarks/mla/w4a8.yml
new file mode 100644
index 000000000..fc5219b2b
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/mla/w4a8.yml
@@ -0,0 +1,45 @@
+# mla-w4a8
+# kernel support matrix microbenchmarks
+steps:
+  - label: "Correctness tests for mla-w4a8"
+    key: "mla-w4a8_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "mla-w4a8_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for mla-w4a8"
+    key: "record_mla-w4a8_CorrectnessTest"
+    depends_on: "mla-w4a8_CorrectnessTest"
+    env:
+      CI_TARGET: "mla-w4a8"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh mla-w4a8_CorrectnessTest
+
+  - label: "Performance tests for mla-w4a8"
+    key: "mla-w4a8_PerformanceTest"
+    depends_on: "record_mla-w4a8_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "mla-w4a8_PerformanceTest" "to be added"
+  - label: "Record performance test result for mla-w4a8"
+    key: "record_mla-w4a8_PerformanceTest"
+    depends_on: "mla-w4a8_PerformanceTest"
+    env:
+      CI_TARGET: "mla-w4a8"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh mla-w4a8_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/mla/w8a16.yml b/.buildkite/kernel_microbenchmarks/mla/w8a16.yml
new file mode 100644
index 000000000..d4a730268
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/mla/w8a16.yml
@@ -0,0 +1,45 @@
+# mla-w8a16
+# kernel support matrix microbenchmarks
+steps:
+  - label: "Correctness tests for mla-w8a16"
+    key: "mla-w8a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "mla-w8a16_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for mla-w8a16"
+    key: "record_mla-w8a16_CorrectnessTest"
+    depends_on: "mla-w8a16_CorrectnessTest"
+    env:
+      CI_TARGET: "mla-w8a16"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh mla-w8a16_CorrectnessTest
+
+  - label: "Performance tests for mla-w8a16"
+    key: "mla-w8a16_PerformanceTest"
+    depends_on: "record_mla-w8a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "mla-w8a16_PerformanceTest" "to be added"
+  - label: "Record performance test result for mla-w8a16"
+    key: "record_mla-w8a16_PerformanceTest"
+    depends_on: "mla-w8a16_PerformanceTest"
+    env:
+      CI_TARGET: "mla-w8a16"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh mla-w8a16_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/mla/w8a8.yml b/.buildkite/kernel_microbenchmarks/mla/w8a8.yml
new file mode 100644
index 000000000..0a338a941
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/mla/w8a8.yml
@@ -0,0 +1,45 @@
+# mla-w8a8
+# kernel support matrix microbenchmarks
+steps:
+  - label: "Correctness tests for mla-w8a8"
+    key: "mla-w8a8_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "mla-w8a8_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for mla-w8a8"
+    key: "record_mla-w8a8_CorrectnessTest"
+    depends_on: "mla-w8a8_CorrectnessTest"
+    env:
+      CI_TARGET: "mla-w8a8"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh mla-w8a8_CorrectnessTest
+
+  - label: "Performance tests for mla-w8a8"
+    key: "mla-w8a8_PerformanceTest"
+    depends_on: "record_mla-w8a8_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "mla-w8a8_PerformanceTest" "to be added"
+  - label: "Record performance test result for mla-w8a8"
+    key: "record_mla-w8a8_PerformanceTest"
+    depends_on: "mla-w8a8_PerformanceTest"
+    env:
+      CI_TARGET: "mla-w8a8"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh mla-w8a8_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/ragged_paged_attention_v3_head_dim_64/w16a16.yml b/.buildkite/kernel_microbenchmarks/ragged_paged_attention_v3_head_dim_64/w16a16.yml
new file mode 100644
index 000000000..20d05f26b
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/ragged_paged_attention_v3_head_dim_64/w16a16.yml
@@ -0,0 +1,45 @@
+# ragged paged attention v3 head_dim 64-w16a16
+# kernel support matrix microbenchmarks
+steps:
+  - label: "Correctness tests for ragged paged attention v3 head_dim 64-w16a16"
+    key: "ragged_paged_attention_v3_head_dim_64-w16a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "ragged_paged_attention_v3_head_dim_64-w16a16_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for ragged paged attention v3 head_dim 64 w16a16"
+    key: "record_ragged_paged_attention_v3_head_dim_64-w16a16_CorrectnessTest"
+    depends_on: "ragged_paged_attention_v3_head_dim_64-w16a16_CorrectnessTest"
+    env:
+      CI_TARGET: "ragged paged attention v3 head_dim 64-w16a16"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh ragged_paged_attention_v3_head_dim_64-w16a16_CorrectnessTest
+
+  - label: "Performance tests for ragged paged attention v3 head_dim 64-w16a16"
+    key: "ragged_paged_attention_v3_head_dim_64-w16a16_PerformanceTest"
+    depends_on: "record_ragged_paged_attention_v3_head_dim_64-w16a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "ragged_paged_attention_v3_head_dim_64-w16a16_PerformanceTest" "to be added"
+  - label: "Record performance test result for ragged paged attention v3 head_dim 64-w16a16"
+    key: "record_ragged_paged_attention_v3_head_dim_64-w16a16_PerformanceTest"
+    depends_on: "ragged_paged_attention_v3_head_dim_64-w16a16_PerformanceTest"
+    env:
+      CI_TARGET: "ragged paged attention v3 head_dim 64-w16a16"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh ragged_paged_attention_v3_head_dim_64-w16a16_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/ragged_paged_attention_v3_head_dim_64/w4a16.yml b/.buildkite/kernel_microbenchmarks/ragged_paged_attention_v3_head_dim_64/w4a16.yml
new file mode 100644
index 000000000..802aa9755
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/ragged_paged_attention_v3_head_dim_64/w4a16.yml
@@ -0,0 +1,45 @@
+# ragged paged attention v3 head_dim 64-w4a16
+# kernel support matrix microbenchmarks
+steps:
+  - label: "Correctness tests for ragged paged attention v3 head_dim 64-w4a16"
+    key: "ragged_paged_attention_v3_head_dim_64-w4a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "ragged_paged_attention_v3_head_dim_64-w4a16_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for ragged paged attention v3 head_dim 64 w4a16"
+    key: "record_ragged_paged_attention_v3_head_dim_64-w4a16_CorrectnessTest"
+    depends_on: "ragged_paged_attention_v3_head_dim_64-w4a16_CorrectnessTest"
+    env:
+      CI_TARGET: "ragged paged attention v3 head_dim 64-w4a16"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh ragged_paged_attention_v3_head_dim_64-w4a16_CorrectnessTest
+
+  - label: "Performance tests for ragged paged attention v3 head_dim 64-w4a16"
+    key: "ragged_paged_attention_v3_head_dim_64-w4a16_PerformanceTest"
+    depends_on: "record_ragged_paged_attention_v3_head_dim_64-w4a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "ragged_paged_attention_v3_head_dim_64-w4a16_PerformanceTest" "to be added"
+  - label: "Record performance test result for ragged paged attention v3 head_dim 64-w4a16"
+    key: "record_ragged_paged_attention_v3_head_dim_64-w4a16_PerformanceTest"
+    depends_on: "ragged_paged_attention_v3_head_dim_64-w4a16_PerformanceTest"
+    env:
+      CI_TARGET: "ragged paged attention v3 head_dim 64-w4a16"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh ragged_paged_attention_v3_head_dim_64-w4a16_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/ragged_paged_attention_v3_head_dim_64/w4a4.yml b/.buildkite/kernel_microbenchmarks/ragged_paged_attention_v3_head_dim_64/w4a4.yml
new file mode 100644
index 000000000..3529047fb
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/ragged_paged_attention_v3_head_dim_64/w4a4.yml
@@ -0,0 +1,45 @@
+# ragged paged attention v3 head_dim 64-w4a4
+# kernel support matrix microbenchmarks
+steps:
+  - label: "Correctness tests for ragged paged attention v3 head_dim 64-w4a4"
+    key: "ragged_paged_attention_v3_head_dim_64-w4a4_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "ragged_paged_attention_v3_head_dim_64-w4a4_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for ragged paged attention v3 head_dim 64 w4a4"
+    key: "record_ragged_paged_attention_v3_head_dim_64-w4a4_CorrectnessTest"
+    depends_on: "ragged_paged_attention_v3_head_dim_64-w4a4_CorrectnessTest"
+    env:
+      CI_TARGET: "ragged paged attention v3 head_dim 64-w4a4"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh ragged_paged_attention_v3_head_dim_64-w4a4_CorrectnessTest
+
+  - label: "Performance tests for ragged paged attention v3 head_dim 64-w4a4"
+    key: "ragged_paged_attention_v3_head_dim_64-w4a4_PerformanceTest"
+    depends_on: "record_ragged_paged_attention_v3_head_dim_64-w4a4_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "ragged_paged_attention_v3_head_dim_64-w4a4_PerformanceTest" "to be added"
+  - label: "Record performance test result for ragged paged attention v3 head_dim 64-w4a4"
+    key: "record_ragged_paged_attention_v3_head_dim_64-w4a4_PerformanceTest"
+    depends_on: "ragged_paged_attention_v3_head_dim_64-w4a4_PerformanceTest"
+    env:
+      CI_TARGET: "ragged paged attention v3 head_dim 64-w4a4"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh ragged_paged_attention_v3_head_dim_64-w4a4_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/ragged_paged_attention_v3_head_dim_64/w4a8.yml b/.buildkite/kernel_microbenchmarks/ragged_paged_attention_v3_head_dim_64/w4a8.yml
new file mode 100644
index 000000000..af76a046d
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/ragged_paged_attention_v3_head_dim_64/w4a8.yml
@@ -0,0 +1,45 @@
+# ragged paged attention v3 head_dim 64-w4a8
+# kernel support matrix microbenchmarks
+steps:
+  - label: "Correctness tests for ragged paged attention v3 head_dim 64-w4a8"
+    key: "ragged_paged_attention_v3_head_dim_64-w4a8_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "ragged_paged_attention_v3_head_dim_64-w4a8_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for ragged paged attention v3 head_dim 64 w4a8"
+    key: "record_ragged_paged_attention_v3_head_dim_64-w4a8_CorrectnessTest"
+    depends_on: "ragged_paged_attention_v3_head_dim_64-w4a8_CorrectnessTest"
+    env:
+      CI_TARGET: "ragged paged attention v3 head_dim 64-w4a8"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh ragged_paged_attention_v3_head_dim_64-w4a8_CorrectnessTest
+
+  - label: "Performance tests for ragged paged attention v3 head_dim 64-w4a8"
+    key: "ragged_paged_attention_v3_head_dim_64-w4a8_PerformanceTest"
+    depends_on: "record_ragged_paged_attention_v3_head_dim_64-w4a8_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "ragged_paged_attention_v3_head_dim_64-w4a8_PerformanceTest" "to be added"
+  - label: "Record performance test result for ragged paged attention v3 head_dim 64-w4a8"
+    key: "record_ragged_paged_attention_v3_head_dim_64-w4a8_PerformanceTest"
+    depends_on: "ragged_paged_attention_v3_head_dim_64-w4a8_PerformanceTest"
+    env:
+      CI_TARGET: "ragged paged attention v3 head_dim 64-w4a8"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh ragged_paged_attention_v3_head_dim_64-w4a8_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/ragged_paged_attention_v3_head_dim_64/w8a16.yml b/.buildkite/kernel_microbenchmarks/ragged_paged_attention_v3_head_dim_64/w8a16.yml
new file mode 100644
index 000000000..bd3671059
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/ragged_paged_attention_v3_head_dim_64/w8a16.yml
@@ -0,0 +1,45 @@
+# ragged paged attention v3 head_dim 64-w8a16
+# kernel support matrix microbenchmarks
+steps:
+  - label: "Correctness tests for ragged paged attention v3 head_dim 64-w8a16"
+    key: "ragged_paged_attention_v3_head_dim_64-w8a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "ragged_paged_attention_v3_head_dim_64-w8a16_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for ragged paged attention v3 head_dim 64 w8a16"
+    key: "record_ragged_paged_attention_v3_head_dim_64-w8a16_CorrectnessTest"
+    depends_on: "ragged_paged_attention_v3_head_dim_64-w8a16_CorrectnessTest"
+    env:
+      CI_TARGET: "ragged paged attention v3 head_dim 64-w8a16"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh ragged_paged_attention_v3_head_dim_64-w8a16_CorrectnessTest
+
+  - label: "Performance tests for ragged paged attention v3 head_dim 64-w8a16"
+    key: "ragged_paged_attention_v3_head_dim_64-w8a16_PerformanceTest"
+    depends_on: "record_ragged_paged_attention_v3_head_dim_64-w8a16_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "ragged_paged_attention_v3_head_dim_64-w8a16_PerformanceTest" "to be added"
+  - label: "Record performance test result for ragged paged attention v3 head_dim 64-w8a16"
+    key: "record_ragged_paged_attention_v3_head_dim_64-w8a16_PerformanceTest"
+    depends_on: "ragged_paged_attention_v3_head_dim_64-w8a16_PerformanceTest"
+    env:
+      CI_TARGET: "ragged paged attention v3 head_dim 64-w8a16"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh ragged_paged_attention_v3_head_dim_64-w8a16_PerformanceTest
diff --git a/.buildkite/kernel_microbenchmarks/ragged_paged_attention_v3_head_dim_64/w8a8.yml b/.buildkite/kernel_microbenchmarks/ragged_paged_attention_v3_head_dim_64/w8a8.yml
new file mode 100644
index 000000000..067224a64
--- /dev/null
+++ b/.buildkite/kernel_microbenchmarks/ragged_paged_attention_v3_head_dim_64/w8a8.yml
@@ -0,0 +1,45 @@
+# ragged paged attention v3 head_dim 64-w8a8
+# kernel support matrix microbenchmarks
+steps:
+  - label: "Correctness tests for ragged paged attention v3 head_dim 64-w8a8"
+    key: "ragged_paged_attention_v3_head_dim_64-w8a8_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "ragged_paged_attention_v3_head_dim_64-w8a8_CorrectnessTest" "to be added"
+  - label: "Record correctness test result for ragged paged attention v3 head_dim 64 w8a8"
+    key: "record_ragged_paged_attention_v3_head_dim_64-w8a8_CorrectnessTest"
+    depends_on: "ragged_paged_attention_v3_head_dim_64-w8a8_CorrectnessTest"
+    env:
+      CI_TARGET: "ragged paged attention v3 head_dim 64-w8a8"
+      CI_STAGE: "CorrectnessTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh ragged_paged_attention_v3_head_dim_64-w8a8_CorrectnessTest
+
+  - label: "Performance tests for ragged paged attention v3 head_dim 64-w8a8"
+    key: "ragged_paged_attention_v3_head_dim_64-w8a8_PerformanceTest"
+    depends_on: "record_ragged_paged_attention_v3_head_dim_64-w8a8_CorrectnessTest"
+    soft_fail: true
+    agents:
+      queue: tpu_v6e_queue
+    commands:
+      - |
+        buildkite-agent meta-data set "ragged_paged_attention_v3_head_dim_64-w8a8_PerformanceTest" "to be added"
+  - label: "Record performance test result for ragged paged attention v3 head_dim 64-w8a8"
+    key: "record_ragged_paged_attention_v3_head_dim_64-w8a8_PerformanceTest"
+    depends_on: "ragged_paged_attention_v3_head_dim_64-w8a8_PerformanceTest"
+    env:
+      CI_TARGET: "ragged paged attention v3 head_dim 64-w8a8"
+      CI_STAGE: "PerformanceTest"
+      CI_CATEGORY: "kernel support matrix microbenchmarks"
+    agents:
+      queue: cpu
+    commands:
+      - |
+        .buildkite/scripts/record_step_result.sh ragged_paged_attention_v3_head_dim_64-w8a8_PerformanceTest
diff --git a/.buildkite/pipeline_generation/add_feature_to_ci.py b/.buildkite/pipeline_generation/add_feature_to_ci.py
index 4787d2652..39a65c225 100644
--- a/.buildkite/pipeline_generation/add_feature_to_ci.py
+++ b/.buildkite/pipeline_generation/add_feature_to_ci.py
@@ -13,6 +13,7 @@
 class FeatureCategory(str, Enum):
     FEATURE_SUPPORT = "feature support matrix"
     KERNEL_SUPPORT = "kernel support matrix"
+    KERNEL_SUPPORT_MICROBENCHMARKS = "kernel support matrix (microbenchmarks)"
 
 
 def generate_from_template(feature_name: str, feature_category: str,
@@ -102,7 +103,8 @@ def main():
         '--category',
         choices=[
             FeatureCategory.FEATURE_SUPPORT.value,
-            FeatureCategory.KERNEL_SUPPORT.value
+            FeatureCategory.KERNEL_SUPPORT.value,
+            FeatureCategory.KERNEL_SUPPORT_MICROBENCHMARKS.value
         ],
         default='feature support matrix',
         help='[OPTIONAL] Category of feature. (Default: feature support matrix)'
diff --git a/.buildkite/scripts/generate_support_matrices.sh b/.buildkite/scripts/generate_support_matrices.sh
index 52c3aeebe..be3aed8e8 100644
--- a/.buildkite/scripts/generate_support_matrices.sh
+++ b/.buildkite/scripts/generate_support_matrices.sh
@@ -6,13 +6,15 @@ ANY_FAILED=false
 MODEL_LIST_KEY="model-list"
 FEATURE_LIST_KEY="feature-list"
 DEFAULT_FEATURES_FILE=".buildkite/features/default_features.txt"
+LOCAL_TPU_VERSION="${BUILDKITE_TAG:-nightly_$(date +%Y%m%d)}"
 
 # Note: This script assumes the metadata keys contain newline-separated lists.
 mapfile -t model_list < <(buildkite-agent meta-data get "${MODEL_LIST_KEY}" --default "")
 mapfile -t metadata_feature_list < <(buildkite-agent meta-data get "${FEATURE_LIST_KEY}" --default "")
 MODEL_STAGES=("UnitTest" "IntegrationTest" "Benchmark")
 FEATURE_STAGES=("CorrectnessTest" "PerformanceTest")
-FEATURE_STAGES_QUANTIZATION=("RecommendedTPUGenerations" "CorrectnessTest" "PerformanceTest")
+FEATURE_STAGES_QUANTIZATION=("QuantizationMethods" "RecommendedTPUGenerations" "CorrectnessTest" "PerformanceTest")
+FEATURE_STAGES_MICROBENCHMARKS=("CorrectnessTest" "PerformanceTest" "TPU Versions")
 
 declare -A TPU_GENERATIONS=(
     ["INT8 W8A8"]="\"v5, v6\""
@@ -22,6 +24,14 @@ declare -A TPU_GENERATIONS=(
     ["FP4 W4A16"]="v7"
     ["AWQ INT4"]="\"v5, v6\""
 )
+declare -A QUANTIZATION_METHODS=(
+    ["INT8 W8A8"]="compressed-tensor"
+    ["INT4 W4A16"]="awq"
+    ["FP8 W8A8"]="compressed-tensor"
+    ["FP8 W8A16"]="compressed-tensor"
+    ["FP4 W4A16"]="mxfp4"
+    ["AWQ INT4"]=""
+)
 declare -a model_csv_files=()
 declare -a feature_csv_files=()
 declare -a default_feature_names=()
@@ -106,11 +116,16 @@ process_features() {
         local stages_to_use=("${FEATURE_STAGES[@]}")
         local header="Feature,CorrectnessTest,PerformanceTest"
         local is_quantization_matrix=false
+        local is_kernel_microbenchmarks=false
 
         if [ "$category" == "quantization support matrix" ]; then
             is_quantization_matrix=true
             stages_to_use=("${FEATURE_STAGES_QUANTIZATION[@]}")
-            header="Feature,Recommended TPU Generations,CorrectnessTest,PerformanceTest"
+            header="Quantization dtype,Quantization methods,Recommended TPU Generations,CorrectnessTest,PerformanceTest"
+        elif [ "$category" == "kernel support matrix microbenchmarks" ]; then
+            is_kernel_microbenchmarks=true
+            stages_to_use=("${FEATURE_STAGES_MICROBENCHMARKS[@]}")
+            header="kernels,CorrectnessTest,PerformanceTest,TPU Versions"
         fi
 
         if [ ! -f "$category_csv" ]; then
@@ -123,11 +138,15 @@ process_features() {
         local stage_index=0
         for stage in "${stages_to_use[@]}"; do
             local result
-
             if [ "$is_quantization_matrix" = true ] && [ "$stage" == "RecommendedTPUGenerations" ]; then
                 # If it's the quantization matrix, hardcode the TPU generation
                 result="${TPU_GENERATIONS["$feature"]:-N/A}"
-
+            elif [ "$is_quantization_matrix" = true ] && [ "$stage" == "QuantizationMethods" ]; then
+                # If it's the quantization matrix, hardcode the quantization methods
+                result="${QUANTIZATION_METHODS["$feature"]:-N/A}"
+            elif [ "$is_kernel_microbenchmarks" = true ] && [ "$stage" == "TPU Versions" ]; then
+                # If it's kernel microbenchmarks matrix, hardcode the tpu version
+                result="${LOCAL_TPU_VERSION}"
             elif [[ "$mode" == "DEFAULT" ]]; then
                 result="✅"
             else
@@ -136,8 +155,8 @@ process_features() {
 
             row="$row,$result"
 
-            # Check for failure (exclude the hardcoded TPU generation column)
-            if [ "$stage" != "RecommendedTPUGenerations" ] && [ "${result}" != "✅" ] && [ "${result}" != "N/A" ] && [ "${result}" != "to be added" ]; then
+            # Check for failure (exclude the hardcoded TPU generation column and Quantization Methods column)
+            if [ "$stage" != "TPU Versions" ] && [ "$stage" != "QuantizationMethods" ] && [ "$stage" != "RecommendedTPUGenerations" ] && [ "${result}" != "✅" ] && [ "${result}" != "N/A" ] && [ "${result}" != "to be added" ]; then
                 ANY_FAILED=true
             fi
 
@@ -147,6 +166,99 @@ process_features() {
     done
 }
 
+process_kernel_matrix_to_pivot() {
+    local input_csv="kernel_support_matrix_microbenchmarks.csv"
+    local output_file="kernel_support_matrix-microbenchmarks.csv"
+
+    if [ ! -f "$input_csv" ]; then
+        echo "Warning: Input CSV $input_csv not found. Skipping pivot."
+        return
+    fi
+
+    # Define Headers for Display
+    local quant_cols_list="w16a16 w8a8 w8a16 w4a4 w4a8 w4a16"
+    local AWK_QUANT_COLS
+    AWK_QUANT_COLS=$(IFS=" "; echo "${quant_cols_list[*]}")
+
+    # Line 1: ,w16a16,,,w8a8,,,w8a16,,,w4a4,,,w4a8,,,w4a16,,
+    local header_line1=","
+    for quant_type in $quant_cols_list; do
+        header_line1="${header_line1}${quant_type},,,"
+    done
+    # Remove the trailing comma from the last group
+    header_line1="${header_line1%,}"
+
+    # Line 2: kernels,correctness,performance,tpu versions,correctness,performance,tpu versions,...
+    local header_line2="kernels"
+    for _ in $quant_cols_list; do
+        header_line2="${header_line2},correctness,performance,tpu versions"
+    done
+
+    # Write the two-line header structure to the output file
+    echo "$header_line1" > "$output_file"
+    echo "$header_line2" >> "$output_file"
+
+    # Awk Script for Pivoting (Data Rows)
+   awk -v AWK_QUANT_COLS="$AWK_QUANT_COLS" '
+        BEGIN { FS=","; OFS="," }
+        NR > 1 {
+            # Kernel parsing logic remains the same
+            gsub(/"/, "", $1);
+
+            if (match($1, /-(w[0-9]+a[0-9]+)$/)) {
+                quant_type = substr($1, RSTART + 1, RLENGTH - 1);
+                base_kernel_key = substr($1, 1, RSTART - 1);
+            } else {
+                 base_kernel_key = $1;
+                 quant_type = "w16a16";
+            }
+
+            matrix[base_kernel_key][quant_type] = $2 OFS $3 OFS $4;
+
+            if (! (base_kernel_key in kernels)) {
+                kernels[base_kernel_key] = 1;
+                kernel_list[num_kernels++] = base_kernel_key;
+            }
+        }
+        END {
+            split(AWK_QUANT_COLS, quant_cols, " ");
+            default_val = "N/A" OFS "N/A" OFS "N/A";
+
+            # Iterate through all unique base kernels found in the input (ordered by kernel_list)
+            for (i=0; i<num_kernels; i++) {
+                local_original_key = kernel_list[i];
+                kernel_for_output = local_original_key;
+
+                # Apply desired renames/modifications to the output name
+                if (kernel_for_output == "generic ragged paged attention v3") {
+                    kernel_for_output = "generic ragged paged attention v3*";
+                } else if (kernel_for_output == "mla") {
+                    kernel_for_output = "mla*";
+                } else if (kernel_for_output == "attention_kernels") {
+                    kernel_for_output = "* For attention kernels, W[x]A[y] denotes KV cache as W, A as compute, and x, y as bit precision";
+                } else if (kernel_for_output == "ragged paged attention v3 head_dim 64") {
+                    kernel_for_output = "ragged paged attention v3 head_dim 64*";
+                }
+
+                row = "\"" kernel_for_output "\"";
+
+                # Append data for all columns
+                for (j in quant_cols) {
+                    quant = quant_cols[j];
+                    data = (matrix[local_original_key][quant] == "") ? default_val : matrix[local_original_key][quant];
+                    row = row OFS data;
+                }
+                print row >> "'"$output_file"'";
+            }
+        }
+    ' "$input_csv"
+
+    # Display the pivoted result
+    echo "--- $output_file ---"
+    cat "$output_file"
+    buildkite-agent artifact upload "$output_file"
+}
+
 if [ ${#model_list[@]} -gt 0 ]; then
     process_models
 fi
@@ -173,17 +285,21 @@ done
 # Feature support matrices
 for csv_file in "${feature_csv_files[@]}"; do
     if [[ -f "$csv_file" ]]; then
-        echo "--- $csv_file ---"
         sorted_content=$(tail -n +2 "$csv_file" | sort -V)
         header=$(head -n 1 "$csv_file")
         echo "$header" > "$csv_file"
         echo "$sorted_content" >> "$csv_file"
-
-        cat "$csv_file"
-        buildkite-agent artifact upload "$csv_file"
+        if [[ "$csv_file" != "kernel_support_matrix_microbenchmarks.csv" ]]; then
+            echo "--- $csv_file ---"
+            cat "$csv_file"
+            buildkite-agent artifact upload "$csv_file"
+        fi
     fi
 done
 
+# Process the Kernel Matrix into the pivoted format
+process_kernel_matrix_to_pivot
+
 echo "Reports uploaded successfully."
 
 # Cleanup
diff --git a/.buildkite/scripts/upload_models_and_features.sh b/.buildkite/scripts/upload_models_and_features.sh
index 875177bdc..603c5d1fb 100644
--- a/.buildkite/scripts/upload_models_and_features.sh
+++ b/.buildkite/scripts/upload_models_and_features.sh
@@ -1,10 +1,29 @@
 #!/bin/bash
 
+
 BUILDKITE_DIR=".buildkite"
-TARGET_FOLDERS="models features parallelism quantization"
 MODEL_LIST_KEY="model-list"
 FEATURE_LIST_KEY="feature-list"
 
+declare -a TARGET_FOLDERS=(
+    "quantization"
+    "parallelism"
+    "models"
+    "features"
+)
+
+# Use find to append the kernel_microbenchmarks subdirectories
+KERNEL_PARENT_DIR=".buildkite/kernel_microbenchmarks"
+
+if [[ -d "$KERNEL_PARENT_DIR" ]]; then
+    while IFS= read -r dir; do
+        folder_path_to_add="${dir#"${BUILDKITE_DIR}"/}"
+        TARGET_FOLDERS+=("$folder_path_to_add")
+    done < <(find "$KERNEL_PARENT_DIR" -maxdepth 1 -mindepth 1 -type d)
+else
+    echo "Warning: Kernel microbenchmarks directory '$KERNEL_PARENT_DIR' not found. Skipping dynamic folder discovery."
+fi
+
 declare -a pipeline_steps
 
 # Declare separate arrays for each list
@@ -12,7 +31,7 @@ declare -a model_list
 declare -a feature_list
 
 
-for folder_path in $TARGET_FOLDERS; do
+for folder_path in "${TARGET_FOLDERS[@]}"; do
   folder=$BUILDKITE_DIR/$folder_path
   # Check if the folder exists
   if [[ ! -d "$folder" ]]; then
@@ -37,13 +56,7 @@ for folder_path in $TARGET_FOLDERS; do
         "models")
           model_list+=("${subject_name}")
           ;;
-        "features")
-          feature_list+=("${subject_name}")
-          ;;
-        "parallelism")
-          feature_list+=("${subject_name}")
-          ;;
-        "quantization")
+        "features" | "parallelism" | "quantization" | "kernel_microbenchmarks"/*)
           feature_list+=("${subject_name}")
           ;;
       esac
diff --git a/support_matrices/hardware_acceleration_and_dtype_support_matrix.csv b/support_matrices/hardware_acceleration_and_dtype_support_matrix.csv
new file mode 100644
index 000000000..31952946c
--- /dev/null
+++ b/support_matrices/hardware_acceleration_and_dtype_support_matrix.csv
@@ -0,0 +1,4 @@
+TPU Generations,Natively Supported FP4/FP8
+v5,"fp8_e4m3b11, fp8_e5m2"
+v6,"fp8_e4m3b11, fp8_e5m2"
+v7,"fp8_e4m3fn, fp8_e5m2, fp4"
diff --git a/support_matrices/nightly/hardware_acceleration_and_dtype_support_matrix.csv b/support_matrices/nightly/hardware_acceleration_and_dtype_support_matrix.csv
new file mode 100644
index 000000000..31952946c
--- /dev/null
+++ b/support_matrices/nightly/hardware_acceleration_and_dtype_support_matrix.csv
@@ -0,0 +1,4 @@
+TPU Generations,Natively Supported FP4/FP8
+v5,"fp8_e4m3b11, fp8_e5m2"
+v6,"fp8_e4m3b11, fp8_e5m2"
+v7,"fp8_e4m3fn, fp8_e5m2, fp4"