From 6a21687bcd8d8a1282d1a3cf7ea0157cf3a0ec5d Mon Sep 17 00:00:00 2001
From: LI MOU <lxglbk@gmail.com>
Date: Thu, 19 Mar 2026 10:57:25 +0000
Subject: [PATCH 1/3] optimize kimi-k2.5-fp4 on amd mi355x gpu

---
 .github/configs/amd-master.yaml               |  6 ++---
 benchmarks/single_node/kimik2.5_fp4_mi355x.sh | 23 ++++++++++++++-----
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 5551860f2..292081163 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -349,15 +349,15 @@ kimik2.5-fp4-mi355x-vllm:
   - isl: 1024
     osl: 1024
     search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
   - isl: 1024
     osl: 8192
     search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
 
 minimaxm2.5-fp8-mi355x-vllm:
   image: vllm/vllm-openai-rocm:v0.15.1
diff --git a/benchmarks/single_node/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/kimik2.5_fp4_mi355x.sh
index bb522b396..a6168922a 100755
--- a/benchmarks/single_node/kimik2.5_fp4_mi355x.sh
+++ b/benchmarks/single_node/kimik2.5_fp4_mi355x.sh
@@ -31,9 +31,21 @@ fi
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
-# do not enable aiter due to Aiter MLA not currently supporting num_heads=8
-# https://github.com/vllm-project/vllm/issues/35641
-# export VLLM_ROCM_USE_AITER=1
+# If the machine runs a MEC FW older than 177, RCCL
+# cannot reclaim some memory.
+# Disable that features to avoid crashes.
+# This is related to the changes in the driver at:
+# https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates
+version=`rocm-smi --showfw | grep MEC | head -n 1 |  awk '{print $NF}'`
+if [[ "$version" == "" || $version -lt 177 ]]; then
+  export HSA_NO_SCRATCH_RECLAIM=1
+fi
+
+export VLLM_ROCM_USE_AITER=1
+export VLLM_ROCM_USE_AITER_MLA=1
+export VLLM_ROCM_USE_AITER_MOE=1
+export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT8
+export VLLM_ROCM_USE_AITER_TRITON_ROPE=1
 
 # following AMD andy luo's recipe
 # https://x.com/linluo77/status/2017024513595301985
@@ -44,10 +56,9 @@ start_gpu_monitor
 set -x
 vllm serve $MODEL --port $PORT \
 --tensor-parallel-size=$TP \
---gpu-memory-utilization 0.95 \
+--gpu-memory-utilization 0.90 \
 --max-model-len $MAX_MODEL_LEN \
---block-size=64 \
---disable-log-requests \
+--block-size=1 \
 --trust-remote-code \
 --mm-encoder-tp-mode data > $SERVER_LOG 2>&1 &
 

From d4040782aaf428677b1f9620c987d26540de67cc Mon Sep 17 00:00:00 2001
From: LI MOU <lxglbk@gmail.com>
Date: Fri, 20 Mar 2026 03:36:17 +0000
Subject: [PATCH 2/3] add expert parallel for kimik2.5-fp4-mi355x

---
 .github/configs/amd-master.yaml               | 1 +
 benchmarks/single_node/kimik2.5_fp4_mi355x.sh | 7 +++++++
 2 files changed, 8 insertions(+)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 292081163..4ffeadb8c 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -354,6 +354,7 @@ kimik2.5-fp4-mi355x-vllm:
     osl: 8192
     search-space:
     - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
diff --git a/benchmarks/single_node/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/kimik2.5_fp4_mi355x.sh
index a6168922a..b42501910 100755
--- a/benchmarks/single_node/kimik2.5_fp4_mi355x.sh
+++ b/benchmarks/single_node/kimik2.5_fp4_mi355x.sh
@@ -47,6 +47,12 @@ export VLLM_ROCM_USE_AITER_MOE=1
 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT8
 export VLLM_ROCM_USE_AITER_TRITON_ROPE=1
 
+if [ "$EP_SIZE" -gt 1 ]; then
+  EP=" --enable-expert-parallel"
+else
+  EP=" "
+fi
+
 # following AMD andy luo's recipe
 # https://x.com/linluo77/status/2017024513595301985
 
@@ -56,6 +62,7 @@ start_gpu_monitor
 set -x
 vllm serve $MODEL --port $PORT \
 --tensor-parallel-size=$TP \
+$EP \
 --gpu-memory-utilization 0.90 \
 --max-model-len $MAX_MODEL_LEN \
 --block-size=1 \

From 5410ce5b872ea0493b4ff6bf5c986ed7b7f971d8 Mon Sep 17 00:00:00 2001
From: Li <chuali@amd.com>
Date: Sat, 21 Mar 2026 23:33:55 -0700
Subject: [PATCH 3/3] Clean up AITER env vars, switch quickreduce to INT4, add
 TP8 search space

- Remove redundant VLLM_ROCM_USE_AITER_MLA=1 and VLLM_ROCM_USE_AITER_MOE=1
  (both default to True in vllm envs.py, only master switch needed)
- Remove VLLM_ROCM_USE_AITER_TRITON_ROPE=1 (noop without
  --compilation-config custom_ops+=+rotary_embedding)
- Switch VLLM_ROCM_QUICK_REDUCE_QUANTIZATION from INT8 to INT4
  for better TTFT/TPOT (2.2x vs 1.17x per quickreduce benchmarks)
- Add TP8EP1 back to all search spaces alongside TP4EP1 and TP4EP4
  so InferenceX can sweep and determine optimal config empirically

Made-with: Cursor
---
 .github/configs/amd-master.yaml               | 3 +++
 benchmarks/single_node/kimik2.5_fp4_mi355x.sh | 5 +----
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 4ffeadb8c..66b4c9ffe 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -349,15 +349,18 @@ kimik2.5-fp4-mi355x-vllm:
   - isl: 1024
     osl: 1024
     search-space:
+    - { tp: 8, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
   - isl: 1024
     osl: 8192
     search-space:
+    - { tp: 8, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
+    - { tp: 8, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
 
 minimaxm2.5-fp8-mi355x-vllm:
diff --git a/benchmarks/single_node/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/kimik2.5_fp4_mi355x.sh
index b42501910..bf14aca47 100755
--- a/benchmarks/single_node/kimik2.5_fp4_mi355x.sh
+++ b/benchmarks/single_node/kimik2.5_fp4_mi355x.sh
@@ -42,10 +42,7 @@ if [[ "$version" == "" || $version -lt 177 ]]; then
 fi
 
 export VLLM_ROCM_USE_AITER=1
-export VLLM_ROCM_USE_AITER_MLA=1
-export VLLM_ROCM_USE_AITER_MOE=1
-export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT8
-export VLLM_ROCM_USE_AITER_TRITON_ROPE=1
+export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
 
 if [ "$EP_SIZE" -gt 1 ]; then
   EP=" --enable-expert-parallel"