diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 5551860f2..66b4c9ffe 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -350,14 +350,18 @@ kimik2.5-fp4-mi355x-vllm:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
   - isl: 1024
     osl: 8192
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
 
 minimaxm2.5-fp8-mi355x-vllm:
   image: vllm/vllm-openai-rocm:v0.15.1
diff --git a/benchmarks/single_node/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/kimik2.5_fp4_mi355x.sh
index bb522b396..bf14aca47 100755
--- a/benchmarks/single_node/kimik2.5_fp4_mi355x.sh
+++ b/benchmarks/single_node/kimik2.5_fp4_mi355x.sh
@@ -31,9 +31,24 @@ fi
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
-# do not enable aiter due to Aiter MLA not currently supporting num_heads=8
-# https://github.com/vllm-project/vllm/issues/35641
-# export VLLM_ROCM_USE_AITER=1
+# If the machine runs a MEC FW older than 177, RCCL
+# cannot reclaim some memory.
+# Disable that features to avoid crashes.
+# This is related to the changes in the driver at:
+# https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates
+version=`rocm-smi --showfw | grep MEC | head -n 1 |  awk '{print $NF}'`
+if [[ "$version" == "" || $version -lt 177 ]]; then
+  export HSA_NO_SCRATCH_RECLAIM=1
+fi
+
+export VLLM_ROCM_USE_AITER=1
+export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
+
+if [ "$EP_SIZE" -gt 1 ]; then
+  EP=" --enable-expert-parallel"
+else
+  EP=" "
+fi
 
 # following AMD andy luo's recipe
 # https://x.com/linluo77/status/2017024513595301985
@@ -44,10 +59,10 @@ start_gpu_monitor
 set -x
 vllm serve $MODEL --port $PORT \
 --tensor-parallel-size=$TP \
---gpu-memory-utilization 0.95 \
+$EP \
+--gpu-memory-utilization 0.90 \
 --max-model-len $MAX_MODEL_LEN \
---block-size=64 \
---disable-log-requests \
+--block-size=1 \
 --trust-remote-code \
 --mm-encoder-tp-mode data > $SERVER_LOG 2>&1 &