From 6a21687bcd8d8a1282d1a3cf7ea0157cf3a0ec5d Mon Sep 17 00:00:00 2001 From: LI MOU Date: Thu, 19 Mar 2026 10:57:25 +0000 Subject: [PATCH 1/3] optimize kimi-k2.5-fp4 on amd mi355x gpu --- .github/configs/amd-master.yaml | 6 ++--- benchmarks/single_node/kimik2.5_fp4_mi355x.sh | 23 ++++++++++++++----- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 5551860f2..292081163 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -349,15 +349,15 @@ kimik2.5-fp4-mi355x-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } - isl: 1024 osl: 8192 search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } minimaxm2.5-fp8-mi355x-vllm: image: vllm/vllm-openai-rocm:v0.15.1 diff --git a/benchmarks/single_node/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/kimik2.5_fp4_mi355x.sh index bb522b396..a6168922a 100755 --- a/benchmarks/single_node/kimik2.5_fp4_mi355x.sh +++ b/benchmarks/single_node/kimik2.5_fp4_mi355x.sh @@ -31,9 +31,21 @@ fi SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} -# do not enable aiter due to Aiter MLA not currently supporting num_heads=8 -# https://github.com/vllm-project/vllm/issues/35641 -# export VLLM_ROCM_USE_AITER=1 +# If the machine runs a MEC FW older than 177, RCCL +# cannot reclaim some memory. +# Disable that features to avoid crashes. +# This is related to the changes in the driver at: +# https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates +version=`rocm-smi --showfw | grep MEC | head -n 1 | awk '{print $NF}'` +if [[ "$version" == "" || $version -lt 177 ]]; then + export HSA_NO_SCRATCH_RECLAIM=1 +fi + +export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_USE_AITER_MLA=1 +export VLLM_ROCM_USE_AITER_MOE=1 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT8 +export VLLM_ROCM_USE_AITER_TRITON_ROPE=1 # following AMD andy luo's recipe # https://x.com/linluo77/status/2017024513595301985 @@ -44,10 +56,9 @@ start_gpu_monitor set -x vllm serve $MODEL --port $PORT \ --tensor-parallel-size=$TP \ ---gpu-memory-utilization 0.95 \ +--gpu-memory-utilization 0.90 \ --max-model-len $MAX_MODEL_LEN \ ---block-size=64 \ ---disable-log-requests \ +--block-size=1 \ --trust-remote-code \ --mm-encoder-tp-mode data > $SERVER_LOG 2>&1 & From d4040782aaf428677b1f9620c987d26540de67cc Mon Sep 17 00:00:00 2001 From: LI MOU Date: Fri, 20 Mar 2026 03:36:17 +0000 Subject: [PATCH 2/3] add expert parallel for kimik2.5-fp4-mi355x --- .github/configs/amd-master.yaml | 1 + benchmarks/single_node/kimik2.5_fp4_mi355x.sh | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 292081163..4ffeadb8c 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -354,6 +354,7 @@ kimik2.5-fp4-mi355x-vllm: osl: 8192 search-space: - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: diff --git a/benchmarks/single_node/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/kimik2.5_fp4_mi355x.sh index a6168922a..b42501910 100755 --- a/benchmarks/single_node/kimik2.5_fp4_mi355x.sh +++ b/benchmarks/single_node/kimik2.5_fp4_mi355x.sh @@ -47,6 +47,12 @@ export VLLM_ROCM_USE_AITER_MOE=1 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT8 export VLLM_ROCM_USE_AITER_TRITON_ROPE=1 +if [ "$EP_SIZE" -gt 1 ]; then + EP=" --enable-expert-parallel" +else + EP=" " +fi + # following AMD andy luo's recipe # https://x.com/linluo77/status/2017024513595301985 @@ -56,6 +62,7 @@ start_gpu_monitor set -x vllm serve $MODEL --port $PORT \ --tensor-parallel-size=$TP \ +$EP \ --gpu-memory-utilization 0.90 \ --max-model-len $MAX_MODEL_LEN \ --block-size=1 \ From 5410ce5b872ea0493b4ff6bf5c986ed7b7f971d8 Mon Sep 17 00:00:00 2001 From: Li Date: Sat, 21 Mar 2026 23:33:55 -0700 Subject: [PATCH 3/3] Clean up AITER env vars, switch quickreduce to INT4, add TP8 search space - Remove redundant VLLM_ROCM_USE_AITER_MLA=1 and VLLM_ROCM_USE_AITER_MOE=1 (both default to True in vllm envs.py, only master switch needed) - Remove VLLM_ROCM_USE_AITER_TRITON_ROPE=1 (noop without --compilation-config custom_ops+=+rotary_embedding) - Switch VLLM_ROCM_QUICK_REDUCE_QUANTIZATION from INT8 to INT4 for better TTFT/TPOT (2.2x vs 1.17x per quickreduce benchmarks) - Add TP8EP1 back to all search spaces alongside TP4EP1 and TP4EP4 so InferenceX can sweep and determine optimal config empirically Made-with: Cursor --- .github/configs/amd-master.yaml | 3 +++ benchmarks/single_node/kimik2.5_fp4_mi355x.sh | 5 +---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 4ffeadb8c..66b4c9ffe 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -349,15 +349,18 @@ kimik2.5-fp4-mi355x-vllm: - isl: 1024 osl: 1024 search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - isl: 1024 osl: 8192 search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } minimaxm2.5-fp8-mi355x-vllm: diff --git a/benchmarks/single_node/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/kimik2.5_fp4_mi355x.sh index b42501910..bf14aca47 100755 --- a/benchmarks/single_node/kimik2.5_fp4_mi355x.sh +++ b/benchmarks/single_node/kimik2.5_fp4_mi355x.sh @@ -42,10 +42,7 @@ if [[ "$version" == "" || $version -lt 177 ]]; then fi export VLLM_ROCM_USE_AITER=1 -export VLLM_ROCM_USE_AITER_MLA=1 -export VLLM_ROCM_USE_AITER_MOE=1 -export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT8 -export VLLM_ROCM_USE_AITER_TRITON_ROPE=1 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 if [ "$EP_SIZE" -gt 1 ]; then EP=" --enable-expert-parallel"