diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 5551860f2..66b4c9ffe 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -350,14 +350,18 @@ kimik2.5-fp4-mi355x-vllm: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } - isl: 1024 osl: 8192 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } minimaxm2.5-fp8-mi355x-vllm: image: vllm/vllm-openai-rocm:v0.15.1 diff --git a/benchmarks/single_node/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/kimik2.5_fp4_mi355x.sh index bb522b396..bf14aca47 100755 --- a/benchmarks/single_node/kimik2.5_fp4_mi355x.sh +++ b/benchmarks/single_node/kimik2.5_fp4_mi355x.sh @@ -31,9 +31,24 @@ fi SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} -# do not enable aiter due to Aiter MLA not currently supporting num_heads=8 -# https://github.com/vllm-project/vllm/issues/35641 -# export VLLM_ROCM_USE_AITER=1 +# If the machine runs a MEC FW older than 177, RCCL +# cannot reclaim some memory. +# Disable that features to avoid crashes. +# This is related to the changes in the driver at: +# https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates +version=`rocm-smi --showfw | grep MEC | head -n 1 | awk '{print $NF}'` +if [[ "$version" == "" || $version -lt 177 ]]; then + export HSA_NO_SCRATCH_RECLAIM=1 +fi + +export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 + +if [ "$EP_SIZE" -gt 1 ]; then + EP=" --enable-expert-parallel" +else + EP=" " +fi # following AMD andy luo's recipe # https://x.com/linluo77/status/2017024513595301985 @@ -44,10 +59,10 @@ start_gpu_monitor set -x vllm serve $MODEL --port $PORT \ --tensor-parallel-size=$TP \ ---gpu-memory-utilization 0.95 \ +$EP \ +--gpu-memory-utilization 0.90 \ --max-model-len $MAX_MODEL_LEN \ ---block-size=64 \ ---disable-log-requests \ +--block-size=1 \ --trust-remote-code \ --mm-encoder-tp-mode data > $SERVER_LOG 2>&1 &