From 1ca7a2bfb32f5eb4956c0d56434840a7b143fd79 Mon Sep 17 00:00:00 2001 From: zhutaoyu Date: Thu, 5 Mar 2026 07:55:12 +0000 Subject: [PATCH 1/4] add minimax tp8 with ep and remove tp-4 --- .github/configs/amd-master.yaml | 6 +++--- benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh | 10 ++++++++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 00fd01936..c1c3d7e5f 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -327,17 +327,17 @@ minimaxm2.5-fp8-mi355x-vllm: osl: 1024 search-space: - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 32, conc-end: 512 } - isl: 1024 osl: 8192 search-space: - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 32, conc-end: 512 } - isl: 8192 osl: 1024 search-space: - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 32, conc-end: 512 } minimaxm2.5-fp8-mi300x-vllm: image: vllm/vllm-openai-rocm:v0.16.0 diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh index b6ea5f65d..c70ec7679 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh @@ -5,6 +5,7 @@ source "$(dirname "$0")/../benchmark_lib.sh" check_env_vars \ MODEL \ TP \ + EP_SIZE \ CONC \ ISL \ OSL \ @@ -28,10 +29,19 @@ export VLLM_ROCM_USE_AITER=1 SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +if [ "$EP_SIZE" -gt 1 ]; then + EP=" --enable-expert-parallel" +else + EP=" " +fi + set -x vllm serve $MODEL --port $PORT \ --tensor-parallel-size=$TP \ +$EP \ --gpu-memory-utilization 0.95 \ +--max-num-seqs 512 \ +--max-num-batched-tokens 16384 \ --max-model-len $MAX_MODEL_LEN \ --block-size=32 \ --disable-log-requests \ From 4edc25ba677543579070791534fdd54b94f39553 Mon Sep 17 00:00:00 2001 From: zhutaoyu Date: Thu, 5 Mar 2026 08:04:44 +0000 Subject: [PATCH 2/4] update changelog --- perf-changelog.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 15d00da6d..9b16a2878 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -875,3 +875,10 @@ - "Server: reasoning-parser qwen3, tool-call-parser qwen3_coder, enable-flashinfer-allreduce-fusion, mem-fraction-static 0.8" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/855 +- config-keys: + - minimaxm2.5-fp8-mi355x-vllm + description: + - "ADD minimax TP=8 with EP, in config of 1k1k, 1k8k, and 8k1k sequence lengths" + - "Config concurrency: 32-512" + - "Remove TP=4 configs" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/868 \ No newline at end of file From 8ae81142b01f775512cdc918c561cc9ee304778c Mon Sep 17 00:00:00 2001 From: Taoyu Zhu Date: Wed, 11 Mar 2026 23:51:41 +0800 Subject: [PATCH 3/4] Update amd-master.yaml for 1k8k & 8k1k CONC to 256 --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index c1c3d7e5f..d7062d500 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -332,12 +332,12 @@ minimaxm2.5-fp8-mi355x-vllm: osl: 8192 search-space: - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 32, conc-end: 512 } + - { tp: 8, ep: 8, conc-start: 32, conc-end: 256 } - isl: 8192 osl: 1024 search-space: - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 32, conc-end: 512 } + - { tp: 8, ep: 8, conc-start: 32, conc-end: 256 } minimaxm2.5-fp8-mi300x-vllm: image: vllm/vllm-openai-rocm:v0.16.0 From 26c5c5f76b027e61b0526405346514514f3d99fe Mon Sep 17 00:00:00 2001 From: Taoyu Zhu Date: Sat, 21 Mar 2026 19:47:29 +0800 Subject: [PATCH 4/4] fix --- .github/configs/amd-master.yaml | 7 +++++-- benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh | 2 -- perf-changelog.yaml | 16 ++++++++-------- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index b6ba472eb..c362604f1 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -360,7 +360,7 @@ kimik2.5-fp4-mi355x-vllm: - { tp: 8, conc-start: 4, conc-end: 64 } minimaxm2.5-fp8-mi355x-vllm: - image: vllm/vllm-openai-rocm:v0.15.1 + image: vllm/vllm-openai-rocm:v0.18.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi355x @@ -372,16 +372,19 @@ minimaxm2.5-fp8-mi355x-vllm: osl: 1024 search-space: - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 32, conc-end: 512 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 32, conc-end: 256 } - isl: 1024 osl: 8192 search-space: - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, ep: 8, conc-start: 32, conc-end: 256 } - isl: 8192 osl: 1024 search-space: - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, ep: 8, conc-start: 32, conc-end: 256 } minimaxm2.5-fp8-mi300x-vllm: diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh index 6c0627a05..4180f5675 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh @@ -43,8 +43,6 @@ vllm serve $MODEL --port $PORT \ --tensor-parallel-size=$TP \ $EP \ --gpu-memory-utilization 0.95 \ ---max-num-seqs 512 \ ---max-num-batched-tokens 16384 \ --max-model-len $MAX_MODEL_LEN \ --block-size=32 \ --disable-log-requests \ diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 01871723c..7927a629f 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -897,14 +897,6 @@ - "Server: reasoning-parser qwen3, tool-call-parser qwen3_coder, enable-flashinfer-allreduce-fusion, mem-fraction-static 0.8" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/855 -- config-keys: - - minimaxm2.5-fp8-mi355x-vllm - description: - - "ADD minimax TP=8 with EP, in config of 1k1k, 1k8k, and 8k1k sequence lengths" - - "Config concurrency: 32-512" - - "Remove TP=4 configs" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/868 - - config-keys: - dsr1-fp8-mi355x-sglang description: @@ -1002,3 +994,11 @@ - "EAGLE speculative decoding: num-steps 3, draft-tokens 4, topk 1" - "New script: benchmarks/single_node/qwen3.5_fp8_b200_mtp.sh" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/898 + +- config-keys: + - minimaxm2.5-fp8-mi355x-vllm + description: + - "ADD minimax TP=8 with EP, in config of 1k1k, 1k8k, and 8k1k sequence lengths" + - "Config concurrency: 32-256" + - "update image to vllm/vllm-openai-rocm:v0.18.0" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/868 \ No newline at end of file