From d601a120fa78e78abc0f1927407bdd0956386be8 Mon Sep 17 00:00:00 2001 From: Anish Shanbhag Date: Thu, 7 May 2026 16:02:00 -0700 Subject: [PATCH] Tune MiniMax M2.5 FP8 H200 vLLM agg --- .github/configs/nvidia-master.yaml | 6 +-- .../single_node/minimaxm2.5_fp8_h200.sh | 46 +++++++++++++++---- perf-changelog.yaml | 7 +++ 3 files changed, 48 insertions(+), 11 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index e8286bff6..94b8b36a3 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -4331,7 +4331,7 @@ gptoss-fp4-h200-vllm: - { tp: 8, conc-start: 4, conc-end: 32 } minimaxm2.5-fp8-h200-vllm: - image: vllm/vllm-openai:v0.18.0 + image: vllm/vllm-openai:v0.20.1-ubuntu2404 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: h200 @@ -4343,11 +4343,11 @@ minimaxm2.5-fp8-h200-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 1, conc-end: 256 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 1, conc-end: 256 } dsr1-fp4-gb200-dynamo-trt: image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 diff --git a/benchmarks/single_node/minimaxm2.5_fp8_h200.sh b/benchmarks/single_node/minimaxm2.5_fp8_h200.sh index 84e73b65c..2eab27a84 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_h200.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_h200.sh @@ -27,23 +27,53 @@ if [ "${EVAL_ONLY}" = "true" ]; then MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" fi -if [ "$EP_SIZE" -ge 1 ]; then - EP=" --enable-expert-parallel" +export PYTHONNOUSERSITE=1 +export SAFETENSORS_FAST_GPU=1 +export VLLM_USE_DEEP_GEMM=0 +export VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER=0 +export VLLM_FLOAT32_MATMUL_PRECISION=high + +COMPILATION_CONFIG=${COMPILATION_CONFIG:-'{"mode":3,"cudagraph_mode":"PIECEWISE","pass_config":{"fuse_minimax_qk_norm":true}}'} +MAX_NUM_SEQS=${MAX_NUM_SEQS:-512} +MAX_NUM_BATCHED_TOKENS=${MAX_NUM_BATCHED_TOKENS:-32768} + +case "$CONC" in + 128|256) + BLOCK_SIZE=${BLOCK_SIZE:-32} + ;; +esac + +if [ "$EP_SIZE" -gt 1 ]; then + EP=(--enable-expert-parallel) +else + EP=() +fi + +if [[ -n "${BLOCK_SIZE:-}" ]]; then + BLOCK_SIZE_ARG=(--block-size "$BLOCK_SIZE") else - EP=" " + BLOCK_SIZE_ARG=() fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor set -x -vllm serve $MODEL --port $PORT \ ---tensor-parallel-size=$TP \ -$EP \ +vllm serve "$MODEL" --port "$PORT" \ +--tensor-parallel-size="$TP" \ +"${EP[@]}" \ --gpu-memory-utilization 0.95 \ ---max-model-len $MAX_MODEL_LEN \ +--max-model-len "$MAX_MODEL_LEN" \ +--max-num-seqs "$MAX_NUM_SEQS" \ +--max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" \ +"${BLOCK_SIZE_ARG[@]}" \ +--kv-cache-dtype fp8 \ +--moe-backend triton \ +--attention-backend FLASHINFER \ +--enable-flashinfer-autotune \ +--compilation-config "$COMPILATION_CONFIG" \ --no-enable-prefix-caching \ ---trust-remote-code > $SERVER_LOG 2>&1 & +--trust-remote-code > "$SERVER_LOG" 2>&1 & SERVER_PID=$! diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 715d6f177..ba070a5a3 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2307,3 +2307,10 @@ - "Tune DSv4 FP4 MI355X SGLang runtime envs: enable aiter MHC pre/post, and enable triton swa prepare kernel." - "Add --context-length. Add --enable-prefill-delayer for dp config" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1300 + +- config-keys: + - minimaxm2.5-fp8-h200-vllm + description: + - "Update MiniMax-M2.5 FP8 H200 vLLM to vllm/vllm-openai:v0.20.1-ubuntu2404" + - "Set vLLM serving knobs in benchmarks/single_node/minimaxm2.5_fp8_h200.sh: generated benchmark max-model-len, previous eval max-model-len handling, fp8 KV cache, FlashInfer attention/autotune, Triton MoE, MiniMax QK norm fusion, and concurrency-specific block-size" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1298