Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2083,6 +2083,35 @@ qwen3.5-fp4-b200-sglang-mtp:
search-space:
- { tp: 4, ep: 1, conc-start: 4, conc-end: 128, spec-decoding: mtp }

qwen3.5-fp4-b200-trt:
image: nvcr.io#nvidia/tensorrt-llm/release:1.3.0rc12
model: nvidia/Qwen3.5-397B-A17B-NVFP4
model-prefix: qwen3.5
runner: b200
precision: fp4
framework: trt
multinode: false
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- { tp: 4, ep: 1, conc-start: 4, conc-end: 16 }
- { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
- { tp: 4, ep: 4, conc-start: 16, conc-end: 256 }
- { tp: 4, ep: 4, dp-attn: true, conc-start: 1024, conc-end: 1024 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 512, conc-end: 1024 }
- isl: 8192
osl: 1024
search-space:
- { tp: 2, ep: 1, conc-start: 4, conc-end: 32 }
- { tp: 4, ep: 1, conc-start: 4, conc-end: 8 }
- { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
- { tp: 2, ep: 2, conc-start: 64, conc-end: 64 }
- { tp: 4, ep: 4, conc-start: 16, conc-end: 16 }
- { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 1024 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 512, conc-end: 1024 }

glm5-fp8-b200-sglang:
image: lmsysorg/sglang:nightly-dev-cu13-20260317-1eea7448
model: zai-org/GLM-5-FP8
Expand Down
196 changes: 196 additions & 0 deletions benchmarks/single_node/qwen3.5_fp4_b200_trt.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
#!/usr/bin/env bash

source "$(dirname "$0")/../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
EP_SIZE \
CONC \
ISL \
OSL \
MAX_MODEL_LEN \
RANDOM_RANGE_RATIO \
RESULT_FILENAME \
DP_ATTENTION

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

echo "TP: $TP, EP: $EP_SIZE, CONC: $CONC, ISL: $ISL, OSL: $OSL, DP_ATTENTION: $DP_ATTENTION"

hf download "$MODEL"

# Derive max_batch_size from (TP, DP_ATTENTION). For dp-attn the DEP-8 case uses 128, DEP-4 uses 256.
# For non-dp-attn, TP=2 uses 256 (retained for compat), else 512.
if [[ "$DP_ATTENTION" == "true" ]]; then
if [[ "$TP" == "8" ]]; then
MAX_BATCH_SIZE=128
else
MAX_BATCH_SIZE=256
fi
else
if [[ "$TP" == "2" ]]; then
MAX_BATCH_SIZE=256
else
MAX_BATCH_SIZE=512
fi
fi

# cuda_graph batch_sizes: powers of 2 up to min(256, MAX_BATCH_SIZE), plus 384,512 when MAX_BATCH_SIZE=512.
CUDA_GRAPH_BATCH_SIZES="1, 2, 4, 8, 16, 32, 64, 128"
if (( MAX_BATCH_SIZE >= 256 )); then
CUDA_GRAPH_BATCH_SIZES="$CUDA_GRAPH_BATCH_SIZES, 256"
fi
if (( MAX_BATCH_SIZE >= 512 )); then
CUDA_GRAPH_BATCH_SIZES="$CUDA_GRAPH_BATCH_SIZES, 384, 512"
fi

# MoE backend: CUTEDSL for dp-attn configs, TRTLLM otherwise.
if [[ "$DP_ATTENTION" == "true" ]]; then
MOE_BACKEND=CUTEDSL
else
MOE_BACKEND=TRTLLM
fi

# max_num_tokens scales with input seq length.
case "$ISL" in
8192) MAX_NUM_TOKENS=33792 ;;
*) MAX_NUM_TOKENS=16384 ;;
esac

# Hand-tuned hybrid: 8k/1k DEP-4 at conc 256 wants TRTLLM MoE and a tighter
# token budget instead of the CUTEDSL default that other dp-attn points use.
if [[ "$ISL" == "8192" && "$TP" == "4" && "$EP_SIZE" == "4" \
&& "$DP_ATTENTION" == "true" && "$CONC" == "256" ]]; then
MOE_BACKEND=TRTLLM
MAX_NUM_TOKENS=24576
fi

# batch_wait_max_tokens_ratio (non-dp-attn only) scales with concurrency.
case "$CONC" in
4|8|16) BATCH_WAIT_RATIO=0.0625 ;;
32) BATCH_WAIT_RATIO=0.125 ;;
64) BATCH_WAIT_RATIO=0.25 ;;
128) BATCH_WAIT_RATIO=0.5 ;;
*) BATCH_WAIT_RATIO=0.75 ;;
esac

EXTRA_CONFIG_FILE="$(pwd)/extra-llm-api-config.yml"

cat > "$EXTRA_CONFIG_FILE" << EOF
max_batch_size: $MAX_BATCH_SIZE
max_num_tokens: $MAX_NUM_TOKENS
num_postprocess_workers: 4
backend: pytorch
print_iter_log: true
enable_layerwise_nvtx_marker: false
disable_overlap_scheduler: false
enable_iter_perf_stats: true
enable_chunked_prefill: false
stream_interval: 20
scheduler_config:
capacity_scheduler_policy: MAX_UTILIZATION
context_chunking_policy: FIRST_COME_FIRST_SERVED
kv_cache_config:
free_gpu_memory_fraction: 0.9
enable_block_reuse: false
dtype: fp8
cuda_graph_config:
enable_padding: true
batch_sizes: [$CUDA_GRAPH_BATCH_SIZES]
moe_config:
backend: $MOE_BACKEND
EOF

if [[ "$DP_ATTENTION" == "true" ]]; then
cat >> "$EXTRA_CONFIG_FILE" << EOF
enable_attention_dp: true
attention_dp_config:
enable_balance: true
batching_wait_iters: 10
timeout_iters: 500
EOF
else
cat >> "$EXTRA_CONFIG_FILE" << EOF
batch_wait_timeout_iters: 50
batch_wait_max_tokens_ratio: $BATCH_WAIT_RATIO
EOF
fi

# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor

set -x

if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
fi
Comment on lines +127 to +130
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 EVAL_ONLY only updates MAX_MODEL_LEN, not MAX_NUM_TOKENS — diverges from every sibling TRT script (dsr1/dsv4/gptoss/h200 variants), which all set both to EVAL_MAX_MODEL_LEN inside the EVAL_ONLY block. As a secondary issue, even if MAX_NUM_TOKENS were updated here, the YAML on line 80-105 has already been written with the original value before the EVAL_ONLY block at line 127, so the on-disk config would still hold the stale max_num_tokens. Suggested fix: add MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN" inside the EVAL_ONLY block AND move that block above the EXTRA_CONFIG_FILE write (or rewrite the YAML afterwards).

Extended reasoning...

What is wrong

In benchmarks/single_node/qwen3.5_fp4_b200_trt.sh lines 127-130:

if [ "${EVAL_ONLY}" = "true" ]; then
    setup_eval_context
    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
fi

Only MAX_MODEL_LEN is overridden. Every comparable TRT-LLM script in the repo overrides both:

Script MAX_MODEL_LEN MAX_NUM_TOKENS
dsr1_fp4_b200_trt.sh (80-84) yes yes
dsv4_fp4_b300_trt.sh (99-101) yes yes
gptoss_fp4_b200_trt.sh (81-85) yes yes
gptoss_fp4_h200_trt.sh (51-53) yes yes
dsr1_fp8_h200_trt.sh (68-70) yes yes
qwen3.5_fp4_b200_trt.sh (127-130) yes no

There is also an ordering bug unique to this script. The YAML at line 82-105 already embeds max_num_tokens: $MAX_NUM_TOKENS before the EVAL_ONLY block runs. The sibling scripts that share this pattern (dsr1_fp4_b200_trt.sh, gptoss_fp4_b200_trt.sh) deliberately do not put max_num_tokens in the YAML — they only pass it via the --max_num_tokens CLI flag, so a late variable update is sufficient. Qwen does both, so even a one-line fix to update the variable would leave the YAML on disk holding the stale value.

Step-by-step proof — EVAL_ONLY=true, ISL=1024, OSL=1024

  1. Lines 57-61: MAX_NUM_TOKENS=16384.
  2. Line 84: YAML written with max_num_tokens: 16384.
  3. Line 128: setup_eval_context computes EVAL_MAX_MODEL_LEN = 1024+1024+256 = 2304 (see benchmark_lib.sh:674-676).
  4. Line 129: MAX_MODEL_LEN=2304. MAX_NUM_TOKENS stays at 16384.
  5. Line 165-166: server launched with --max_seq_len=2304 --max_num_tokens=16384.

For ISL=8192 it is even more pronounced: max_seq_len=9472, max_num_tokens=33792. Compare against gptoss_fp4_b200_trt.sh where the analogous flow ends with --max_seq_len=2304 --max_num_tokens=2304 because line 84 there also clamps MAX_NUM_TOKENS.

Addressing the refutation

The refuter argues that max_num_tokens is batch-level (not per-sequence) so a value larger than max_seq_len is a valid TRT-LLM config and not wasteful. That is technically true for steady-state correctness — the server will start and lm_eval will run. The refutation is right that this is not a crash. However:

  • The cuda-graph capture set is sized off this value. Lines 100-102 declare cuda_graph_config.batch_sizes up to 512, and TRT-LLM caps cuda-graph capture by max_num_tokens. Leaving max_num_tokens=33792 while max_seq_len=9472 causes EVAL_ONLY startup to capture graphs sized for the full benchmark scenario instead of the eval scenario, which is exactly what every other sibling script is trying to avoid.
  • gptoss_fp4_b200_trt.sh is a direct precedent, not just dsr1. The refuter dismisses dsr1 because its MAX_NUM_TOKENS is computed dynamically from CONC+ISL and so "makes sense" to override. But gptoss_fp4_b200_trt.sh:79 uses a hand-tuned static value (MAX_NUM_TOKENS=20000) just like qwen3.5, and still clamps it down in the EVAL_ONLY block (gptoss_fp4_b200_trt.sh:84). That removes the refuter's main escape hatch — the pattern applies regardless of how the original value was derived.
  • "YAML/CLI consistency is preserved" is technically true (both stay 16384/33792) but misses the goal of EVAL_ONLY mode: shrink the server to the eval workload. Both are wrong relative to that goal, so the fact that they are wrong in the same way is not a defense.

Severity

Marking normal rather than nit because (a) the pattern is unambiguous across 5+ sibling TRT scripts, (b) the YAML-vs-CLI ordering issue is a separate, qwen3.5-specific bug that any naive one-liner fix would still leave behind, and (c) the wasted cuda-graph capture meaningfully extends EVAL_ONLY startup. It does not corrupt results, however, so it is on the lower end of normal.


SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}

# --- audit: dump env + generated config before launching the server ---
echo "=============== env (resolved) ==============="
printf ' %-22s = %s\n' \
MODEL "$MODEL" \
TP "$TP" \
EP_SIZE "$EP_SIZE" \
DP_ATTENTION "$DP_ATTENTION" \
CONC "$CONC" \
ISL "$ISL" \
OSL "$OSL" \
MAX_MODEL_LEN "$MAX_MODEL_LEN" \
RANDOM_RANGE_RATIO "$RANDOM_RANGE_RATIO" \
RESULT_FILENAME "$RESULT_FILENAME" \
MAX_BATCH_SIZE "$MAX_BATCH_SIZE" \
MAX_NUM_TOKENS "$MAX_NUM_TOKENS" \
MOE_BACKEND "$MOE_BACKEND" \
BATCH_WAIT_RATIO "$BATCH_WAIT_RATIO" \
CUDA_GRAPH_BATCH_SIZES "$CUDA_GRAPH_BATCH_SIZES" \
SERVER_LOG "$SERVER_LOG" \
PORT "$PORT" \
EVAL_ONLY "${EVAL_ONLY:-}"
echo "=============== $EXTRA_CONFIG_FILE ==============="
ls -la "$EXTRA_CONFIG_FILE"
cat "$EXTRA_CONFIG_FILE"
echo "=============================================="

mpirun -n 1 --oversubscribe --allow-run-as-root \
trtllm-serve "$MODEL" --port="$PORT" \
--trust_remote_code \
--backend=pytorch \
--max_seq_len="$MAX_MODEL_LEN" \
--max_num_tokens="$MAX_NUM_TOKENS" \
--tp_size="$TP" --ep_size="$EP_SIZE" \
--extra_llm_api_options="$EXTRA_CONFIG_FILE" \
> "$SERVER_LOG" 2>&1 &

SERVER_PID=$!

# Wait for server to be ready
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend openai \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts $(( CONC * 10 )) \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/

# After throughput, run evaluation only if RUN_EVAL is true
if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

# Stop GPU monitoring
stop_gpu_monitor
set +x
10 changes: 10 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2214,3 +2214,13 @@
- "Bump --speculative-config num_speculative_tokens from 1 to 2 (`{\"method\":\"mtp\",\"num_speculative_tokens\":2}`)"
- "Re-test whether H200 MTP kernels accept 2 draft tokens — Blackwell MTP runs at 2 (per @wzhao18's vLLM Blackwell MTP submission); checking if H200 has parity now"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1279

- config-keys:
- qwen3.5-fp4-b200-trt
description:
- "Add Qwen3.5-397B FP4 B200 TensorRT-LLM benchmark"
- "Image: nvcr.io#nvidia/tensorrt-llm/release:1.3.0rc12"
- "Model: nvidia/Qwen3.5-397B-A17B-NVFP4"
- "1k1k: TP-only (tp4 conc 4-16, tp8 conc 4), TEP (tp4ep4 conc 16-256), DEP (tp4ep4 dp-attn conc 1024, tp8ep8 dp-attn conc 512-1024)"
- "8k1k: TP-only (tp2 conc 4-32, tp4 conc 4-8, tp8 conc 4), TEP (tp2ep2 conc 64, tp4ep4 conc 16), DEP (tp4ep4 dp-attn conc 256-1024, tp8ep8 dp-attn conc 512-1024)"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1280
Loading