From 7108fd7987cfdb5b09460762f45166246c452e16 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 8 May 2026 21:10:08 -0700 Subject: [PATCH 1/7] Add DSV4 B300 Dynamo vLLM disagg --- .github/configs/nvidia-master.yaml | 82 +++++++++++ .../8k1k/disagg-b300-high-tpt-megamoe.yaml | 136 ++++++++++++++++++ .../8k1k/disagg-b300-low-latency.yaml | 128 +++++++++++++++++ .../8k1k/disagg-b300-low-middle-curve.yaml | 129 +++++++++++++++++ .../8k1k/disagg-b300-max-tpt-megamoe.yaml | 136 ++++++++++++++++++ .../8k1k/disagg-b300-mid-curve-megamoe.yaml | 136 ++++++++++++++++++ perf-changelog.yaml | 9 ++ runners/launch_b300-nv.sh | 28 +++- 8 files changed, 781 insertions(+), 3 deletions(-) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-high-tpt-megamoe.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-low-latency.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-low-middle-curve.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-max-tpt-megamoe.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-mid-curve-megamoe.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index e8286bff6..032a9679b 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -8122,6 +8122,88 @@ dsv4-fp4-gb200-dynamo-vllm-mtp2: ep: 8 dp-attn: true +dsv4-fp4-b300-dynamo-vllm: + image: vllm/vllm-openai:v0.20.1 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: b300 + precision: fp4 + framework: dynamo-vllm + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 8192 + osl: 1024 + search-space: + # B300 adaptation of the DSV4 B200/GB200 vLLM disagg recipes. Each + # prefill/decode worker maps to one full 8-GPU B300 node. + - conc-list: [1] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-b300-low-latency.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + - conc-list: [32, 256, 512] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-b300-low-middle-curve.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 1 + dp-attn: false + - conc-list: [512, 1024] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-b300-mid-curve-megamoe.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [4096] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-b300-high-tpt-megamoe.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [4096, 8192, 12288] + prefill: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-b300-max-tpt-megamoe.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + dsv4-fp4-gb300-dynamo-vllm: image: vllm/vllm-openai:v0.20.0-ubuntu2404 model: deepseek-ai/DeepSeek-V4-Pro diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-high-tpt-megamoe.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-high-tpt-megamoe.yaml new file mode 100644 index 000000000..2dfe37d69 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-high-tpt-megamoe.yaml @@ -0,0 +1,136 @@ +name: "svf-vllm-disagg-b300-high-tpt-megamoe" + +# B300 adaptation of the DSV4 GB200/B200 vLLM disagg recipe. Each worker uses +# one full 8-GPU B300 node, plus a dedicated NATS/etcd infra node. +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:v0.20.0-cu130" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 2 + decode_nodes: 1 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +infra: + etcd_nats_dedicated_node: true + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + NCCL_CUMEM_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + NCCL_CUMEM_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enable-ep-weight-filter: true + moe-backend: deep_gemm_mega_moe + attention-config: '{"use_fp4_indexer_cache": true}' + enforce-eager: true + max-model-len: 9280 + max-num-seqs: 16 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' + gpu-memory-utilization: 0.85 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + tokenizer-mode: deepseek_v4 + tool-call-parser: deepseek_v4 + enable-auto-tool-choice: true + reasoning-parser: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enable-ep-weight-filter: true + moe-backend: deep_gemm_mega_moe + attention-config: '{"use_fp4_indexer_cache": true}' + max-model-len: 9280 + max-num-seqs: 512 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 512 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' + gpu-memory-utilization: 0.85 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + tool-call-parser: deepseek_v4 + enable-auto-tool-choice: true + reasoning-parser: deepseek_v4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + model: + repo: "deepseek-ai/DeepSeek-V4-Pro" + revision: "0366e4e064385807ea86b088a5c6c878ff23343b" + container: + image: "vllm/vllm-openai:v0.20.0-cu130" + frameworks: + dynamo: "1.2.0.dev20260426" + vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-low-latency.yaml new file mode 100644 index 000000000..dd2241301 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-low-latency.yaml @@ -0,0 +1,128 @@ +name: "svf-vllm-disagg-b300-low-latency" + +# B300 adaptation of the DSV4 GB200/B200 vLLM disagg recipe. Each worker uses +# one full 8-GPU B300 node, plus a dedicated NATS/etcd infra node. +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:v0.20.0-cu130" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +infra: + etcd_nats_dedicated_node: true + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + NCCL_CUMEM_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + NCCL_CUMEM_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + attention-config: '{"use_fp4_indexer_cache": true}' + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + tokenizer-mode: deepseek_v4 + tool-call-parser: deepseek_v4 + enable-auto-tool-choice: true + reasoning-parser: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 8 + pipeline-parallel-size: 1 + max-model-len: 16384 + max-num-seqs: 256 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 256 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + tool-call-parser: deepseek_v4 + enable-auto-tool-choice: true + reasoning-parser: deepseek_v4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + container: + image: "vllm/vllm-openai:v0.20.0-cu130" + frameworks: + dynamo: "1.2.0.dev20260426" + vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-low-middle-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-low-middle-curve.yaml new file mode 100644 index 000000000..3dc60ba93 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-low-middle-curve.yaml @@ -0,0 +1,129 @@ +name: "svf-vllm-disagg-b300-low-middle-curve" + +# B300 adaptation of the DSV4 GB200/B200 vLLM disagg recipe. Each worker uses +# one full 8-GPU B300 node, plus a dedicated NATS/etcd infra node. +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:v0.20.0-cu130" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 4 + prefill_workers: 1 + decode_workers: 4 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +infra: + etcd_nats_dedicated_node: true + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + NCCL_CUMEM_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + NCCL_CUMEM_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enable-ep-weight-filter: true + attention-config: '{"use_fp4_indexer_cache": true}' + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + tokenizer-mode: deepseek_v4 + tool-call-parser: deepseek_v4 + enable-auto-tool-choice: true + reasoning-parser: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 8 + pipeline-parallel-size: 1 + max-model-len: 16384 + max-num-seqs: 256 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 256 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + tool-call-parser: deepseek_v4 + enable-auto-tool-choice: true + reasoning-parser: deepseek_v4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256x512" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + container: + image: "vllm/vllm-openai:v0.20.0-cu130" + frameworks: + dynamo: "1.2.0.dev20260426" + vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-max-tpt-megamoe.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-max-tpt-megamoe.yaml new file mode 100644 index 000000000..f3bcdc831 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-max-tpt-megamoe.yaml @@ -0,0 +1,136 @@ +name: "svf-vllm-disagg-b300-max-tpt-megamoe" + +# B300 adaptation of the DSV4 GB200/B200 vLLM disagg recipe. Each worker uses +# one full 8-GPU B300 node, plus a dedicated NATS/etcd infra node. +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:v0.20.0-cu130" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 3 + decode_nodes: 1 + prefill_workers: 3 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +infra: + etcd_nats_dedicated_node: true + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + NCCL_CUMEM_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + NCCL_CUMEM_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enable-ep-weight-filter: true + moe-backend: deep_gemm_mega_moe + attention-config: '{"use_fp4_indexer_cache": true}' + enforce-eager: true + max-model-len: 9280 + max-num-seqs: 16 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' + gpu-memory-utilization: 0.85 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + tokenizer-mode: deepseek_v4 + tool-call-parser: deepseek_v4 + enable-auto-tool-choice: true + reasoning-parser: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enable-ep-weight-filter: true + moe-backend: deep_gemm_mega_moe + attention-config: '{"use_fp4_indexer_cache": true}' + max-model-len: 9280 + max-num-seqs: 512 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 512 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' + gpu-memory-utilization: 0.85 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + tool-call-parser: deepseek_v4 + enable-auto-tool-choice: true + reasoning-parser: deepseek_v4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + model: + repo: "deepseek-ai/DeepSeek-V4-Pro" + revision: "0366e4e064385807ea86b088a5c6c878ff23343b" + container: + image: "vllm/vllm-openai:v0.20.0-cu130" + frameworks: + dynamo: "1.2.0.dev20260426" + vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-mid-curve-megamoe.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-mid-curve-megamoe.yaml new file mode 100644 index 000000000..dbfbfef71 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-mid-curve-megamoe.yaml @@ -0,0 +1,136 @@ +name: "svf-vllm-disagg-b300-mid-curve-megamoe" + +# B300 adaptation of the DSV4 GB200/B200 vLLM disagg recipe. Each worker uses +# one full 8-GPU B300 node, plus a dedicated NATS/etcd infra node. +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:v0.20.0-cu130" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +infra: + etcd_nats_dedicated_node: true + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + NCCL_CUMEM_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + NCCL_CUMEM_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enable-ep-weight-filter: true + moe-backend: deep_gemm_mega_moe + attention-config: '{"use_fp4_indexer_cache": true}' + enforce-eager: true + max-model-len: 9280 + max-num-seqs: 16 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' + gpu-memory-utilization: 0.85 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + tokenizer-mode: deepseek_v4 + tool-call-parser: deepseek_v4 + enable-auto-tool-choice: true + reasoning-parser: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enable-ep-weight-filter: true + moe-backend: deep_gemm_mega_moe + attention-config: '{"use_fp4_indexer_cache": true}' + max-model-len: 9280 + max-num-seqs: 512 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 512 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' + gpu-memory-utilization: 0.85 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + tool-call-parser: deepseek_v4 + enable-auto-tool-choice: true + reasoning-parser: deepseek_v4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256x512x1024" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + model: + repo: "deepseek-ai/DeepSeek-V4-Pro" + revision: "0366e4e064385807ea86b088a5c6c878ff23343b" + container: + image: "vllm/vllm-openai:v0.20.0-cu130" + frameworks: + dynamo: "1.2.0.dev20260426" + vllm: "0.20.0" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 715d6f177..f41e7dcdc 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2307,3 +2307,12 @@ - "Tune DSv4 FP4 MI355X SGLang runtime envs: enable aiter MHC pre/post, and enable triton swa prepare kernel." - "Add --context-length. Add --enable-prefill-delayer for dp config" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1300 + +- config-keys: + - dsv4-fp4-b300-dynamo-vllm + description: + - "Add DeepSeek-V4-Pro FP4 B300 disaggregated multi-node coverage using Dynamo vLLM" + - "Adapt the B200/GB200 DSV4 vLLM disagg recipes to B300 by mapping each worker to one full 8-GPU B300 node" + - "Carry over B200 single-node vLLM DSv4 optimizations: FP4 indexer cache, FULL_AND_PIECEWISE compilation, DSv4 tool/reasoning parser flags, larger cudagraph capture, and lower GMU on Mega-MoE DP paths" + - "Update the B300 NV Slurm launcher to support dsv4/fp4 with dynamo-vllm and local recipe overlays" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1303 diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index e81bf91a3..0bf203022 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -9,8 +9,8 @@ set -x if [[ "$IS_MULTINODE" == "true" ]]; then # Validate framework -if [[ $FRAMEWORK != "dynamo-sglang" && $FRAMEWORK != "dynamo-trt" ]]; then - echo "Unsupported framework: $FRAMEWORK. Supported frameworks are: dynamo-trt, dynamo-sglang" +if [[ $FRAMEWORK != "dynamo-sglang" && $FRAMEWORK != "dynamo-trt" && $FRAMEWORK != "dynamo-vllm" ]]; then + echo "Unsupported framework: $FRAMEWORK. Supported frameworks are: dynamo-trt, dynamo-sglang, dynamo-vllm" exit 1 fi @@ -25,8 +25,22 @@ elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then export MODEL_PATH="/data/models/dsr1-fp8" export SERVED_MODEL_NAME="deepseek-r1-fp8" export SRT_SLURM_MODEL_PREFIX="dsr1-fp8" +elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" && $FRAMEWORK == "dynamo-vllm" ]]; then + SELECTED_MODEL_PATH="" + if [[ -n "${MODEL_PATH:-}" && -d "${MODEL_PATH}" ]]; then + SELECTED_MODEL_PATH="$MODEL_PATH" + else + for candidate in /data/models/dsv4-pro /data/models/deepseek-v4-pro /data/models/DeepSeek-V4-Pro; do + if [[ -d "$candidate" ]]; then + SELECTED_MODEL_PATH="$candidate" + break + fi + done + fi + export MODEL_PATH="${SELECTED_MODEL_PATH:-/data/models/dsv4-pro}" + export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro" else - echo "Unsupported model: $MODEL_PREFIX-$PRECISION. Supported models are: dsr1-fp4, dsr1-fp8" + echo "Unsupported model: $MODEL_PREFIX-$PRECISION. Supported models are: dsr1-fp4, dsr1-fp8, dsv4-fp4 with dynamo-vllm" exit 1 fi @@ -41,6 +55,12 @@ fi if [[ "$IS_AGENTIC" == "1" ]]; then git clone --branch cam/sa-submission-q2-2026 --single-branch https://github.com/cquil11/srt-slurm-nv.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" || exit 1 +elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then + git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" + cd "$SRT_REPO_DIR" || exit 1 + git checkout aflowers/vllm-gb200-v0.20.0 + mkdir -p recipes/vllm/deepseek-v4 + cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4 else git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" || exit 1 @@ -96,6 +116,8 @@ model_paths: containers: dynamo-trtllm: "${SQUASH_FILE}" dynamo-sglang: "${SQUASH_FILE}" + dynamo-vllm: "${SQUASH_FILE}" + "${IMAGE}": "${SQUASH_FILE}" nginx-sqsh: "${NGINX_SQUASH_FILE}" use_exclusive_sbatch_directive: true default_mounts: From 8f9bfabd0bdcdf01f80c532e4cc2d208e69c98ff Mon Sep 17 00:00:00 2001 From: Bryan Shan <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 8 May 2026 21:35:54 -0700 Subject: [PATCH 2/7] Update perf-changelog.yaml --- perf-changelog.yaml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index f41e7dcdc..908416979 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2312,7 +2312,4 @@ - dsv4-fp4-b300-dynamo-vllm description: - "Add DeepSeek-V4-Pro FP4 B300 disaggregated multi-node coverage using Dynamo vLLM" - - "Adapt the B200/GB200 DSV4 vLLM disagg recipes to B300 by mapping each worker to one full 8-GPU B300 node" - - "Carry over B200 single-node vLLM DSv4 optimizations: FP4 indexer cache, FULL_AND_PIECEWISE compilation, DSv4 tool/reasoning parser flags, larger cudagraph capture, and lower GMU on Mega-MoE DP paths" - - "Update the B300 NV Slurm launcher to support dsv4/fp4 with dynamo-vllm and local recipe overlays" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1303 + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1304 From a3cdd83075ef2e5cc1e61bb5438b7e59976167da Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 8 May 2026 23:44:03 -0700 Subject: [PATCH 3/7] Update B300 Dynamo vLLM image --- .../deepseek-v4/8k1k/disagg-b300-high-tpt-megamoe.yaml | 6 +++--- .../vllm/deepseek-v4/8k1k/disagg-b300-low-latency.yaml | 6 +++--- .../deepseek-v4/8k1k/disagg-b300-low-middle-curve.yaml | 6 +++--- .../vllm/deepseek-v4/8k1k/disagg-b300-max-tpt-megamoe.yaml | 6 +++--- .../deepseek-v4/8k1k/disagg-b300-mid-curve-megamoe.yaml | 6 +++--- perf-changelog.yaml | 7 +++++++ 6 files changed, 22 insertions(+), 15 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-high-tpt-megamoe.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-high-tpt-megamoe.yaml index 2dfe37d69..7368074a8 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-high-tpt-megamoe.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-high-tpt-megamoe.yaml @@ -4,7 +4,7 @@ name: "svf-vllm-disagg-b300-high-tpt-megamoe" # one full 8-GPU B300 node, plus a dedicated NATS/etcd infra node. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:v0.20.0-cu130" + container: "vllm/vllm-openai:v0.20.1" precision: "fp4" dynamo: @@ -130,7 +130,7 @@ identity: repo: "deepseek-ai/DeepSeek-V4-Pro" revision: "0366e4e064385807ea86b088a5c6c878ff23343b" container: - image: "vllm/vllm-openai:v0.20.0-cu130" + image: "vllm/vllm-openai:v0.20.1" frameworks: dynamo: "1.2.0.dev20260426" - vllm: "0.20.0" + vllm: "0.20.1" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-low-latency.yaml index dd2241301..98e094f5c 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-low-latency.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-low-latency.yaml @@ -4,7 +4,7 @@ name: "svf-vllm-disagg-b300-low-latency" # one full 8-GPU B300 node, plus a dedicated NATS/etcd infra node. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:v0.20.0-cu130" + container: "vllm/vllm-openai:v0.20.1" precision: "fp4" dynamo: @@ -122,7 +122,7 @@ benchmark: identity: container: - image: "vllm/vllm-openai:v0.20.0-cu130" + image: "vllm/vllm-openai:v0.20.1" frameworks: dynamo: "1.2.0.dev20260426" - vllm: "0.20.0" + vllm: "0.20.1" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-low-middle-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-low-middle-curve.yaml index 3dc60ba93..c51837305 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-low-middle-curve.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-low-middle-curve.yaml @@ -4,7 +4,7 @@ name: "svf-vllm-disagg-b300-low-middle-curve" # one full 8-GPU B300 node, plus a dedicated NATS/etcd infra node. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:v0.20.0-cu130" + container: "vllm/vllm-openai:v0.20.1" precision: "fp4" dynamo: @@ -123,7 +123,7 @@ benchmark: identity: container: - image: "vllm/vllm-openai:v0.20.0-cu130" + image: "vllm/vllm-openai:v0.20.1" frameworks: dynamo: "1.2.0.dev20260426" - vllm: "0.20.0" + vllm: "0.20.1" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-max-tpt-megamoe.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-max-tpt-megamoe.yaml index f3bcdc831..1ad2663c3 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-max-tpt-megamoe.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-max-tpt-megamoe.yaml @@ -4,7 +4,7 @@ name: "svf-vllm-disagg-b300-max-tpt-megamoe" # one full 8-GPU B300 node, plus a dedicated NATS/etcd infra node. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:v0.20.0-cu130" + container: "vllm/vllm-openai:v0.20.1" precision: "fp4" dynamo: @@ -130,7 +130,7 @@ identity: repo: "deepseek-ai/DeepSeek-V4-Pro" revision: "0366e4e064385807ea86b088a5c6c878ff23343b" container: - image: "vllm/vllm-openai:v0.20.0-cu130" + image: "vllm/vllm-openai:v0.20.1" frameworks: dynamo: "1.2.0.dev20260426" - vllm: "0.20.0" + vllm: "0.20.1" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-mid-curve-megamoe.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-mid-curve-megamoe.yaml index dbfbfef71..ac638e74b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-mid-curve-megamoe.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-mid-curve-megamoe.yaml @@ -4,7 +4,7 @@ name: "svf-vllm-disagg-b300-mid-curve-megamoe" # one full 8-GPU B300 node, plus a dedicated NATS/etcd infra node. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:v0.20.0-cu130" + container: "vllm/vllm-openai:v0.20.1" precision: "fp4" dynamo: @@ -130,7 +130,7 @@ identity: repo: "deepseek-ai/DeepSeek-V4-Pro" revision: "0366e4e064385807ea86b088a5c6c878ff23343b" container: - image: "vllm/vllm-openai:v0.20.0-cu130" + image: "vllm/vllm-openai:v0.20.1" frameworks: dynamo: "1.2.0.dev20260426" - vllm: "0.20.0" + vllm: "0.20.1" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 908416979..57ab5215e 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2313,3 +2313,10 @@ description: - "Add DeepSeek-V4-Pro FP4 B300 disaggregated multi-node coverage using Dynamo vLLM" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1304 + +- config-keys: + - dsv4-fp4-b300-dynamo-vllm + description: + - "Update B300 Dynamo vLLM recipes from vllm/vllm-openai:v0.20.0-cu130 to vllm/vllm-openai:v0.20.1" + - "Align recipe identity metadata with the nvidia-master image so srt-slurm launches the v0.20.1 stack" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1304 From efeac75040ca14aa1a10053f764150039a6119db Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 8 May 2026 23:52:57 -0700 Subject: [PATCH 4/7] Remove unsupported B300 Dynamo vLLM flags --- .../deepseek-v4/8k1k/disagg-b300-high-tpt-megamoe.yaml | 4 ---- .../vllm/deepseek-v4/8k1k/disagg-b300-low-latency.yaml | 4 ---- .../deepseek-v4/8k1k/disagg-b300-low-middle-curve.yaml | 4 ---- .../vllm/deepseek-v4/8k1k/disagg-b300-max-tpt-megamoe.yaml | 4 ---- .../deepseek-v4/8k1k/disagg-b300-mid-curve-megamoe.yaml | 4 ---- perf-changelog.yaml | 7 +++++++ 6 files changed, 7 insertions(+), 20 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-high-tpt-megamoe.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-high-tpt-megamoe.yaml index 7368074a8..0119719a5 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-high-tpt-megamoe.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-high-tpt-megamoe.yaml @@ -83,8 +83,6 @@ backend: enable-sleep-mode: true numa-bind: true tokenizer-mode: deepseek_v4 - tool-call-parser: deepseek_v4 - enable-auto-tool-choice: true reasoning-parser: deepseek_v4 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' @@ -112,8 +110,6 @@ backend: no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true tokenizer-mode: deepseek_v4 - tool-call-parser: deepseek_v4 - enable-auto-tool-choice: true reasoning-parser: deepseek_v4 benchmark: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-low-latency.yaml index 98e094f5c..fc08b32ac 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-low-latency.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-low-latency.yaml @@ -84,8 +84,6 @@ backend: offload-num-in-group: 1 offload-prefetch-step: 2 tokenizer-mode: deepseek_v4 - tool-call-parser: deepseek_v4 - enable-auto-tool-choice: true reasoning-parser: deepseek_v4 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' @@ -107,8 +105,6 @@ backend: no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true tokenizer-mode: deepseek_v4 - tool-call-parser: deepseek_v4 - enable-auto-tool-choice: true reasoning-parser: deepseek_v4 benchmark: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-low-middle-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-low-middle-curve.yaml index c51837305..4cb7e15dd 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-low-middle-curve.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-low-middle-curve.yaml @@ -85,8 +85,6 @@ backend: offload-num-in-group: 1 offload-prefetch-step: 2 tokenizer-mode: deepseek_v4 - tool-call-parser: deepseek_v4 - enable-auto-tool-choice: true reasoning-parser: deepseek_v4 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' @@ -108,8 +106,6 @@ backend: no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true tokenizer-mode: deepseek_v4 - tool-call-parser: deepseek_v4 - enable-auto-tool-choice: true reasoning-parser: deepseek_v4 benchmark: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-max-tpt-megamoe.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-max-tpt-megamoe.yaml index 1ad2663c3..ab6e3b4ed 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-max-tpt-megamoe.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-max-tpt-megamoe.yaml @@ -83,8 +83,6 @@ backend: enable-sleep-mode: true numa-bind: true tokenizer-mode: deepseek_v4 - tool-call-parser: deepseek_v4 - enable-auto-tool-choice: true reasoning-parser: deepseek_v4 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' @@ -112,8 +110,6 @@ backend: no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true tokenizer-mode: deepseek_v4 - tool-call-parser: deepseek_v4 - enable-auto-tool-choice: true reasoning-parser: deepseek_v4 benchmark: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-mid-curve-megamoe.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-mid-curve-megamoe.yaml index ac638e74b..9961fa265 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-mid-curve-megamoe.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-mid-curve-megamoe.yaml @@ -83,8 +83,6 @@ backend: enable-sleep-mode: true numa-bind: true tokenizer-mode: deepseek_v4 - tool-call-parser: deepseek_v4 - enable-auto-tool-choice: true reasoning-parser: deepseek_v4 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' @@ -112,8 +110,6 @@ backend: no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true tokenizer-mode: deepseek_v4 - tool-call-parser: deepseek_v4 - enable-auto-tool-choice: true reasoning-parser: deepseek_v4 benchmark: diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 57ab5215e..55eeef7ba 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2320,3 +2320,10 @@ - "Update B300 Dynamo vLLM recipes from vllm/vllm-openai:v0.20.0-cu130 to vllm/vllm-openai:v0.20.1" - "Align recipe identity metadata with the nvidia-master image so srt-slurm launches the v0.20.1 stack" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1304 + +- config-keys: + - dsv4-fp4-b300-dynamo-vllm + description: + - "Remove unsupported Dynamo vLLM tool-call CLI flags from B300 recipes: --enable-auto-tool-choice and --tool-call-parser" + - "Keep DeepSeek-V4 tokenizer-mode and reasoning-parser settings for chat/reasoning compatibility" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1304 From fb5093ae7669c3d7576ba5258d6c003b81a3cba4 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sat, 9 May 2026 00:21:31 -0700 Subject: [PATCH 5/7] Turn off B300 Dynamo vLLM MegaMoE --- .../deepseek-v4/8k1k/disagg-b300-high-tpt-megamoe.yaml | 2 -- .../vllm/deepseek-v4/8k1k/disagg-b300-max-tpt-megamoe.yaml | 2 -- .../deepseek-v4/8k1k/disagg-b300-mid-curve-megamoe.yaml | 2 -- perf-changelog.yaml | 7 +++++++ 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-high-tpt-megamoe.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-high-tpt-megamoe.yaml index 0119719a5..eb01e756f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-high-tpt-megamoe.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-high-tpt-megamoe.yaml @@ -65,7 +65,6 @@ backend: data-parallel-rpc-port: 13345 enable-expert-parallel: true enable-ep-weight-filter: true - moe-backend: deep_gemm_mega_moe attention-config: '{"use_fp4_indexer_cache": true}' enforce-eager: true max-model-len: 9280 @@ -94,7 +93,6 @@ backend: data-parallel-rpc-port: 13345 enable-expert-parallel: true enable-ep-weight-filter: true - moe-backend: deep_gemm_mega_moe attention-config: '{"use_fp4_indexer_cache": true}' max-model-len: 9280 max-num-seqs: 512 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-max-tpt-megamoe.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-max-tpt-megamoe.yaml index ab6e3b4ed..47d98e9af 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-max-tpt-megamoe.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-max-tpt-megamoe.yaml @@ -65,7 +65,6 @@ backend: data-parallel-rpc-port: 13345 enable-expert-parallel: true enable-ep-weight-filter: true - moe-backend: deep_gemm_mega_moe attention-config: '{"use_fp4_indexer_cache": true}' enforce-eager: true max-model-len: 9280 @@ -94,7 +93,6 @@ backend: data-parallel-rpc-port: 13345 enable-expert-parallel: true enable-ep-weight-filter: true - moe-backend: deep_gemm_mega_moe attention-config: '{"use_fp4_indexer_cache": true}' max-model-len: 9280 max-num-seqs: 512 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-mid-curve-megamoe.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-mid-curve-megamoe.yaml index 9961fa265..0f9c078d2 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-mid-curve-megamoe.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-mid-curve-megamoe.yaml @@ -65,7 +65,6 @@ backend: data-parallel-rpc-port: 13345 enable-expert-parallel: true enable-ep-weight-filter: true - moe-backend: deep_gemm_mega_moe attention-config: '{"use_fp4_indexer_cache": true}' enforce-eager: true max-model-len: 9280 @@ -94,7 +93,6 @@ backend: data-parallel-rpc-port: 13345 enable-expert-parallel: true enable-ep-weight-filter: true - moe-backend: deep_gemm_mega_moe attention-config: '{"use_fp4_indexer_cache": true}' max-model-len: 9280 max-num-seqs: 512 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 55eeef7ba..a8a11fbe2 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2327,3 +2327,10 @@ - "Remove unsupported Dynamo vLLM tool-call CLI flags from B300 recipes: --enable-auto-tool-choice and --tool-call-parser" - "Keep DeepSeek-V4 tokenizer-mode and reasoning-parser settings for chat/reasoning compatibility" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1304 + +- config-keys: + - dsv4-fp4-b300-dynamo-vllm + description: + - "Turn off explicit deep_gemm_mega_moe backend selection in B300 Dynamo vLLM throughput recipes" + - "Let vLLM choose the MoE backend automatically to avoid CUDA symmetric-memory rendezvous failures during startup" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1304 From 877dda5f09ba878133156d03d7bcdbea2400e6dc Mon Sep 17 00:00:00 2001 From: Bryan Shan <58582368+Oseltamivir@users.noreply.github.com> Date: Sat, 9 May 2026 07:56:04 -0700 Subject: [PATCH 6/7] Modify concurrency settings in nvidia-master.yaml Updated concurrency lists and removed unused configurations. --- .github/configs/nvidia-master.yaml | 32 +++--------------------------- 1 file changed, 3 insertions(+), 29 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 032a9679b..592129409 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -8138,7 +8138,7 @@ dsv4-fp4-b300-dynamo-vllm: search-space: # B300 adaptation of the DSV4 B200/GB200 vLLM disagg recipes. Each # prefill/decode worker maps to one full 8-GPU B300 node. - - conc-list: [1] + - conc-list: [1, 64, 128] prefill: num-worker: 1 tp: 8 @@ -8151,20 +8151,7 @@ dsv4-fp4-b300-dynamo-vllm: tp: 8 ep: 1 dp-attn: false - - conc-list: [32, 256, 512] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-b300-low-middle-curve.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 1 - dp-attn: false - - conc-list: [512, 1024] + - conc-list: [128, 256, 512, 1024] prefill: num-worker: 1 tp: 8 @@ -8177,7 +8164,7 @@ dsv4-fp4-b300-dynamo-vllm: tp: 8 ep: 8 dp-attn: true - - conc-list: [4096] + - conc-list: [4096, 8192] prefill: num-worker: 2 tp: 8 @@ -8190,19 +8177,6 @@ dsv4-fp4-b300-dynamo-vllm: tp: 8 ep: 8 dp-attn: true - - conc-list: [4096, 8192, 12288] - prefill: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-b300-max-tpt-megamoe.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true dsv4-fp4-gb300-dynamo-vllm: image: vllm/vllm-openai:v0.20.0-ubuntu2404 From c0c1e0be7a67d6dcdab771586c8ee29e213cbf92 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sat, 9 May 2026 11:21:36 -0700 Subject: [PATCH 7/7] propagate --- .../vllm/deepseek-v4/8k1k/disagg-b300-high-tpt-megamoe.yaml | 2 +- .../vllm/deepseek-v4/8k1k/disagg-b300-low-latency.yaml | 2 +- .../vllm/deepseek-v4/8k1k/disagg-b300-mid-curve-megamoe.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-high-tpt-megamoe.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-high-tpt-megamoe.yaml index eb01e756f..6d754905b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-high-tpt-megamoe.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-high-tpt-megamoe.yaml @@ -114,7 +114,7 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "4096" + concurrencies: "4096x8192" req_rate: "inf" use_chat_template: true custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-low-latency.yaml index fc08b32ac..4f5cf12bd 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-low-latency.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-low-latency.yaml @@ -111,7 +111,7 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "1" + concurrencies: "1x64x128" req_rate: "inf" use_chat_template: true custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-mid-curve-megamoe.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-mid-curve-megamoe.yaml index 0f9c078d2..30cfdacd8 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-mid-curve-megamoe.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b300-mid-curve-megamoe.yaml @@ -114,7 +114,7 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "256x512x1024" + concurrencies: "128x256x512x1024" req_rate: "inf" use_chat_template: true custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer"