From 21915104854e9f8bd77ae1b725fca0fb87d753ed Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 8 May 2026 13:53:02 -0700 Subject: [PATCH 1/7] Add DSV4 B200 Dynamo vLLM disagg --- .github/configs/nvidia-master.yaml | 82 ++++++++++ .../8k1k/disagg-b200-high-tpt-megamoe.yaml | 154 ++++++++++++++++++ .../8k1k/disagg-b200-low-latency.yaml | 149 +++++++++++++++++ .../8k1k/disagg-b200-low-middle-curve.yaml | 150 +++++++++++++++++ .../8k1k/disagg-b200-max-tpt-megamoe.yaml | 154 ++++++++++++++++++ .../8k1k/disagg-b200-mid-curve-megamoe.yaml | 154 ++++++++++++++++++ perf-changelog.yaml | 8 + runners/launch_b200-dgxc.sh | 26 ++- 8 files changed, 875 insertions(+), 2 deletions(-) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-high-tpt-megamoe.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-latency.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-middle-curve.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-max-tpt-megamoe.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-mid-curve-megamoe.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index e8286bff6..a92ed574c 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7942,6 +7942,88 @@ kimik2.5-fp4-gb200-dynamo-vllm: ep: 16 dp-attn: true +dsv4-fp4-b200-dynamo-vllm: + image: vllm/vllm-openai:v0.20.0-ubuntu2404 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: b200-multinode + precision: fp4 + framework: dynamo-vllm + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 8192 + osl: 1024 + search-space: + # B200 adaptation of the DSV4 GB200 vLLM disagg recipes. Each worker + # maps to one full 8-GPU B200 node. + - conc-list: [1] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-latency.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + - conc-list: [256, 512] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-middle-curve.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 1 + dp-attn: false + - conc-list: [256, 512, 1024] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-b200-mid-curve-megamoe.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [4096] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-b200-high-tpt-megamoe.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [4096] + prefill: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-b200-max-tpt-megamoe.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + dsv4-fp4-gb200-dynamo-vllm: image: vllm/vllm-openai:v0.20.0-ubuntu2404 model: deepseek-ai/DeepSeek-V4-Pro diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-high-tpt-megamoe.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-high-tpt-megamoe.yaml new file mode 100644 index 000000000..4ab1e0c88 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-high-tpt-megamoe.yaml @@ -0,0 +1,154 @@ +name: "svf-vllm-disagg-b200-high-tpt-megamoe" + +# Mirrored from NVIDIA/srt-slurm aflowers/vllm-gb200-v0.20.0 branch: +# recipes/vllm/deepseek-v4-pro/GB200/8k1k/disagg-gb200-high-tpt-megamoe.yaml +# +# B200 adaptation of the GB200 recipe below. Each prefill/decode worker uses +# one full 8-GPU B200 node, plus a dedicated NATS/etcd infra node. +# +# Local deltas vs upstream: +# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match +# SRT_SLURM_MODEL_PREFIX in the launch script. +# * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404 to +# match nvidia-master.yaml image (which the launch script registers as +# the alias key in srtslurm.yaml). Upstream variants ship either the +# non-dynamo floating tag or a sha256 pin. +# * slurm.time_limit + health_check set to 8h / 1440 attempts to +# absorb cold-cache model loads. +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:v0.20.0-ubuntu2404" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 2 + decode_nodes: 1 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +infra: + etcd_nats_dedicated_node: true + +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + TORCH_SYMMMEM: "NVSHMEM" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + TORCH_SYMMMEM: "NVSHMEM" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enable-ep-weight-filter: true + moe-backend: deep_gemm_mega_moe + enforce-eager: true + max-model-len: 9280 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.95 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enable-ep-weight-filter: true + moe-backend: deep_gemm_mega_moe + max-model-len: 9280 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 512 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + model: + repo: "deepseek-ai/DeepSeek-V4-Pro" + revision: "0366e4e064385807ea86b088a5c6c878ff23343b" + container: + image: "vllm/vllm-openai:v0.20.0-ubuntu2404" + frameworks: + dynamo: "1.2.0.dev20260426" + vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-latency.yaml new file mode 100644 index 000000000..d80e97f11 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-latency.yaml @@ -0,0 +1,149 @@ +name: "svf-vllm-disagg-b200-low-latency" + +# Mirrored from NVIDIA/srt-slurm aflowers/vllm-gb200-v0.20.0 branch: +# recipes/vllm/deepseek-v4-pro/GB200/8k1k/disagg-gb200-low-latency.yaml +# +# B200 adaptation of the GB200 recipe below. Each prefill/decode worker uses +# one full 8-GPU B200 node, plus a dedicated NATS/etcd infra node. +# +# Local deltas vs upstream: +# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match +# SRT_SLURM_MODEL_PREFIX in the launch script. +# * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404 to +# match nvidia-master.yaml image (which the launch script registers as +# the alias key in srtslurm.yaml). Upstream variants ship either the +# non-dynamo floating tag or a sha256 pin. +# * slurm.time_limit + health_check set to 8h / 1440 attempts to +# absorb cold-cache model loads. +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:v0.20.0-ubuntu2404" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +infra: + etcd_nats_dedicated_node: true + +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 8 + pipeline-parallel-size: 1 +# data-parallel-size: 8 +# data-parallel-rpc-port: 13345 +# enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 256 + max-cudagraph-capture-size: 256 + max-num-batched-tokens: 256 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + container: + image: "vllm/vllm-openai:v0.20.0-ubuntu2404" + frameworks: + dynamo: "1.2.0.dev20260426" + vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-middle-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-middle-curve.yaml new file mode 100644 index 000000000..5a2820eab --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-middle-curve.yaml @@ -0,0 +1,150 @@ +name: "svf-vllm-disagg-b200-low-middle-curve" + +# Mirrored from NVIDIA/srt-slurm aflowers/vllm-gb200-v0.20.0 branch: +# recipes/vllm/deepseek-v4-pro/GB200/8k1k/disagg-gb200-low-middle-curve.yaml +# +# B200 adaptation of the GB200 recipe below. Each prefill/decode worker uses +# one full 8-GPU B200 node, plus a dedicated NATS/etcd infra node. +# +# Local deltas vs upstream: +# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match +# SRT_SLURM_MODEL_PREFIX in the launch script. +# * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404 to +# match nvidia-master.yaml image (which the launch script registers as +# the alias key in srtslurm.yaml). Upstream variants ship either the +# non-dynamo floating tag or a sha256 pin. +# * slurm.time_limit + health_check set to 8h / 1440 attempts to +# absorb cold-cache model loads. +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:v0.20.0-ubuntu2404" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 4 + prefill_workers: 1 + decode_workers: 4 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +infra: + etcd_nats_dedicated_node: true + +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enable-ep-weight-filter: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 8 + pipeline-parallel-size: 1 +# data-parallel-size: 8 +# data-parallel-rpc-port: 13345 +# enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 256 + max-cudagraph-capture-size: 256 + max-num-batched-tokens: 256 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256x512" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + container: + image: "vllm/vllm-openai:v0.20.0-ubuntu2404" + frameworks: + dynamo: "1.2.0.dev20260426" + vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-max-tpt-megamoe.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-max-tpt-megamoe.yaml new file mode 100644 index 000000000..74cb4c08c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-max-tpt-megamoe.yaml @@ -0,0 +1,154 @@ +name: "svf-vllm-disagg-b200-max-tpt-megamoe" + +# Mirrored from NVIDIA/srt-slurm aflowers/vllm-gb200-v0.20.0 branch: +# recipes/vllm/deepseek-v4-pro/GB200/8k1k/disagg-gb200-max-tpt-megamoe.yaml +# +# B200 adaptation of the GB200 recipe below. Each prefill/decode worker uses +# one full 8-GPU B200 node, plus a dedicated NATS/etcd infra node. +# +# Local deltas vs upstream: +# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match +# SRT_SLURM_MODEL_PREFIX in the launch script. +# * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404 to +# match nvidia-master.yaml image (which the launch script registers as +# the alias key in srtslurm.yaml). Upstream variants ship either the +# non-dynamo floating tag or a sha256 pin. +# * slurm.time_limit + health_check set to 8h / 1440 attempts to +# absorb cold-cache model loads. +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:v0.20.0-ubuntu2404" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 3 + decode_nodes: 1 + prefill_workers: 3 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +infra: + etcd_nats_dedicated_node: true + +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + TORCH_SYMMMEM: "NVSHMEM" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + TORCH_SYMMMEM: "NVSHMEM" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enable-ep-weight-filter: true + moe-backend: deep_gemm_mega_moe + enforce-eager: true + max-model-len: 9280 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.95 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enable-ep-weight-filter: true + moe-backend: deep_gemm_mega_moe + max-model-len: 9280 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 512 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + model: + repo: "deepseek-ai/DeepSeek-V4-Pro" + revision: "0366e4e064385807ea86b088a5c6c878ff23343b" + container: + image: "vllm/vllm-openai:v0.20.0-ubuntu2404" + frameworks: + dynamo: "1.2.0.dev20260426" + vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-mid-curve-megamoe.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-mid-curve-megamoe.yaml new file mode 100644 index 000000000..0d4d53de0 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-mid-curve-megamoe.yaml @@ -0,0 +1,154 @@ +name: "svf-vllm-disagg-b200-mid-curve-megamoe" + +# Mirrored from NVIDIA/srt-slurm aflowers/vllm-gb200-v0.20.0 branch: +# recipes/vllm/deepseek-v4-pro/GB200/8k1k/disagg-gb200-mid-curve-megamoe.yaml +# +# B200 adaptation of the GB200 recipe below. Each prefill/decode worker uses +# one full 8-GPU B200 node, plus a dedicated NATS/etcd infra node. +# +# Local deltas vs upstream: +# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match +# SRT_SLURM_MODEL_PREFIX in the launch script. +# * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404 to +# match nvidia-master.yaml image (which the launch script registers as +# the alias key in srtslurm.yaml). Upstream variants ship either the +# non-dynamo floating tag or a sha256 pin. +# * slurm.time_limit + health_check set to 8h / 1440 attempts to +# absorb cold-cache model loads. +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:v0.20.0-ubuntu2404" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +infra: + etcd_nats_dedicated_node: true + +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + TORCH_SYMMMEM: "NVSHMEM" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + TORCH_SYMMMEM: "NVSHMEM" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enable-ep-weight-filter: true + moe-backend: deep_gemm_mega_moe + enforce-eager: true + max-model-len: 9280 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.95 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enable-ep-weight-filter: true + moe-backend: deep_gemm_mega_moe + max-model-len: 9280 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 512 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256x512x1024" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + model: + repo: "deepseek-ai/DeepSeek-V4-Pro" + revision: "0366e4e064385807ea86b088a5c6c878ff23343b" + container: + image: "vllm/vllm-openai:v0.20.0-ubuntu2404" + frameworks: + dynamo: "1.2.0.dev20260426" + vllm: "0.20.0" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 715d6f177..db24b0903 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2307,3 +2307,11 @@ - "Tune DSv4 FP4 MI355X SGLang runtime envs: enable aiter MHC pre/post, and enable triton swa prepare kernel." - "Add --context-length. Add --enable-prefill-delayer for dp config" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1300 + +- config-keys: + - dsv4-fp4-b200-dynamo-vllm + description: + - "Add DeepSeek-V4-Pro FP4 B200 disaggregated multi-node coverage using Dynamo vLLM" + - "Adapt the existing DSV4 GB200 vLLM disagg recipes to B200 by mapping each worker to one full 8-GPU B200 node" + - "Update the B200 DGXC Slurm launcher to support dsv4/fp4 with dynamo-vllm and local recipe overlays" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1303 diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh index e2681ccec..0a5d6fe21 100644 --- a/runners/launch_b200-dgxc.sh +++ b/runners/launch_b200-dgxc.sh @@ -9,8 +9,8 @@ set -x if [[ "$IS_MULTINODE" == "true" ]]; then # Validate framework - if [[ $FRAMEWORK != "dynamo-sglang" && $FRAMEWORK != "dynamo-trt" ]]; then - echo "Unsupported framework: $FRAMEWORK. Supported frameworks are: dynamo-trt, dynamo-sglang" + if [[ $FRAMEWORK != "dynamo-sglang" && $FRAMEWORK != "dynamo-trt" && $FRAMEWORK != "dynamo-vllm" ]]; then + echo "Unsupported framework: $FRAMEWORK. Supported frameworks are: dynamo-trt, dynamo-sglang, dynamo-vllm" exit 1 fi @@ -23,6 +23,20 @@ if [[ "$IS_MULTINODE" == "true" ]]; then elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then export MODEL_PATH="/lustre/fsw/models/dsr1-0528-fp8" export SRT_SLURM_MODEL_PREFIX="dsr1-fp8" + elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" && $FRAMEWORK == "dynamo-vllm" ]]; then + SELECTED_MODEL_PATH="" + if [[ -n "${MODEL_PATH:-}" && -d "${MODEL_PATH}" ]]; then + SELECTED_MODEL_PATH="$MODEL_PATH" + else + for candidate in /lustre/fsw/models/deepseek-v4-pro /lustre/fsw/models/dsv4-pro /lustre/fsw/models/DeepSeek-V4-Pro; do + if [[ -d "$candidate" ]]; then + SELECTED_MODEL_PATH="$candidate" + break + fi + done + fi + export MODEL_PATH="${SELECTED_MODEL_PATH:-/lustre/fsw/models/deepseek-v4-pro}" + export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro" else echo "Unsupported model prefix/precision: $MODEL_PREFIX/$PRECISION" exit 1 @@ -40,6 +54,12 @@ if [[ "$IS_MULTINODE" == "true" ]]; then if [[ "$IS_AGENTIC" == "1" ]]; then git clone --branch cam/sa-submission-q2-2026 --single-branch https://github.com/cquil11/srt-slurm-nv.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" || exit 1 + elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then + git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" + cd "$SRT_REPO_DIR" || exit 1 + git checkout aflowers/vllm-gb200-v0.20.0 + mkdir -p recipes/vllm/deepseek-v4 + cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4 else git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" || exit 1 @@ -95,6 +115,8 @@ model_paths: containers: dynamo-trtllm: "${SQUASH_FILE}" dynamo-sglang: "${SQUASH_FILE}" + dynamo-vllm: "${SQUASH_FILE}" + "${IMAGE}": "${SQUASH_FILE}" nginx-sqsh: "${NGINX_SQUASH_FILE}" use_exclusive_sbatch_directive: true EOF From a1630bfd52f3626be79b0efa4fc29ac33d0ef2ce Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 8 May 2026 14:15:43 -0700 Subject: [PATCH 2/7] image change --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index a92ed574c..d1c835ce1 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7943,7 +7943,7 @@ kimik2.5-fp4-gb200-dynamo-vllm: dp-attn: true dsv4-fp4-b200-dynamo-vllm: - image: vllm/vllm-openai:v0.20.0-ubuntu2404 + image: vllm/vllm-openai:v0.20.0-cu130 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b200-multinode From be8c58f29d465278844c596793157b77fb07a4d8 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 8 May 2026 14:45:15 -0700 Subject: [PATCH 3/7] env vars --- .../8k1k/disagg-b200-high-tpt-megamoe.yaml | 18 +++--------------- .../8k1k/disagg-b200-low-latency.yaml | 16 +++------------- .../8k1k/disagg-b200-low-middle-curve.yaml | 16 +++------------- .../8k1k/disagg-b200-max-tpt-megamoe.yaml | 18 +++--------------- .../8k1k/disagg-b200-mid-curve-megamoe.yaml | 18 +++--------------- 5 files changed, 15 insertions(+), 71 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-high-tpt-megamoe.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-high-tpt-megamoe.yaml index 4ab1e0c88..79d868af0 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-high-tpt-megamoe.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-high-tpt-megamoe.yaml @@ -9,7 +9,7 @@ name: "svf-vllm-disagg-b200-high-tpt-megamoe" # Local deltas vs upstream: # * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match # SRT_SLURM_MODEL_PREFIX in the launch script. -# * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404 to +# * model.container set to vllm/vllm-openai:v0.20.0-cu130 to # match nvidia-master.yaml image (which the launch script registers as # the alias key in srtslurm.yaml). Upstream variants ship either the # non-dynamo floating tag or a sha256 pin. @@ -17,7 +17,7 @@ name: "svf-vllm-disagg-b200-high-tpt-megamoe" # absorb cold-cache model loads. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:v0.20.0-ubuntu2404" + container: "vllm/vllm-openai:v0.20.0-cu130" precision: "fp4" dynamo: @@ -53,11 +53,7 @@ backend: connector: null prefill_environment: TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - TORCH_SYMMMEM: "NVSHMEM" NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" VLLM_SERVER_DEV_MODE: "1" VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" @@ -66,23 +62,15 @@ backend: UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_P2P_LEVEL: NVL decode_environment: TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - TORCH_SYMMMEM: "NVSHMEM" NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" VLLM_SERVER_DEV_MODE: "1" # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_P2P_LEVEL: NVL vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' @@ -148,7 +136,7 @@ identity: repo: "deepseek-ai/DeepSeek-V4-Pro" revision: "0366e4e064385807ea86b088a5c6c878ff23343b" container: - image: "vllm/vllm-openai:v0.20.0-ubuntu2404" + image: "vllm/vllm-openai:v0.20.0-cu130" frameworks: dynamo: "1.2.0.dev20260426" vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-latency.yaml index d80e97f11..c269ab3b5 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-latency.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-latency.yaml @@ -9,7 +9,7 @@ name: "svf-vllm-disagg-b200-low-latency" # Local deltas vs upstream: # * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match # SRT_SLURM_MODEL_PREFIX in the launch script. -# * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404 to +# * model.container set to vllm/vllm-openai:v0.20.0-cu130 to # match nvidia-master.yaml image (which the launch script registers as # the alias key in srtslurm.yaml). Upstream variants ship either the # non-dynamo floating tag or a sha256 pin. @@ -17,7 +17,7 @@ name: "svf-vllm-disagg-b200-low-latency" # absorb cold-cache model loads. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:v0.20.0-ubuntu2404" + container: "vllm/vllm-openai:v0.20.0-cu130" precision: "fp4" dynamo: @@ -53,10 +53,7 @@ backend: connector: null prefill_environment: TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" VLLM_SERVER_DEV_MODE: "1" VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" @@ -65,22 +62,15 @@ backend: UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_P2P_LEVEL: NVL decode_environment: TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" VLLM_SERVER_DEV_MODE: "1" # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_P2P_LEVEL: NVL vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' @@ -143,7 +133,7 @@ benchmark: identity: container: - image: "vllm/vllm-openai:v0.20.0-ubuntu2404" + image: "vllm/vllm-openai:v0.20.0-cu130" frameworks: dynamo: "1.2.0.dev20260426" vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-middle-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-middle-curve.yaml index 5a2820eab..21452d719 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-middle-curve.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-middle-curve.yaml @@ -9,7 +9,7 @@ name: "svf-vllm-disagg-b200-low-middle-curve" # Local deltas vs upstream: # * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match # SRT_SLURM_MODEL_PREFIX in the launch script. -# * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404 to +# * model.container set to vllm/vllm-openai:v0.20.0-cu130 to # match nvidia-master.yaml image (which the launch script registers as # the alias key in srtslurm.yaml). Upstream variants ship either the # non-dynamo floating tag or a sha256 pin. @@ -17,7 +17,7 @@ name: "svf-vllm-disagg-b200-low-middle-curve" # absorb cold-cache model loads. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:v0.20.0-ubuntu2404" + container: "vllm/vllm-openai:v0.20.0-cu130" precision: "fp4" dynamo: @@ -53,10 +53,7 @@ backend: connector: null prefill_environment: TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" VLLM_SERVER_DEV_MODE: "1" VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" @@ -65,22 +62,15 @@ backend: UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_P2P_LEVEL: NVL decode_environment: TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" VLLM_SERVER_DEV_MODE: "1" # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_P2P_LEVEL: NVL vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' @@ -144,7 +134,7 @@ benchmark: identity: container: - image: "vllm/vllm-openai:v0.20.0-ubuntu2404" + image: "vllm/vllm-openai:v0.20.0-cu130" frameworks: dynamo: "1.2.0.dev20260426" vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-max-tpt-megamoe.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-max-tpt-megamoe.yaml index 74cb4c08c..abdd8bd6f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-max-tpt-megamoe.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-max-tpt-megamoe.yaml @@ -9,7 +9,7 @@ name: "svf-vllm-disagg-b200-max-tpt-megamoe" # Local deltas vs upstream: # * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match # SRT_SLURM_MODEL_PREFIX in the launch script. -# * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404 to +# * model.container set to vllm/vllm-openai:v0.20.0-cu130 to # match nvidia-master.yaml image (which the launch script registers as # the alias key in srtslurm.yaml). Upstream variants ship either the # non-dynamo floating tag or a sha256 pin. @@ -17,7 +17,7 @@ name: "svf-vllm-disagg-b200-max-tpt-megamoe" # absorb cold-cache model loads. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:v0.20.0-ubuntu2404" + container: "vllm/vllm-openai:v0.20.0-cu130" precision: "fp4" dynamo: @@ -53,11 +53,7 @@ backend: connector: null prefill_environment: TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - TORCH_SYMMMEM: "NVSHMEM" NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" VLLM_SERVER_DEV_MODE: "1" VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" @@ -66,23 +62,15 @@ backend: UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_P2P_LEVEL: NVL decode_environment: TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - TORCH_SYMMMEM: "NVSHMEM" NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" VLLM_SERVER_DEV_MODE: "1" # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_P2P_LEVEL: NVL vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' @@ -148,7 +136,7 @@ identity: repo: "deepseek-ai/DeepSeek-V4-Pro" revision: "0366e4e064385807ea86b088a5c6c878ff23343b" container: - image: "vllm/vllm-openai:v0.20.0-ubuntu2404" + image: "vllm/vllm-openai:v0.20.0-cu130" frameworks: dynamo: "1.2.0.dev20260426" vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-mid-curve-megamoe.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-mid-curve-megamoe.yaml index 0d4d53de0..3eae84272 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-mid-curve-megamoe.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-mid-curve-megamoe.yaml @@ -9,7 +9,7 @@ name: "svf-vllm-disagg-b200-mid-curve-megamoe" # Local deltas vs upstream: # * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match # SRT_SLURM_MODEL_PREFIX in the launch script. -# * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404 to +# * model.container set to vllm/vllm-openai:v0.20.0-cu130 to # match nvidia-master.yaml image (which the launch script registers as # the alias key in srtslurm.yaml). Upstream variants ship either the # non-dynamo floating tag or a sha256 pin. @@ -17,7 +17,7 @@ name: "svf-vllm-disagg-b200-mid-curve-megamoe" # absorb cold-cache model loads. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:v0.20.0-ubuntu2404" + container: "vllm/vllm-openai:v0.20.0-cu130" precision: "fp4" dynamo: @@ -53,11 +53,7 @@ backend: connector: null prefill_environment: TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - TORCH_SYMMMEM: "NVSHMEM" NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" VLLM_SERVER_DEV_MODE: "1" VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" @@ -66,23 +62,15 @@ backend: UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_P2P_LEVEL: NVL decode_environment: TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - TORCH_SYMMMEM: "NVSHMEM" NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" VLLM_SERVER_DEV_MODE: "1" # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_P2P_LEVEL: NVL vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' @@ -148,7 +136,7 @@ identity: repo: "deepseek-ai/DeepSeek-V4-Pro" revision: "0366e4e064385807ea86b088a5c6c878ff23343b" container: - image: "vllm/vllm-openai:v0.20.0-ubuntu2404" + image: "vllm/vllm-openai:v0.20.0-cu130" frameworks: dynamo: "1.2.0.dev20260426" vllm: "0.20.0" From f728288dbbfcd1271cc3783aafd886cb31b5f966 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 8 May 2026 15:40:26 -0700 Subject: [PATCH 4/7] env vars --- .../vllm/deepseek-v4/8k1k/disagg-b200-high-tpt-megamoe.yaml | 4 ---- .../vllm/deepseek-v4/8k1k/disagg-b200-low-latency.yaml | 2 -- .../vllm/deepseek-v4/8k1k/disagg-b200-low-middle-curve.yaml | 2 -- .../vllm/deepseek-v4/8k1k/disagg-b200-max-tpt-megamoe.yaml | 4 ---- .../vllm/deepseek-v4/8k1k/disagg-b200-mid-curve-megamoe.yaml | 4 ---- 5 files changed, 16 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-high-tpt-megamoe.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-high-tpt-megamoe.yaml index 79d868af0..2fdbfc2e6 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-high-tpt-megamoe.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-high-tpt-megamoe.yaml @@ -61,7 +61,6 @@ backend: # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" decode_environment: TILELANG_CLEANUP_TEMP_FILES: "1" NCCL_CUMEM_ENABLE: "1" @@ -70,7 +69,6 @@ backend: # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' @@ -82,7 +80,6 @@ backend: data-parallel-rpc-port: 13345 enable-expert-parallel: true enable-ep-weight-filter: true - moe-backend: deep_gemm_mega_moe enforce-eager: true max-model-len: 9280 max-num-seqs: 16 @@ -107,7 +104,6 @@ backend: data-parallel-rpc-port: 13345 enable-expert-parallel: true enable-ep-weight-filter: true - moe-backend: deep_gemm_mega_moe max-model-len: 9280 max-num-seqs: 512 max-cudagraph-capture-size: 512 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-latency.yaml index c269ab3b5..788544e14 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-latency.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-latency.yaml @@ -61,7 +61,6 @@ backend: # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" decode_environment: TILELANG_CLEANUP_TEMP_FILES: "1" NCCL_CUMEM_ENABLE: "1" @@ -70,7 +69,6 @@ backend: # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-middle-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-middle-curve.yaml index 21452d719..2f2adb478 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-middle-curve.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-middle-curve.yaml @@ -61,7 +61,6 @@ backend: # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" decode_environment: TILELANG_CLEANUP_TEMP_FILES: "1" NCCL_CUMEM_ENABLE: "1" @@ -70,7 +69,6 @@ backend: # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-max-tpt-megamoe.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-max-tpt-megamoe.yaml index abdd8bd6f..c66d5bee8 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-max-tpt-megamoe.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-max-tpt-megamoe.yaml @@ -61,7 +61,6 @@ backend: # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" decode_environment: TILELANG_CLEANUP_TEMP_FILES: "1" NCCL_CUMEM_ENABLE: "1" @@ -70,7 +69,6 @@ backend: # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' @@ -82,7 +80,6 @@ backend: data-parallel-rpc-port: 13345 enable-expert-parallel: true enable-ep-weight-filter: true - moe-backend: deep_gemm_mega_moe enforce-eager: true max-model-len: 9280 max-num-seqs: 16 @@ -107,7 +104,6 @@ backend: data-parallel-rpc-port: 13345 enable-expert-parallel: true enable-ep-weight-filter: true - moe-backend: deep_gemm_mega_moe max-model-len: 9280 max-num-seqs: 512 max-cudagraph-capture-size: 512 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-mid-curve-megamoe.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-mid-curve-megamoe.yaml index 3eae84272..f65203b5f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-mid-curve-megamoe.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-mid-curve-megamoe.yaml @@ -61,7 +61,6 @@ backend: # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" decode_environment: TILELANG_CLEANUP_TEMP_FILES: "1" NCCL_CUMEM_ENABLE: "1" @@ -70,7 +69,6 @@ backend: # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' @@ -82,7 +80,6 @@ backend: data-parallel-rpc-port: 13345 enable-expert-parallel: true enable-ep-weight-filter: true - moe-backend: deep_gemm_mega_moe enforce-eager: true max-model-len: 9280 max-num-seqs: 16 @@ -107,7 +104,6 @@ backend: data-parallel-rpc-port: 13345 enable-expert-parallel: true enable-ep-weight-filter: true - moe-backend: deep_gemm_mega_moe max-model-len: 9280 max-num-seqs: 512 max-cudagraph-capture-size: 512 From 4ca8bb5421d610343c609e4016be4290f1ef4fc9 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 8 May 2026 20:05:33 -0700 Subject: [PATCH 5/7] refine pareto --- .github/configs/nvidia-master.yaml | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index d1c835ce1..f0c31e8a5 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7958,7 +7958,7 @@ dsv4-fp4-b200-dynamo-vllm: search-space: # B200 adaptation of the DSV4 GB200 vLLM disagg recipes. Each worker # maps to one full 8-GPU B200 node. - - conc-list: [1] + - conc-list: [1,64] prefill: num-worker: 1 tp: 8 @@ -7971,7 +7971,7 @@ dsv4-fp4-b200-dynamo-vllm: tp: 8 ep: 1 dp-attn: false - - conc-list: [256, 512] + - conc-list: [512] prefill: num-worker: 1 tp: 8 @@ -7984,20 +7984,7 @@ dsv4-fp4-b200-dynamo-vllm: tp: 8 ep: 1 dp-attn: false - - conc-list: [256, 512, 1024] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-b200-mid-curve-megamoe.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [4096] + - conc-list: [1024, 2048, 4096] prefill: num-worker: 2 tp: 8 @@ -8010,7 +7997,7 @@ dsv4-fp4-b200-dynamo-vllm: tp: 8 ep: 8 dp-attn: true - - conc-list: [4096] + - conc-list: [8192] prefill: num-worker: 3 tp: 8 From b3fa178eedeb19a5a3e64d348fe315b4d659c634 Mon Sep 17 00:00:00 2001 From: Bryan Shan <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 8 May 2026 21:05:22 -0700 Subject: [PATCH 6/7] Update nvidia-master.yaml --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index f0c31e8a5..30933ce2e 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7943,7 +7943,7 @@ kimik2.5-fp4-gb200-dynamo-vllm: dp-attn: true dsv4-fp4-b200-dynamo-vllm: - image: vllm/vllm-openai:v0.20.0-cu130 + image: vllm/vllm-openai:v0.20.1 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b200-multinode From 26091a749f2d7008bbaddc4a7c575f5a5f92d9e6 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 8 May 2026 22:13:56 -0700 Subject: [PATCH 7/7] change image --- .../vllm/deepseek-v4/8k1k/disagg-b200-high-tpt-megamoe.yaml | 6 +++--- .../vllm/deepseek-v4/8k1k/disagg-b200-low-latency.yaml | 6 +++--- .../vllm/deepseek-v4/8k1k/disagg-b200-low-middle-curve.yaml | 6 +++--- .../vllm/deepseek-v4/8k1k/disagg-b200-max-tpt-megamoe.yaml | 6 +++--- .../deepseek-v4/8k1k/disagg-b200-mid-curve-megamoe.yaml | 6 +++--- perf-changelog.yaml | 1 + 6 files changed, 16 insertions(+), 15 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-high-tpt-megamoe.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-high-tpt-megamoe.yaml index 2fdbfc2e6..241a6c8d0 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-high-tpt-megamoe.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-high-tpt-megamoe.yaml @@ -9,7 +9,7 @@ name: "svf-vllm-disagg-b200-high-tpt-megamoe" # Local deltas vs upstream: # * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match # SRT_SLURM_MODEL_PREFIX in the launch script. -# * model.container set to vllm/vllm-openai:v0.20.0-cu130 to +# * model.container set to vllm/vllm-openai:v0.20.1 to # match nvidia-master.yaml image (which the launch script registers as # the alias key in srtslurm.yaml). Upstream variants ship either the # non-dynamo floating tag or a sha256 pin. @@ -17,7 +17,7 @@ name: "svf-vllm-disagg-b200-high-tpt-megamoe" # absorb cold-cache model loads. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:v0.20.0-cu130" + container: "vllm/vllm-openai:v0.20.1" precision: "fp4" dynamo: @@ -132,7 +132,7 @@ identity: repo: "deepseek-ai/DeepSeek-V4-Pro" revision: "0366e4e064385807ea86b088a5c6c878ff23343b" container: - image: "vllm/vllm-openai:v0.20.0-cu130" + image: "vllm/vllm-openai:v0.20.1" frameworks: dynamo: "1.2.0.dev20260426" vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-latency.yaml index 788544e14..c0b56f222 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-latency.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-latency.yaml @@ -9,7 +9,7 @@ name: "svf-vllm-disagg-b200-low-latency" # Local deltas vs upstream: # * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match # SRT_SLURM_MODEL_PREFIX in the launch script. -# * model.container set to vllm/vllm-openai:v0.20.0-cu130 to +# * model.container set to vllm/vllm-openai:v0.20.1 to # match nvidia-master.yaml image (which the launch script registers as # the alias key in srtslurm.yaml). Upstream variants ship either the # non-dynamo floating tag or a sha256 pin. @@ -17,7 +17,7 @@ name: "svf-vllm-disagg-b200-low-latency" # absorb cold-cache model loads. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:v0.20.0-cu130" + container: "vllm/vllm-openai:v0.20.1" precision: "fp4" dynamo: @@ -131,7 +131,7 @@ benchmark: identity: container: - image: "vllm/vllm-openai:v0.20.0-cu130" + image: "vllm/vllm-openai:v0.20.1" frameworks: dynamo: "1.2.0.dev20260426" vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-middle-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-middle-curve.yaml index 2f2adb478..f11a73238 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-middle-curve.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-middle-curve.yaml @@ -9,7 +9,7 @@ name: "svf-vllm-disagg-b200-low-middle-curve" # Local deltas vs upstream: # * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match # SRT_SLURM_MODEL_PREFIX in the launch script. -# * model.container set to vllm/vllm-openai:v0.20.0-cu130 to +# * model.container set to vllm/vllm-openai:v0.20.1 to # match nvidia-master.yaml image (which the launch script registers as # the alias key in srtslurm.yaml). Upstream variants ship either the # non-dynamo floating tag or a sha256 pin. @@ -17,7 +17,7 @@ name: "svf-vllm-disagg-b200-low-middle-curve" # absorb cold-cache model loads. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:v0.20.0-cu130" + container: "vllm/vllm-openai:v0.20.1" precision: "fp4" dynamo: @@ -132,7 +132,7 @@ benchmark: identity: container: - image: "vllm/vllm-openai:v0.20.0-cu130" + image: "vllm/vllm-openai:v0.20.1" frameworks: dynamo: "1.2.0.dev20260426" vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-max-tpt-megamoe.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-max-tpt-megamoe.yaml index c66d5bee8..23451b4ec 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-max-tpt-megamoe.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-max-tpt-megamoe.yaml @@ -9,7 +9,7 @@ name: "svf-vllm-disagg-b200-max-tpt-megamoe" # Local deltas vs upstream: # * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match # SRT_SLURM_MODEL_PREFIX in the launch script. -# * model.container set to vllm/vllm-openai:v0.20.0-cu130 to +# * model.container set to vllm/vllm-openai:v0.20.1 to # match nvidia-master.yaml image (which the launch script registers as # the alias key in srtslurm.yaml). Upstream variants ship either the # non-dynamo floating tag or a sha256 pin. @@ -17,7 +17,7 @@ name: "svf-vllm-disagg-b200-max-tpt-megamoe" # absorb cold-cache model loads. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:v0.20.0-cu130" + container: "vllm/vllm-openai:v0.20.1" precision: "fp4" dynamo: @@ -132,7 +132,7 @@ identity: repo: "deepseek-ai/DeepSeek-V4-Pro" revision: "0366e4e064385807ea86b088a5c6c878ff23343b" container: - image: "vllm/vllm-openai:v0.20.0-cu130" + image: "vllm/vllm-openai:v0.20.1" frameworks: dynamo: "1.2.0.dev20260426" vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-mid-curve-megamoe.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-mid-curve-megamoe.yaml index f65203b5f..a1dec32d0 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-mid-curve-megamoe.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-mid-curve-megamoe.yaml @@ -9,7 +9,7 @@ name: "svf-vllm-disagg-b200-mid-curve-megamoe" # Local deltas vs upstream: # * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match # SRT_SLURM_MODEL_PREFIX in the launch script. -# * model.container set to vllm/vllm-openai:v0.20.0-cu130 to +# * model.container set to vllm/vllm-openai:v0.20.1 to # match nvidia-master.yaml image (which the launch script registers as # the alias key in srtslurm.yaml). Upstream variants ship either the # non-dynamo floating tag or a sha256 pin. @@ -17,7 +17,7 @@ name: "svf-vllm-disagg-b200-mid-curve-megamoe" # absorb cold-cache model loads. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:v0.20.0-cu130" + container: "vllm/vllm-openai:v0.20.1" precision: "fp4" dynamo: @@ -132,7 +132,7 @@ identity: repo: "deepseek-ai/DeepSeek-V4-Pro" revision: "0366e4e064385807ea86b088a5c6c878ff23343b" container: - image: "vllm/vllm-openai:v0.20.0-cu130" + image: "vllm/vllm-openai:v0.20.1" frameworks: dynamo: "1.2.0.dev20260426" vllm: "0.20.0" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index db24b0903..12a5e4da5 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2312,6 +2312,7 @@ - dsv4-fp4-b200-dynamo-vllm description: - "Add DeepSeek-V4-Pro FP4 B200 disaggregated multi-node coverage using Dynamo vLLM" + - "9eff9734a30b6713a8566217d36f8277630fd2d31cec7f0a0292835901a23aa4" - "Adapt the existing DSV4 GB200 vLLM disagg recipes to B200 by mapping each worker to one full 8-GPU B200 node" - "Update the B200 DGXC Slurm launcher to support dsv4/fp4 with dynamo-vllm and local recipe overlays" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1303