Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 69 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7942,6 +7942,75 @@ kimik2.5-fp4-gb200-dynamo-vllm:
ep: 16
dp-attn: true

dsv4-fp4-b200-dynamo-vllm:
image: vllm/vllm-openai:v0.20.1
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: b200-multinode
precision: fp4
framework: dynamo-vllm
multinode: true
disagg: true
scenarios:
fixed-seq-len:
- isl: 8192
osl: 1024
search-space:
# B200 adaptation of the DSV4 GB200 vLLM disagg recipes. Each worker
# maps to one full 8-GPU B200 node.
- conc-list: [1,64]
prefill:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-latency.yaml"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
- conc-list: [512]
prefill:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-middle-curve.yaml"
decode:
num-worker: 4
tp: 8
ep: 1
dp-attn: false
- conc-list: [1024, 2048, 4096]
prefill:
num-worker: 2
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-b200-high-tpt-megamoe.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
- conc-list: [8192]
prefill:
num-worker: 3
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-b200-max-tpt-megamoe.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true

dsv4-fp4-gb200-dynamo-vllm:
image: vllm/vllm-openai:v0.20.0-ubuntu2404
model: deepseek-ai/DeepSeek-V4-Pro
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
name: "svf-vllm-disagg-b200-high-tpt-megamoe"

# Mirrored from NVIDIA/srt-slurm aflowers/vllm-gb200-v0.20.0 branch:
# recipes/vllm/deepseek-v4-pro/GB200/8k1k/disagg-gb200-high-tpt-megamoe.yaml
#
# B200 adaptation of the GB200 recipe below. Each prefill/decode worker uses
# one full 8-GPU B200 node, plus a dedicated NATS/etcd infra node.
#
# Local deltas vs upstream:
# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match
# SRT_SLURM_MODEL_PREFIX in the launch script.
# * model.container set to vllm/vllm-openai:v0.20.1 to
# match nvidia-master.yaml image (which the launch script registers as
# the alias key in srtslurm.yaml). Upstream variants ship either the
# non-dynamo floating tag or a sha256 pin.
# * slurm.time_limit + health_check set to 8h / 1440 attempts to
# absorb cold-cache model loads.
model:
path: "deepseek-v4-pro"
container: "vllm/vllm-openai:v0.20.1"
precision: "fp4"

dynamo:
install: true
wheel: "1.2.0.dev20260426"

setup_script: vllm-container-deps.sh

slurm:
time_limit: "8:00:00"

health_check:
max_attempts: 1440
interval_seconds: 10
resources:
gpu_type: "b200"
gpus_per_node: 8
prefill_nodes: 2
decode_nodes: 1
prefill_workers: 2
decode_workers: 1
gpus_per_prefill: 8
gpus_per_decode: 8

infra:
etcd_nats_dedicated_node: true

frontend:
type: dynamo
enable_multiple_frontends: false
backend:
type: vllm
connector: null
prefill_environment:
TILELANG_CLEANUP_TEMP_FILES: "1"
NCCL_CUMEM_ENABLE: "1"
VLLM_SERVER_DEV_MODE: "1"
VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
# VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
# VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
UCX_MEMTYPE_CACHE: "n"
UCX_MEMTYPE_REG_WHOLE: "n"
decode_environment:
TILELANG_CLEANUP_TEMP_FILES: "1"
NCCL_CUMEM_ENABLE: "1"
VLLM_SERVER_DEV_MODE: "1"
# VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
# VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
UCX_MEMTYPE_CACHE: "n"
UCX_MEMTYPE_REG_WHOLE: "n"
vllm_config:
prefill:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 1
pipeline-parallel-size: 1
data-parallel-size: 8
data-parallel-rpc-port: 13345
enable-expert-parallel: true
enable-ep-weight-filter: true
enforce-eager: true
max-model-len: 9280
max-num-seqs: 16
max-num-batched-tokens: 32768
trust-remote-code: true
no-enable-prefix-caching: true
no-enable-flashinfer-autotune: true
no-async-scheduling: true
block-size: 256
gpu-memory-utilization: 0.95
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true
numa-bind: true
tokenizer-mode: deepseek_v4
decode:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 1
pipeline-parallel-size: 1
data-parallel-size: 8
data-parallel-rpc-port: 13345
enable-expert-parallel: true
enable-ep-weight-filter: true
max-model-len: 9280
max-num-seqs: 512
max-cudagraph-capture-size: 512
max-num-batched-tokens: 512
trust-remote-code: true
no-enable-prefix-caching: true
no-enable-flashinfer-autotune: true
block-size: 256
compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
gpu-memory-utilization: 0.9
stream-interval: 50
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true
tokenizer-mode: deepseek_v4
benchmark:
type: "sa-bench"
isl: 8192
osl: 1024
concurrencies: "4096"
req_rate: "inf"
use_chat_template: true
custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer"

identity:
model:
repo: "deepseek-ai/DeepSeek-V4-Pro"
revision: "0366e4e064385807ea86b088a5c6c878ff23343b"
container:
image: "vllm/vllm-openai:v0.20.1"
frameworks:
dynamo: "1.2.0.dev20260426"
vllm: "0.20.0"
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
name: "svf-vllm-disagg-b200-low-latency"

# Mirrored from NVIDIA/srt-slurm aflowers/vllm-gb200-v0.20.0 branch:
# recipes/vllm/deepseek-v4-pro/GB200/8k1k/disagg-gb200-low-latency.yaml
#
# B200 adaptation of the GB200 recipe below. Each prefill/decode worker uses
# one full 8-GPU B200 node, plus a dedicated NATS/etcd infra node.
#
# Local deltas vs upstream:
# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match
# SRT_SLURM_MODEL_PREFIX in the launch script.
# * model.container set to vllm/vllm-openai:v0.20.1 to
# match nvidia-master.yaml image (which the launch script registers as
# the alias key in srtslurm.yaml). Upstream variants ship either the
# non-dynamo floating tag or a sha256 pin.
# * slurm.time_limit + health_check set to 8h / 1440 attempts to
# absorb cold-cache model loads.
model:
path: "deepseek-v4-pro"
container: "vllm/vllm-openai:v0.20.1"
precision: "fp4"

dynamo:
install: true
wheel: "1.2.0.dev20260426"

setup_script: vllm-container-deps.sh

slurm:
time_limit: "8:00:00"

health_check:
max_attempts: 1440
interval_seconds: 10
resources:
gpu_type: "b200"
gpus_per_node: 8
prefill_nodes: 1
decode_nodes: 1
prefill_workers: 1
decode_workers: 1
gpus_per_prefill: 8
gpus_per_decode: 8

infra:
etcd_nats_dedicated_node: true

frontend:
type: dynamo
enable_multiple_frontends: false
backend:
type: vllm
connector: null
prefill_environment:
TILELANG_CLEANUP_TEMP_FILES: "1"
NCCL_CUMEM_ENABLE: "1"
VLLM_SERVER_DEV_MODE: "1"
VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
# VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
# VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
UCX_MEMTYPE_CACHE: "n"
UCX_MEMTYPE_REG_WHOLE: "n"
decode_environment:
TILELANG_CLEANUP_TEMP_FILES: "1"
NCCL_CUMEM_ENABLE: "1"
VLLM_SERVER_DEV_MODE: "1"
# VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
# VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
UCX_MEMTYPE_CACHE: "n"
UCX_MEMTYPE_REG_WHOLE: "n"
vllm_config:
prefill:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 1
pipeline-parallel-size: 1
data-parallel-size: 8
data-parallel-rpc-port: 13345
enable-expert-parallel: true
enforce-eager: true
max-model-len: 16384
max-num-seqs: 16
max-num-batched-tokens: 32768
trust-remote-code: true
no-enable-prefix-caching: true
no-enable-flashinfer-autotune: true
no-async-scheduling: true
block-size: 256
gpu-memory-utilization: 0.8
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true
numa-bind: true
offload-group-size: 3
offload-num-in-group: 1
offload-prefetch-step: 2
# offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts"
tokenizer-mode: deepseek_v4
decode:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 8
pipeline-parallel-size: 1
# data-parallel-size: 8
# data-parallel-rpc-port: 13345
# enable-expert-parallel: true
max-model-len: 16384
max-num-seqs: 256
max-cudagraph-capture-size: 256
max-num-batched-tokens: 256
trust-remote-code: true
no-enable-prefix-caching: true
no-enable-flashinfer-autotune: true
block-size: 256
compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
gpu-memory-utilization: 0.9
stream-interval: 50
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true
tokenizer-mode: deepseek_v4
benchmark:
type: "sa-bench"
isl: 8192
osl: 1024
concurrencies: "1"
req_rate: "inf"
use_chat_template: true
custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer"

identity:
container:
image: "vllm/vllm-openai:v0.20.1"
frameworks:
dynamo: "1.2.0.dev20260426"
vllm: "0.20.0"
Loading
Loading