Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8178,6 +8178,62 @@ dsv4-fp4-gb200-dynamo-vllm-mtp2:
ep: 8
dp-attn: true

dsv4-fp4-b300-dynamo-vllm:
image: vllm/vllm-openai:v0.20.1
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: b300
precision: fp4
framework: dynamo-vllm
multinode: true
disagg: true
scenarios:
fixed-seq-len:
- isl: 8192
osl: 1024
search-space:
# B300 adaptation of the DSV4 B200/GB200 vLLM disagg recipes. Each
# prefill/decode worker maps to one full 8-GPU B300 node.
- conc-list: [1, 64, 128]
prefill:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-b300-low-latency.yaml"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
- conc-list: [128, 256, 512, 1024]
prefill:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-b300-mid-curve-megamoe.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
- conc-list: [4096, 8192]
prefill:
num-worker: 2
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-b300-high-tpt-megamoe.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true

dsv4-fp4-gb300-dynamo-vllm:
image: vllm/vllm-openai:v0.20.0-ubuntu2404
model: deepseek-ai/DeepSeek-V4-Pro
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
name: "svf-vllm-disagg-b300-high-tpt-megamoe"

# B300 adaptation of the DSV4 GB200/B200 vLLM disagg recipe. Each worker uses
# one full 8-GPU B300 node, plus a dedicated NATS/etcd infra node.
model:
path: "deepseek-v4-pro"
container: "vllm/vllm-openai:v0.20.1"
precision: "fp4"

dynamo:
install: true
wheel: "1.2.0.dev20260426"

setup_script: vllm-container-deps.sh

slurm:
time_limit: "8:00:00"

health_check:
max_attempts: 1440
interval_seconds: 10

resources:
gpu_type: "b300"
gpus_per_node: 8
prefill_nodes: 2
decode_nodes: 1
prefill_workers: 2
decode_workers: 1
gpus_per_prefill: 8
gpus_per_decode: 8

infra:
etcd_nats_dedicated_node: true

frontend:
type: dynamo
enable_multiple_frontends: false

backend:
type: vllm
connector: null
prefill_environment:
TILELANG_CLEANUP_TEMP_FILES: "1"
NCCL_CUMEM_ENABLE: "1"
VLLM_SERVER_DEV_MODE: "1"
VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
UCX_MEMTYPE_CACHE: "n"
UCX_MEMTYPE_REG_WHOLE: "n"
decode_environment:
TILELANG_CLEANUP_TEMP_FILES: "1"
NCCL_CUMEM_ENABLE: "1"
VLLM_SERVER_DEV_MODE: "1"
UCX_MEMTYPE_CACHE: "n"
UCX_MEMTYPE_REG_WHOLE: "n"
vllm_config:
prefill:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 1
pipeline-parallel-size: 1
data-parallel-size: 8
data-parallel-rpc-port: 13345
enable-expert-parallel: true
enable-ep-weight-filter: true
attention-config: '{"use_fp4_indexer_cache": true}'
enforce-eager: true
max-model-len: 9280
max-num-seqs: 16
max-cudagraph-capture-size: 2048
max-num-batched-tokens: 32768
trust-remote-code: true
no-enable-prefix-caching: true
no-enable-flashinfer-autotune: true
no-async-scheduling: true
block-size: 256
compilation-config: '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}'
gpu-memory-utilization: 0.85
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true
numa-bind: true
tokenizer-mode: deepseek_v4
reasoning-parser: deepseek_v4
decode:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 1
pipeline-parallel-size: 1
data-parallel-size: 8
data-parallel-rpc-port: 13345
enable-expert-parallel: true
enable-ep-weight-filter: true
attention-config: '{"use_fp4_indexer_cache": true}'
max-model-len: 9280
max-num-seqs: 512
max-cudagraph-capture-size: 2048
max-num-batched-tokens: 512
trust-remote-code: true
no-enable-prefix-caching: true
no-enable-flashinfer-autotune: true
block-size: 256
compilation-config: '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}'
gpu-memory-utilization: 0.85
stream-interval: 50
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true
tokenizer-mode: deepseek_v4
reasoning-parser: deepseek_v4

benchmark:
type: "sa-bench"
isl: 8192
osl: 1024
concurrencies: "4096x8192"
req_rate: "inf"
use_chat_template: true
custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer"

identity:
model:
repo: "deepseek-ai/DeepSeek-V4-Pro"
revision: "0366e4e064385807ea86b088a5c6c878ff23343b"
container:
image: "vllm/vllm-openai:v0.20.1"
frameworks:
dynamo: "1.2.0.dev20260426"
vllm: "0.20.1"
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
name: "svf-vllm-disagg-b300-low-latency"

# B300 adaptation of the DSV4 GB200/B200 vLLM disagg recipe. Each worker uses
# one full 8-GPU B300 node, plus a dedicated NATS/etcd infra node.
model:
path: "deepseek-v4-pro"
container: "vllm/vllm-openai:v0.20.1"
precision: "fp4"

dynamo:
install: true
wheel: "1.2.0.dev20260426"

setup_script: vllm-container-deps.sh

slurm:
time_limit: "8:00:00"

health_check:
max_attempts: 1440
interval_seconds: 10

resources:
gpu_type: "b300"
gpus_per_node: 8
prefill_nodes: 1
decode_nodes: 1
prefill_workers: 1
decode_workers: 1
gpus_per_prefill: 8
gpus_per_decode: 8

infra:
etcd_nats_dedicated_node: true

frontend:
type: dynamo
enable_multiple_frontends: false

backend:
type: vllm
connector: null
prefill_environment:
TILELANG_CLEANUP_TEMP_FILES: "1"
NCCL_CUMEM_ENABLE: "1"
VLLM_SERVER_DEV_MODE: "1"
VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
UCX_MEMTYPE_CACHE: "n"
UCX_MEMTYPE_REG_WHOLE: "n"
decode_environment:
TILELANG_CLEANUP_TEMP_FILES: "1"
NCCL_CUMEM_ENABLE: "1"
VLLM_SERVER_DEV_MODE: "1"
UCX_MEMTYPE_CACHE: "n"
UCX_MEMTYPE_REG_WHOLE: "n"
vllm_config:
prefill:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 1
pipeline-parallel-size: 1
data-parallel-size: 8
data-parallel-rpc-port: 13345
enable-expert-parallel: true
attention-config: '{"use_fp4_indexer_cache": true}'
enforce-eager: true
max-model-len: 16384
max-num-seqs: 16
max-cudagraph-capture-size: 2048
max-num-batched-tokens: 32768
trust-remote-code: true
no-enable-prefix-caching: true
no-enable-flashinfer-autotune: true
no-async-scheduling: true
block-size: 256
compilation-config: '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}'
gpu-memory-utilization: 0.8
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true
numa-bind: true
offload-group-size: 3
offload-num-in-group: 1
offload-prefetch-step: 2
tokenizer-mode: deepseek_v4
reasoning-parser: deepseek_v4
decode:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 8
pipeline-parallel-size: 1
max-model-len: 16384
max-num-seqs: 256
max-cudagraph-capture-size: 2048
max-num-batched-tokens: 256
trust-remote-code: true
no-enable-prefix-caching: true
no-enable-flashinfer-autotune: true
block-size: 256
compilation-config: '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}'
gpu-memory-utilization: 0.9
stream-interval: 50
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true
tokenizer-mode: deepseek_v4
reasoning-parser: deepseek_v4

benchmark:
type: "sa-bench"
isl: 8192
osl: 1024
concurrencies: "1x64x128"
req_rate: "inf"
use_chat_template: true
custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer"

identity:
container:
image: "vllm/vllm-openai:v0.20.1"
frameworks:
dynamo: "1.2.0.dev20260426"
vllm: "0.20.1"
Loading
Loading