Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 107 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8313,3 +8313,110 @@ dsv4-fp4-gb300-dynamo-sglang:
tp: 12
ep: 12
dp-attn: true

# MTP variant of dsv4-fp4-gb300-dynamo-sglang.
dsv4-fp4-gb300-dynamo-sglang-mtp:
image: lmsysorg/sglang:nightly-dev-cu13-20260508-2cf1a4ab
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: gb300-cw
precision: fp4
framework: dynamo-sglang
multinode: true
disagg: true
scenarios:
fixed-seq-len:
- isl: 8192
osl: 1024
search-space:
# Low-latency baseline: 1p1d-tp4-tp4. 2 nodes.
- spec-decoding: "mtp"
conc-list: [1]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml"
decode:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
# Low-latency 1p6d-dep4-tp4: 1P (DEP=4) + 6 TP=4 decode workers. 7 nodes.
# Recipe runs concurrencies=8x32x64; matrix tracks the max.
- spec-decoding: "mtp"
conc-list: [64]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml"
decode:
num-worker: 6
tp: 4
ep: 1
dp-attn: false
# Mid curve 1p1d-dep4-dep8. 3 nodes.
- spec-decoding: "mtp"
conc-list: [256]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
# Mid curve 1p1d-dep4-dep16. 5 nodes.
- spec-decoding: "mtp"
conc-list: [256]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
# Mid curve 2p1d-dep4-dep8. 4 nodes.
- spec-decoding: "mtp"
conc-list: [512]
prefill:
num-worker: 2
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
# Mid curve 4p1d-dep4-dep8. 6 nodes.
- spec-decoding: "mtp"
conc-list: [1024]
prefill:
num-worker: 4
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
name: "dsv4-pro-gb300-disagg-8k1k-low-latency-1p1d-tp4-tp4-mtp"

frontend:
type: dynamo
enable_multiple_frontends: true
num_additional_frontends: 8

dynamo:
hash: "81d0555ee23519cea80a42b4fe824e30368b7300"
install: true

model:
path: "deepseek-v4-pro"
container: "lmsysorg/sglang:nightly-dev-cu13-20260508-2cf1a4ab"
precision: "mxfp4"

sbatch_directives:
cpus-per-task: "144"
mem: "0"

resources:
gpu_type: "gb300"
gpus_per_node: 4
prefill_nodes: 1
prefill_workers: 1
decode_nodes: 1
decode_workers: 1

backend:
type: sglang

prefill_environment:
PYTHONUNBUFFERED: "1"
SGLANG_RADIX_DISABLE_REUSE: "1"
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
SGLANG_DEFAULT_THINKING: "1"
SGLANG_DSV4_REASONING_EFFORT: "max"
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
SGLANG_OPT_USE_JIT_NORM: "1"
SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
SGLANG_OPT_USE_TOPK_V2: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
MC_FORCE_MNNVL: "1"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"

decode_environment:
PYTHONUNBUFFERED: "1"
SGLANG_RADIX_DISABLE_REUSE: "1"
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
SGLANG_DEFAULT_THINKING: "1"
SGLANG_DSV4_REASONING_EFFORT: "max"
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
SGLANG_OPT_USE_JIT_NORM: "1"
SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
SGLANG_OPT_USE_TOPK_V2: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
MC_FORCE_MNNVL: "1"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
# SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2
# is single-node only and corrupts results in 2-node decode setups.

sglang_config:
prefill:
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
model-path: "/model/"
trust-remote-code: true
tool-call-parser: deepseekv4 # gates the dsv4 chat-encoding spec.

disaggregation-mode: "prefill"
disaggregation-transfer-backend: mooncake

tensor-parallel-size: 4
data-parallel-size: 1
expert-parallel-size: 1

moe-runner-backend: "flashinfer_mxfp4"
disable-flashinfer-autotune: true

mem-fraction-static: 0.9
max-running-requests: 8
cuda-graph-max-bs: 8
chunked-prefill-size: 32768

decode:
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
model-path: "/model/"
trust-remote-code: true
tool-call-parser: deepseekv4 # gates the dsv4 chat-encoding spec.

disaggregation-mode: "decode"
disaggregation-transfer-backend: mooncake

tensor-parallel-size: 4
data-parallel-size: 1
expert-parallel-size: 1

moe-runner-backend: "flashinfer_mxfp4"
disable-flashinfer-autotune: true

speculative-algo: "EAGLE"
speculative-num-steps: 3
speculative-eagle-topk: 1
speculative-num-draft-tokens: 4

mem-fraction-static: 0.9
max-running-requests: 8
cuda-graph-max-bs: 8
swa-full-tokens-ratio: 0.1
context-length: 16384

benchmark:
type: "sa-bench"
isl: 8192
osl: 1024
random_range_ratio: 0.8
concurrencies: "1"
req_rate: "inf"
use_chat_template: true
custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer"
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
name: "dsv4-pro-gb300-disagg-8k1k-low-latency-1p6d-dep4-tp4-mtp"

frontend:
type: dynamo
enable_multiple_frontends: true
num_additional_frontends: 8

dynamo:
hash: "81d0555ee23519cea80a42b4fe824e30368b7300"
install: true

model:
path: "deepseek-v4-pro"
container: "lmsysorg/sglang:nightly-dev-cu13-20260508-2cf1a4ab"
precision: "mxfp4"

sbatch_directives:
cpus-per-task: "144"
mem: "0"

resources:
gpu_type: "gb300"
gpus_per_node: 4
prefill_nodes: 1
prefill_workers: 1
decode_nodes: 6
decode_workers: 6

backend:
type: sglang

prefill_environment:
PYTHONUNBUFFERED: "1"
SGLANG_RADIX_DISABLE_REUSE: "1"
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
SGLANG_DEFAULT_THINKING: "1"
SGLANG_DSV4_REASONING_EFFORT: "max"
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
SGLANG_OPT_USE_JIT_NORM: "1"
SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
SGLANG_OPT_USE_TOPK_V2: "1"

SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
SGLANG_OPT_USE_FAST_MASK_EP: "1"
SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216"
SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"

NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
MC_FORCE_MNNVL: "1"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"

decode_environment:
PYTHONUNBUFFERED: "1"
SGLANG_RADIX_DISABLE_REUSE: "1"
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
SGLANG_DEFAULT_THINKING: "1"
SGLANG_DSV4_REASONING_EFFORT: "max"
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
SGLANG_OPT_USE_JIT_NORM: "1"
SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
SGLANG_OPT_USE_TOPK_V2: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
MC_FORCE_MNNVL: "1"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
# SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2
# is single-node only and corrupts results in 2-node decode setups.

sglang_config:
prefill:
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
model-path: "/model/"
trust-remote-code: true
tool-call-parser: deepseekv4 # gates the dsv4 chat-encoding spec.

disaggregation-mode: "prefill"
disaggregation-transfer-backend: mooncake

tensor-parallel-size: 4
data-parallel-size: 4
expert-parallel-size: 4

enable-dp-attention: true
enable-dp-lm-head: true

moe-a2a-backend: "deepep"
deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'

mem-fraction-static: 0.9
max-running-requests: 128
cuda-graph-max-bs: 128
chunked-prefill-size: 32768

decode:
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
model-path: "/model/"
trust-remote-code: true
tool-call-parser: deepseekv4 # gates the dsv4 chat-encoding spec.

disaggregation-mode: "decode"
disaggregation-transfer-backend: mooncake

tensor-parallel-size: 4
data-parallel-size: 1
expert-parallel-size: 1

moe-runner-backend: "flashinfer_mxfp4"
disable-flashinfer-autotune: true

speculative-algo: "EAGLE"
speculative-num-steps: 3
speculative-eagle-topk: 1
speculative-num-draft-tokens: 4

mem-fraction-static: 0.9
max-running-requests: 128
cuda-graph-max-bs: 128
swa-full-tokens-ratio: 0.1
context-length: 16384

benchmark:
type: "sa-bench"
isl: 8192
osl: 1024
random_range_ratio: 0.8
concurrencies: "8x32x64"
req_rate: "inf"
use_chat_template: true
custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer"
Loading