diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index e8286bff6..1b23be292 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -8313,3 +8313,110 @@ dsv4-fp4-gb300-dynamo-sglang: tp: 12 ep: 12 dp-attn: true + +# MTP variant of dsv4-fp4-gb300-dynamo-sglang. +dsv4-fp4-gb300-dynamo-sglang-mtp: + image: lmsysorg/sglang:nightly-dev-cu13-20260509-9ee83034 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: gb300-cw + precision: fp4 + framework: dynamo-sglang + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 8192 + osl: 1024 + search-space: + # Low-latency baseline: 1p1d-tp4-tp4. 2 nodes. + - spec-decoding: "mtp" + conc-list: [1] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + # Low-latency 1p6d-dep4-tp4: 1P (DEP=4) + 6 TP=4 decode workers. 7 nodes. + # Recipe runs concurrencies=8x32x64; matrix tracks the max. + - spec-decoding: "mtp" + conc-list: [64] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml" + decode: + num-worker: 6 + tp: 4 + ep: 1 + dp-attn: false + # Mid curve 1p1d-dep4-dep8. 3 nodes. + - spec-decoding: "mtp" + conc-list: [256] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + # Mid curve 1p1d-dep4-dep16. 5 nodes. + - spec-decoding: "mtp" + conc-list: [256] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # Mid curve 2p1d-dep4-dep8. 4 nodes. + - spec-decoding: "mtp" + conc-list: [512] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + # Mid curve 4p1d-dep4-dep8. 6 nodes. + - spec-decoding: "mtp" + conc-list: [1024] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml new file mode 100644 index 000000000..6e87911cd --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml @@ -0,0 +1,127 @@ +name: "dsv4-pro-gb300-disagg-8k1k-low-latency-1p1d-tp4-tp4-mtp" + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +dynamo: + hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c" + install: true + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260509-9ee83034" + precision: "mxfp4" + +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 1 + decode_workers: 1 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 # gates dsv4 chat-encoding spec. + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + mem-fraction-static: 0.9 + max-running-requests: 8 + cuda-graph-max-bs: 8 + chunked-prefill-size: 32768 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 # gates dsv4 chat-encoding spec. + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 8 + cuda-graph-max-bs: 8 + swa-full-tokens-ratio: 0.1 + context-length: 16384 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml new file mode 100644 index 000000000..1ddb20a1e --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml @@ -0,0 +1,141 @@ +name: "dsv4-pro-gb300-disagg-8k1k-low-latency-1p6d-dep4-tp4-mtp" + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +dynamo: + hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c" + install: true + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260509-9ee83034" + precision: "mxfp4" + +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 6 + decode_workers: 6 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 # gates dsv4 chat-encoding spec. + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.9 + max-running-requests: 128 + cuda-graph-max-bs: 128 + chunked-prefill-size: 32768 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 # gates dsv4 chat-encoding spec. + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 128 + cuda-graph-max-bs: 128 + swa-full-tokens-ratio: 0.1 + context-length: 16384 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "8x32x64" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml new file mode 100644 index 000000000..65a597eb8 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml @@ -0,0 +1,157 @@ +name: "dsv4-pro-gb300-disagg-8k1k-mid-curve-1p1d-dep4-dep16-mtp" + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +dynamo: + hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c" + install: true + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260509-9ee83034" + precision: "mxfp4" + +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + decode_nodes: 4 + decode_workers: 1 + gpus_per_decode: 16 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "2048" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" # CAR_V2 is single-node only. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 # gates dsv4 chat-encoding spec. + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.9 + max-running-requests: 256 + cuda-graph-max-bs: 256 + chunked-prefill-size: 32768 + stream-interval: 60 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 # gates dsv4 chat-encoding spec. + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.94 + max-running-requests: 3072 + cuda-graph-max-bs: 256 + swa-full-tokens-ratio: 0.15 + context-length: 16384 + stream-interval: 60 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "256" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml new file mode 100644 index 000000000..b21f60d23 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml @@ -0,0 +1,157 @@ +name: "dsv4-pro-gb300-disagg-8k1k-mid-curve-1p1d-dep4-dep8-mtp" + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +dynamo: + hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c" + install: true + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260509-9ee83034" + precision: "mxfp4" + +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + decode_nodes: 2 + decode_workers: 1 + gpus_per_decode: 8 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "2048" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" # CAR_V2 is single-node only. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 # gates dsv4 chat-encoding spec. + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.9 + max-running-requests: 256 + cuda-graph-max-bs: 256 + chunked-prefill-size: 32768 + stream-interval: 60 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 # gates dsv4 chat-encoding spec. + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.94 + max-running-requests: 3072 + cuda-graph-max-bs: 256 + swa-full-tokens-ratio: 0.15 + context-length: 16384 + stream-interval: 60 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "256" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml new file mode 100644 index 000000000..a11086da7 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml @@ -0,0 +1,157 @@ +name: "dsv4-pro-gb300-disagg-8k1k-mid-curve-2p1d-dep4-dep8-mtp" + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +dynamo: + hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c" + install: true + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260509-9ee83034" + precision: "mxfp4" + +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 2 + prefill_workers: 2 + gpus_per_prefill: 4 + decode_nodes: 2 + decode_workers: 1 + gpus_per_decode: 8 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" # CAR_V2 is single-node only. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 # gates dsv4 chat-encoding spec. + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.9 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 32768 + stream-interval: 60 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 # gates dsv4 chat-encoding spec. + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 3072 + cuda-graph-max-bs: 512 + swa-full-tokens-ratio: 0.15 + context-length: 16384 + stream-interval: 60 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "512" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml new file mode 100644 index 000000000..e7054c288 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml @@ -0,0 +1,157 @@ +name: "dsv4-pro-gb300-disagg-8k1k-mid-curve-4p1d-dep4-dep8-mtp" + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +dynamo: + hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c" + install: true + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260509-9ee83034" + precision: "mxfp4" + +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 4 + prefill_workers: 4 + gpus_per_prefill: 4 + decode_nodes: 2 + decode_workers: 1 + gpus_per_decode: 8 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" # CAR_V2 is single-node only. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 # gates dsv4 chat-encoding spec. + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.9 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + chunked-prefill-size: 32768 + stream-interval: 60 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 # gates dsv4 chat-encoding spec. + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 3072 + cuda-graph-max-bs: 1024 + swa-full-tokens-ratio: 0.15 + context-length: 16384 + stream-interval: 60 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1024" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 715d6f177..160f160af 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2300,6 +2300,14 @@ - "Re-enable lm-eval scoring for dsv4-fp4-gb300-dynamo-sglang now that the srt-slurm pin includes the lm-eval orchestrator path" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1295 +- config-keys: + - dsv4-fp4-gb300-dynamo-sglang-mtp + description: + - "Add DeepSeek-V4-Pro FP4 GB300 disaggregated SGLang MTP coverage using lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev" + - "Mirror the base 8k/1k disagg search space (1p1d-tp4, 1p6d-dep4-tp4, 1p1d-dep4-dep8, 1p1d-dep4-dep16, 2p1d-dep4-dep8, 4p1d-dep4-dep8) with spec-decoding: mtp and EAGLE on the decode side (num-steps=3, eagle-topk=1, num-draft-tokens=4)" + - "Recipes adapted from elvischenv/srt-slurm@dsv4-gb300-disagg-8k1k-mtp:recipes/dsv4-pro/sglang/gb300-fp4/8k1k/disagg/mtp/, repointed at the public sglang-staging container and the deepseek-v4-pro model alias" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1297 + - config-keys: - dsv4-fp4-mi355x-sglang description: diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh index 8c78613f1..38374475c 100644 --- a/runners/launch_gb300-cw.sh +++ b/runners/launch_gb300-cw.sh @@ -12,8 +12,14 @@ if [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then export MODEL_PATH="/scratch/models/dsv4/" if [[ $FRAMEWORK == "dynamo-sglang" ]]; then - SRT_SLURM_RECIPES_REPO="https://github.com/NVIDIA/srt-slurm.git" - SRT_SLURM_RECIPES_REF="main" + # Pinned to ch-wan/srt-slurm fork @ cwan/fix-sglang-dsv4-tokenizer-callable + # while NVIDIA/srt-slurm#144 (sa-bench: make SGLangDeepseekV4Tokenizer + # callable) is in review. Without the fix, multi-node DSv4-Pro MTP + # sa-bench runs fail in calculate_metrics with + # ``TypeError: 'SGLangDeepseekV4Tokenizer' object is not callable``. + # Revert to ``NVIDIA/srt-slurm.git`` @ ``main`` once #144 merges. + SRT_SLURM_RECIPES_REPO="https://github.com/ch-wan/srt-slurm.git" + SRT_SLURM_RECIPES_REF="518476323f19226513a1691714c550582cffe343" SRT_RECIPE_SRC="$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4" SRT_RECIPE_DST="recipes/sglang/deepseek-v4" elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then