From ce53cf13d1dd4d0df4289ed689a02f24fb144862 Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Thu, 7 May 2026 23:05:54 -0700 Subject: [PATCH 01/19] add mtp configs --- .github/configs/nvidia-master.yaml | 107 ++++++++++++ .../8k1k/disagg-gb300-1p1d-tp4-tp4-2-c1.yaml | 2 - .../mtp/disagg-low-latency-1p1d-tp4-tp4.yaml | 121 ++++++++++++++ .../mtp/disagg-low-latency-1p6d-dep4-tp4.yaml | 135 ++++++++++++++++ .../mtp/disagg-mid-curve-1p1d-dep4-dep16.yaml | 152 ++++++++++++++++++ .../mtp/disagg-mid-curve-1p1d-dep4-dep8.yaml | 152 ++++++++++++++++++ .../mtp/disagg-mid-curve-2p1d-dep4-dep8.yaml | 152 ++++++++++++++++++ .../mtp/disagg-mid-curve-4p1d-dep4-dep8.yaml | 152 ++++++++++++++++++ perf-changelog.yaml | 8 + 9 files changed, 979 insertions(+), 2 deletions(-) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-low-latency-1p1d-tp4-tp4.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-low-latency-1p6d-dep4-tp4.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-1p1d-dep4-dep16.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-1p1d-dep4-dep8.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-2p1d-dep4-dep8.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-4p1d-dep4-dep8.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index e8286bff6..a47a4be13 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -8313,3 +8313,110 @@ dsv4-fp4-gb300-dynamo-sglang: tp: 12 ep: 12 dp-attn: true + +# MTP variant of dsv4-fp4-gb300-dynamo-sglang. +dsv4-fp4-gb300-dynamo-sglang-mtp: + image: lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: gb300-cw + precision: fp4 + framework: dynamo-sglang + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 8192 + osl: 1024 + search-space: + # Low-latency baseline: 1p1d-tp4-tp4. 2 nodes. + - spec-decoding: "mtp" + conc-list: [1] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/mtp/disagg-low-latency-1p1d-tp4-tp4.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + # Low-latency 1p6d-dep4-tp4: 1P (DEP=4) + 6 TP=4 decode workers. 7 nodes. + # Recipe runs concurrencies=8x32x64; matrix tracks the max. + - spec-decoding: "mtp" + conc-list: [64] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/mtp/disagg-low-latency-1p6d-dep4-tp4.yaml" + decode: + num-worker: 6 + tp: 4 + ep: 1 + dp-attn: false + # Mid curve 1p1d-dep4-dep8. 3 nodes. + - spec-decoding: "mtp" + conc-list: [256] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-1p1d-dep4-dep8.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + # Mid curve 1p1d-dep4-dep16. 5 nodes. + - spec-decoding: "mtp" + conc-list: [256] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-1p1d-dep4-dep16.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # Mid curve 2p1d-dep4-dep8. 4 nodes. + - spec-decoding: "mtp" + conc-list: [512] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-2p1d-dep4-dep8.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + # Mid curve 4p1d-dep4-dep8. 6 nodes. + - spec-decoding: "mtp" + conc-list: [1024] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-4p1d-dep4-dep8.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-tp4-tp4-2-c1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-tp4-tp4-2-c1.yaml index 1e6d8cc37..643693a1a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-tp4-tp4-2-c1.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-tp4-tp4-2-c1.yaml @@ -121,7 +121,6 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true - disable-radix-cache: true disaggregation-mode: "prefill" disaggregation-transfer-backend: mooncake @@ -142,7 +141,6 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true - disable-radix-cache: true disaggregation-mode: "decode" disaggregation-transfer-backend: mooncake diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-low-latency-1p1d-tp4-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-low-latency-1p1d-tp4-tp4.yaml new file mode 100644 index 000000000..c83f3e8c6 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-low-latency-1p1d-tp4-tp4.yaml @@ -0,0 +1,121 @@ +name: "dsv4-pro-gb300-disagg-8k1k-low-latency-1p1d-tp4-tp4-mtp" + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +dynamo: + hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d" + install: true + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 1 + decode_workers: 1 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + mem-fraction-static: 0.9 + max-running-requests: 8 + cuda-graph-max-bs: 8 + chunked-prefill-size: 32768 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 8 + cuda-graph-max-bs: 8 + swa-full-tokens-ratio: 0.1 + context-length: 16384 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-low-latency-1p6d-dep4-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-low-latency-1p6d-dep4-tp4.yaml new file mode 100644 index 000000000..c5ad99b5d --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-low-latency-1p6d-dep4-tp4.yaml @@ -0,0 +1,135 @@ +name: "dsv4-pro-gb300-disagg-8k1k-low-latency-1p6d-dep4-tp4-mtp" + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +dynamo: + hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d" + install: true + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 6 + decode_workers: 6 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.9 + max-running-requests: 128 + cuda-graph-max-bs: 128 + chunked-prefill-size: 32768 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 128 + cuda-graph-max-bs: 128 + swa-full-tokens-ratio: 0.1 + context-length: 16384 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "8x32x64" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-1p1d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-1p1d-dep4-dep16.yaml new file mode 100644 index 000000000..6af7bb523 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-1p1d-dep4-dep16.yaml @@ -0,0 +1,152 @@ +name: "dsv4-pro-gb300-disagg-8k1k-mid-curve-1p1d-dep4-dep16-mtp" + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +dynamo: + hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d" + install: true + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + decode_nodes: 4 + decode_workers: 1 + gpus_per_decode: 16 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "2048" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.9 + max-running-requests: 256 + cuda-graph-max-bs: 256 + chunked-prefill-size: 32768 + stream-interval: 60 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.94 + max-running-requests: 3072 + cuda-graph-max-bs: 256 + swa-full-tokens-ratio: 0.15 + context-length: 16384 + stream-interval: 60 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "256" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-1p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-1p1d-dep4-dep8.yaml new file mode 100644 index 000000000..0bcd03a65 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-1p1d-dep4-dep8.yaml @@ -0,0 +1,152 @@ +name: "dsv4-pro-gb300-disagg-8k1k-mid-curve-1p1d-dep4-dep8-mtp" + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +dynamo: + hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d" + install: true + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + decode_nodes: 2 + decode_workers: 1 + gpus_per_decode: 8 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "2048" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.9 + max-running-requests: 256 + cuda-graph-max-bs: 256 + chunked-prefill-size: 32768 + stream-interval: 60 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.94 + max-running-requests: 3072 + cuda-graph-max-bs: 256 + swa-full-tokens-ratio: 0.15 + context-length: 16384 + stream-interval: 60 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "256" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-2p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-2p1d-dep4-dep8.yaml new file mode 100644 index 000000000..afb1d9733 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-2p1d-dep4-dep8.yaml @@ -0,0 +1,152 @@ +name: "dsv4-pro-gb300-disagg-8k1k-mid-curve-2p1d-dep4-dep8-mtp" + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +dynamo: + hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d" + install: true + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 2 + prefill_workers: 2 + gpus_per_prefill: 4 + decode_nodes: 2 + decode_workers: 1 + gpus_per_decode: 8 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.9 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 32768 + stream-interval: 60 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 3072 + cuda-graph-max-bs: 512 + swa-full-tokens-ratio: 0.15 + context-length: 16384 + stream-interval: 60 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "512" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-4p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-4p1d-dep4-dep8.yaml new file mode 100644 index 000000000..8fdb6c903 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-4p1d-dep4-dep8.yaml @@ -0,0 +1,152 @@ +name: "dsv4-pro-gb300-disagg-8k1k-mid-curve-4p1d-dep4-dep8-mtp" + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +dynamo: + hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d" + install: true + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 4 + prefill_workers: 4 + gpus_per_prefill: 4 + decode_nodes: 2 + decode_workers: 1 + gpus_per_decode: 8 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.9 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + chunked-prefill-size: 32768 + stream-interval: 60 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 3072 + cuda-graph-max-bs: 1024 + swa-full-tokens-ratio: 0.15 + context-length: 16384 + stream-interval: 60 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1024" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index e05c08e35..fff4e544a 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2299,3 +2299,11 @@ - "Rename and consolidate the per-concurrency recipe files to `disagg-gb300-{N}p1d-{topo}-{nodes}-c{conc}.yaml`" - "Re-enable lm-eval scoring for dsv4-fp4-gb300-dynamo-sglang now that the srt-slurm pin includes the lm-eval orchestrator path" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1295 + +- config-keys: + - dsv4-fp4-gb300-dynamo-sglang-mtp + description: + - "Add DeepSeek-V4-Pro FP4 GB300 disaggregated SGLang MTP coverage using lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev" + - "Mirror the base 8k/1k disagg search space (1p1d-tp4, 1p6d-dep4-tp4, 1p1d-dep4-dep8, 1p1d-dep4-dep16, 2p1d-dep4-dep8, 4p1d-dep4-dep8) with spec-decoding: mtp and EAGLE on the decode side (num-steps=3, eagle-topk=1, num-draft-tokens=4)" + - "Recipes adapted from elvischenv/srt-slurm@dsv4-gb300-disagg-8k1k-mtp:recipes/dsv4-pro/sglang/gb300-fp4/8k1k/disagg/mtp/, repointed at the public sglang-staging container and the deepseek-v4-pro model alias" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1297 From debb7b9761a11bc2762886567bd1d34c53d59098 Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Fri, 8 May 2026 00:23:14 -0700 Subject: [PATCH 02/19] Add sbatch_directives to MTP recipes (root-cause fix) Without `cpus-per-task: 144` and `mem: 0`, slurm hands out 1 CPU and ~4 MB per task, and the dynamo cold source build (~500 rust crates) is OOM-killed before any worker comes up. Manifests as `Sweep failed (exit code: 137)` ~30 s after orchestrator start. Mirrors the block already present in the working main 8k1k recipes (e.g. disagg-gb300-1p1d-tp4-tp4-2-c1.yaml). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../deepseek-v4/8k1k/mtp/disagg-low-latency-1p1d-tp4-tp4.yaml | 4 ++++ .../8k1k/mtp/disagg-low-latency-1p6d-dep4-tp4.yaml | 4 ++++ .../8k1k/mtp/disagg-mid-curve-1p1d-dep4-dep16.yaml | 4 ++++ .../deepseek-v4/8k1k/mtp/disagg-mid-curve-1p1d-dep4-dep8.yaml | 4 ++++ .../deepseek-v4/8k1k/mtp/disagg-mid-curve-2p1d-dep4-dep8.yaml | 4 ++++ .../deepseek-v4/8k1k/mtp/disagg-mid-curve-4p1d-dep4-dep8.yaml | 4 ++++ 6 files changed, 24 insertions(+) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-low-latency-1p1d-tp4-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-low-latency-1p1d-tp4-tp4.yaml index c83f3e8c6..45f4c463f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-low-latency-1p1d-tp4-tp4.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-low-latency-1p1d-tp4-tp4.yaml @@ -14,6 +14,10 @@ model: container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev" precision: "mxfp4" +sbatch_directives: + cpus-per-task: "144" + mem: "0" + resources: gpu_type: "gb300" gpus_per_node: 4 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-low-latency-1p6d-dep4-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-low-latency-1p6d-dep4-tp4.yaml index c5ad99b5d..5a49a22bd 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-low-latency-1p6d-dep4-tp4.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-low-latency-1p6d-dep4-tp4.yaml @@ -14,6 +14,10 @@ model: container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev" precision: "mxfp4" +sbatch_directives: + cpus-per-task: "144" + mem: "0" + resources: gpu_type: "gb300" gpus_per_node: 4 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-1p1d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-1p1d-dep4-dep16.yaml index 6af7bb523..c2ab03970 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-1p1d-dep4-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-1p1d-dep4-dep16.yaml @@ -14,6 +14,10 @@ model: container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev" precision: "mxfp4" +sbatch_directives: + cpus-per-task: "144" + mem: "0" + resources: gpu_type: "gb300" gpus_per_node: 4 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-1p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-1p1d-dep4-dep8.yaml index 0bcd03a65..a6799fdfc 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-1p1d-dep4-dep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-1p1d-dep4-dep8.yaml @@ -14,6 +14,10 @@ model: container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev" precision: "mxfp4" +sbatch_directives: + cpus-per-task: "144" + mem: "0" + resources: gpu_type: "gb300" gpus_per_node: 4 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-2p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-2p1d-dep4-dep8.yaml index afb1d9733..2712c48d4 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-2p1d-dep4-dep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-2p1d-dep4-dep8.yaml @@ -14,6 +14,10 @@ model: container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev" precision: "mxfp4" +sbatch_directives: + cpus-per-task: "144" + mem: "0" + resources: gpu_type: "gb300" gpus_per_node: 4 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-4p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-4p1d-dep4-dep8.yaml index 8fdb6c903..fc0b7e5f3 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-4p1d-dep4-dep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-4p1d-dep4-dep8.yaml @@ -14,6 +14,10 @@ model: container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev" precision: "mxfp4" +sbatch_directives: + cpus-per-task: "144" + mem: "0" + resources: gpu_type: "gb300" gpus_per_node: 4 From 59f899decf4b3c1dea6b0ac166e1e5494ae93586 Mon Sep 17 00:00:00 2001 From: Baizhou Zhang Date: Fri, 8 May 2026 01:31:03 -0700 Subject: [PATCH 03/19] Change deepgemm flags --- .../deepseek-v4/8k1k/mtp/disagg-low-latency-1p1d-tp4-tp4.yaml | 4 ++-- .../8k1k/mtp/disagg-low-latency-1p6d-dep4-tp4.yaml | 4 ++-- .../8k1k/mtp/disagg-mid-curve-1p1d-dep4-dep16.yaml | 4 ++-- .../deepseek-v4/8k1k/mtp/disagg-mid-curve-1p1d-dep4-dep8.yaml | 4 ++-- .../deepseek-v4/8k1k/mtp/disagg-mid-curve-2p1d-dep4-dep8.yaml | 4 ++-- .../deepseek-v4/8k1k/mtp/disagg-mid-curve-4p1d-dep4-dep8.yaml | 4 ++-- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-low-latency-1p1d-tp4-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-low-latency-1p1d-tp4-tp4.yaml index 45f4c463f..f77afd6fc 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-low-latency-1p1d-tp4-tp4.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-low-latency-1p1d-tp4-tp4.yaml @@ -32,7 +32,7 @@ backend: prefill_environment: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" - SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_ENABLE_THINKING: "1" SGLANG_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" @@ -50,7 +50,7 @@ backend: decode_environment: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" - SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_ENABLE_THINKING: "1" SGLANG_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-low-latency-1p6d-dep4-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-low-latency-1p6d-dep4-tp4.yaml index 5a49a22bd..f1ba0f1d1 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-low-latency-1p6d-dep4-tp4.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-low-latency-1p6d-dep4-tp4.yaml @@ -32,7 +32,7 @@ backend: prefill_environment: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" - SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_ENABLE_THINKING: "1" SGLANG_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" @@ -61,7 +61,7 @@ backend: decode_environment: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" - SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_ENABLE_THINKING: "1" SGLANG_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-1p1d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-1p1d-dep4-dep16.yaml index c2ab03970..ff646668d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-1p1d-dep4-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-1p1d-dep4-dep16.yaml @@ -34,7 +34,7 @@ backend: prefill_environment: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" - SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_ENABLE_THINKING: "1" SGLANG_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" @@ -63,7 +63,7 @@ backend: decode_environment: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" - SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_ENABLE_THINKING: "1" SGLANG_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-1p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-1p1d-dep4-dep8.yaml index a6799fdfc..fe75e7a66 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-1p1d-dep4-dep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-1p1d-dep4-dep8.yaml @@ -34,7 +34,7 @@ backend: prefill_environment: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" - SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_ENABLE_THINKING: "1" SGLANG_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" @@ -63,7 +63,7 @@ backend: decode_environment: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" - SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_ENABLE_THINKING: "1" SGLANG_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-2p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-2p1d-dep4-dep8.yaml index 2712c48d4..0508f4e18 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-2p1d-dep4-dep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-2p1d-dep4-dep8.yaml @@ -34,7 +34,7 @@ backend: prefill_environment: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" - SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_ENABLE_THINKING: "1" SGLANG_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" @@ -63,7 +63,7 @@ backend: decode_environment: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" - SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_ENABLE_THINKING: "1" SGLANG_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-4p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-4p1d-dep4-dep8.yaml index fc0b7e5f3..c9224b37e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-4p1d-dep4-dep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-4p1d-dep4-dep8.yaml @@ -34,7 +34,7 @@ backend: prefill_environment: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" - SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_ENABLE_THINKING: "1" SGLANG_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" @@ -63,7 +63,7 @@ backend: decode_environment: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" - SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_ENABLE_THINKING: "1" SGLANG_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" From 5f028858821caf21fd9fce68b1eca86efb32f2fd Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Fri, 8 May 2026 01:33:21 -0700 Subject: [PATCH 04/19] Move MTP recipes up to 8k1k/ with -mtp filename suffix Mirrors the convention used elsewhere in the repo: per-config files at the same depth as their non-MTP siblings, distinguished only by the -mtp suffix. CONFIG_FILE references in nvidia-master.yaml updated accordingly. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/nvidia-master.yaml | 12 ++++++------ ...yaml => disagg-low-latency-1p1d-tp4-tp4-mtp.yaml} | 0 ...aml => disagg-low-latency-1p6d-dep4-tp4-mtp.yaml} | 0 ...aml => disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml} | 0 ...yaml => disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml} | 0 ...yaml => disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml} | 0 ...yaml => disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml} | 0 7 files changed, 6 insertions(+), 6 deletions(-) rename benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/{mtp/disagg-low-latency-1p1d-tp4-tp4.yaml => disagg-low-latency-1p1d-tp4-tp4-mtp.yaml} (100%) rename benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/{mtp/disagg-low-latency-1p6d-dep4-tp4.yaml => disagg-low-latency-1p6d-dep4-tp4-mtp.yaml} (100%) rename benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/{mtp/disagg-mid-curve-1p1d-dep4-dep16.yaml => disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml} (100%) rename benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/{mtp/disagg-mid-curve-1p1d-dep4-dep8.yaml => disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml} (100%) rename benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/{mtp/disagg-mid-curve-2p1d-dep4-dep8.yaml => disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml} (100%) rename benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/{mtp/disagg-mid-curve-4p1d-dep4-dep8.yaml => disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml} (100%) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index a47a4be13..3a6005190 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -8338,7 +8338,7 @@ dsv4-fp4-gb300-dynamo-sglang-mtp: ep: 1 dp-attn: false additional-settings: - - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/mtp/disagg-low-latency-1p1d-tp4-tp4.yaml" + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml" decode: num-worker: 1 tp: 4 @@ -8354,7 +8354,7 @@ dsv4-fp4-gb300-dynamo-sglang-mtp: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/mtp/disagg-low-latency-1p6d-dep4-tp4.yaml" + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml" decode: num-worker: 6 tp: 4 @@ -8369,7 +8369,7 @@ dsv4-fp4-gb300-dynamo-sglang-mtp: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-1p1d-dep4-dep8.yaml" + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml" decode: num-worker: 1 tp: 8 @@ -8384,7 +8384,7 @@ dsv4-fp4-gb300-dynamo-sglang-mtp: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-1p1d-dep4-dep16.yaml" + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml" decode: num-worker: 1 tp: 16 @@ -8399,7 +8399,7 @@ dsv4-fp4-gb300-dynamo-sglang-mtp: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-2p1d-dep4-dep8.yaml" + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml" decode: num-worker: 1 tp: 8 @@ -8414,7 +8414,7 @@ dsv4-fp4-gb300-dynamo-sglang-mtp: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-4p1d-dep4-dep8.yaml" + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml" decode: num-worker: 1 tp: 8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-low-latency-1p1d-tp4-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-low-latency-1p1d-tp4-tp4.yaml rename to benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-low-latency-1p6d-dep4-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-low-latency-1p6d-dep4-tp4.yaml rename to benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-1p1d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-1p1d-dep4-dep16.yaml rename to benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-1p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-1p1d-dep4-dep8.yaml rename to benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-2p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-2p1d-dep4-dep8.yaml rename to benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-4p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/mtp/disagg-mid-curve-4p1d-dep4-dep8.yaml rename to benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml From 50c6c59be0bf3184ba687c30075959d7a1dd8ee9 Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Fri, 8 May 2026 01:35:51 -0700 Subject: [PATCH 05/19] fix --- .../sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-tp4-tp4-2-c1.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-tp4-tp4-2-c1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-tp4-tp4-2-c1.yaml index 643693a1a..1e6d8cc37 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-tp4-tp4-2-c1.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-tp4-tp4-2-c1.yaml @@ -121,6 +121,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true + disable-radix-cache: true disaggregation-mode: "prefill" disaggregation-transfer-backend: mooncake @@ -141,6 +142,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true + disable-radix-cache: true disaggregation-mode: "decode" disaggregation-transfer-backend: mooncake From 4bceea3def1416219511ceea6617f3ea333cfc92 Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Fri, 8 May 2026 10:31:40 -0700 Subject: [PATCH 06/19] =?UTF-8?q?Drop=20custom=5Ftokenizer=20from=20MTP=20?= =?UTF-8?q?recipes=20=E2=80=94=20incompatible=20with=20sa-bench?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sa-bench's calculate_metrics calls `tokenizer(text)` to count output tokens, but `sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer` doesn't implement __call__: TypeError: 'SGLangDeepseekV4Tokenizer' object is not callable File "/srtctl-benchmarks/sa-bench/benchmark_serving.py", line 657 num_tokens = len(tokenizer(output.text_chunks[i], ...).input_ids) This is the actual cause of the benchmark-task failures while eval-only tasks succeed (lm-eval doesn't go through this path). Removing custom_tokenizer falls back to AutoTokenizer.from_pretrained(/model). The chat_template is stored in the model's tokenizer_config.json, so `use_chat_template: true` continues to apply via the HF tokenizer (required for MTP correctness per AGENTS.md). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml | 1 - .../deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml | 1 - .../deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml | 1 - .../deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml | 1 - .../deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml | 1 - .../deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml | 1 - 6 files changed, 6 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml index f77afd6fc..259f5566f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml @@ -122,4 +122,3 @@ benchmark: concurrencies: "1" req_rate: "inf" use_chat_template: true - custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml index f1ba0f1d1..e6a192537 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml @@ -136,4 +136,3 @@ benchmark: concurrencies: "8x32x64" req_rate: "inf" use_chat_template: true - custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml index ff646668d..05a20181d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml @@ -153,4 +153,3 @@ benchmark: concurrencies: "256" req_rate: "inf" use_chat_template: true - custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml index fe75e7a66..fa2747fd6 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml @@ -153,4 +153,3 @@ benchmark: concurrencies: "256" req_rate: "inf" use_chat_template: true - custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml index 0508f4e18..cf6e3b132 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml @@ -153,4 +153,3 @@ benchmark: concurrencies: "512" req_rate: "inf" use_chat_template: true - custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml index c9224b37e..14178e0ce 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml @@ -153,4 +153,3 @@ benchmark: concurrencies: "1024" req_rate: "inf" use_chat_template: true - custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" From e1a50819621aa41eadbbba564d38e6cc335b5c63 Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Fri, 8 May 2026 11:02:17 -0700 Subject: [PATCH 07/19] Pin srt-slurm to fork w/ SGLangDeepseekV4Tokenizer callable + restore custom_tokenizer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit NVIDIA/srt-slurm#144 adds __call__ / __getattr__ to SGLangDeepseekV4Tokenizer so sa-bench's calculate_metrics (benchmark_serving.py:657 — `tokenizer(text).input_ids`) can count generated tokens for DSv4-Pro multi-node MTP runs without throwing ``TypeError: 'SGLangDeepseekV4Tokenizer' object is not callable``. Until that PR merges, pin gb300-cw's sglang launcher to ``ch-wan/srt-slurm @ c901ad38`` (the same fix), and restore ``custom_tokenizer: sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer`` in the 6 MTP recipes. ``use_chat_template: true`` is required by AGENTS.md for MTP correctness (EAGLE acceptance regresses on raw random tokens). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml | 1 + .../8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml | 1 + .../8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml | 1 + .../8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml | 1 + .../8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml | 1 + .../8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml | 1 + runners/launch_gb300-cw.sh | 10 ++++++++-- 7 files changed, 14 insertions(+), 2 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml index 259f5566f..f77afd6fc 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml @@ -122,3 +122,4 @@ benchmark: concurrencies: "1" req_rate: "inf" use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml index e6a192537..f1ba0f1d1 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml @@ -136,3 +136,4 @@ benchmark: concurrencies: "8x32x64" req_rate: "inf" use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml index 05a20181d..ff646668d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml @@ -153,3 +153,4 @@ benchmark: concurrencies: "256" req_rate: "inf" use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml index fa2747fd6..fe75e7a66 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml @@ -153,3 +153,4 @@ benchmark: concurrencies: "256" req_rate: "inf" use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml index cf6e3b132..0508f4e18 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml @@ -153,3 +153,4 @@ benchmark: concurrencies: "512" req_rate: "inf" use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml index 14178e0ce..c9224b37e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml @@ -153,3 +153,4 @@ benchmark: concurrencies: "1024" req_rate: "inf" use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh index 8c78613f1..bf5405d4c 100644 --- a/runners/launch_gb300-cw.sh +++ b/runners/launch_gb300-cw.sh @@ -12,8 +12,14 @@ if [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then export MODEL_PATH="/scratch/models/dsv4/" if [[ $FRAMEWORK == "dynamo-sglang" ]]; then - SRT_SLURM_RECIPES_REPO="https://github.com/NVIDIA/srt-slurm.git" - SRT_SLURM_RECIPES_REF="main" + # Pinned to ch-wan/srt-slurm fork @ cwan/fix-sglang-dsv4-tokenizer-callable + # while NVIDIA/srt-slurm#144 (sa-bench: make SGLangDeepseekV4Tokenizer + # callable) is in review. Without the fix, multi-node DSv4-Pro MTP + # sa-bench runs fail in calculate_metrics with + # ``TypeError: 'SGLangDeepseekV4Tokenizer' object is not callable``. + # Revert to ``NVIDIA/srt-slurm.git`` @ ``main`` once #144 merges. + SRT_SLURM_RECIPES_REPO="https://github.com/ch-wan/srt-slurm.git" + SRT_SLURM_RECIPES_REF="c901ad38ac917a21ddc150d15c2d8cdeff6aa381" SRT_RECIPE_SRC="$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4" SRT_RECIPE_DST="recipes/sglang/deepseek-v4" elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then From 530762b9c11b90d9961bd20f2c98bcd3d251163e Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Fri, 8 May 2026 14:40:26 -0700 Subject: [PATCH 08/19] Bump sglang container to nightly-dev-cu13-20260508-2cf1a4ab (latest main) Pinned to the multi-arch image produced by sgl-project/sglang Build and Push Development Docker Images run #25574279419 (head_sha 2cf1a4ab, HEAD of sglang main). Replaces the older staging image lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev (May 7). The nightly-dev-cu13 image carries the full sglang main as of 2026-05-08 21:06 UTC, including upstream fixes since the May-7 staging snapshot. Multi-arch manifest covers amd64 + arm64, so it works on the gb300 (Grace) compute nodes. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/nvidia-master.yaml | 4 ++-- .../deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml | 2 +- .../8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml | 2 +- .../8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml | 2 +- .../deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml | 2 +- .../deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml | 2 +- .../deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml | 2 +- 7 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 3a6005190..87970e9c4 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -8216,7 +8216,7 @@ dsv4-fp4-gb300-dynamo-vllm: dp-attn: true dsv4-fp4-gb300-dynamo-sglang: - image: lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev + image: lmsysorg/sglang:nightly-dev-cu13-20260508-2cf1a4ab model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: gb300-cw @@ -8316,7 +8316,7 @@ dsv4-fp4-gb300-dynamo-sglang: # MTP variant of dsv4-fp4-gb300-dynamo-sglang. dsv4-fp4-gb300-dynamo-sglang-mtp: - image: lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev + image: lmsysorg/sglang:nightly-dev-cu13-20260508-2cf1a4ab model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: gb300-cw diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml index f77afd6fc..bc2186cce 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml @@ -11,7 +11,7 @@ dynamo: model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev" + container: "lmsysorg/sglang:nightly-dev-cu13-20260508-2cf1a4ab" precision: "mxfp4" sbatch_directives: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml index f1ba0f1d1..e4587f746 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml @@ -11,7 +11,7 @@ dynamo: model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev" + container: "lmsysorg/sglang:nightly-dev-cu13-20260508-2cf1a4ab" precision: "mxfp4" sbatch_directives: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml index ff646668d..ab03f6f4c 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml @@ -11,7 +11,7 @@ dynamo: model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev" + container: "lmsysorg/sglang:nightly-dev-cu13-20260508-2cf1a4ab" precision: "mxfp4" sbatch_directives: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml index fe75e7a66..6d602077a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml @@ -11,7 +11,7 @@ dynamo: model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev" + container: "lmsysorg/sglang:nightly-dev-cu13-20260508-2cf1a4ab" precision: "mxfp4" sbatch_directives: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml index 0508f4e18..a7a8c3396 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml @@ -11,7 +11,7 @@ dynamo: model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev" + container: "lmsysorg/sglang:nightly-dev-cu13-20260508-2cf1a4ab" precision: "mxfp4" sbatch_directives: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml index c9224b37e..d615bb7ab 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml @@ -11,7 +11,7 @@ dynamo: model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev" + container: "lmsysorg/sglang:nightly-dev-cu13-20260508-2cf1a4ab" precision: "mxfp4" sbatch_directives: From 6db9e2ccee12e47be363372b217113ebf03ff867 Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Fri, 8 May 2026 14:41:51 -0700 Subject: [PATCH 09/19] Restore base dsv4-fp4-gb300-dynamo-sglang image to staging tag The previous commit accidentally bumped the non-MTP base entry's image too. The base 8k1k recipes still pin ``container: lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev``, and the launcher requires the matrix's ``image:`` to match the recipe's ``container:`` (it templates ``\"\${IMAGE}\": \${SQUASH_FILE}`` into srtslurm.yaml). Mismatching them would break the base sweep. Only the dsv4-fp4-gb300-dynamo-sglang-mtp entry needs the nightly-dev-cu13 bump (paired with the MTP recipe ``container:`` field). Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 87970e9c4..39b173d1a 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -8216,7 +8216,7 @@ dsv4-fp4-gb300-dynamo-vllm: dp-attn: true dsv4-fp4-gb300-dynamo-sglang: - image: lmsysorg/sglang:nightly-dev-cu13-20260508-2cf1a4ab + image: lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: gb300-cw From 164f5a2b7ebf49635d4835cbe228fbedfab41259 Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Fri, 8 May 2026 15:16:36 -0700 Subject: [PATCH 10/19] Pin MTP recipes to dynamo 81d0555e (matches working base recipes) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 6 MTP recipes were imported with dynamo hash 9d3c913d from the upstream srt-slurm fork, but the working non-MTP base recipes already on this branch use 81d0555ee23519cea80a42b4fe824e30368b7300 — paired with the sglang nightly cu13 main builds. The 9d3c913d wheel is incompatible with sglang main 2cf1a4ab: the decode scheduler subprocess (rank 0) is SIGQUIT'd during sgl.Engine() init at dynamo.sglang.init_llm:77, surfacing as "Rank 0 scheduler died during initialization (exit code: -3)" in CI run 25580956722. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml | 2 +- .../deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml | 2 +- .../deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml | 2 +- .../deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml | 2 +- .../deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml | 2 +- .../deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml index bc2186cce..3ceb5d197 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml @@ -6,7 +6,7 @@ frontend: num_additional_frontends: 8 dynamo: - hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d" + hash: "81d0555ee23519cea80a42b4fe824e30368b7300" install: true model: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml index e4587f746..01febe59f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml @@ -6,7 +6,7 @@ frontend: num_additional_frontends: 8 dynamo: - hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d" + hash: "81d0555ee23519cea80a42b4fe824e30368b7300" install: true model: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml index ab03f6f4c..60fc36c62 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml @@ -6,7 +6,7 @@ frontend: num_additional_frontends: 8 dynamo: - hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d" + hash: "81d0555ee23519cea80a42b4fe824e30368b7300" install: true model: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml index 6d602077a..e0edbe830 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml @@ -6,7 +6,7 @@ frontend: num_additional_frontends: 8 dynamo: - hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d" + hash: "81d0555ee23519cea80a42b4fe824e30368b7300" install: true model: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml index a7a8c3396..ad1da36b6 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml @@ -6,7 +6,7 @@ frontend: num_additional_frontends: 8 dynamo: - hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d" + hash: "81d0555ee23519cea80a42b4fe824e30368b7300" install: true model: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml index d615bb7ab..276ca1a31 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml @@ -6,7 +6,7 @@ frontend: num_additional_frontends: 8 dynamo: - hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d" + hash: "81d0555ee23519cea80a42b4fe824e30368b7300" install: true model: From 6d28994beb762df9928ca1ba3f1cc4a2ded3ebe8 Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Fri, 8 May 2026 15:45:04 -0700 Subject: [PATCH 11/19] Explicitly disable CAR_V2 in multi-node decode MTP recipes The 4 multi-node decode MTP recipes had a comment saying SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 was "intentionally NOT set", but sglang main 2cf1a4ab defaults this on. CAR_V2 is single-node only, and on multi-node decode it silently fails to construct its backing ``self.obj``, then segfaults during cuda graph capture: AttributeError: 'CustomAllReduceV2' object has no attribute 'obj' at custom_all_reduce_v2.py:97 in capture() The scheduler is SIGQUIT'd, surfacing as "Rank 0 scheduler died during initialization (exit code: -3)" in dynamo's wrapper. Explicitly setting the env to "0" matches the intent of the pre-existing comment. Affects: dep4-dep8, dep4-dep16, 2p1d-dep4-dep8, 4p1d-dep4-dep8. Single-node decode recipes (1p1d-tp4-tp4, 1p6d-dep4-tp4) keep the default since CAR_V2 works in single-node. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml | 6 ++++-- .../8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml | 6 ++++-- .../8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml | 6 ++++-- .../8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml | 6 ++++-- 4 files changed, 16 insertions(+), 8 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml index 60fc36c62..136c05848 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml @@ -87,8 +87,10 @@ backend: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" - # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 - # is single-node only and corrupts results in 2-node decode setups. + # CAR_V2 is single-node only and segfaults during cuda graph capture + # on multi-node decode ('CustomAllReduceV2' object has no attribute + # 'obj' at custom_all_reduce_v2.py:97). Explicitly disable. + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml index e0edbe830..851d22892 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml @@ -87,8 +87,10 @@ backend: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" - # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 - # is single-node only and corrupts results in 2-node decode setups. + # CAR_V2 is single-node only and segfaults during cuda graph capture + # on multi-node decode ('CustomAllReduceV2' object has no attribute + # 'obj' at custom_all_reduce_v2.py:97). Explicitly disable. + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml index ad1da36b6..3abde6022 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml @@ -87,8 +87,10 @@ backend: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" - # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 - # is single-node only and corrupts results in 2-node decode setups. + # CAR_V2 is single-node only and segfaults during cuda graph capture + # on multi-node decode ('CustomAllReduceV2' object has no attribute + # 'obj' at custom_all_reduce_v2.py:97). Explicitly disable. + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml index 276ca1a31..1c65bf3c1 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml @@ -87,8 +87,10 @@ backend: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" - # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 - # is single-node only and corrupts results in 2-node decode setups. + # CAR_V2 is single-node only and segfaults during cuda graph capture + # on multi-node decode ('CustomAllReduceV2' object has no attribute + # 'obj' at custom_all_reduce_v2.py:97). Explicitly disable. + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" sglang_config: prefill: From 9c4c2440ff8c9760db15a8bca1f3ca6d4c124b23 Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Fri, 8 May 2026 15:49:45 -0700 Subject: [PATCH 12/19] Explicitly disable CAR_V2 in 8k1k base decode recipes too Apply the same explicit ``SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0"`` to the existing 8k1k base decode recipes that had only the ``intentionally NOT set`` comment. The MTP fix in 6d28994b proved the comment-only pattern is brittle: sglang main 2cf1a4ab defaults the env on, and CAR_V2 segfaults during cuda graph capture on multi-node decode. Make the disable explicit so a future image bump on the base sweep can't regress the same way. Affects 6 recipes: 1p1d-tp4-tp4-2-c1, 1p1d-dep4-dep16-5-c1024, 4p1d-dep4-dep16-8-c1024, 8p1d-dep4-dep16-12-c4096, 10p1d-dep4-dep16-14-c8192, 12p1d-dep4-dep12-15-c21504. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../8k1k/disagg-gb300-10p1d-dep4-dep16-14-c8192.yaml | 6 ++++-- .../8k1k/disagg-gb300-12p1d-dep4-dep12-15-c21504.yaml | 6 ++++-- .../8k1k/disagg-gb300-1p1d-dep4-dep16-5-c1024.yaml | 6 ++++-- .../deepseek-v4/8k1k/disagg-gb300-1p1d-tp4-tp4-2-c1.yaml | 6 ++++-- .../8k1k/disagg-gb300-4p1d-dep4-dep16-8-c1024.yaml | 6 ++++-- .../8k1k/disagg-gb300-8p1d-dep4-dep16-12-c4096.yaml | 6 ++++-- 6 files changed, 24 insertions(+), 12 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-10p1d-dep4-dep16-14-c8192.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-10p1d-dep4-dep16-14-c8192.yaml index 1bcd793c1..2c956b28a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-10p1d-dep4-dep16-14-c8192.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-10p1d-dep4-dep16-14-c8192.yaml @@ -136,8 +136,10 @@ backend: SGLANG_LOG_FORWARD_ITERS: "1" SGLANG_LOG_MS: "1" SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" - # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 - # is single-node only and corrupts results in 2-node decode setups. + # CAR_V2 is single-node only and segfaults during cuda graph capture + # on multi-node decode ('CustomAllReduceV2' object has no attribute + # 'obj' at custom_all_reduce_v2.py:97). Explicitly disable. + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep12-15-c21504.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep12-15-c21504.yaml index 0fbab8d77..827844ebd 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep12-15-c21504.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep12-15-c21504.yaml @@ -136,8 +136,10 @@ backend: SGLANG_LOG_FORWARD_ITERS: "1" SGLANG_LOG_MS: "1" SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" - # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 - # is single-node only and corrupts results in 2-node decode setups. + # CAR_V2 is single-node only and segfaults during cuda graph capture + # on multi-node decode ('CustomAllReduceV2' object has no attribute + # 'obj' at custom_all_reduce_v2.py:97). Explicitly disable. + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-dep16-5-c1024.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-dep16-5-c1024.yaml index 99bad72bc..26fed7fd2 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-dep16-5-c1024.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-dep16-5-c1024.yaml @@ -136,8 +136,10 @@ backend: SGLANG_LOG_FORWARD_ITERS: "1" SGLANG_LOG_MS: "1" SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" - # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 - # is single-node only and corrupts results in 2-node decode setups. + # CAR_V2 is single-node only and segfaults during cuda graph capture + # on multi-node decode ('CustomAllReduceV2' object has no attribute + # 'obj' at custom_all_reduce_v2.py:97). Explicitly disable. + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-tp4-tp4-2-c1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-tp4-tp4-2-c1.yaml index 1e6d8cc37..6ea1bafe3 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-tp4-tp4-2-c1.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-tp4-tp4-2-c1.yaml @@ -113,8 +113,10 @@ backend: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" - # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 - # is single-node only and corrupts results in 2-node decode setups. + # CAR_V2 is single-node only and segfaults during cuda graph capture + # on multi-node decode ('CustomAllReduceV2' object has no attribute + # 'obj' at custom_all_reduce_v2.py:97). Explicitly disable. + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep16-8-c1024.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep16-8-c1024.yaml index 2a7cf4d28..56b852656 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep16-8-c1024.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep16-8-c1024.yaml @@ -136,8 +136,10 @@ backend: SGLANG_LOG_FORWARD_ITERS: "1" SGLANG_LOG_MS: "1" SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" - # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 - # is single-node only and corrupts results in 2-node decode setups. + # CAR_V2 is single-node only and segfaults during cuda graph capture + # on multi-node decode ('CustomAllReduceV2' object has no attribute + # 'obj' at custom_all_reduce_v2.py:97). Explicitly disable. + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-8p1d-dep4-dep16-12-c4096.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-8p1d-dep4-dep16-12-c4096.yaml index cf7061eca..395536b6a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-8p1d-dep4-dep16-12-c4096.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-8p1d-dep4-dep16-12-c4096.yaml @@ -136,8 +136,10 @@ backend: SGLANG_LOG_FORWARD_ITERS: "1" SGLANG_LOG_MS: "1" SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" - # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 - # is single-node only and corrupts results in 2-node decode setups. + # CAR_V2 is single-node only and segfaults during cuda graph capture + # on multi-node decode ('CustomAllReduceV2' object has no attribute + # 'obj' at custom_all_reduce_v2.py:97). Explicitly disable. + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" sglang_config: prefill: From 9814b42d70e5d9f3d9abe2674e63ec0ec210e27e Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Fri, 8 May 2026 16:59:27 -0700 Subject: [PATCH 13/19] Set both old and new sglang thinking/reasoning env vars in MTP recipes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sglang main 2cf1a4ab moved ``SGLANG_ENABLE_THINKING`` → ``SGLANG_DEFAULT_THINKING`` and ``SGLANG_REASONING_EFFORT`` → ``SGLANG_DSV4_REASONING_EFFORT``. The deprecation helper ``_print_deprecated_env`` (environ.py:642) only emits a warning — it does NOT propagate the value to the new name. So the old env vars were silently ignored: server defaulted to non-thinking mode with empty reasoning effort, dropping GSM8K accuracy from ~95% to ~40% (eval_results_all from run 25583345967: em_strict=0.4291 for 1p6d-dep4-tp4 conc=64, 0.4056 for 4p1d-dep4-dep8 conc=1024). Set both names in prefill_environment and decode_environment of all six MTP recipes: * old names — read by the sa-bench client tokenizer (sa_bench_tokenizers.sglang_deepseek_v4) for prompt-rendering parity with the server. * new names — read by the sglang server in 2cf1a4ab+. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml | 14 ++++++++++++++ .../8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml | 14 ++++++++++++++ .../8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml | 14 ++++++++++++++ .../8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml | 14 ++++++++++++++ .../8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml | 14 ++++++++++++++ .../8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml | 14 ++++++++++++++ 6 files changed, 84 insertions(+) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml index 3ceb5d197..70de5d8df 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml @@ -33,8 +33,15 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + # sglang main 2cf1a4ab silently ignores the old env names (the + # _print_deprecated_env helper only warns, it does not propagate + # the value to the new name). Set both: old names for the sa-bench + # client-side tokenizer (sa_bench_tokenizers.sglang_deepseek_v4), + # new names for the sglang server. SGLANG_ENABLE_THINKING: "1" + SGLANG_DEFAULT_THINKING: "1" SGLANG_REASONING_EFFORT: "max" + SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" @@ -51,8 +58,15 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + # sglang main 2cf1a4ab silently ignores the old env names (the + # _print_deprecated_env helper only warns, it does not propagate + # the value to the new name). Set both: old names for the sa-bench + # client-side tokenizer (sa_bench_tokenizers.sglang_deepseek_v4), + # new names for the sglang server. SGLANG_ENABLE_THINKING: "1" + SGLANG_DEFAULT_THINKING: "1" SGLANG_REASONING_EFFORT: "max" + SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml index 01febe59f..a2f83d813 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml @@ -33,8 +33,15 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + # sglang main 2cf1a4ab silently ignores the old env names (the + # _print_deprecated_env helper only warns, it does not propagate + # the value to the new name). Set both: old names for the sa-bench + # client-side tokenizer (sa_bench_tokenizers.sglang_deepseek_v4), + # new names for the sglang server. SGLANG_ENABLE_THINKING: "1" + SGLANG_DEFAULT_THINKING: "1" SGLANG_REASONING_EFFORT: "max" + SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" @@ -62,8 +69,15 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + # sglang main 2cf1a4ab silently ignores the old env names (the + # _print_deprecated_env helper only warns, it does not propagate + # the value to the new name). Set both: old names for the sa-bench + # client-side tokenizer (sa_bench_tokenizers.sglang_deepseek_v4), + # new names for the sglang server. SGLANG_ENABLE_THINKING: "1" + SGLANG_DEFAULT_THINKING: "1" SGLANG_REASONING_EFFORT: "max" + SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml index 136c05848..ce5ac8796 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml @@ -35,8 +35,15 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + # sglang main 2cf1a4ab silently ignores the old env names (the + # _print_deprecated_env helper only warns, it does not propagate + # the value to the new name). Set both: old names for the sa-bench + # client-side tokenizer (sa_bench_tokenizers.sglang_deepseek_v4), + # new names for the sglang server. SGLANG_ENABLE_THINKING: "1" + SGLANG_DEFAULT_THINKING: "1" SGLANG_REASONING_EFFORT: "max" + SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" @@ -64,8 +71,15 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + # sglang main 2cf1a4ab silently ignores the old env names (the + # _print_deprecated_env helper only warns, it does not propagate + # the value to the new name). Set both: old names for the sa-bench + # client-side tokenizer (sa_bench_tokenizers.sglang_deepseek_v4), + # new names for the sglang server. SGLANG_ENABLE_THINKING: "1" + SGLANG_DEFAULT_THINKING: "1" SGLANG_REASONING_EFFORT: "max" + SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml index 851d22892..098c5947b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml @@ -35,8 +35,15 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + # sglang main 2cf1a4ab silently ignores the old env names (the + # _print_deprecated_env helper only warns, it does not propagate + # the value to the new name). Set both: old names for the sa-bench + # client-side tokenizer (sa_bench_tokenizers.sglang_deepseek_v4), + # new names for the sglang server. SGLANG_ENABLE_THINKING: "1" + SGLANG_DEFAULT_THINKING: "1" SGLANG_REASONING_EFFORT: "max" + SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" @@ -64,8 +71,15 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + # sglang main 2cf1a4ab silently ignores the old env names (the + # _print_deprecated_env helper only warns, it does not propagate + # the value to the new name). Set both: old names for the sa-bench + # client-side tokenizer (sa_bench_tokenizers.sglang_deepseek_v4), + # new names for the sglang server. SGLANG_ENABLE_THINKING: "1" + SGLANG_DEFAULT_THINKING: "1" SGLANG_REASONING_EFFORT: "max" + SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml index 3abde6022..8579a9786 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml @@ -35,8 +35,15 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + # sglang main 2cf1a4ab silently ignores the old env names (the + # _print_deprecated_env helper only warns, it does not propagate + # the value to the new name). Set both: old names for the sa-bench + # client-side tokenizer (sa_bench_tokenizers.sglang_deepseek_v4), + # new names for the sglang server. SGLANG_ENABLE_THINKING: "1" + SGLANG_DEFAULT_THINKING: "1" SGLANG_REASONING_EFFORT: "max" + SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" @@ -64,8 +71,15 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + # sglang main 2cf1a4ab silently ignores the old env names (the + # _print_deprecated_env helper only warns, it does not propagate + # the value to the new name). Set both: old names for the sa-bench + # client-side tokenizer (sa_bench_tokenizers.sglang_deepseek_v4), + # new names for the sglang server. SGLANG_ENABLE_THINKING: "1" + SGLANG_DEFAULT_THINKING: "1" SGLANG_REASONING_EFFORT: "max" + SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml index 1c65bf3c1..3003e5c91 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml @@ -35,8 +35,15 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + # sglang main 2cf1a4ab silently ignores the old env names (the + # _print_deprecated_env helper only warns, it does not propagate + # the value to the new name). Set both: old names for the sa-bench + # client-side tokenizer (sa_bench_tokenizers.sglang_deepseek_v4), + # new names for the sglang server. SGLANG_ENABLE_THINKING: "1" + SGLANG_DEFAULT_THINKING: "1" SGLANG_REASONING_EFFORT: "max" + SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" @@ -64,8 +71,15 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + # sglang main 2cf1a4ab silently ignores the old env names (the + # _print_deprecated_env helper only warns, it does not propagate + # the value to the new name). Set both: old names for the sa-bench + # client-side tokenizer (sa_bench_tokenizers.sglang_deepseek_v4), + # new names for the sglang server. SGLANG_ENABLE_THINKING: "1" + SGLANG_DEFAULT_THINKING: "1" SGLANG_REASONING_EFFORT: "max" + SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" From 3e049e8f0098412979abdeedcaa2277ddd5f44ca Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Fri, 8 May 2026 17:03:16 -0700 Subject: [PATCH 14/19] Set tool-call-parser=deepseekv4 to enable DSV4 chat encoding (gsm8k regression fix) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GSM8K accuracy on the latest sweep dropped from the expected ~95% to ~40% (em_strict=0.4291 for 1p6d-dep4-tp4 conc=64; 0.4056 for 4p1d-dep4-dep8 conc=1024 — run 25583345967 eval_results_all). Inspecting samples_gsm8k_*.jsonl revealed every response was prefixed with junk like "Weapon:" / "Weaponized" / "We黑白颠倒", and the reasoning often answered a different question than what was asked — classic symptom of a malformed chat-template prompt. Root cause in sglang main 2cf1a4ab (entrypoints/openai/serving_chat.py:296): def _resolve_chat_encoding_spec(self) -> Optional[str]: if self.tool_call_parser == "deepseekv4": return "dsv4" if self.tool_call_parser == "deepseekv32": return "dsv32" The dsv4 chat-encoding spec — which routes DSV4 prompts through ``encoding_dsv4.encode_messages`` with thinking-mode and reasoning-effort handling — only activates when ``--tool-call-parser deepseekv4`` is set. Without it the server falls back to the vanilla HF chat template (``apply_chat_template``), which doesn't know about DSV4's special tokens, ```` blocks, or the ``thinking_mode`` argument. The MTP recipes never set this flag, so ServerArgs reports ``tool_call_parser=None`` and the model receives a malformed prompt. Add ``tool-call-parser: deepseekv4`` to both prefill and decode ``sglang_config`` blocks in all 6 MTP recipes. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../disagg-low-latency-1p1d-tp4-tp4-mtp.yaml | 18 ++++++++++++++++++ .../disagg-low-latency-1p6d-dep4-tp4-mtp.yaml | 18 ++++++++++++++++++ .../disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml | 18 ++++++++++++++++++ .../disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml | 18 ++++++++++++++++++ .../disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml | 18 ++++++++++++++++++ .../disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml | 18 ++++++++++++++++++ 6 files changed, 108 insertions(+) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml index 70de5d8df..ce635bd17 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml @@ -86,6 +86,15 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true + # sglang main 2cf1a4ab gates the dsv4 chat-encoding spec + # (which routes DSV4 prompts through encoding_dsv4.py with the + # proper thinking-mode + reasoning-effort handling) on + # tool_call_parser=="deepseekv4". Without this flag the server + # falls back to the vanilla HF chat template, the prompt is + # rendered without DSV4 special tokens, generations come out + # with junk "Weapon:" prefixes, and gsm8k accuracy collapses + # from ~95% to ~40% (run 25583345967 eval_results_all). + tool-call-parser: deepseekv4 disaggregation-mode: "prefill" disaggregation-transfer-backend: mooncake @@ -106,6 +115,15 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true + # sglang main 2cf1a4ab gates the dsv4 chat-encoding spec + # (which routes DSV4 prompts through encoding_dsv4.py with the + # proper thinking-mode + reasoning-effort handling) on + # tool_call_parser=="deepseekv4". Without this flag the server + # falls back to the vanilla HF chat template, the prompt is + # rendered without DSV4 special tokens, generations come out + # with junk "Weapon:" prefixes, and gsm8k accuracy collapses + # from ~95% to ~40% (run 25583345967 eval_results_all). + tool-call-parser: deepseekv4 disaggregation-mode: "decode" disaggregation-transfer-backend: mooncake diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml index a2f83d813..ab15ba34d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml @@ -97,6 +97,15 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true + # sglang main 2cf1a4ab gates the dsv4 chat-encoding spec + # (which routes DSV4 prompts through encoding_dsv4.py with the + # proper thinking-mode + reasoning-effort handling) on + # tool_call_parser=="deepseekv4". Without this flag the server + # falls back to the vanilla HF chat template, the prompt is + # rendered without DSV4 special tokens, generations come out + # with junk "Weapon:" prefixes, and gsm8k accuracy collapses + # from ~95% to ~40% (run 25583345967 eval_results_all). + tool-call-parser: deepseekv4 disaggregation-mode: "prefill" disaggregation-transfer-backend: mooncake @@ -120,6 +129,15 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true + # sglang main 2cf1a4ab gates the dsv4 chat-encoding spec + # (which routes DSV4 prompts through encoding_dsv4.py with the + # proper thinking-mode + reasoning-effort handling) on + # tool_call_parser=="deepseekv4". Without this flag the server + # falls back to the vanilla HF chat template, the prompt is + # rendered without DSV4 special tokens, generations come out + # with junk "Weapon:" prefixes, and gsm8k accuracy collapses + # from ~95% to ~40% (run 25583345967 eval_results_all). + tool-call-parser: deepseekv4 disaggregation-mode: "decode" disaggregation-transfer-backend: mooncake diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml index ce5ac8796..61a221401 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml @@ -111,6 +111,15 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true + # sglang main 2cf1a4ab gates the dsv4 chat-encoding spec + # (which routes DSV4 prompts through encoding_dsv4.py with the + # proper thinking-mode + reasoning-effort handling) on + # tool_call_parser=="deepseekv4". Without this flag the server + # falls back to the vanilla HF chat template, the prompt is + # rendered without DSV4 special tokens, generations come out + # with junk "Weapon:" prefixes, and gsm8k accuracy collapses + # from ~95% to ~40% (run 25583345967 eval_results_all). + tool-call-parser: deepseekv4 disaggregation-mode: "prefill" disaggregation-transfer-backend: mooncake @@ -135,6 +144,15 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true + # sglang main 2cf1a4ab gates the dsv4 chat-encoding spec + # (which routes DSV4 prompts through encoding_dsv4.py with the + # proper thinking-mode + reasoning-effort handling) on + # tool_call_parser=="deepseekv4". Without this flag the server + # falls back to the vanilla HF chat template, the prompt is + # rendered without DSV4 special tokens, generations come out + # with junk "Weapon:" prefixes, and gsm8k accuracy collapses + # from ~95% to ~40% (run 25583345967 eval_results_all). + tool-call-parser: deepseekv4 disaggregation-mode: "decode" disaggregation-transfer-backend: mooncake diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml index 098c5947b..611a36f2d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml @@ -111,6 +111,15 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true + # sglang main 2cf1a4ab gates the dsv4 chat-encoding spec + # (which routes DSV4 prompts through encoding_dsv4.py with the + # proper thinking-mode + reasoning-effort handling) on + # tool_call_parser=="deepseekv4". Without this flag the server + # falls back to the vanilla HF chat template, the prompt is + # rendered without DSV4 special tokens, generations come out + # with junk "Weapon:" prefixes, and gsm8k accuracy collapses + # from ~95% to ~40% (run 25583345967 eval_results_all). + tool-call-parser: deepseekv4 disaggregation-mode: "prefill" disaggregation-transfer-backend: mooncake @@ -135,6 +144,15 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true + # sglang main 2cf1a4ab gates the dsv4 chat-encoding spec + # (which routes DSV4 prompts through encoding_dsv4.py with the + # proper thinking-mode + reasoning-effort handling) on + # tool_call_parser=="deepseekv4". Without this flag the server + # falls back to the vanilla HF chat template, the prompt is + # rendered without DSV4 special tokens, generations come out + # with junk "Weapon:" prefixes, and gsm8k accuracy collapses + # from ~95% to ~40% (run 25583345967 eval_results_all). + tool-call-parser: deepseekv4 disaggregation-mode: "decode" disaggregation-transfer-backend: mooncake diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml index 8579a9786..60c142111 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml @@ -111,6 +111,15 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true + # sglang main 2cf1a4ab gates the dsv4 chat-encoding spec + # (which routes DSV4 prompts through encoding_dsv4.py with the + # proper thinking-mode + reasoning-effort handling) on + # tool_call_parser=="deepseekv4". Without this flag the server + # falls back to the vanilla HF chat template, the prompt is + # rendered without DSV4 special tokens, generations come out + # with junk "Weapon:" prefixes, and gsm8k accuracy collapses + # from ~95% to ~40% (run 25583345967 eval_results_all). + tool-call-parser: deepseekv4 disaggregation-mode: "prefill" disaggregation-transfer-backend: mooncake @@ -135,6 +144,15 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true + # sglang main 2cf1a4ab gates the dsv4 chat-encoding spec + # (which routes DSV4 prompts through encoding_dsv4.py with the + # proper thinking-mode + reasoning-effort handling) on + # tool_call_parser=="deepseekv4". Without this flag the server + # falls back to the vanilla HF chat template, the prompt is + # rendered without DSV4 special tokens, generations come out + # with junk "Weapon:" prefixes, and gsm8k accuracy collapses + # from ~95% to ~40% (run 25583345967 eval_results_all). + tool-call-parser: deepseekv4 disaggregation-mode: "decode" disaggregation-transfer-backend: mooncake diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml index 3003e5c91..400475319 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml @@ -111,6 +111,15 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true + # sglang main 2cf1a4ab gates the dsv4 chat-encoding spec + # (which routes DSV4 prompts through encoding_dsv4.py with the + # proper thinking-mode + reasoning-effort handling) on + # tool_call_parser=="deepseekv4". Without this flag the server + # falls back to the vanilla HF chat template, the prompt is + # rendered without DSV4 special tokens, generations come out + # with junk "Weapon:" prefixes, and gsm8k accuracy collapses + # from ~95% to ~40% (run 25583345967 eval_results_all). + tool-call-parser: deepseekv4 disaggregation-mode: "prefill" disaggregation-transfer-backend: mooncake @@ -135,6 +144,15 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true + # sglang main 2cf1a4ab gates the dsv4 chat-encoding spec + # (which routes DSV4 prompts through encoding_dsv4.py with the + # proper thinking-mode + reasoning-effort handling) on + # tool_call_parser=="deepseekv4". Without this flag the server + # falls back to the vanilla HF chat template, the prompt is + # rendered without DSV4 special tokens, generations come out + # with junk "Weapon:" prefixes, and gsm8k accuracy collapses + # from ~95% to ~40% (run 25583345967 eval_results_all). + tool-call-parser: deepseekv4 disaggregation-mode: "decode" disaggregation-transfer-backend: mooncake From 255e7fba2770ed90d00bda51f00d8560f52c9cb6 Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Fri, 8 May 2026 17:06:16 -0700 Subject: [PATCH 15/19] Revert CAR_V2 explicit-disable in non-MTP base 8k1k recipes Restore the 6 base recipes to their state on origin/main; the explicit ``SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: \"0\"`` was added defensively in 9c4c2440, but the base sweep is happy on its current staging-dev image and shouldn't be touched in this PR. Reverts files: disagg-gb300-1p1d-tp4-tp4-2-c1.yaml disagg-gb300-1p1d-dep4-dep16-5-c1024.yaml disagg-gb300-4p1d-dep4-dep16-8-c1024.yaml disagg-gb300-8p1d-dep4-dep16-12-c4096.yaml disagg-gb300-10p1d-dep4-dep16-14-c8192.yaml disagg-gb300-12p1d-dep4-dep12-15-c21504.yaml Co-Authored-By: Claude Opus 4.7 (1M context) --- .../8k1k/disagg-gb300-10p1d-dep4-dep16-14-c8192.yaml | 6 ++---- .../8k1k/disagg-gb300-12p1d-dep4-dep12-15-c21504.yaml | 6 ++---- .../8k1k/disagg-gb300-1p1d-dep4-dep16-5-c1024.yaml | 6 ++---- .../deepseek-v4/8k1k/disagg-gb300-1p1d-tp4-tp4-2-c1.yaml | 6 ++---- .../8k1k/disagg-gb300-4p1d-dep4-dep16-8-c1024.yaml | 6 ++---- .../8k1k/disagg-gb300-8p1d-dep4-dep16-12-c4096.yaml | 6 ++---- 6 files changed, 12 insertions(+), 24 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-10p1d-dep4-dep16-14-c8192.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-10p1d-dep4-dep16-14-c8192.yaml index 2c956b28a..1bcd793c1 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-10p1d-dep4-dep16-14-c8192.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-10p1d-dep4-dep16-14-c8192.yaml @@ -136,10 +136,8 @@ backend: SGLANG_LOG_FORWARD_ITERS: "1" SGLANG_LOG_MS: "1" SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" - # CAR_V2 is single-node only and segfaults during cuda graph capture - # on multi-node decode ('CustomAllReduceV2' object has no attribute - # 'obj' at custom_all_reduce_v2.py:97). Explicitly disable. - SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep12-15-c21504.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep12-15-c21504.yaml index 827844ebd..0fbab8d77 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep12-15-c21504.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep12-15-c21504.yaml @@ -136,10 +136,8 @@ backend: SGLANG_LOG_FORWARD_ITERS: "1" SGLANG_LOG_MS: "1" SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" - # CAR_V2 is single-node only and segfaults during cuda graph capture - # on multi-node decode ('CustomAllReduceV2' object has no attribute - # 'obj' at custom_all_reduce_v2.py:97). Explicitly disable. - SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-dep16-5-c1024.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-dep16-5-c1024.yaml index 26fed7fd2..99bad72bc 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-dep16-5-c1024.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-dep16-5-c1024.yaml @@ -136,10 +136,8 @@ backend: SGLANG_LOG_FORWARD_ITERS: "1" SGLANG_LOG_MS: "1" SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" - # CAR_V2 is single-node only and segfaults during cuda graph capture - # on multi-node decode ('CustomAllReduceV2' object has no attribute - # 'obj' at custom_all_reduce_v2.py:97). Explicitly disable. - SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-tp4-tp4-2-c1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-tp4-tp4-2-c1.yaml index 6ea1bafe3..1e6d8cc37 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-tp4-tp4-2-c1.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-tp4-tp4-2-c1.yaml @@ -113,10 +113,8 @@ backend: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" - # CAR_V2 is single-node only and segfaults during cuda graph capture - # on multi-node decode ('CustomAllReduceV2' object has no attribute - # 'obj' at custom_all_reduce_v2.py:97). Explicitly disable. - SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep16-8-c1024.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep16-8-c1024.yaml index 56b852656..2a7cf4d28 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep16-8-c1024.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep16-8-c1024.yaml @@ -136,10 +136,8 @@ backend: SGLANG_LOG_FORWARD_ITERS: "1" SGLANG_LOG_MS: "1" SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" - # CAR_V2 is single-node only and segfaults during cuda graph capture - # on multi-node decode ('CustomAllReduceV2' object has no attribute - # 'obj' at custom_all_reduce_v2.py:97). Explicitly disable. - SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-8p1d-dep4-dep16-12-c4096.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-8p1d-dep4-dep16-12-c4096.yaml index 395536b6a..cf7061eca 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-8p1d-dep4-dep16-12-c4096.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-8p1d-dep4-dep16-12-c4096.yaml @@ -136,10 +136,8 @@ backend: SGLANG_LOG_FORWARD_ITERS: "1" SGLANG_LOG_MS: "1" SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" - # CAR_V2 is single-node only and segfaults during cuda graph capture - # on multi-node decode ('CustomAllReduceV2' object has no attribute - # 'obj' at custom_all_reduce_v2.py:97). Explicitly disable. - SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. sglang_config: prefill: From cb598072ffdb2cadede43d27c85242a92200dcd0 Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Fri, 8 May 2026 17:10:24 -0700 Subject: [PATCH 16/19] Trim verbose comments and drop deprecated env var names in MTP recipes - Drop ``SGLANG_ENABLE_THINKING`` / ``SGLANG_REASONING_EFFORT`` (deprecated since sglang main 2cf1a4ab); keep only the new names ``SGLANG_DEFAULT_THINKING`` / ``SGLANG_DSV4_REASONING_EFFORT``. - Bump the srt-slurm fork pin to 51847632 so the sa-bench client tokenizer reads the new env names (with old names as fallback). - Trim multi-line block comments down to one-line tail comments for the CAR_V2 disable and ``tool-call-parser: deepseekv4`` flag. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../disagg-low-latency-1p1d-tp4-tp4-mtp.yaml | 34 +--------------- .../disagg-low-latency-1p6d-dep4-tp4-mtp.yaml | 34 +--------------- .../disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml | 39 ++----------------- .../disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml | 39 ++----------------- .../disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml | 39 ++----------------- .../disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml | 39 ++----------------- runners/launch_gb300-cw.sh | 2 +- 7 files changed, 17 insertions(+), 209 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml index ce635bd17..b21af9395 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml @@ -33,14 +33,7 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - # sglang main 2cf1a4ab silently ignores the old env names (the - # _print_deprecated_env helper only warns, it does not propagate - # the value to the new name). Set both: old names for the sa-bench - # client-side tokenizer (sa_bench_tokenizers.sglang_deepseek_v4), - # new names for the sglang server. - SGLANG_ENABLE_THINKING: "1" SGLANG_DEFAULT_THINKING: "1" - SGLANG_REASONING_EFFORT: "max" SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" @@ -58,14 +51,7 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - # sglang main 2cf1a4ab silently ignores the old env names (the - # _print_deprecated_env helper only warns, it does not propagate - # the value to the new name). Set both: old names for the sa-bench - # client-side tokenizer (sa_bench_tokenizers.sglang_deepseek_v4), - # new names for the sglang server. - SGLANG_ENABLE_THINKING: "1" SGLANG_DEFAULT_THINKING: "1" - SGLANG_REASONING_EFFORT: "max" SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" @@ -86,15 +72,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true - # sglang main 2cf1a4ab gates the dsv4 chat-encoding spec - # (which routes DSV4 prompts through encoding_dsv4.py with the - # proper thinking-mode + reasoning-effort handling) on - # tool_call_parser=="deepseekv4". Without this flag the server - # falls back to the vanilla HF chat template, the prompt is - # rendered without DSV4 special tokens, generations come out - # with junk "Weapon:" prefixes, and gsm8k accuracy collapses - # from ~95% to ~40% (run 25583345967 eval_results_all). - tool-call-parser: deepseekv4 + tool-call-parser: deepseekv4 # gates the dsv4 chat-encoding spec. disaggregation-mode: "prefill" disaggregation-transfer-backend: mooncake @@ -115,15 +93,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true - # sglang main 2cf1a4ab gates the dsv4 chat-encoding spec - # (which routes DSV4 prompts through encoding_dsv4.py with the - # proper thinking-mode + reasoning-effort handling) on - # tool_call_parser=="deepseekv4". Without this flag the server - # falls back to the vanilla HF chat template, the prompt is - # rendered without DSV4 special tokens, generations come out - # with junk "Weapon:" prefixes, and gsm8k accuracy collapses - # from ~95% to ~40% (run 25583345967 eval_results_all). - tool-call-parser: deepseekv4 + tool-call-parser: deepseekv4 # gates the dsv4 chat-encoding spec. disaggregation-mode: "decode" disaggregation-transfer-backend: mooncake diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml index ab15ba34d..23d7401cc 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml @@ -33,14 +33,7 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - # sglang main 2cf1a4ab silently ignores the old env names (the - # _print_deprecated_env helper only warns, it does not propagate - # the value to the new name). Set both: old names for the sa-bench - # client-side tokenizer (sa_bench_tokenizers.sglang_deepseek_v4), - # new names for the sglang server. - SGLANG_ENABLE_THINKING: "1" SGLANG_DEFAULT_THINKING: "1" - SGLANG_REASONING_EFFORT: "max" SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" @@ -69,14 +62,7 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - # sglang main 2cf1a4ab silently ignores the old env names (the - # _print_deprecated_env helper only warns, it does not propagate - # the value to the new name). Set both: old names for the sa-bench - # client-side tokenizer (sa_bench_tokenizers.sglang_deepseek_v4), - # new names for the sglang server. - SGLANG_ENABLE_THINKING: "1" SGLANG_DEFAULT_THINKING: "1" - SGLANG_REASONING_EFFORT: "max" SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" @@ -97,15 +83,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true - # sglang main 2cf1a4ab gates the dsv4 chat-encoding spec - # (which routes DSV4 prompts through encoding_dsv4.py with the - # proper thinking-mode + reasoning-effort handling) on - # tool_call_parser=="deepseekv4". Without this flag the server - # falls back to the vanilla HF chat template, the prompt is - # rendered without DSV4 special tokens, generations come out - # with junk "Weapon:" prefixes, and gsm8k accuracy collapses - # from ~95% to ~40% (run 25583345967 eval_results_all). - tool-call-parser: deepseekv4 + tool-call-parser: deepseekv4 # gates the dsv4 chat-encoding spec. disaggregation-mode: "prefill" disaggregation-transfer-backend: mooncake @@ -129,15 +107,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true - # sglang main 2cf1a4ab gates the dsv4 chat-encoding spec - # (which routes DSV4 prompts through encoding_dsv4.py with the - # proper thinking-mode + reasoning-effort handling) on - # tool_call_parser=="deepseekv4". Without this flag the server - # falls back to the vanilla HF chat template, the prompt is - # rendered without DSV4 special tokens, generations come out - # with junk "Weapon:" prefixes, and gsm8k accuracy collapses - # from ~95% to ~40% (run 25583345967 eval_results_all). - tool-call-parser: deepseekv4 + tool-call-parser: deepseekv4 # gates the dsv4 chat-encoding spec. disaggregation-mode: "decode" disaggregation-transfer-backend: mooncake diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml index 61a221401..07ed5596e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml @@ -35,14 +35,7 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - # sglang main 2cf1a4ab silently ignores the old env names (the - # _print_deprecated_env helper only warns, it does not propagate - # the value to the new name). Set both: old names for the sa-bench - # client-side tokenizer (sa_bench_tokenizers.sglang_deepseek_v4), - # new names for the sglang server. - SGLANG_ENABLE_THINKING: "1" SGLANG_DEFAULT_THINKING: "1" - SGLANG_REASONING_EFFORT: "max" SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" @@ -71,14 +64,7 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - # sglang main 2cf1a4ab silently ignores the old env names (the - # _print_deprecated_env helper only warns, it does not propagate - # the value to the new name). Set both: old names for the sa-bench - # client-side tokenizer (sa_bench_tokenizers.sglang_deepseek_v4), - # new names for the sglang server. - SGLANG_ENABLE_THINKING: "1" SGLANG_DEFAULT_THINKING: "1" - SGLANG_REASONING_EFFORT: "max" SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" @@ -101,25 +87,14 @@ backend: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" - # CAR_V2 is single-node only and segfaults during cuda graph capture - # on multi-node decode ('CustomAllReduceV2' object has no attribute - # 'obj' at custom_all_reduce_v2.py:97). Explicitly disable. - SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" # CAR_V2 is single-node only. sglang_config: prefill: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true - # sglang main 2cf1a4ab gates the dsv4 chat-encoding spec - # (which routes DSV4 prompts through encoding_dsv4.py with the - # proper thinking-mode + reasoning-effort handling) on - # tool_call_parser=="deepseekv4". Without this flag the server - # falls back to the vanilla HF chat template, the prompt is - # rendered without DSV4 special tokens, generations come out - # with junk "Weapon:" prefixes, and gsm8k accuracy collapses - # from ~95% to ~40% (run 25583345967 eval_results_all). - tool-call-parser: deepseekv4 + tool-call-parser: deepseekv4 # gates the dsv4 chat-encoding spec. disaggregation-mode: "prefill" disaggregation-transfer-backend: mooncake @@ -144,15 +119,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true - # sglang main 2cf1a4ab gates the dsv4 chat-encoding spec - # (which routes DSV4 prompts through encoding_dsv4.py with the - # proper thinking-mode + reasoning-effort handling) on - # tool_call_parser=="deepseekv4". Without this flag the server - # falls back to the vanilla HF chat template, the prompt is - # rendered without DSV4 special tokens, generations come out - # with junk "Weapon:" prefixes, and gsm8k accuracy collapses - # from ~95% to ~40% (run 25583345967 eval_results_all). - tool-call-parser: deepseekv4 + tool-call-parser: deepseekv4 # gates the dsv4 chat-encoding spec. disaggregation-mode: "decode" disaggregation-transfer-backend: mooncake diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml index 611a36f2d..8036e2e18 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml @@ -35,14 +35,7 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - # sglang main 2cf1a4ab silently ignores the old env names (the - # _print_deprecated_env helper only warns, it does not propagate - # the value to the new name). Set both: old names for the sa-bench - # client-side tokenizer (sa_bench_tokenizers.sglang_deepseek_v4), - # new names for the sglang server. - SGLANG_ENABLE_THINKING: "1" SGLANG_DEFAULT_THINKING: "1" - SGLANG_REASONING_EFFORT: "max" SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" @@ -71,14 +64,7 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - # sglang main 2cf1a4ab silently ignores the old env names (the - # _print_deprecated_env helper only warns, it does not propagate - # the value to the new name). Set both: old names for the sa-bench - # client-side tokenizer (sa_bench_tokenizers.sglang_deepseek_v4), - # new names for the sglang server. - SGLANG_ENABLE_THINKING: "1" SGLANG_DEFAULT_THINKING: "1" - SGLANG_REASONING_EFFORT: "max" SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" @@ -101,25 +87,14 @@ backend: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" - # CAR_V2 is single-node only and segfaults during cuda graph capture - # on multi-node decode ('CustomAllReduceV2' object has no attribute - # 'obj' at custom_all_reduce_v2.py:97). Explicitly disable. - SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" # CAR_V2 is single-node only. sglang_config: prefill: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true - # sglang main 2cf1a4ab gates the dsv4 chat-encoding spec - # (which routes DSV4 prompts through encoding_dsv4.py with the - # proper thinking-mode + reasoning-effort handling) on - # tool_call_parser=="deepseekv4". Without this flag the server - # falls back to the vanilla HF chat template, the prompt is - # rendered without DSV4 special tokens, generations come out - # with junk "Weapon:" prefixes, and gsm8k accuracy collapses - # from ~95% to ~40% (run 25583345967 eval_results_all). - tool-call-parser: deepseekv4 + tool-call-parser: deepseekv4 # gates the dsv4 chat-encoding spec. disaggregation-mode: "prefill" disaggregation-transfer-backend: mooncake @@ -144,15 +119,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true - # sglang main 2cf1a4ab gates the dsv4 chat-encoding spec - # (which routes DSV4 prompts through encoding_dsv4.py with the - # proper thinking-mode + reasoning-effort handling) on - # tool_call_parser=="deepseekv4". Without this flag the server - # falls back to the vanilla HF chat template, the prompt is - # rendered without DSV4 special tokens, generations come out - # with junk "Weapon:" prefixes, and gsm8k accuracy collapses - # from ~95% to ~40% (run 25583345967 eval_results_all). - tool-call-parser: deepseekv4 + tool-call-parser: deepseekv4 # gates the dsv4 chat-encoding spec. disaggregation-mode: "decode" disaggregation-transfer-backend: mooncake diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml index 60c142111..cd8569a08 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml @@ -35,14 +35,7 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - # sglang main 2cf1a4ab silently ignores the old env names (the - # _print_deprecated_env helper only warns, it does not propagate - # the value to the new name). Set both: old names for the sa-bench - # client-side tokenizer (sa_bench_tokenizers.sglang_deepseek_v4), - # new names for the sglang server. - SGLANG_ENABLE_THINKING: "1" SGLANG_DEFAULT_THINKING: "1" - SGLANG_REASONING_EFFORT: "max" SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" @@ -71,14 +64,7 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - # sglang main 2cf1a4ab silently ignores the old env names (the - # _print_deprecated_env helper only warns, it does not propagate - # the value to the new name). Set both: old names for the sa-bench - # client-side tokenizer (sa_bench_tokenizers.sglang_deepseek_v4), - # new names for the sglang server. - SGLANG_ENABLE_THINKING: "1" SGLANG_DEFAULT_THINKING: "1" - SGLANG_REASONING_EFFORT: "max" SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" @@ -101,25 +87,14 @@ backend: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" - # CAR_V2 is single-node only and segfaults during cuda graph capture - # on multi-node decode ('CustomAllReduceV2' object has no attribute - # 'obj' at custom_all_reduce_v2.py:97). Explicitly disable. - SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" # CAR_V2 is single-node only. sglang_config: prefill: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true - # sglang main 2cf1a4ab gates the dsv4 chat-encoding spec - # (which routes DSV4 prompts through encoding_dsv4.py with the - # proper thinking-mode + reasoning-effort handling) on - # tool_call_parser=="deepseekv4". Without this flag the server - # falls back to the vanilla HF chat template, the prompt is - # rendered without DSV4 special tokens, generations come out - # with junk "Weapon:" prefixes, and gsm8k accuracy collapses - # from ~95% to ~40% (run 25583345967 eval_results_all). - tool-call-parser: deepseekv4 + tool-call-parser: deepseekv4 # gates the dsv4 chat-encoding spec. disaggregation-mode: "prefill" disaggregation-transfer-backend: mooncake @@ -144,15 +119,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true - # sglang main 2cf1a4ab gates the dsv4 chat-encoding spec - # (which routes DSV4 prompts through encoding_dsv4.py with the - # proper thinking-mode + reasoning-effort handling) on - # tool_call_parser=="deepseekv4". Without this flag the server - # falls back to the vanilla HF chat template, the prompt is - # rendered without DSV4 special tokens, generations come out - # with junk "Weapon:" prefixes, and gsm8k accuracy collapses - # from ~95% to ~40% (run 25583345967 eval_results_all). - tool-call-parser: deepseekv4 + tool-call-parser: deepseekv4 # gates the dsv4 chat-encoding spec. disaggregation-mode: "decode" disaggregation-transfer-backend: mooncake diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml index 400475319..715df595b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml @@ -35,14 +35,7 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - # sglang main 2cf1a4ab silently ignores the old env names (the - # _print_deprecated_env helper only warns, it does not propagate - # the value to the new name). Set both: old names for the sa-bench - # client-side tokenizer (sa_bench_tokenizers.sglang_deepseek_v4), - # new names for the sglang server. - SGLANG_ENABLE_THINKING: "1" SGLANG_DEFAULT_THINKING: "1" - SGLANG_REASONING_EFFORT: "max" SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" @@ -71,14 +64,7 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - # sglang main 2cf1a4ab silently ignores the old env names (the - # _print_deprecated_env helper only warns, it does not propagate - # the value to the new name). Set both: old names for the sa-bench - # client-side tokenizer (sa_bench_tokenizers.sglang_deepseek_v4), - # new names for the sglang server. - SGLANG_ENABLE_THINKING: "1" SGLANG_DEFAULT_THINKING: "1" - SGLANG_REASONING_EFFORT: "max" SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" @@ -101,25 +87,14 @@ backend: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" - # CAR_V2 is single-node only and segfaults during cuda graph capture - # on multi-node decode ('CustomAllReduceV2' object has no attribute - # 'obj' at custom_all_reduce_v2.py:97). Explicitly disable. - SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" # CAR_V2 is single-node only. sglang_config: prefill: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true - # sglang main 2cf1a4ab gates the dsv4 chat-encoding spec - # (which routes DSV4 prompts through encoding_dsv4.py with the - # proper thinking-mode + reasoning-effort handling) on - # tool_call_parser=="deepseekv4". Without this flag the server - # falls back to the vanilla HF chat template, the prompt is - # rendered without DSV4 special tokens, generations come out - # with junk "Weapon:" prefixes, and gsm8k accuracy collapses - # from ~95% to ~40% (run 25583345967 eval_results_all). - tool-call-parser: deepseekv4 + tool-call-parser: deepseekv4 # gates the dsv4 chat-encoding spec. disaggregation-mode: "prefill" disaggregation-transfer-backend: mooncake @@ -144,15 +119,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true - # sglang main 2cf1a4ab gates the dsv4 chat-encoding spec - # (which routes DSV4 prompts through encoding_dsv4.py with the - # proper thinking-mode + reasoning-effort handling) on - # tool_call_parser=="deepseekv4". Without this flag the server - # falls back to the vanilla HF chat template, the prompt is - # rendered without DSV4 special tokens, generations come out - # with junk "Weapon:" prefixes, and gsm8k accuracy collapses - # from ~95% to ~40% (run 25583345967 eval_results_all). - tool-call-parser: deepseekv4 + tool-call-parser: deepseekv4 # gates the dsv4 chat-encoding spec. disaggregation-mode: "decode" disaggregation-transfer-backend: mooncake diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh index bf5405d4c..38374475c 100644 --- a/runners/launch_gb300-cw.sh +++ b/runners/launch_gb300-cw.sh @@ -19,7 +19,7 @@ if [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then # ``TypeError: 'SGLangDeepseekV4Tokenizer' object is not callable``. # Revert to ``NVIDIA/srt-slurm.git`` @ ``main`` once #144 merges. SRT_SLURM_RECIPES_REPO="https://github.com/ch-wan/srt-slurm.git" - SRT_SLURM_RECIPES_REF="c901ad38ac917a21ddc150d15c2d8cdeff6aa381" + SRT_SLURM_RECIPES_REF="518476323f19226513a1691714c550582cffe343" SRT_RECIPE_SRC="$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4" SRT_RECIPE_DST="recipes/sglang/deepseek-v4" elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then From 9ff03f21a817b7f7b6917830579ef45bc26a0cdb Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Fri, 8 May 2026 18:14:19 -0700 Subject: [PATCH 17/19] Revert MTP recipes to staging-dev container (gsm8k accuracy fix) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bump to ``lmsysorg/sglang:nightly-dev-cu13-20260508-2cf1a4ab`` introduced an MTP-path accuracy regression: gsm8k em_strict dropped from the expected ~0.93 to ~0.42 (run 25585766931 eval_results_all shows 0.4200 for 4p1d-dep4-dep8 conc=1024). Local repro on the cluster: the failed 5-shot prompt sent through plain sglang chat completion returns the correct answer; through the dynamo+nightly pipeline it returns garbage prefixed with junk tokens. Restore the same staging-dev container the base ``dsv4-fp4-gb300- dynamo-sglang`` sweep already runs on. Drop the dependent flags that only existed because of the nightly bump: - container: nightly-dev-cu13-20260508-2cf1a4ab → sglang-staging: deepseek-v4-grace-blackwell-dev (matches the matrix entry's image) - ``tool-call-parser: deepseekv4`` removed (the chat-encoding-spec routing it gated on doesn't exist in staging-dev; HF chat_template handles DSV4 prompts directly via dynamo's native Rust formatter). - Env vars reverted to ``SGLANG_ENABLE_THINKING`` / ``SGLANG_REASONING_EFFORT`` (the names staging-dev recognizes). - nvidia-master.yaml MTP entry image updated to match. The dynamo hash, srt-slurm fork pin, sbatch_directives, and multi-node CAR_V2 disable all stay (still required). Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/nvidia-master.yaml | 2 +- .../8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml | 12 +++++------- .../8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml | 12 +++++------- .../8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml | 12 +++++------- .../8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml | 12 +++++------- .../8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml | 12 +++++------- .../8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml | 12 +++++------- 7 files changed, 31 insertions(+), 43 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 39b173d1a..3a6005190 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -8316,7 +8316,7 @@ dsv4-fp4-gb300-dynamo-sglang: # MTP variant of dsv4-fp4-gb300-dynamo-sglang. dsv4-fp4-gb300-dynamo-sglang-mtp: - image: lmsysorg/sglang:nightly-dev-cu13-20260508-2cf1a4ab + image: lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: gb300-cw diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml index b21af9395..58b2ee47a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml @@ -11,7 +11,7 @@ dynamo: model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang:nightly-dev-cu13-20260508-2cf1a4ab" + container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev" precision: "mxfp4" sbatch_directives: @@ -33,8 +33,8 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_DEFAULT_THINKING: "1" - SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" @@ -51,8 +51,8 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_DEFAULT_THINKING: "1" - SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" @@ -72,7 +72,6 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true - tool-call-parser: deepseekv4 # gates the dsv4 chat-encoding spec. disaggregation-mode: "prefill" disaggregation-transfer-backend: mooncake @@ -93,7 +92,6 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true - tool-call-parser: deepseekv4 # gates the dsv4 chat-encoding spec. disaggregation-mode: "decode" disaggregation-transfer-backend: mooncake diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml index 23d7401cc..5e335f64d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml @@ -11,7 +11,7 @@ dynamo: model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang:nightly-dev-cu13-20260508-2cf1a4ab" + container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev" precision: "mxfp4" sbatch_directives: @@ -33,8 +33,8 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_DEFAULT_THINKING: "1" - SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" @@ -62,8 +62,8 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_DEFAULT_THINKING: "1" - SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" @@ -83,7 +83,6 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true - tool-call-parser: deepseekv4 # gates the dsv4 chat-encoding spec. disaggregation-mode: "prefill" disaggregation-transfer-backend: mooncake @@ -107,7 +106,6 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true - tool-call-parser: deepseekv4 # gates the dsv4 chat-encoding spec. disaggregation-mode: "decode" disaggregation-transfer-backend: mooncake diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml index 07ed5596e..2af8ef2d6 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml @@ -11,7 +11,7 @@ dynamo: model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang:nightly-dev-cu13-20260508-2cf1a4ab" + container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev" precision: "mxfp4" sbatch_directives: @@ -35,8 +35,8 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_DEFAULT_THINKING: "1" - SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" @@ -64,8 +64,8 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_DEFAULT_THINKING: "1" - SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" @@ -94,7 +94,6 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true - tool-call-parser: deepseekv4 # gates the dsv4 chat-encoding spec. disaggregation-mode: "prefill" disaggregation-transfer-backend: mooncake @@ -119,7 +118,6 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true - tool-call-parser: deepseekv4 # gates the dsv4 chat-encoding spec. disaggregation-mode: "decode" disaggregation-transfer-backend: mooncake diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml index 8036e2e18..cbd47b932 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml @@ -11,7 +11,7 @@ dynamo: model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang:nightly-dev-cu13-20260508-2cf1a4ab" + container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev" precision: "mxfp4" sbatch_directives: @@ -35,8 +35,8 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_DEFAULT_THINKING: "1" - SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" @@ -64,8 +64,8 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_DEFAULT_THINKING: "1" - SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" @@ -94,7 +94,6 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true - tool-call-parser: deepseekv4 # gates the dsv4 chat-encoding spec. disaggregation-mode: "prefill" disaggregation-transfer-backend: mooncake @@ -119,7 +118,6 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true - tool-call-parser: deepseekv4 # gates the dsv4 chat-encoding spec. disaggregation-mode: "decode" disaggregation-transfer-backend: mooncake diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml index cd8569a08..657c0b02d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml @@ -11,7 +11,7 @@ dynamo: model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang:nightly-dev-cu13-20260508-2cf1a4ab" + container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev" precision: "mxfp4" sbatch_directives: @@ -35,8 +35,8 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_DEFAULT_THINKING: "1" - SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" @@ -64,8 +64,8 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_DEFAULT_THINKING: "1" - SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" @@ -94,7 +94,6 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true - tool-call-parser: deepseekv4 # gates the dsv4 chat-encoding spec. disaggregation-mode: "prefill" disaggregation-transfer-backend: mooncake @@ -119,7 +118,6 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true - tool-call-parser: deepseekv4 # gates the dsv4 chat-encoding spec. disaggregation-mode: "decode" disaggregation-transfer-backend: mooncake diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml index 715df595b..20b59b5c2 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml @@ -11,7 +11,7 @@ dynamo: model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang:nightly-dev-cu13-20260508-2cf1a4ab" + container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev" precision: "mxfp4" sbatch_directives: @@ -35,8 +35,8 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_DEFAULT_THINKING: "1" - SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" @@ -64,8 +64,8 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_DEFAULT_THINKING: "1" - SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" @@ -94,7 +94,6 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true - tool-call-parser: deepseekv4 # gates the dsv4 chat-encoding spec. disaggregation-mode: "prefill" disaggregation-transfer-backend: mooncake @@ -119,7 +118,6 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true - tool-call-parser: deepseekv4 # gates the dsv4 chat-encoding spec. disaggregation-mode: "decode" disaggregation-transfer-backend: mooncake From 9b0611356720d7ca8feabd0a18f61ae0d1def5c6 Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Fri, 8 May 2026 19:11:54 -0700 Subject: [PATCH 18/19] Bump dynamo hash to 34d55a5 to fix DSV4 chat-template formatter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Local repro on the cluster (job 2226, slurm-gb300-139-009) confirmed the regression is in dynamo's wrapper, not in sglang main: - sglang main (2cf1a4ab) standalone, same failed 5-shot: prompt_tokens=1128, answer=18 (correct). - sglang main + dynamo 81d0555e (CI): answer="Weapon:#### 16" (em_strict=0.42). The pinned dynamo at 81d0555e ships an older Rust DSV4 prompt formatter whose ``render()`` always calls ``encode_messages(...)`` — which hardcodes ``reasoning_effort=None`` and ignores ``chat_template_kwargs`` entirely. That produces a prompt the model fails on under MTP. Dynamo PR #9322 (commit 34d55a5, "Deduplicate DeepSeek prompt encoders v3.2 and v4") rewrote ``render()`` to read ``reasoning_effort`` and ``drop_thinking`` from ``chat_template_args`` and plumb them into ``encode_messages_with_options``, fixing the DSV4 prompt rendering. Restore the changes the staging-dev revert had to undo: - container: nightly-dev-cu13-20260508-2cf1a4ab - tool-call-parser: deepseekv4 (gates the dsv4 chat-encoding spec) - SGLANG_DEFAULT_THINKING / SGLANG_DSV4_REASONING_EFFORT - dynamo.hash 81d0555e -> 34d55a5 - nvidia-master.yaml MTP entry image CAR_V2 disable on multi-node decode and the srt-slurm fork pin remain. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/nvidia-master.yaml | 2 +- .../8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml | 14 ++++++++------ .../8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml | 14 ++++++++------ .../8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml | 14 ++++++++------ .../8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml | 14 ++++++++------ .../8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml | 14 ++++++++------ .../8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml | 14 ++++++++------ 7 files changed, 49 insertions(+), 37 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 3a6005190..39b173d1a 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -8316,7 +8316,7 @@ dsv4-fp4-gb300-dynamo-sglang: # MTP variant of dsv4-fp4-gb300-dynamo-sglang. dsv4-fp4-gb300-dynamo-sglang-mtp: - image: lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev + image: lmsysorg/sglang:nightly-dev-cu13-20260508-2cf1a4ab model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: gb300-cw diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml index 58b2ee47a..1d45b2a1d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml @@ -6,12 +6,12 @@ frontend: num_additional_frontends: 8 dynamo: - hash: "81d0555ee23519cea80a42b4fe824e30368b7300" + hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c" install: true model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev" + container: "lmsysorg/sglang:nightly-dev-cu13-20260508-2cf1a4ab" precision: "mxfp4" sbatch_directives: @@ -33,8 +33,8 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_ENABLE_THINKING: "1" - SGLANG_REASONING_EFFORT: "max" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" @@ -51,8 +51,8 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_ENABLE_THINKING: "1" - SGLANG_REASONING_EFFORT: "max" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" @@ -72,6 +72,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true + tool-call-parser: deepseekv4 # gates dsv4 chat-encoding spec. disaggregation-mode: "prefill" disaggregation-transfer-backend: mooncake @@ -92,6 +93,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true + tool-call-parser: deepseekv4 # gates dsv4 chat-encoding spec. disaggregation-mode: "decode" disaggregation-transfer-backend: mooncake diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml index 5e335f64d..52bfe60cd 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml @@ -6,12 +6,12 @@ frontend: num_additional_frontends: 8 dynamo: - hash: "81d0555ee23519cea80a42b4fe824e30368b7300" + hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c" install: true model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev" + container: "lmsysorg/sglang:nightly-dev-cu13-20260508-2cf1a4ab" precision: "mxfp4" sbatch_directives: @@ -33,8 +33,8 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_ENABLE_THINKING: "1" - SGLANG_REASONING_EFFORT: "max" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" @@ -62,8 +62,8 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_ENABLE_THINKING: "1" - SGLANG_REASONING_EFFORT: "max" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" @@ -83,6 +83,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true + tool-call-parser: deepseekv4 # gates dsv4 chat-encoding spec. disaggregation-mode: "prefill" disaggregation-transfer-backend: mooncake @@ -106,6 +107,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true + tool-call-parser: deepseekv4 # gates dsv4 chat-encoding spec. disaggregation-mode: "decode" disaggregation-transfer-backend: mooncake diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml index 2af8ef2d6..1c117b5fa 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml @@ -6,12 +6,12 @@ frontend: num_additional_frontends: 8 dynamo: - hash: "81d0555ee23519cea80a42b4fe824e30368b7300" + hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c" install: true model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev" + container: "lmsysorg/sglang:nightly-dev-cu13-20260508-2cf1a4ab" precision: "mxfp4" sbatch_directives: @@ -35,8 +35,8 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_ENABLE_THINKING: "1" - SGLANG_REASONING_EFFORT: "max" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" @@ -64,8 +64,8 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_ENABLE_THINKING: "1" - SGLANG_REASONING_EFFORT: "max" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" @@ -94,6 +94,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true + tool-call-parser: deepseekv4 # gates dsv4 chat-encoding spec. disaggregation-mode: "prefill" disaggregation-transfer-backend: mooncake @@ -118,6 +119,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true + tool-call-parser: deepseekv4 # gates dsv4 chat-encoding spec. disaggregation-mode: "decode" disaggregation-transfer-backend: mooncake diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml index cbd47b932..78f8981d0 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml @@ -6,12 +6,12 @@ frontend: num_additional_frontends: 8 dynamo: - hash: "81d0555ee23519cea80a42b4fe824e30368b7300" + hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c" install: true model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev" + container: "lmsysorg/sglang:nightly-dev-cu13-20260508-2cf1a4ab" precision: "mxfp4" sbatch_directives: @@ -35,8 +35,8 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_ENABLE_THINKING: "1" - SGLANG_REASONING_EFFORT: "max" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" @@ -64,8 +64,8 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_ENABLE_THINKING: "1" - SGLANG_REASONING_EFFORT: "max" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" @@ -94,6 +94,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true + tool-call-parser: deepseekv4 # gates dsv4 chat-encoding spec. disaggregation-mode: "prefill" disaggregation-transfer-backend: mooncake @@ -118,6 +119,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true + tool-call-parser: deepseekv4 # gates dsv4 chat-encoding spec. disaggregation-mode: "decode" disaggregation-transfer-backend: mooncake diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml index 657c0b02d..a58e7c5f5 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml @@ -6,12 +6,12 @@ frontend: num_additional_frontends: 8 dynamo: - hash: "81d0555ee23519cea80a42b4fe824e30368b7300" + hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c" install: true model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev" + container: "lmsysorg/sglang:nightly-dev-cu13-20260508-2cf1a4ab" precision: "mxfp4" sbatch_directives: @@ -35,8 +35,8 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_ENABLE_THINKING: "1" - SGLANG_REASONING_EFFORT: "max" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" @@ -64,8 +64,8 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_ENABLE_THINKING: "1" - SGLANG_REASONING_EFFORT: "max" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" @@ -94,6 +94,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true + tool-call-parser: deepseekv4 # gates dsv4 chat-encoding spec. disaggregation-mode: "prefill" disaggregation-transfer-backend: mooncake @@ -118,6 +119,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true + tool-call-parser: deepseekv4 # gates dsv4 chat-encoding spec. disaggregation-mode: "decode" disaggregation-transfer-backend: mooncake diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml index 20b59b5c2..fa8385d6e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml @@ -6,12 +6,12 @@ frontend: num_additional_frontends: 8 dynamo: - hash: "81d0555ee23519cea80a42b4fe824e30368b7300" + hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c" install: true model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang-staging:deepseek-v4-grace-blackwell-dev" + container: "lmsysorg/sglang:nightly-dev-cu13-20260508-2cf1a4ab" precision: "mxfp4" sbatch_directives: @@ -35,8 +35,8 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_ENABLE_THINKING: "1" - SGLANG_REASONING_EFFORT: "max" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" @@ -64,8 +64,8 @@ backend: PYTHONUNBUFFERED: "1" SGLANG_RADIX_DISABLE_REUSE: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_ENABLE_THINKING: "1" - SGLANG_REASONING_EFFORT: "max" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" @@ -94,6 +94,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true + tool-call-parser: deepseekv4 # gates dsv4 chat-encoding spec. disaggregation-mode: "prefill" disaggregation-transfer-backend: mooncake @@ -118,6 +119,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" model-path: "/model/" trust-remote-code: true + tool-call-parser: deepseekv4 # gates dsv4 chat-encoding spec. disaggregation-mode: "decode" disaggregation-transfer-backend: mooncake From 36bf04050229251ce6e847e2a4e99671d48b485b Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Fri, 8 May 2026 19:16:43 -0700 Subject: [PATCH 19/19] Bump sglang container to nightly-dev-cu13-20260509-9ee83034 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Latest sglang main build (sgl-project/sglang Actions run 25586829316, head_sha 9ee83034, completed 2026-05-09 00:51 UTC). Pairs with the dynamo bump in 9b061135 (commit 34d55a5, PR #9322 — DSV4 chat- template formatter rewrite). Updated all 6 MTP recipe ``container:`` fields and the ``dsv4-fp4-gb300-dynamo-sglang-mtp`` matrix entry's ``image:`` in nvidia-master.yaml. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/nvidia-master.yaml | 2 +- .../deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml | 2 +- .../deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml | 2 +- .../deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml | 2 +- .../deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml | 2 +- .../deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml | 2 +- .../deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 39b173d1a..1b23be292 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -8316,7 +8316,7 @@ dsv4-fp4-gb300-dynamo-sglang: # MTP variant of dsv4-fp4-gb300-dynamo-sglang. dsv4-fp4-gb300-dynamo-sglang-mtp: - image: lmsysorg/sglang:nightly-dev-cu13-20260508-2cf1a4ab + image: lmsysorg/sglang:nightly-dev-cu13-20260509-9ee83034 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: gb300-cw diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml index 1d45b2a1d..6e87911cd 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml @@ -11,7 +11,7 @@ dynamo: model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang:nightly-dev-cu13-20260508-2cf1a4ab" + container: "lmsysorg/sglang:nightly-dev-cu13-20260509-9ee83034" precision: "mxfp4" sbatch_directives: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml index 52bfe60cd..1ddb20a1e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml @@ -11,7 +11,7 @@ dynamo: model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang:nightly-dev-cu13-20260508-2cf1a4ab" + container: "lmsysorg/sglang:nightly-dev-cu13-20260509-9ee83034" precision: "mxfp4" sbatch_directives: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml index 1c117b5fa..65a597eb8 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml @@ -11,7 +11,7 @@ dynamo: model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang:nightly-dev-cu13-20260508-2cf1a4ab" + container: "lmsysorg/sglang:nightly-dev-cu13-20260509-9ee83034" precision: "mxfp4" sbatch_directives: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml index 78f8981d0..b21f60d23 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml @@ -11,7 +11,7 @@ dynamo: model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang:nightly-dev-cu13-20260508-2cf1a4ab" + container: "lmsysorg/sglang:nightly-dev-cu13-20260509-9ee83034" precision: "mxfp4" sbatch_directives: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml index a58e7c5f5..a11086da7 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml @@ -11,7 +11,7 @@ dynamo: model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang:nightly-dev-cu13-20260508-2cf1a4ab" + container: "lmsysorg/sglang:nightly-dev-cu13-20260509-9ee83034" precision: "mxfp4" sbatch_directives: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml index fa8385d6e..e7054c288 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml @@ -11,7 +11,7 @@ dynamo: model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang:nightly-dev-cu13-20260508-2cf1a4ab" + container: "lmsysorg/sglang:nightly-dev-cu13-20260509-9ee83034" precision: "mxfp4" sbatch_directives: