From 5fe1a358c5d2efae950d28b893991c07b4cedf04 Mon Sep 17 00:00:00 2001 From: haic0 Date: Tue, 5 May 2026 21:45:53 -0500 Subject: [PATCH 1/4] Update kimi k2.5 mtp support by using modelrunnerV2 and vllm v0.20.1 Signed-off-by: haic0 --- .github/configs/amd-master.yaml | 2421 ++++++++--------- .../single_node/kimik2.5_int4_mi300x_mtp.sh | 86 + .../single_node/kimik2.5_int4_mi325x_mtp.sh | 86 + .../single_node/kimik2.5_int4_mi355x_mtp.sh | 86 + 4 files changed, 1371 insertions(+), 1308 deletions(-) create mode 100644 benchmarks/single_node/kimik2.5_int4_mi300x_mtp.sh create mode 100644 benchmarks/single_node/kimik2.5_int4_mi325x_mtp.sh create mode 100644 benchmarks/single_node/kimik2.5_int4_mi355x_mtp.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 44045c274..b8b84d535 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -6,21 +6,16 @@ dsr1-fp4-mi355x-sglang: precision: fp4 framework: sglang multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 16, 32, 64, 128, 256] } + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } dsr1-fp4-mi355x-atom: image: rocm/atom:rocm7.1.1-ubuntu24.04-pytorch2.9-atom0.1.1-MI350x @@ -30,18 +25,17 @@ dsr1-fp4-mi355x-atom: precision: fp4 framework: atom multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 32, conc-end: 256 } - - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } - - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 32, conc-end: 256 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } dsr1-fp4-mi355x-atom-mtp: image: rocm/atom:rocm7.2.0-ubuntu24.04-pytorch2.9-atom0.1.1 @@ -52,18 +46,17 @@ dsr1-fp4-mi355x-atom-mtp: # WIP framework (no customers yet) framework: atom multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - #- { tp: 4, conc-start: 32, conc-end: 256, spec-decoding: mtp } - - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + #- { tp: 4, conc-start: 32, conc-end: 256, spec-decoding: mtp } + - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } dsr1-fp8-mi300x-sglang: image: lmsysorg/sglang:v0.5.9-rocm700-mi30x @@ -73,16 +66,15 @@ dsr1-fp8-mi300x-sglang: precision: fp8 framework: sglang multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } dsr1-fp8-mi325x-sglang: image: lmsysorg/sglang:v0.5.9-rocm700-mi30x @@ -92,16 +84,15 @@ dsr1-fp8-mi325x-sglang: precision: fp8 framework: sglang multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } dsr1-fp8-mi355x-sglang: image: lmsysorg/sglang:v0.5.9-rocm700-mi35x @@ -111,17 +102,16 @@ dsr1-fp8-mi355x-sglang: precision: fp8 framework: sglang multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, conc-start: 32, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, conc-start: 32, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-bf16-mi355x-sglang: image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415 @@ -131,16 +121,15 @@ qwen3.5-bf16-mi355x-sglang: precision: bf16 framework: sglang multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } qwen3.5-bf16-mi355x-sglang-mtp: image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415 @@ -150,16 +139,15 @@ qwen3.5-bf16-mi355x-sglang-mtp: precision: bf16 framework: sglang multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } qwen3.5-bf16-mi300x-sglang: image: lmsysorg/sglang:v0.5.10-rocm720-mi30x @@ -169,16 +157,15 @@ qwen3.5-bf16-mi300x-sglang: precision: bf16 framework: sglang multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-bf16-mi325x-sglang: image: lmsysorg/sglang:v0.5.10-rocm720-mi30x @@ -188,16 +175,15 @@ qwen3.5-bf16-mi325x-sglang: precision: bf16 framework: sglang multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-fp8-mi325x-sglang: image: lmsysorg/sglang:v0.5.10-rocm720-mi30x @@ -207,16 +193,15 @@ qwen3.5-fp8-mi325x-sglang: precision: fp8 framework: sglang multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-fp8-mi355x-sglang: image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414 @@ -226,19 +211,18 @@ qwen3.5-fp8-mi355x-sglang: precision: fp8 framework: sglang multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } - - { tp: 8, ep: 8, conc-start: 64, conc-end: 256 } - - { tp: 2, ep: 2, conc-start: 128, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, ep: 2, conc-start: 4, conc-end: 32 } - - { tp: 4, ep: 1, conc-start: 32, conc-end: 256 } + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } + - { tp: 8, ep: 8, conc-start: 64, conc-end: 256 } + - { tp: 2, ep: 2, conc-start: 128, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, ep: 2, conc-start: 4, conc-end: 32 } + - { tp: 4, ep: 1, conc-start: 32, conc-end: 256 } qwen3.5-fp8-mi355x-sglang-mtp: image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414 @@ -248,63 +232,18 @@ qwen3.5-fp8-mi355x-sglang-mtp: precision: fp8 framework: sglang multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 32, spec-decoding: mtp } - - { tp: 8, ep: 8, conc-start: 64, conc-end: 256, spec-decoding: mtp } - - { tp: 2, ep: 2, conc-start: 128, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, ep: 2, conc-start: 4, conc-end: 32, spec-decoding: mtp } - - { tp: 4, ep: 1, conc-start: 32, conc-end: 256, spec-decoding: mtp } - -qwen3.5-fp8-mi355x-atom: - image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - runner: mi355x - precision: fp8 - framework: atom - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 2, ep: 1, conc-start: 4, conc-end: 256 } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, ep: 1, conc-start: 4, conc-end: 256 } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } - -qwen3.5-fp8-mi355x-atom-mtp: - image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - runner: mi355x - precision: fp8 - framework: atom - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 32, spec-decoding: mtp } + - { tp: 8, ep: 8, conc-start: 64, conc-end: 256, spec-decoding: mtp } + - { tp: 2, ep: 2, conc-start: 128, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, ep: 2, conc-start: 4, conc-end: 32, spec-decoding: mtp } + - { tp: 4, ep: 1, conc-start: 32, conc-end: 256, spec-decoding: mtp } qwen3.5-fp4-mi355x-sglang: image: rocm/sgl-dev:v0.5.10rc0-rocm720-mi35x-20260413 @@ -314,39 +253,17 @@ qwen3.5-fp4-mi355x-sglang: precision: fp4 framework: sglang multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 256 } - - { tp: 4, conc-start: 4, conc-end: 16 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 256 } - - { tp: 4, conc-start: 4, conc-end: 16 } - -qwen3.5-fp4-mi355x-atom: - image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post - model: amd/Qwen3.5-397B-A17B-MXFP4 - model-prefix: qwen3.5 - runner: mi355x - precision: fp4 - framework: atom - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 256 } - - { tp: 4, conc-start: 4, conc-end: 16 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 256 } - - { tp: 4, conc-start: 4, conc-end: 16 } + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 256 } + - { tp: 4, conc-start: 4, conc-end: 16 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 256 } + - { tp: 4, conc-start: 4, conc-end: 16 } qwen3.5-fp8-mi300x-sglang: image: lmsysorg/sglang:v0.5.10-rocm720-mi30x @@ -356,16 +273,15 @@ qwen3.5-fp8-mi300x-sglang: precision: fp8 framework: sglang multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } glm5-fp8-mi355x-sglang: image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260413 @@ -375,58 +291,51 @@ glm5-fp8-mi355x-sglang: precision: fp8 framework: sglang multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } glm5-fp8-mi355x-sglang-mtp: - image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415 + image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260413 model: zai-org/GLM-5-FP8 model-prefix: glm5 runner: mi355x precision: fp8 framework: sglang multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, conc-start: 4, conc-end: 128, spec-decoding: mtp } - - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, conc-start: 4, conc-end: 128, spec-decoding: mtp } - - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp } + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } glm5-fp8-mi355x-atom: - image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post + image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2.post model: zai-org/GLM-5-FP8 model-prefix: glm5 runner: mi355x precision: fp8 framework: atom multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, conc-start: 4, conc-end: 256 } - - { tp: 8, conc-start: 4, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, conc-start: 4, conc-end: 256 } - - { tp: 8, conc-start: 4, conc-end: 256 } + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 256 } glm5.1-fp4-mi355x-sglang: image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415 @@ -436,37 +345,17 @@ glm5.1-fp4-mi355x-sglang: precision: fp4 framework: sglang multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 256 } - - { tp: 4, conc-start: 4, conc-end: 16 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 256 } - - { tp: 4, conc-start: 4, conc-end: 16 } - -glm5.1-fp4-mi355x-atom: - image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post - model: amd/GLM-5.1-MXFP4 - model-prefix: glm5.1 - runner: mi355x - precision: fp4 - framework: atom - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, conc-start: 4, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, conc-start: 4, conc-end: 256 } + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 256 } + - { tp: 4, conc-start: 4, conc-end: 16 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 256 } + - { tp: 4, conc-start: 4, conc-end: 16 } kimik2.5-int4-mi355x-vllm: image: vllm/vllm-openai-rocm:v0.18.0 @@ -476,16 +365,15 @@ kimik2.5-int4-mi355x-vllm: precision: int4 framework: vllm multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } kimik2.5-int4-mi325x-vllm: image: vllm/vllm-openai-rocm:v0.18.0 @@ -495,16 +383,15 @@ kimik2.5-int4-mi325x-vllm: precision: int4 framework: vllm multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } kimik2.5-int4-mi300x-vllm: image: vllm/vllm-openai-rocm:v0.18.0 @@ -514,17 +401,70 @@ kimik2.5-int4-mi300x-vllm: precision: int4 framework: vllm multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + +kimik2.5-int4-mi355x-vllm-mtp: + image: vllm/vllm-openai-rocm:v0.20.1 + model: moonshotai/Kimi-K2.5 + model-prefix: kimik2.5 + runner: mi355x + precision: int4 + framework: vllm + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + +kimik2.5-int4-mi300x-vllm-mtp: + image: vllm/vllm-openai-rocm:v0.20.1 + model: moonshotai/Kimi-K2.5 + model-prefix: kimik2.5 + runner: mi300x + precision: int4 + framework: vllm + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + +kimik2.5-int4-mi325x-vllm-mtp: + image: vllm/vllm-openai-rocm:v0.20.1 + model: moonshotai/Kimi-K2.5 + model-prefix: kimik2.5 + runner: mi325x + precision: int4 + framework: vllm + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + kimik2.5-fp4-mi355x-vllm: image: vllm/vllm-openai-rocm:v0.18.0 model: amd/Kimi-K2.5-MXFP4 @@ -533,18 +473,17 @@ kimik2.5-fp4-mi355x-vllm: precision: fp4 framework: vllm multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } kimik2.5-fp4-mi355x-atom: image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2 @@ -554,18 +493,17 @@ kimik2.5-fp4-mi355x-atom: precision: fp4 framework: atom multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 128 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 128 } + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 128 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 128 } minimaxm2.5-fp8-mi355x-vllm: image: vllm/vllm-openai-rocm:v0.19.0 @@ -575,66 +513,41 @@ minimaxm2.5-fp8-mi355x-vllm: precision: fp8 framework: vllm multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 2, ep: 2, conc-start: 2, conc-end: 512 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 256 } - - { tp: 8, ep: 8, conc-start: 2, conc-end: 2 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, ep: 2, conc-start: 2, conc-end: 256 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 512 } - - { tp: 8, ep: 8, conc-start: 2, conc-end: 2 } + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 2, ep: 2, conc-start: 2, conc-end: 512 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 256 } + - { tp: 8, ep: 8, conc-start: 2, conc-end: 2 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, ep: 2, conc-start: 2, conc-end: 256 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 512 } + - { tp: 8, ep: 8, conc-start: 2, conc-end: 2 } minimaxm2.5-fp8-mi355x-atom: - image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post + image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi355x precision: fp8 framework: atom multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 256 } - - { tp: 4, conc-start: 4, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 256 } - - { tp: 4, conc-start: 4, conc-end: 256 } - -minimaxm2.5-fp4-mi355x-atom: - image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post - model: amd/MiniMax-M2.5-MXFP4 - model-prefix: minimaxm2.5 - runner: mi355x - precision: fp4 - framework: atom - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 1024 } - - { tp: 2, conc-start: 4, conc-end: 1024 } - - { tp: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, conc-start: 4, conc-end: 16 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 1024 } - - { tp: 2, conc-start: 4, conc-end: 1024 } - - { tp: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, conc-start: 4, conc-end: 16 } + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, ep: 8, conc-start: 32, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, ep: 8, conc-start: 32, conc-end: 256 } minimaxm2.5-fp4-mi355x-vllm: image: vllm/vllm-openai-rocm:v0.19.1 @@ -644,20 +557,19 @@ minimaxm2.5-fp4-mi355x-vllm: precision: fp4 framework: vllm multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 32 } - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 32 } - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 32 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 32 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } minimaxm2.5-fp8-mi300x-vllm: image: vllm/vllm-openai-rocm:v0.16.0 @@ -667,18 +579,17 @@ minimaxm2.5-fp8-mi300x-vllm: precision: fp8 framework: vllm multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } minimaxm2.5-fp8-mi325x-vllm: image: vllm/vllm-openai-rocm:v0.18.0 @@ -688,18 +599,17 @@ minimaxm2.5-fp8-mi325x-vllm: precision: fp8 framework: vllm multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 512 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 256 } + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 512 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 256 } gptoss-fp4-mi300x-vllm: image: vllm/vllm-openai-rocm:v0.17.0 @@ -709,22 +619,21 @@ gptoss-fp4-mi300x-vllm: precision: fp4 framework: vllm multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, conc-start: 64, conc-end: 256 } - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 1, conc-end: 16 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 64 } - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 1, conc-end: 16 } + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 1, conc-start: 64, conc-end: 256 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 1, conc-end: 16 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 1, conc-end: 16 } gptoss-fp4-mi325x-vllm: image: vllm/vllm-openai-rocm:v0.17.0 @@ -734,22 +643,21 @@ gptoss-fp4-mi325x-vllm: precision: fp4 framework: vllm multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 64 } - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 64 } - - { tp: 2, conc-start: 4, conc-end: 8 } - - { tp: 4, conc-start: 4, conc-end: 8 } - - { tp: 8, conc-start: 4, conc-end: 16 } + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 8 } + - { tp: 4, conc-start: 4, conc-end: 8 } + - { tp: 8, conc-start: 4, conc-end: 16 } gptoss-fp4-mi355x-vllm: image: vllm/vllm-openai-rocm:v0.17.0 @@ -759,41 +667,39 @@ gptoss-fp4-mi355x-vllm: precision: fp4 framework: vllm multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 8 } - - { tp: 8, conc-start: 4, conc-end: 16 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 4 } - - { tp: 8, conc-start: 4, conc-end: 8 } + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 8 } + - { tp: 8, conc-start: 4, conc-end: 16 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 4 } + - { tp: 8, conc-start: 4, conc-end: 8 } gptoss-fp4-mi355x-atom: - image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post + image: rocm/atom:rocm7.1.1-ubuntu24.04-pytorch2.9-atom0.1.1-MI350x model: openai/gpt-oss-120b model-prefix: gptoss runner: mi355x precision: fp4 framework: atom multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, conc-start: 16, conc-end: 256 } - - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 256 } - - { tp: 8, ep: 1, conc-start: 4, conc-end: 16 } + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 1, conc-start: 16, conc-end: 128 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 128 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 16 } dsr1-fp8-mi355x-atom: image: rocm/atom:rocm7.1.1-ubuntu24.04-pytorch2.9-atom0.1.1-MI350x @@ -804,16 +710,15 @@ dsr1-fp8-mi355x-atom: # WIP framework (no customers yet) framework: atom multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 128 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 128 } + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 128 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 128 } dsr1-fp8-mi355x-atom-mtp: image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2 @@ -823,16 +728,15 @@ dsr1-fp8-mi355x-atom-mtp: precision: fp8 framework: atom multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } dsr1-fp8-mi355x-sglang-disagg: image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 @@ -843,151 +747,150 @@ dsr1-fp8-mi355x-sglang-disagg: framework: sglang-disagg multinode: true disagg: true - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - # non-MTP configurations - # "Top of curve" (1 prefill workers each at DEP8 and 1 decode workers at DEP16) - - spec-decoding: "none" - conc-list: [ 1024, 2048 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # "Middle of curve" (1 prefill workers each at TP8 and 2 decode workers at DEP8) - - spec-decoding: "none" - conc-list: [ 1536, 1024, 512 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - - # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) - - spec-decoding: "none" - conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - - spec-decoding: "none" - conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - - isl: 8192 - osl: 1024 - search-space: - # non-MTP configurations - # "Top of curve" (2 prefill worker at DEP8 and 1 decode worker at DEP8) - - spec-decoding: "none" - conc-list: [ 1024, 2048 ] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=2" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - # "Bottom of curve" (1 prefill worker at TP8 and 2 decode workers at TP8) - - spec-decoding: "none" - conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - - spec-decoding: "none" - conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # non-MTP configurations + # "Top of curve" (1 prefill workers each at DEP8 and 1 decode workers at DEP16) + - spec-decoding: "none" + conc-list: [ 1024, 2048 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # "Middle of curve" (1 prefill workers each at TP8 and 2 decode workers at DEP8) + - spec-decoding: "none" + conc-list: [ 1536, 1024, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + + # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) + - spec-decoding: "none" + conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + - spec-decoding: "none" + conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + - isl: 8192 + osl: 1024 + search-space: + # non-MTP configurations + # "Top of curve" (2 prefill worker at DEP8 and 1 decode worker at DEP8) + - spec-decoding: "none" + conc-list: [ 1024, 2048 ] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + # "Bottom of curve" (1 prefill worker at TP8 and 2 decode workers at TP8) + - spec-decoding: "none" + conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + - spec-decoding: "none" + conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" dsr1-fp8-mi355x-sglang-disagg-mtp: @@ -999,151 +902,150 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: framework: sglang-disagg multinode: true disagg: true - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - # MTP configurations - # "Top of curve" (1 prefill worker at DEP8 and 1 decode worker at DEP16) - - spec-decoding: "mtp" - conc-list: [ 1024, 2048 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" - - # "Middle of curve" (1 prefill worker at TP8 and 2 decode workers each at DEP8) - - spec-decoding: "mtp" - conc-list: [ 1536, 1024, 512, 256 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" - - - # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) - - spec-decoding: "mtp" - conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=2" - - - spec-decoding: "mtp" - conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=2" - - - isl: 8192 - osl: 1024 - search-space: - # MTP configurations - # "Top of curve" (2 prefill worker at DEP8 and 1 decode worker at DEP8) - - spec-decoding: "mtp" - conc-list: [ 1024, 2048 ] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=2" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" - - # "Bottom of curve" (1 prefill worker at TP8 and 2 decode workers at TP8) - - spec-decoding: "mtp" - conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=2" - - - spec-decoding: "mtp" - conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=2" + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # MTP configurations + # "Top of curve" (1 prefill worker at DEP8 and 1 decode worker at DEP16) + - spec-decoding: "mtp" + conc-list: [ 1024, 2048 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=1" + + # "Middle of curve" (1 prefill worker at TP8 and 2 decode workers each at DEP8) + - spec-decoding: "mtp" + conc-list: [ 1536, 1024, 512, 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=1" + + + # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) + - spec-decoding: "mtp" + conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=2" + + - spec-decoding: "mtp" + conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=2" + + - isl: 8192 + osl: 1024 + search-space: + # MTP configurations + # "Top of curve" (2 prefill worker at DEP8 and 1 decode worker at DEP8) + - spec-decoding: "mtp" + conc-list: [ 1024, 2048 ] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" + + # "Bottom of curve" (1 prefill worker at TP8 and 2 decode workers at TP8) + - spec-decoding: "mtp" + conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=2" + + - spec-decoding: "mtp" + conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=2" dsr1-fp4-mi355x-sglang-disagg: @@ -1155,205 +1057,204 @@ dsr1-fp4-mi355x-sglang-disagg: framework: sglang-disagg multinode: true disagg: true - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - # non-MTP configurations - # 1P1D TP8 - - spec-decoding: "none" - conc-list: [ 1, 2, 4, 8 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - # 1P2D TP8 - - spec-decoding: "none" - conc-list: [ 2, 4, 8, 16, 32 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # 1P2D TP8 - - spec-decoding: "none" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # 1P2D TP4 - - spec-decoding: "none" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # non-MTP configurations + # 1P1D TP8 + - spec-decoding: "none" + conc-list: [ 1, 2, 4, 8 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + # 1P2D TP8 + - spec-decoding: "none" + conc-list: [ 2, 4, 8, 16, 32 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # 1P2D TP8 + - spec-decoding: "none" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # 1P2D TP4 + - spec-decoding: "none" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" - # 1*DEP4+ 1*DEP8 - - spec-decoding: "none" - conc-list: [ 1024, 2048 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - - isl: 8192 - osl: 1024 - search-space: - # non-MTP configurations - # 1P1D pure TP8 - - spec-decoding: "none" - conc-list: [ 1, 2, 4, 8 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - # 1P2D TP8 - - spec-decoding: "none" - conc-list: [ 2, 4, 8, 16, 32 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # 1P2D TP8 - - spec-decoding: "none" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # 1P2D TP4 - - spec-decoding: "none" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # 4*DEP4 + 1*DEP8 - - spec-decoding: "none" - conc-list: [ 1024, 2048, 4096 ] - prefill: - num-worker: 4 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_NODES=4" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" + # 1*DEP4+ 1*DEP8 + - spec-decoding: "none" + conc-list: [ 1024, 2048 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + - isl: 8192 + osl: 1024 + search-space: + # non-MTP configurations + # 1P1D pure TP8 + - spec-decoding: "none" + conc-list: [ 1, 2, 4, 8 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + # 1P2D TP8 + - spec-decoding: "none" + conc-list: [ 2, 4, 8, 16, 32 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # 1P2D TP8 + - spec-decoding: "none" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # 1P2D TP4 + - spec-decoding: "none" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # 4*DEP4 + 1*DEP8 + - spec-decoding: "none" + conc-list: [ 1024, 2048, 4096 ] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=4" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" dsr1-fp4-mi355x-sglang-disagg-mtp: image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 @@ -1364,299 +1265,203 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: framework: sglang-disagg multinode: true disagg: true - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - # MTP configurations - # 1P1D TP8 - - spec-decoding: "mtp" - conc-list: [ 1, 2, 4, 8 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=3" - - # 1P2D TP8 - - spec-decoding: "mtp" - conc-list: [ 2, 4, 8, 16, 32 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=3" - - # 1P2D TP8 - - spec-decoding: "mtp" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" - - # 1P2D TP4 - - spec-decoding: "mtp" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" - - # 1*DEP4+ 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 1024, 2048 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" - - - - isl: 8192 - osl: 1024 - search-space: - # MTP configurations - # 1P1D pure TP8 - - spec-decoding: "mtp" - conc-list: [ 1, 2, 4, 8 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=3" - - - # 1P2D TP8 - - spec-decoding: "mtp" - conc-list: [ 2, 4, 8, 16, 32 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=3" - - # 1P2D TP8 - - spec-decoding: "mtp" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" - - # 1P2D TP4 - - spec-decoding: "mtp" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" - - # 4*DEP4 + 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 1024, 2048, 4096 ] - prefill: - num-worker: 4 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_NODES=4" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" - -dsv4-fp8-mi355x-sglang: - image: rocm/sgl-dev:deepseek-v4-mi35x - model: sgl-project/DeepSeek-V4-Pro-FP8 - model-prefix: dsv4 - runner: mi355x - precision: fp8 - framework: sglang - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - -# DSv4-Pro FP4 on MI355X via SGLang. Uses a rocm720 mi35x image built off the -# amd/deepseek_v4 branch in sgl-project/sglang; the SHA is encoded in the -# image tag, so bumping sglang is just an image tag bump here. Sweeps -# DP-attention on/off and EP=8. -dsv4-fp4-mi355x-sglang: - image: rocm/sgl-dev:rocm720-mi35x-a8410de-20260502-DSv4 - model: deepseek-ai/DeepSeek-V4-Pro - model-prefix: dsv4 - runner: mi355x - precision: fp4 - framework: sglang - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, dp-attn: true, conc-start: 16, conc-end: 256 } - - { tp: 8, dp-attn: false, conc-start: 1 , conc-end: 16 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, dp-attn: true, conc-start: 16, conc-end: 128 } - - { tp: 8, dp-attn: false, conc-start: 1, conc-end: 16 } - -# vLLM with AITER MLA decode for DSv4 on MI355X (vllm-project/vllm#40889, -# stacked on #40871). Uses the ATOM MI355X image (ROCm 7.2.2, aiter with -# MLA decode, MI355X GPU detection); vLLM is rebuilt from the PR branch -# at runtime by benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh at a -# pinned SHA. Once both PRs merge into a release, switch to a vLLM ROCm -# MI355X image and remove the build step. -dsv4-fp8-mi355x-vllm: - image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post - model: deepseek-ai/DeepSeek-V4-Pro - model-prefix: dsv4 - runner: mi355x - precision: fp8 - framework: vllm - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 1, conc-end: 1 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 1, conc-end: 1 } - -# Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650). -# PR1 of the ATOM DSv4 series still uses torch sparse-attention fallbacks -# that OOM once warmup/prefill batches multiple requests; keep CONC=1 until -# the AITER sparse-attention kernel / multi-request path lands upstream. -# --enforce-eager and ATOM_USE_TRITON_MOE=1 are required on gfx950. Image is -# the standard atom0.1.2.post MI355X base (matching qwen3.5-fp8-mi355x-atom); -# the DSv4 PR is overlaid at runtime by dsv4_fp4_mi355x_atom.sh at a pinned SHA. -dsv4-fp4-mi355x-atom: - image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post - model: deepseek-ai/DeepSeek-V4-Pro - model-prefix: dsv4 - runner: mi355x - precision: fp4 - framework: atom - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # MTP configurations + # 1P1D TP8 + - spec-decoding: "mtp" + conc-list: [ 1, 2, 4, 8 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" + + # 1P2D TP8 + - spec-decoding: "mtp" + conc-list: [ 2, 4, 8, 16, 32 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=3" + + # 1P2D TP8 + - spec-decoding: "mtp" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=1" + + # 1P2D TP4 + - spec-decoding: "mtp" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=1" + + # 1*DEP4+ 1*DEP8 + - spec-decoding: "mtp" + conc-list: [ 1024, 2048 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" + + + - isl: 8192 + osl: 1024 + search-space: + # MTP configurations + # 1P1D pure TP8 + - spec-decoding: "mtp" + conc-list: [ 1, 2, 4, 8 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" + + + # 1P2D TP8 + - spec-decoding: "mtp" + conc-list: [ 2, 4, 8, 16, 32 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=3" + + # 1P2D TP8 + - spec-decoding: "mtp" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=1" + + # 1P2D TP4 + - spec-decoding: "mtp" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=1" + + # 4*DEP4 + 1*DEP8 + - spec-decoding: "mtp" + conc-list: [ 1024, 2048, 4096 ] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=4" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" diff --git a/benchmarks/single_node/kimik2.5_int4_mi300x_mtp.sh b/benchmarks/single_node/kimik2.5_int4_mi300x_mtp.sh new file mode 100644 index 000000000..c32ed1ca4 --- /dev/null +++ b/benchmarks/single_node/kimik2.5_int4_mi300x_mtp.sh @@ -0,0 +1,86 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + MAX_MODEL_LEN \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +hf download "$MODEL" + +# Set HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES for Ray compatibility in vLLM 0.14+ +if [ -n "$ROCR_VISIBLE_DEVICES" ]; then + export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" +fi + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi +# Start GPU monitoring (power, temperature, clocks every second) +start_gpu_monitor + +echo "Ensuring host benchmark dependencies (aiohttp/transformers/numpy/tqdm/huggingface-hub/tiktoken)..." +python3 -m pip install --no-cache-dir aiohttp transformers numpy tqdm huggingface-hub tiktoken + + + +set -x +export VLLM_USE_V2_MODEL_RUNNER=1 +export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 +export VLLM_ROCM_USE_AITER_RMSNORM=0 +export VLLM_SPEC_CONFIG="${VLLM_SPEC_CONFIG:-{\"model\": \"nvidia/Kimi-K2.5-Thinking-Eagle3\", \"method\": \"eagle3\", \"num_speculative_tokens\": 3}}" + +vllm serve $MODEL --port $PORT \ +--tensor-parallel-size=$TP \ +--gpu-memory-utilization 0.9 \ +--max-model-len $MAX_MODEL_LEN \ +--block-size=64 \ +--trust-remote-code \ +--no-enable-prefix-caching \ +--max-num-seqs 256 \ +--speculative-config "$VLLM_SPEC_CONFIG" \ +--mm-encoder-tp-mode data > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --trust-remote-code + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +# Stop GPU monitoring +stop_gpu_monitor +set +x + diff --git a/benchmarks/single_node/kimik2.5_int4_mi325x_mtp.sh b/benchmarks/single_node/kimik2.5_int4_mi325x_mtp.sh new file mode 100644 index 000000000..c32ed1ca4 --- /dev/null +++ b/benchmarks/single_node/kimik2.5_int4_mi325x_mtp.sh @@ -0,0 +1,86 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + MAX_MODEL_LEN \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +hf download "$MODEL" + +# Set HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES for Ray compatibility in vLLM 0.14+ +if [ -n "$ROCR_VISIBLE_DEVICES" ]; then + export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" +fi + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi +# Start GPU monitoring (power, temperature, clocks every second) +start_gpu_monitor + +echo "Ensuring host benchmark dependencies (aiohttp/transformers/numpy/tqdm/huggingface-hub/tiktoken)..." +python3 -m pip install --no-cache-dir aiohttp transformers numpy tqdm huggingface-hub tiktoken + + + +set -x +export VLLM_USE_V2_MODEL_RUNNER=1 +export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 +export VLLM_ROCM_USE_AITER_RMSNORM=0 +export VLLM_SPEC_CONFIG="${VLLM_SPEC_CONFIG:-{\"model\": \"nvidia/Kimi-K2.5-Thinking-Eagle3\", \"method\": \"eagle3\", \"num_speculative_tokens\": 3}}" + +vllm serve $MODEL --port $PORT \ +--tensor-parallel-size=$TP \ +--gpu-memory-utilization 0.9 \ +--max-model-len $MAX_MODEL_LEN \ +--block-size=64 \ +--trust-remote-code \ +--no-enable-prefix-caching \ +--max-num-seqs 256 \ +--speculative-config "$VLLM_SPEC_CONFIG" \ +--mm-encoder-tp-mode data > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --trust-remote-code + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +# Stop GPU monitoring +stop_gpu_monitor +set +x + diff --git a/benchmarks/single_node/kimik2.5_int4_mi355x_mtp.sh b/benchmarks/single_node/kimik2.5_int4_mi355x_mtp.sh new file mode 100644 index 000000000..c32ed1ca4 --- /dev/null +++ b/benchmarks/single_node/kimik2.5_int4_mi355x_mtp.sh @@ -0,0 +1,86 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + MAX_MODEL_LEN \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +hf download "$MODEL" + +# Set HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES for Ray compatibility in vLLM 0.14+ +if [ -n "$ROCR_VISIBLE_DEVICES" ]; then + export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" +fi + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi +# Start GPU monitoring (power, temperature, clocks every second) +start_gpu_monitor + +echo "Ensuring host benchmark dependencies (aiohttp/transformers/numpy/tqdm/huggingface-hub/tiktoken)..." +python3 -m pip install --no-cache-dir aiohttp transformers numpy tqdm huggingface-hub tiktoken + + + +set -x +export VLLM_USE_V2_MODEL_RUNNER=1 +export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 +export VLLM_ROCM_USE_AITER_RMSNORM=0 +export VLLM_SPEC_CONFIG="${VLLM_SPEC_CONFIG:-{\"model\": \"nvidia/Kimi-K2.5-Thinking-Eagle3\", \"method\": \"eagle3\", \"num_speculative_tokens\": 3}}" + +vllm serve $MODEL --port $PORT \ +--tensor-parallel-size=$TP \ +--gpu-memory-utilization 0.9 \ +--max-model-len $MAX_MODEL_LEN \ +--block-size=64 \ +--trust-remote-code \ +--no-enable-prefix-caching \ +--max-num-seqs 256 \ +--speculative-config "$VLLM_SPEC_CONFIG" \ +--mm-encoder-tp-mode data > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --trust-remote-code + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +# Stop GPU monitoring +stop_gpu_monitor +set +x + From f696ca8a44de252bc2480ceb1cf550c341c3ea22 Mon Sep 17 00:00:00 2001 From: haic0 Date: Tue, 5 May 2026 22:11:13 -0500 Subject: [PATCH 2/4] Update kimi k2.5 mtp support by using modelrunnerV2 and vllm v0.20.1 Signed-off-by: haic0 --- .github/configs/amd-master.yaml | 2365 +++++++++++++++++-------------- 1 file changed, 1307 insertions(+), 1058 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index b8b84d535..d5ee5d664 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -6,16 +6,21 @@ dsr1-fp4-mi355x-sglang: precision: fp4 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 16, 32, 64, 128, 256] } dsr1-fp4-mi355x-atom: image: rocm/atom:rocm7.1.1-ubuntu24.04-pytorch2.9-atom0.1.1-MI350x @@ -25,17 +30,18 @@ dsr1-fp4-mi355x-atom: precision: fp4 framework: atom multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 32, conc-end: 256 } - - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } - - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 32, conc-end: 256 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } dsr1-fp4-mi355x-atom-mtp: image: rocm/atom:rocm7.2.0-ubuntu24.04-pytorch2.9-atom0.1.1 @@ -46,17 +52,18 @@ dsr1-fp4-mi355x-atom-mtp: # WIP framework (no customers yet) framework: atom multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - #- { tp: 4, conc-start: 32, conc-end: 256, spec-decoding: mtp } - - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + #- { tp: 4, conc-start: 32, conc-end: 256, spec-decoding: mtp } + - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } dsr1-fp8-mi300x-sglang: image: lmsysorg/sglang:v0.5.9-rocm700-mi30x @@ -66,15 +73,16 @@ dsr1-fp8-mi300x-sglang: precision: fp8 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } dsr1-fp8-mi325x-sglang: image: lmsysorg/sglang:v0.5.9-rocm700-mi30x @@ -84,15 +92,16 @@ dsr1-fp8-mi325x-sglang: precision: fp8 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } dsr1-fp8-mi355x-sglang: image: lmsysorg/sglang:v0.5.9-rocm700-mi35x @@ -102,16 +111,17 @@ dsr1-fp8-mi355x-sglang: precision: fp8 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, conc-start: 32, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, conc-start: 32, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-bf16-mi355x-sglang: image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415 @@ -121,15 +131,16 @@ qwen3.5-bf16-mi355x-sglang: precision: bf16 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } qwen3.5-bf16-mi355x-sglang-mtp: image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415 @@ -139,15 +150,16 @@ qwen3.5-bf16-mi355x-sglang-mtp: precision: bf16 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } qwen3.5-bf16-mi300x-sglang: image: lmsysorg/sglang:v0.5.10-rocm720-mi30x @@ -157,15 +169,16 @@ qwen3.5-bf16-mi300x-sglang: precision: bf16 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-bf16-mi325x-sglang: image: lmsysorg/sglang:v0.5.10-rocm720-mi30x @@ -175,15 +188,16 @@ qwen3.5-bf16-mi325x-sglang: precision: bf16 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-fp8-mi325x-sglang: image: lmsysorg/sglang:v0.5.10-rocm720-mi30x @@ -193,15 +207,16 @@ qwen3.5-fp8-mi325x-sglang: precision: fp8 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-fp8-mi355x-sglang: image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414 @@ -211,18 +226,19 @@ qwen3.5-fp8-mi355x-sglang: precision: fp8 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } - - { tp: 8, ep: 8, conc-start: 64, conc-end: 256 } - - { tp: 2, ep: 2, conc-start: 128, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, ep: 2, conc-start: 4, conc-end: 32 } - - { tp: 4, ep: 1, conc-start: 32, conc-end: 256 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } + - { tp: 8, ep: 8, conc-start: 64, conc-end: 256 } + - { tp: 2, ep: 2, conc-start: 128, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, ep: 2, conc-start: 4, conc-end: 32 } + - { tp: 4, ep: 1, conc-start: 32, conc-end: 256 } qwen3.5-fp8-mi355x-sglang-mtp: image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414 @@ -232,18 +248,63 @@ qwen3.5-fp8-mi355x-sglang-mtp: precision: fp8 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 32, spec-decoding: mtp } - - { tp: 8, ep: 8, conc-start: 64, conc-end: 256, spec-decoding: mtp } - - { tp: 2, ep: 2, conc-start: 128, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, ep: 2, conc-start: 4, conc-end: 32, spec-decoding: mtp } - - { tp: 4, ep: 1, conc-start: 32, conc-end: 256, spec-decoding: mtp } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 32, spec-decoding: mtp } + - { tp: 8, ep: 8, conc-start: 64, conc-end: 256, spec-decoding: mtp } + - { tp: 2, ep: 2, conc-start: 128, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, ep: 2, conc-start: 4, conc-end: 32, spec-decoding: mtp } + - { tp: 4, ep: 1, conc-start: 32, conc-end: 256, spec-decoding: mtp } + +qwen3.5-fp8-mi355x-atom: + image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + runner: mi355x + precision: fp8 + framework: atom + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 2, ep: 1, conc-start: 4, conc-end: 256 } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, ep: 1, conc-start: 4, conc-end: 256 } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } + +qwen3.5-fp8-mi355x-atom-mtp: + image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + runner: mi355x + precision: fp8 + framework: atom + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } qwen3.5-fp4-mi355x-sglang: image: rocm/sgl-dev:v0.5.10rc0-rocm720-mi35x-20260413 @@ -253,17 +314,39 @@ qwen3.5-fp4-mi355x-sglang: precision: fp4 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 256 } - - { tp: 4, conc-start: 4, conc-end: 16 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 256 } - - { tp: 4, conc-start: 4, conc-end: 16 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 256 } + - { tp: 4, conc-start: 4, conc-end: 16 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 256 } + - { tp: 4, conc-start: 4, conc-end: 16 } + +qwen3.5-fp4-mi355x-atom: + image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post + model: amd/Qwen3.5-397B-A17B-MXFP4 + model-prefix: qwen3.5 + runner: mi355x + precision: fp4 + framework: atom + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 256 } + - { tp: 4, conc-start: 4, conc-end: 16 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 256 } + - { tp: 4, conc-start: 4, conc-end: 16 } qwen3.5-fp8-mi300x-sglang: image: lmsysorg/sglang:v0.5.10-rocm720-mi30x @@ -273,15 +356,16 @@ qwen3.5-fp8-mi300x-sglang: precision: fp8 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } glm5-fp8-mi355x-sglang: image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260413 @@ -291,51 +375,58 @@ glm5-fp8-mi355x-sglang: precision: fp8 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } glm5-fp8-mi355x-sglang-mtp: - image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260413 + image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415 model: zai-org/GLM-5-FP8 model-prefix: glm5 runner: mi355x precision: fp8 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, conc-start: 4, conc-end: 128, spec-decoding: mtp } + - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, conc-start: 4, conc-end: 128, spec-decoding: mtp } + - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp } glm5-fp8-mi355x-atom: - image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2.post + image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post model: zai-org/GLM-5-FP8 model-prefix: glm5 runner: mi355x precision: fp8 framework: atom multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 256 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, conc-start: 4, conc-end: 256 } + - { tp: 8, conc-start: 4, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, conc-start: 4, conc-end: 256 } + - { tp: 8, conc-start: 4, conc-end: 256 } glm5.1-fp4-mi355x-sglang: image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415 @@ -345,17 +436,37 @@ glm5.1-fp4-mi355x-sglang: precision: fp4 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 256 } - - { tp: 4, conc-start: 4, conc-end: 16 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 256 } - - { tp: 4, conc-start: 4, conc-end: 16 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 256 } + - { tp: 4, conc-start: 4, conc-end: 16 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 256 } + - { tp: 4, conc-start: 4, conc-end: 16 } + +glm5.1-fp4-mi355x-atom: + image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post + model: amd/GLM-5.1-MXFP4 + model-prefix: glm5.1 + runner: mi355x + precision: fp4 + framework: atom + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, conc-start: 4, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, conc-start: 4, conc-end: 256 } kimik2.5-int4-mi355x-vllm: image: vllm/vllm-openai-rocm:v0.18.0 @@ -365,15 +476,16 @@ kimik2.5-int4-mi355x-vllm: precision: int4 framework: vllm multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } kimik2.5-int4-mi325x-vllm: image: vllm/vllm-openai-rocm:v0.18.0 @@ -383,15 +495,16 @@ kimik2.5-int4-mi325x-vllm: precision: int4 framework: vllm multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } kimik2.5-int4-mi300x-vllm: image: vllm/vllm-openai-rocm:v0.18.0 @@ -401,15 +514,16 @@ kimik2.5-int4-mi300x-vllm: precision: int4 framework: vllm multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } kimik2.5-int4-mi355x-vllm-mtp: image: vllm/vllm-openai-rocm:v0.20.1 @@ -473,17 +587,18 @@ kimik2.5-fp4-mi355x-vllm: precision: fp4 framework: vllm multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } kimik2.5-fp4-mi355x-atom: image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2 @@ -493,17 +608,18 @@ kimik2.5-fp4-mi355x-atom: precision: fp4 framework: atom multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 128 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 128 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 128 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 128 } minimaxm2.5-fp8-mi355x-vllm: image: vllm/vllm-openai-rocm:v0.19.0 @@ -513,41 +629,66 @@ minimaxm2.5-fp8-mi355x-vllm: precision: fp8 framework: vllm multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 2, ep: 2, conc-start: 2, conc-end: 512 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 256 } - - { tp: 8, ep: 8, conc-start: 2, conc-end: 2 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, ep: 2, conc-start: 2, conc-end: 256 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 512 } - - { tp: 8, ep: 8, conc-start: 2, conc-end: 2 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 2, ep: 2, conc-start: 2, conc-end: 512 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 256 } + - { tp: 8, ep: 8, conc-start: 2, conc-end: 2 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, ep: 2, conc-start: 2, conc-end: 256 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 512 } + - { tp: 8, ep: 8, conc-start: 2, conc-end: 2 } minimaxm2.5-fp8-mi355x-atom: - image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2 + image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi355x precision: fp8 framework: atom multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, ep: 8, conc-start: 32, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, ep: 8, conc-start: 32, conc-end: 256 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 256 } + - { tp: 4, conc-start: 4, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 256 } + - { tp: 4, conc-start: 4, conc-end: 256 } + +minimaxm2.5-fp4-mi355x-atom: + image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post + model: amd/MiniMax-M2.5-MXFP4 + model-prefix: minimaxm2.5 + runner: mi355x + precision: fp4 + framework: atom + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 1024 } + - { tp: 2, conc-start: 4, conc-end: 1024 } + - { tp: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, conc-start: 4, conc-end: 16 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 1024 } + - { tp: 2, conc-start: 4, conc-end: 1024 } + - { tp: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, conc-start: 4, conc-end: 16 } minimaxm2.5-fp4-mi355x-vllm: image: vllm/vllm-openai-rocm:v0.19.1 @@ -557,19 +698,20 @@ minimaxm2.5-fp4-mi355x-vllm: precision: fp4 framework: vllm multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 32 } - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 32 } - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 32 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 32 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } minimaxm2.5-fp8-mi300x-vllm: image: vllm/vllm-openai-rocm:v0.16.0 @@ -579,17 +721,18 @@ minimaxm2.5-fp8-mi300x-vllm: precision: fp8 framework: vllm multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } minimaxm2.5-fp8-mi325x-vllm: image: vllm/vllm-openai-rocm:v0.18.0 @@ -599,17 +742,18 @@ minimaxm2.5-fp8-mi325x-vllm: precision: fp8 framework: vllm multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 512 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 256 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 512 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 256 } gptoss-fp4-mi300x-vllm: image: vllm/vllm-openai-rocm:v0.17.0 @@ -619,21 +763,22 @@ gptoss-fp4-mi300x-vllm: precision: fp4 framework: vllm multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, conc-start: 64, conc-end: 256 } - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 1, conc-end: 16 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 64 } - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 1, conc-end: 16 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 1, conc-start: 64, conc-end: 256 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 1, conc-end: 16 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 1, conc-end: 16 } gptoss-fp4-mi325x-vllm: image: vllm/vllm-openai-rocm:v0.17.0 @@ -643,21 +788,22 @@ gptoss-fp4-mi325x-vllm: precision: fp4 framework: vllm multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 64 } - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 64 } - - { tp: 2, conc-start: 4, conc-end: 8 } - - { tp: 4, conc-start: 4, conc-end: 8 } - - { tp: 8, conc-start: 4, conc-end: 16 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 8 } + - { tp: 4, conc-start: 4, conc-end: 8 } + - { tp: 8, conc-start: 4, conc-end: 16 } gptoss-fp4-mi355x-vllm: image: vllm/vllm-openai-rocm:v0.17.0 @@ -667,39 +813,41 @@ gptoss-fp4-mi355x-vllm: precision: fp4 framework: vllm multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 8 } - - { tp: 8, conc-start: 4, conc-end: 16 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 4 } - - { tp: 8, conc-start: 4, conc-end: 8 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 8 } + - { tp: 8, conc-start: 4, conc-end: 16 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 4 } + - { tp: 8, conc-start: 4, conc-end: 8 } gptoss-fp4-mi355x-atom: - image: rocm/atom:rocm7.1.1-ubuntu24.04-pytorch2.9-atom0.1.1-MI350x + image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post model: openai/gpt-oss-120b model-prefix: gptoss runner: mi355x precision: fp4 framework: atom multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, conc-start: 16, conc-end: 128 } - - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 128 } - - { tp: 8, ep: 1, conc-start: 4, conc-end: 16 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 1, conc-start: 16, conc-end: 256 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 256 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 16 } dsr1-fp8-mi355x-atom: image: rocm/atom:rocm7.1.1-ubuntu24.04-pytorch2.9-atom0.1.1-MI350x @@ -710,15 +858,16 @@ dsr1-fp8-mi355x-atom: # WIP framework (no customers yet) framework: atom multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 128 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 128 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 128 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 128 } dsr1-fp8-mi355x-atom-mtp: image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2 @@ -728,15 +877,16 @@ dsr1-fp8-mi355x-atom-mtp: precision: fp8 framework: atom multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } dsr1-fp8-mi355x-sglang-disagg: image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 @@ -747,150 +897,151 @@ dsr1-fp8-mi355x-sglang-disagg: framework: sglang-disagg multinode: true disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # non-MTP configurations - # "Top of curve" (1 prefill workers each at DEP8 and 1 decode workers at DEP16) - - spec-decoding: "none" - conc-list: [ 1024, 2048 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # "Middle of curve" (1 prefill workers each at TP8 and 2 decode workers at DEP8) - - spec-decoding: "none" - conc-list: [ 1536, 1024, 512 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - - # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) - - spec-decoding: "none" - conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - - spec-decoding: "none" - conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - - isl: 8192 - osl: 1024 - search-space: - # non-MTP configurations - # "Top of curve" (2 prefill worker at DEP8 and 1 decode worker at DEP8) - - spec-decoding: "none" - conc-list: [ 1024, 2048 ] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=2" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - # "Bottom of curve" (1 prefill worker at TP8 and 2 decode workers at TP8) - - spec-decoding: "none" - conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - - spec-decoding: "none" - conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # non-MTP configurations + # "Top of curve" (1 prefill workers each at DEP8 and 1 decode workers at DEP16) + - spec-decoding: "none" + conc-list: [ 1024, 2048 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # "Middle of curve" (1 prefill workers each at TP8 and 2 decode workers at DEP8) + - spec-decoding: "none" + conc-list: [ 1536, 1024, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + + # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) + - spec-decoding: "none" + conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + - spec-decoding: "none" + conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + - isl: 8192 + osl: 1024 + search-space: + # non-MTP configurations + # "Top of curve" (2 prefill worker at DEP8 and 1 decode worker at DEP8) + - spec-decoding: "none" + conc-list: [ 1024, 2048 ] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + # "Bottom of curve" (1 prefill worker at TP8 and 2 decode workers at TP8) + - spec-decoding: "none" + conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + - spec-decoding: "none" + conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" dsr1-fp8-mi355x-sglang-disagg-mtp: @@ -902,150 +1053,151 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: framework: sglang-disagg multinode: true disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # MTP configurations - # "Top of curve" (1 prefill worker at DEP8 and 1 decode worker at DEP16) - - spec-decoding: "mtp" - conc-list: [ 1024, 2048 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" - - # "Middle of curve" (1 prefill worker at TP8 and 2 decode workers each at DEP8) - - spec-decoding: "mtp" - conc-list: [ 1536, 1024, 512, 256 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" - - - # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) - - spec-decoding: "mtp" - conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=2" - - - spec-decoding: "mtp" - conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=2" - - - isl: 8192 - osl: 1024 - search-space: - # MTP configurations - # "Top of curve" (2 prefill worker at DEP8 and 1 decode worker at DEP8) - - spec-decoding: "mtp" - conc-list: [ 1024, 2048 ] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=2" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" - - # "Bottom of curve" (1 prefill worker at TP8 and 2 decode workers at TP8) - - spec-decoding: "mtp" - conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=2" - - - spec-decoding: "mtp" - conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=2" + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # MTP configurations + # "Top of curve" (1 prefill worker at DEP8 and 1 decode worker at DEP16) + - spec-decoding: "mtp" + conc-list: [ 1024, 2048 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=1" + + # "Middle of curve" (1 prefill worker at TP8 and 2 decode workers each at DEP8) + - spec-decoding: "mtp" + conc-list: [ 1536, 1024, 512, 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=1" + + + # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) + - spec-decoding: "mtp" + conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=2" + + - spec-decoding: "mtp" + conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=2" + + - isl: 8192 + osl: 1024 + search-space: + # MTP configurations + # "Top of curve" (2 prefill worker at DEP8 and 1 decode worker at DEP8) + - spec-decoding: "mtp" + conc-list: [ 1024, 2048 ] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" + + # "Bottom of curve" (1 prefill worker at TP8 and 2 decode workers at TP8) + - spec-decoding: "mtp" + conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=2" + + - spec-decoding: "mtp" + conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=2" dsr1-fp4-mi355x-sglang-disagg: @@ -1057,204 +1209,205 @@ dsr1-fp4-mi355x-sglang-disagg: framework: sglang-disagg multinode: true disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # non-MTP configurations - # 1P1D TP8 - - spec-decoding: "none" - conc-list: [ 1, 2, 4, 8 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - # 1P2D TP8 - - spec-decoding: "none" - conc-list: [ 2, 4, 8, 16, 32 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # 1P2D TP8 - - spec-decoding: "none" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # 1P2D TP4 - - spec-decoding: "none" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # non-MTP configurations + # 1P1D TP8 + - spec-decoding: "none" + conc-list: [ 1, 2, 4, 8 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + # 1P2D TP8 + - spec-decoding: "none" + conc-list: [ 2, 4, 8, 16, 32 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # 1P2D TP8 + - spec-decoding: "none" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # 1P2D TP4 + - spec-decoding: "none" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" - # 1*DEP4+ 1*DEP8 - - spec-decoding: "none" - conc-list: [ 1024, 2048 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - - isl: 8192 - osl: 1024 - search-space: - # non-MTP configurations - # 1P1D pure TP8 - - spec-decoding: "none" - conc-list: [ 1, 2, 4, 8 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - # 1P2D TP8 - - spec-decoding: "none" - conc-list: [ 2, 4, 8, 16, 32 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # 1P2D TP8 - - spec-decoding: "none" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # 1P2D TP4 - - spec-decoding: "none" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # 4*DEP4 + 1*DEP8 - - spec-decoding: "none" - conc-list: [ 1024, 2048, 4096 ] - prefill: - num-worker: 4 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_NODES=4" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" + # 1*DEP4+ 1*DEP8 + - spec-decoding: "none" + conc-list: [ 1024, 2048 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + - isl: 8192 + osl: 1024 + search-space: + # non-MTP configurations + # 1P1D pure TP8 + - spec-decoding: "none" + conc-list: [ 1, 2, 4, 8 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + # 1P2D TP8 + - spec-decoding: "none" + conc-list: [ 2, 4, 8, 16, 32 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # 1P2D TP8 + - spec-decoding: "none" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # 1P2D TP4 + - spec-decoding: "none" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # 4*DEP4 + 1*DEP8 + - spec-decoding: "none" + conc-list: [ 1024, 2048, 4096 ] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=4" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" dsr1-fp4-mi355x-sglang-disagg-mtp: image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 @@ -1265,203 +1418,299 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: framework: sglang-disagg multinode: true disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # MTP configurations - # 1P1D TP8 - - spec-decoding: "mtp" - conc-list: [ 1, 2, 4, 8 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=3" - - # 1P2D TP8 - - spec-decoding: "mtp" - conc-list: [ 2, 4, 8, 16, 32 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=3" - - # 1P2D TP8 - - spec-decoding: "mtp" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" - - # 1P2D TP4 - - spec-decoding: "mtp" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" - - # 1*DEP4+ 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 1024, 2048 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" - - - - isl: 8192 - osl: 1024 - search-space: - # MTP configurations - # 1P1D pure TP8 - - spec-decoding: "mtp" - conc-list: [ 1, 2, 4, 8 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=3" - - - # 1P2D TP8 - - spec-decoding: "mtp" - conc-list: [ 2, 4, 8, 16, 32 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=3" - - # 1P2D TP8 - - spec-decoding: "mtp" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" - - # 1P2D TP4 - - spec-decoding: "mtp" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" - - # 4*DEP4 + 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 1024, 2048, 4096 ] - prefill: - num-worker: 4 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_NODES=4" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # MTP configurations + # 1P1D TP8 + - spec-decoding: "mtp" + conc-list: [ 1, 2, 4, 8 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" + + # 1P2D TP8 + - spec-decoding: "mtp" + conc-list: [ 2, 4, 8, 16, 32 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=3" + + # 1P2D TP8 + - spec-decoding: "mtp" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=1" + + # 1P2D TP4 + - spec-decoding: "mtp" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=1" + + # 1*DEP4+ 1*DEP8 + - spec-decoding: "mtp" + conc-list: [ 1024, 2048 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" + + + - isl: 8192 + osl: 1024 + search-space: + # MTP configurations + # 1P1D pure TP8 + - spec-decoding: "mtp" + conc-list: [ 1, 2, 4, 8 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" + + + # 1P2D TP8 + - spec-decoding: "mtp" + conc-list: [ 2, 4, 8, 16, 32 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=3" + + # 1P2D TP8 + - spec-decoding: "mtp" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=1" + + # 1P2D TP4 + - spec-decoding: "mtp" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=1" + + # 4*DEP4 + 1*DEP8 + - spec-decoding: "mtp" + conc-list: [ 1024, 2048, 4096 ] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=4" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" + +dsv4-fp8-mi355x-sglang: + image: rocm/sgl-dev:deepseek-v4-mi35x + model: sgl-project/DeepSeek-V4-Pro-FP8 + model-prefix: dsv4 + runner: mi355x + precision: fp8 + framework: sglang + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + +# DSv4-Pro FP4 on MI355X via SGLang. Uses a rocm720 mi35x image built off the +# amd/deepseek_v4 branch in sgl-project/sglang; the SHA is encoded in the +# image tag, so bumping sglang is just an image tag bump here. Sweeps +# DP-attention on/off and EP=8. +dsv4-fp4-mi355x-sglang: + image: rocm/sgl-dev:rocm720-mi35x-a8410de-20260502-DSv4 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: mi355x + precision: fp4 + framework: sglang + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, dp-attn: true, conc-start: 16, conc-end: 256 } + - { tp: 8, dp-attn: false, conc-start: 1 , conc-end: 16 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, dp-attn: true, conc-start: 16, conc-end: 128 } + - { tp: 8, dp-attn: false, conc-start: 1, conc-end: 16 } + +# vLLM with AITER MLA decode for DSv4 on MI355X (vllm-project/vllm#40889, +# stacked on #40871). Uses the ATOM MI355X image (ROCm 7.2.2, aiter with +# MLA decode, MI355X GPU detection); vLLM is rebuilt from the PR branch +# at runtime by benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh at a +# pinned SHA. Once both PRs merge into a release, switch to a vLLM ROCm +# MI355X image and remove the build step. +dsv4-fp8-mi355x-vllm: + image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: mi355x + precision: fp8 + framework: vllm + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 1, conc-end: 1 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 1, conc-end: 1 } + +# Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650). +# PR1 of the ATOM DSv4 series still uses torch sparse-attention fallbacks +# that OOM once warmup/prefill batches multiple requests; keep CONC=1 until +# the AITER sparse-attention kernel / multi-request path lands upstream. +# --enforce-eager and ATOM_USE_TRITON_MOE=1 are required on gfx950. Image is +# the standard atom0.1.2.post MI355X base (matching qwen3.5-fp8-mi355x-atom); +# the DSv4 PR is overlaid at runtime by dsv4_fp4_mi355x_atom.sh at a pinned SHA. +dsv4-fp4-mi355x-atom: + image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: mi355x + precision: fp4 + framework: atom + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } From 512552dd0775361f6a75eb082dc1d790ea7108bf Mon Sep 17 00:00:00 2001 From: haic0 Date: Tue, 5 May 2026 22:21:07 -0500 Subject: [PATCH 3/4] Update kimi k2.5 mtp support by using modelrunnerV2 and vllm v0.20.1 Signed-off-by: haic0 --- .github/configs/amd-master.yaml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index d5ee5d664..7c54c5009 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -533,7 +533,8 @@ kimik2.5-int4-mi355x-vllm-mtp: precision: int4 framework: vllm multinode: false - seq-len-configs: + scenarios: + fixed-seq-len: - isl: 1024 osl: 1024 search-space: @@ -551,7 +552,8 @@ kimik2.5-int4-mi300x-vllm-mtp: precision: int4 framework: vllm multinode: false - seq-len-configs: + scenarios: + fixed-seq-len: - isl: 1024 osl: 1024 search-space: @@ -569,7 +571,8 @@ kimik2.5-int4-mi325x-vllm-mtp: precision: int4 framework: vllm multinode: false - seq-len-configs: + scenarios: + fixed-seq-len: - isl: 1024 osl: 1024 search-space: From 48b30857db1f07f926e73ef636e7966d43aedc13 Mon Sep 17 00:00:00 2001 From: haic0 Date: Tue, 5 May 2026 22:39:05 -0500 Subject: [PATCH 4/4] Update kimi k2.5 mtp support by using modelrunnerV2 and vllm v0.20.1 Signed-off-by: haic0 --- .github/configs/amd-master.yaml | 50 ++++++++++++++++----------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 7c54c5009..7c666fe6d 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -535,14 +535,14 @@ kimik2.5-int4-mi355x-vllm-mtp: multinode: false scenarios: fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } kimik2.5-int4-mi300x-vllm-mtp: image: vllm/vllm-openai-rocm:v0.20.1 @@ -554,14 +554,14 @@ kimik2.5-int4-mi300x-vllm-mtp: multinode: false scenarios: fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } kimik2.5-int4-mi325x-vllm-mtp: image: vllm/vllm-openai-rocm:v0.20.1 @@ -573,15 +573,15 @@ kimik2.5-int4-mi325x-vllm-mtp: multinode: false scenarios: fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } - + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + kimik2.5-fp4-mi355x-vllm: image: vllm/vllm-openai-rocm:v0.18.0 model: amd/Kimi-K2.5-MXFP4