From f67aea9a8cbbb145904ad87b2f06e5890d7dd886 Mon Sep 17 00:00:00 2001 From: Andy Luo Date: Sat, 2 May 2026 12:09:51 -0700 Subject: [PATCH 1/4] feat: add vLLM + LMCache CPU offloading for MiniMax-M2.5 agentic benchmark on AMD GPUs Add `offloading: lmcache` as a new KV cache offloading option for the agentic trace replay benchmark on MI300X/MI325X/MI355X. LMCache offloads cold KV cache pages to CPU DRAM via LMCacheConnectorV1, enabling larger working sets than HBM-only prefix caching. - Add benchmark scripts for MiniMax-M2.5 FP8 on MI300X/MI325X/MI355X - Add install_lmcache_hip() helper (PyPI wheel is CUDA-only, must build from source) - Extend offloading Literal to include "lmcache" in validation - Add agentic-coding scenarios to AMD master config with lmcache/none sweeps - Add 21 new tests for agentic + LMCache validation - Bump MiniMax vLLM images to v0.19.1 Smoke-tested on MI300X (TP=2) and MI355X (TP=4) with MiniMax-M2.7. Co-Authored-By: Claude Opus 4.6 --- .github/configs/amd-master.yaml | 27 +- benchmarks/benchmark_lib.sh | 13 + .../agentic/minimaxm2.5_fp8_mi300x.sh | 122 ++++++++ .../agentic/minimaxm2.5_fp8_mi325x.sh | 122 ++++++++ .../agentic/minimaxm2.5_fp8_mi355x.sh | 122 ++++++++ utils/matrix_logic/test_validation.py | 280 ++++++++++++++++++ utils/matrix_logic/validation.py | 4 +- 7 files changed, 685 insertions(+), 5 deletions(-) create mode 100755 benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh create mode 100755 benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh create mode 100755 benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 949a8a106..41894d75d 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -568,7 +568,7 @@ kimik2.5-fp4-mi355x-atom: - { tp: 4, conc-start: 4, conc-end: 128 } minimaxm2.5-fp8-mi355x-vllm: - image: vllm/vllm-openai-rocm:v0.19.0 + image: vllm/vllm-openai-rocm:v0.19.1 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi355x @@ -589,6 +589,13 @@ minimaxm2.5-fp8-mi355x-vllm: - { tp: 2, ep: 2, conc-start: 2, conc-end: 256 } - { tp: 4, ep: 4, conc-start: 4, conc-end: 512 } - { tp: 8, ep: 8, conc-start: 2, conc-end: 2 } + agentic-coding: + - duration: 1800 + search-space: + - { tp: 4, offloading: none, conc-list: [2, 4, 8, 16, 32] } + - { tp: 4, offloading: lmcache, conc-list: [2, 4, 8, 16, 32] } + - { tp: 8, offloading: none, conc-list: [2, 4, 8, 16, 32] } + - { tp: 8, offloading: lmcache, conc-list: [2, 4, 8, 16, 32] } minimaxm2.5-fp8-mi355x-atom: image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2 @@ -662,7 +669,7 @@ minimaxm2.5-fp4-mi355x-vllm: - { tp: 4, conc-start: 4, conc-end: 64 } minimaxm2.5-fp8-mi300x-vllm: - image: vllm/vllm-openai-rocm:v0.16.0 + image: vllm/vllm-openai-rocm:v0.19.1 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi300x @@ -681,9 +688,16 @@ minimaxm2.5-fp8-mi300x-vllm: search-space: - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } + agentic-coding: + - duration: 1800 + search-space: + - { tp: 2, offloading: none, conc-list: [2, 4, 8, 16] } + - { tp: 2, offloading: lmcache, conc-list: [2, 4, 8, 16] } + - { tp: 8, offloading: none, conc-list: [2, 4, 8, 16, 32] } + - { tp: 8, offloading: lmcache, conc-list: [2, 4, 8, 16, 32] } minimaxm2.5-fp8-mi325x-vllm: - image: vllm/vllm-openai-rocm:v0.18.0 + image: vllm/vllm-openai-rocm:v0.19.1 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi325x @@ -702,6 +716,13 @@ minimaxm2.5-fp8-mi325x-vllm: search-space: - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 8, ep: 8, conc-start: 4, conc-end: 256 } + agentic-coding: + - duration: 1800 + search-space: + - { tp: 2, offloading: none, conc-list: [2, 4, 8, 16] } + - { tp: 2, offloading: lmcache, conc-list: [2, 4, 8, 16] } + - { tp: 8, offloading: none, conc-list: [2, 4, 8, 16, 32] } + - { tp: 8, offloading: lmcache, conc-list: [2, 4, 8, 16, 32] } gptoss-fp4-mi300x-vllm: image: vllm/vllm-openai-rocm:v0.17.0 diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 4c0c8642e..75c72316f 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -902,6 +902,19 @@ resolve_trace_source() { hf download --repo-type dataset "$dataset" } +install_lmcache_hip() { + # LMCache PyPI wheel ships CUDA-only c_ops.so; must build from source for ROCm. + # `pip install lmcache` ignores BUILD_WITH_HIP and installs the pre-built CUDA wheel. + # We must clone and build with --no-build-isolation to get the HIP c_ops.so. + local lmcache_dir + lmcache_dir="$(mktemp -d)/LMCache" + echo "Building LMCache from source with HIP support..." + git clone --depth 1 https://github.com/LMCache/LMCache.git "$lmcache_dir" + SETUPTOOLS_SCM_PRETEND_VERSION=0.4.4 BUILD_WITH_HIP=1 \ + agentic_pip_install -e "$lmcache_dir" --no-build-isolation + echo "LMCache HIP build complete." +} + install_agentic_deps() { agentic_pip_install --quiet urllib3 requests 2>/dev/null || true agentic_pip_install -q -r "$AGENTIC_DIR/requirements.txt" diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh new file mode 100755 index 000000000..9512c5a63 --- /dev/null +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh @@ -0,0 +1,122 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# Agentic trace replay benchmark for MiniMax-M2.5 FP8 on MI300X using vLLM. +# Supports LMCache CPU DRAM offloading for KV cache. +# +# Required env vars: +# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR + +source "$(dirname "$0")/../../benchmark_lib.sh" + +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR + +PORT=${PORT:-8888} +DURATION=${DURATION:-1800} +MAX_DELAY=${MAX_DELAY:-60} +ADVANCE_MIN=${ADVANCE_MIN:-0.0} +ADVANCE_MAX=${ADVANCE_MAX:-0.7} +# Agentic matrix entries don't set max-model-len, so the workflow passes 0. +# ${:-DEFAULT} only fires on unset/empty, so handle 0 explicitly. +if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then + MAX_MODEL_LEN=131072 +fi + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +rocm-smi + +# If the machine runs a MEC FW older than 177, RCCL cannot reclaim some memory. +# See https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates +version=`rocm-smi --showfw | grep MEC | head -n 1 | awk '{print $NF}'` +if [[ "$version" == "" || $version -lt 177 ]]; then + export HSA_NO_SCRATCH_RECLAIM=1 +fi + +# Ray compatibility in vLLM 0.14+ needs HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES +if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then + export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" +fi + +export AMDGCN_USE_BUFFER_OPS=0 +export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 +export PYTHONNOUSERSITE=1 + +# ---- Resolve traces and install deps ---------------------------------------- +resolve_trace_source +install_agentic_deps + +# ---- Server config ---------------------------------------------------------- +SERVER_LOG="$RESULT_DIR/server.log" +mkdir -p "$RESULT_DIR" + +OFFLOAD_ARGS="" +PREFIX_CACHE_FLAG="--no-enable-prefix-caching" + +case "$OFFLOADING" in + none) + ;; + cpu) + OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager" + ;; + lmcache) + # LMCache CPU DRAM offloading via LMCacheConnectorV1. + # Critical: PYTHONHASHSEED=0 is mandatory for cache key consistency + # across TP workers. Without it, hit rate is 0%. + install_lmcache_hip + export PYTHONHASHSEED=0 + export LMCACHE_LOCAL_CPU=true + export LMCACHE_CHUNK_SIZE=256 + # LMCache reuses vLLM's prefix cache hash function, so prefix caching + # must be enabled (unlike native CPU offloading). + PREFIX_CACHE_FLAG="--enable-prefix-caching" + OFFLOAD_ARGS="--kv-transfer-config {\"kv_connector\":\"LMCacheConnectorV1\",\"kv_role\":\"kv_both\"}" + ;; + *) + echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu, lmcache)" >&2 + exit 1 + ;; +esac + +echo "Starting vllm server..." + +vllm serve $MODEL \ +--host 0.0.0.0 \ +--port $PORT \ +--trust-remote-code \ +--tool-call-parser minimax_m2 \ +--reasoning-parser minimax_m2 \ +--enable-auto-tool-choice \ +--attention-backend ROCM_AITER_UNIFIED_ATTN \ +--tensor-parallel-size=$TP \ +--gpu-memory-utilization 0.85 \ +--max-model-len $MAX_MODEL_LEN \ +--max-num-seqs $CONC \ +--block-size=64 \ +--kv-cache-dtype fp8 \ +$PREFIX_CACHE_FLAG \ +$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +# ---- Run benchmark ---------------------------------------------------------- +build_replay_cmd "$RESULT_DIR" + +echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" + +set -x +$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true +set +x + +write_agentic_result_json "$RESULT_DIR" + +# ---- Post-processing -------------------------------------------------------- +python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ + "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh new file mode 100755 index 000000000..8e2306721 --- /dev/null +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh @@ -0,0 +1,122 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# Agentic trace replay benchmark for MiniMax-M2.5 FP8 on MI325X using vLLM. +# Supports LMCache CPU DRAM offloading for KV cache. +# +# Required env vars: +# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR + +source "$(dirname "$0")/../../benchmark_lib.sh" + +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR + +PORT=${PORT:-8888} +DURATION=${DURATION:-1800} +MAX_DELAY=${MAX_DELAY:-60} +ADVANCE_MIN=${ADVANCE_MIN:-0.0} +ADVANCE_MAX=${ADVANCE_MAX:-0.7} +# Agentic matrix entries don't set max-model-len, so the workflow passes 0. +# ${:-DEFAULT} only fires on unset/empty, so handle 0 explicitly. +if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then + MAX_MODEL_LEN=131072 +fi + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +rocm-smi + +# If the machine runs a MEC FW older than 177, RCCL cannot reclaim some memory. +# See https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates +version=`rocm-smi --showfw | grep MEC | head -n 1 | awk '{print $NF}'` +if [[ "$version" == "" || $version -lt 177 ]]; then + export HSA_NO_SCRATCH_RECLAIM=1 +fi + +# Ray compatibility in vLLM 0.14+ needs HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES +if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then + export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" +fi + +export AMDGCN_USE_BUFFER_OPS=0 +export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 +export PYTHONNOUSERSITE=1 + +# ---- Resolve traces and install deps ---------------------------------------- +resolve_trace_source +install_agentic_deps + +# ---- Server config ---------------------------------------------------------- +SERVER_LOG="$RESULT_DIR/server.log" +mkdir -p "$RESULT_DIR" + +OFFLOAD_ARGS="" +PREFIX_CACHE_FLAG="--no-enable-prefix-caching" + +case "$OFFLOADING" in + none) + ;; + cpu) + OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager" + ;; + lmcache) + # LMCache CPU DRAM offloading via LMCacheConnectorV1. + # Critical: PYTHONHASHSEED=0 is mandatory for cache key consistency + # across TP workers. Without it, hit rate is 0%. + install_lmcache_hip + export PYTHONHASHSEED=0 + export LMCACHE_LOCAL_CPU=true + export LMCACHE_CHUNK_SIZE=256 + # LMCache reuses vLLM's prefix cache hash function, so prefix caching + # must be enabled (unlike native CPU offloading). + PREFIX_CACHE_FLAG="--enable-prefix-caching" + OFFLOAD_ARGS="--kv-transfer-config {\"kv_connector\":\"LMCacheConnectorV1\",\"kv_role\":\"kv_both\"}" + ;; + *) + echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu, lmcache)" >&2 + exit 1 + ;; +esac + +echo "Starting vllm server..." + +vllm serve $MODEL \ +--host 0.0.0.0 \ +--port $PORT \ +--trust-remote-code \ +--tool-call-parser minimax_m2 \ +--reasoning-parser minimax_m2 \ +--enable-auto-tool-choice \ +--attention-backend ROCM_AITER_UNIFIED_ATTN \ +--tensor-parallel-size=$TP \ +--gpu-memory-utilization 0.85 \ +--max-model-len $MAX_MODEL_LEN \ +--max-num-seqs $CONC \ +--block-size=64 \ +--kv-cache-dtype fp8 \ +$PREFIX_CACHE_FLAG \ +$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +# ---- Run benchmark ---------------------------------------------------------- +build_replay_cmd "$RESULT_DIR" + +echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" + +set -x +$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true +set +x + +write_agentic_result_json "$RESULT_DIR" + +# ---- Post-processing -------------------------------------------------------- +python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ + "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh new file mode 100755 index 000000000..9304ae5c9 --- /dev/null +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh @@ -0,0 +1,122 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# Agentic trace replay benchmark for MiniMax-M2.5 FP8 on MI355X using vLLM. +# Supports LMCache CPU DRAM offloading for KV cache. +# +# Required env vars: +# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR + +source "$(dirname "$0")/../../benchmark_lib.sh" + +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR + +PORT=${PORT:-8888} +DURATION=${DURATION:-1800} +MAX_DELAY=${MAX_DELAY:-60} +ADVANCE_MIN=${ADVANCE_MIN:-0.0} +ADVANCE_MAX=${ADVANCE_MAX:-0.7} +# Agentic matrix entries don't set max-model-len, so the workflow passes 0. +# ${:-DEFAULT} only fires on unset/empty, so handle 0 explicitly. +if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then + MAX_MODEL_LEN=131072 +fi + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +rocm-smi + +# If the machine runs a MEC FW older than 177, RCCL cannot reclaim some memory. +# See https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates +version=`rocm-smi --showfw | grep MEC | head -n 1 | awk '{print $NF}'` +if [[ "$version" == "" || $version -lt 177 ]]; then + export HSA_NO_SCRATCH_RECLAIM=1 +fi + +# Ray compatibility in vLLM 0.14+ needs HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES +if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then + export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" +fi + +export AMDGCN_USE_BUFFER_OPS=0 +export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 +export PYTHONNOUSERSITE=1 + +# ---- Resolve traces and install deps ---------------------------------------- +resolve_trace_source +install_agentic_deps + +# ---- Server config ---------------------------------------------------------- +SERVER_LOG="$RESULT_DIR/server.log" +mkdir -p "$RESULT_DIR" + +OFFLOAD_ARGS="" +PREFIX_CACHE_FLAG="--no-enable-prefix-caching" + +case "$OFFLOADING" in + none) + ;; + cpu) + OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager" + ;; + lmcache) + # LMCache CPU DRAM offloading via LMCacheConnectorV1. + # Critical: PYTHONHASHSEED=0 is mandatory for cache key consistency + # across TP workers. Without it, hit rate is 0%. + install_lmcache_hip + export PYTHONHASHSEED=0 + export LMCACHE_LOCAL_CPU=true + export LMCACHE_CHUNK_SIZE=256 + # LMCache reuses vLLM's prefix cache hash function, so prefix caching + # must be enabled (unlike native CPU offloading). + PREFIX_CACHE_FLAG="--enable-prefix-caching" + OFFLOAD_ARGS="--kv-transfer-config {\"kv_connector\":\"LMCacheConnectorV1\",\"kv_role\":\"kv_both\"}" + ;; + *) + echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu, lmcache)" >&2 + exit 1 + ;; +esac + +echo "Starting vllm server..." + +vllm serve $MODEL \ +--host 0.0.0.0 \ +--port $PORT \ +--trust-remote-code \ +--tool-call-parser minimax_m2 \ +--reasoning-parser minimax_m2 \ +--enable-auto-tool-choice \ +--attention-backend ROCM_AITER_UNIFIED_ATTN \ +--tensor-parallel-size=$TP \ +--gpu-memory-utilization 0.85 \ +--max-model-len $MAX_MODEL_LEN \ +--max-num-seqs $CONC \ +--block-size=64 \ +--kv-cache-dtype fp8 \ +$PREFIX_CACHE_FLAG \ +$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +# ---- Run benchmark ---------------------------------------------------------- +build_replay_cmd "$RESULT_DIR" + +echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" + +set -x +$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true +set +x + +write_agentic_result_json "$RESULT_DIR" + +# ---- Post-processing -------------------------------------------------------- +python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ + "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true diff --git a/utils/matrix_logic/test_validation.py b/utils/matrix_logic/test_validation.py index 1274fd86a..284937815 100644 --- a/utils/matrix_logic/test_validation.py +++ b/utils/matrix_logic/test_validation.py @@ -11,7 +11,12 @@ MultiNodeSeqLenConfig, SingleNodeMasterConfigEntry, MultiNodeMasterConfigEntry, + SingleNodeAgenticMatrixEntry, + AgenticCodingSearchSpaceEntry, + AgenticCodingConfig, + SingleNodeScenarios, validate_matrix_entry, + validate_agentic_matrix_entry, validate_master_config, validate_runner_config, load_config_files, @@ -875,3 +880,278 @@ def test_validation_runs_by_default(self, tmp_path): with pytest.raises(ValueError) as exc_info: load_runner_file(str(runner_file)) assert "must be a list" in str(exc_info.value) + + +# ============================================================================= +# Test AgenticCodingSearchSpaceEntry +# ============================================================================= + +class TestAgenticCodingSearchSpaceEntry: + """Tests for AgenticCodingSearchSpaceEntry model.""" + + def test_valid_with_offloading_none(self): + """Valid entry with offloading=none should pass.""" + entry = AgenticCodingSearchSpaceEntry(**{ + "tp": 8, + "offloading": "none", + "conc-list": [2, 4, 8, 16], + }) + assert entry.tp == 8 + assert entry.offloading == "none" + assert entry.conc_list == [2, 4, 8, 16] + + def test_valid_with_offloading_cpu(self): + """Valid entry with offloading=cpu should pass.""" + entry = AgenticCodingSearchSpaceEntry(**{ + "tp": 4, + "offloading": "cpu", + "conc-list": [4, 8], + }) + assert entry.offloading == "cpu" + + def test_valid_with_offloading_lmcache(self): + """Valid entry with offloading=lmcache should pass.""" + entry = AgenticCodingSearchSpaceEntry(**{ + "tp": 2, + "offloading": "lmcache", + "conc-list": [2, 4, 8, 16], + }) + assert entry.offloading == "lmcache" + assert entry.tp == 2 + + def test_valid_with_offloading_ssd(self): + """Valid entry with offloading=ssd should pass.""" + entry = AgenticCodingSearchSpaceEntry(**{ + "tp": 8, + "offloading": "ssd", + "conc-start": 4, + "conc-end": 32, + }) + assert entry.offloading == "ssd" + + def test_invalid_offloading_value(self): + """Invalid offloading value should fail.""" + with pytest.raises(Exception): + AgenticCodingSearchSpaceEntry(**{ + "tp": 8, + "offloading": "invalid", + "conc-list": [4], + }) + + def test_offloading_defaults_to_none(self): + """Offloading should default to none.""" + entry = AgenticCodingSearchSpaceEntry(**{ + "tp": 8, + "conc-list": [4, 8], + }) + assert entry.offloading == "none" + + def test_must_specify_tp_or_prefill_decode(self): + """Must specify either tp or both prefill and decode.""" + with pytest.raises(Exception) as exc_info: + AgenticCodingSearchSpaceEntry(**{ + "offloading": "lmcache", + "conc-list": [4], + }) + assert "must specify either tp" in str(exc_info.value).lower() + + def test_cannot_mix_tp_and_prefill(self): + """Cannot specify both tp and prefill/decode.""" + with pytest.raises(Exception): + AgenticCodingSearchSpaceEntry(**{ + "tp": 8, + "prefill": { + "num-worker": 1, "tp": 4, "ep": 4, "dp-attn": False, + }, + "decode": { + "num-worker": 1, "tp": 8, "ep": 8, "dp-attn": False, + }, + "conc-list": [4], + }) + + +# ============================================================================= +# Test SingleNodeAgenticMatrixEntry +# ============================================================================= + +class TestSingleNodeAgenticMatrixEntry: + """Tests for SingleNodeAgenticMatrixEntry model.""" + + @pytest.fixture + def valid_agentic_entry(self): + return { + "image": "vllm/vllm-openai-rocm:v0.19.1", + "model": "MiniMaxAI/MiniMax-M2.5", + "model-prefix": "minimaxm2.5", + "precision": "fp8", + "framework": "vllm", + "runner": "mi300x", + "tp": 2, + "ep": 1, + "dp-attn": False, + "conc": 8, + "offloading": "lmcache", + "duration": 1800, + "exp-name": "minimaxm2.5_tp2_conc8_offloadlmcache", + "scenario-type": "agentic-coding", + } + + def test_valid_lmcache_entry(self, valid_agentic_entry): + """Valid agentic entry with lmcache offloading should pass.""" + entry = SingleNodeAgenticMatrixEntry(**valid_agentic_entry) + assert entry.offloading == "lmcache" + assert entry.tp == 2 + assert entry.conc == 8 + assert entry.scenario_type == "agentic-coding" + + def test_valid_none_offloading(self, valid_agentic_entry): + """Valid agentic entry with no offloading should pass.""" + valid_agentic_entry["offloading"] = "none" + entry = SingleNodeAgenticMatrixEntry(**valid_agentic_entry) + assert entry.offloading == "none" + + def test_valid_cpu_offloading(self, valid_agentic_entry): + """Valid agentic entry with cpu offloading should pass.""" + valid_agentic_entry["offloading"] = "cpu" + entry = SingleNodeAgenticMatrixEntry(**valid_agentic_entry) + assert entry.offloading == "cpu" + + def test_invalid_offloading_rejected(self, valid_agentic_entry): + """Invalid offloading value should fail.""" + valid_agentic_entry["offloading"] = "gpu" + with pytest.raises(Exception): + SingleNodeAgenticMatrixEntry(**valid_agentic_entry) + + def test_missing_offloading_fails(self, valid_agentic_entry): + """Missing offloading field should fail (no default on matrix entry).""" + del valid_agentic_entry["offloading"] + with pytest.raises(Exception): + SingleNodeAgenticMatrixEntry(**valid_agentic_entry) + + def test_extra_field_forbidden(self, valid_agentic_entry): + """Extra fields should be rejected.""" + valid_agentic_entry["extra-field"] = "value" + with pytest.raises(Exception): + SingleNodeAgenticMatrixEntry(**valid_agentic_entry) + + def test_validate_agentic_matrix_entry_function(self, valid_agentic_entry): + """validate_agentic_matrix_entry should accept valid entry.""" + result = validate_agentic_matrix_entry(valid_agentic_entry) + assert result == valid_agentic_entry + + def test_validate_agentic_matrix_entry_invalid(self, valid_agentic_entry): + """validate_agentic_matrix_entry should reject invalid entry.""" + del valid_agentic_entry["tp"] + with pytest.raises(ValueError) as exc_info: + validate_agentic_matrix_entry(valid_agentic_entry) + assert "failed validation" in str(exc_info.value) + + +# ============================================================================= +# Test AgenticCodingConfig +# ============================================================================= + +class TestAgenticCodingConfig: + """Tests for AgenticCodingConfig model.""" + + def test_valid_with_lmcache_and_none(self): + """Config with both lmcache and none offloading entries should pass.""" + config = AgenticCodingConfig(**{ + "duration": 1800, + "search-space": [ + {"tp": 2, "offloading": "none", "conc-list": [2, 4, 8]}, + {"tp": 2, "offloading": "lmcache", "conc-list": [2, 4, 8]}, + ], + }) + assert config.duration == 1800 + assert len(config.search_space) == 2 + assert config.search_space[0].offloading == "none" + assert config.search_space[1].offloading == "lmcache" + + def test_duration_defaults_to_1800(self): + """Duration should default to 1800.""" + config = AgenticCodingConfig(**{ + "search-space": [ + {"tp": 8, "offloading": "none", "conc-list": [4]}, + ], + }) + assert config.duration == 1800 + + +# ============================================================================= +# Test Master Config with Agentic Scenarios +# ============================================================================= + +class TestMasterConfigWithAgentic: + """Tests for master config entries containing agentic-coding scenarios.""" + + def test_single_node_with_agentic_only(self): + """Single node config with only agentic-coding scenario should pass.""" + config = SingleNodeMasterConfigEntry(**{ + "image": "vllm/vllm-openai-rocm:v0.19.1", + "model": "MiniMaxAI/MiniMax-M2.5", + "model-prefix": "minimaxm2.5", + "precision": "fp8", + "framework": "vllm", + "runner": "mi300x", + "multinode": False, + "scenarios": { + "agentic-coding": [ + { + "duration": 1800, + "search-space": [ + {"tp": 2, "offloading": "lmcache", "conc-list": [2, 4, 8]}, + ], + } + ], + }, + }) + assert config.scenarios.agentic_coding is not None + assert len(config.scenarios.agentic_coding) == 1 + assert config.scenarios.agentic_coding[0].search_space[0].offloading == "lmcache" + + def test_single_node_with_both_scenarios(self): + """Single node config with both fixed-seq-len and agentic-coding should pass.""" + config = SingleNodeMasterConfigEntry(**{ + "image": "vllm/vllm-openai-rocm:v0.19.1", + "model": "MiniMaxAI/MiniMax-M2.5", + "model-prefix": "minimaxm2.5", + "precision": "fp8", + "framework": "vllm", + "runner": "mi300x", + "multinode": False, + "scenarios": { + "fixed-seq-len": [ + { + "isl": 1024, "osl": 1024, + "search-space": [{"tp": 2, "conc-start": 4, "conc-end": 64}], + } + ], + "agentic-coding": [ + { + "duration": 1800, + "search-space": [ + {"tp": 2, "offloading": "none", "conc-list": [2, 4, 8]}, + {"tp": 2, "offloading": "lmcache", "conc-list": [2, 4, 8]}, + ], + } + ], + }, + }) + assert config.scenarios.fixed_seq_len is not None + assert config.scenarios.agentic_coding is not None + + def test_scenarios_must_have_at_least_one(self): + """Scenarios must have at least one scenario type.""" + with pytest.raises(Exception) as exc_info: + SingleNodeMasterConfigEntry(**{ + "image": "test", + "model": "test", + "model-prefix": "test", + "precision": "fp8", + "framework": "vllm", + "runner": "mi300x", + "multinode": False, + "scenarios": {}, + }) + assert "At least one scenario" in str(exc_info.value) diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py index dd245aec7..195311d49 100644 --- a/utils/matrix_logic/validation.py +++ b/utils/matrix_logic/validation.py @@ -156,7 +156,7 @@ class SingleNodeAgenticMatrixEntry(BaseModel): ep: int dp_attn: bool = Field(alias=Fields.DP_ATTN.value) conc: int - offloading: Literal["none", "cpu", "ssd"] = Field(alias=Fields.OFFLOADING.value) + offloading: Literal["none", "cpu", "lmcache", "ssd"] = Field(alias=Fields.OFFLOADING.value) duration: int = Field(default=1800, alias=Fields.DURATION.value) exp_name: str = Field(alias=Fields.EXP_NAME.value) scenario_type: str = Field(alias=Fields.SCENARIO_TYPE.value) @@ -338,7 +338,7 @@ class AgenticCodingSearchSpaceEntry(BaseModel): default="none", alias=Fields.SPEC_DECODING.value) prefill: Optional[WorkerConfig] = None decode: Optional[WorkerConfig] = None - offloading: Literal["none", "cpu", "ssd"] = Field(default="none", alias=Fields.OFFLOADING.value) + offloading: Literal["none", "cpu", "lmcache", "ssd"] = Field(default="none", alias=Fields.OFFLOADING.value) conc_start: Optional[int] = Field(default=None, alias=Fields.CONC_START.value) conc_end: Optional[int] = Field(default=None, alias=Fields.CONC_END.value) conc_list: Optional[List[int]] = Field(default=None, alias=Fields.CONC_LIST.value) From 603fbb874016ece18f22f3a293e9929b901f5345 Mon Sep 17 00:00:00 2001 From: Andy Luo Date: Sat, 2 May 2026 12:33:40 -0700 Subject: [PATCH 2/4] =?UTF-8?q?fix:=20address=20PR=20review=20=E2=80=94=20?= =?UTF-8?q?rename=20lmcache=20to=20lmcache=5Fcpu,=20clarify=20test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rename offloading value from "lmcache" to "lmcache_cpu" to distinguish from potential future LMCache backends (NVMe, WEKA, etc.) - Clarify test_cannot_mix_tp_and_prefill docstring: this tests existing validation behavior from PR #1201, not a new constraint. Note that a future PR may relax this to allow different prefill/decode TP values. Co-Authored-By: Claude Opus 4.6 --- .github/configs/amd-master.yaml | 12 ++++---- .../agentic/minimaxm2.5_fp8_mi300x.sh | 4 +-- .../agentic/minimaxm2.5_fp8_mi325x.sh | 4 +-- .../agentic/minimaxm2.5_fp8_mi355x.sh | 4 +-- utils/matrix_logic/test_validation.py | 28 ++++++++++--------- utils/matrix_logic/validation.py | 4 +-- 6 files changed, 29 insertions(+), 27 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 41894d75d..296e1c052 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -593,9 +593,9 @@ minimaxm2.5-fp8-mi355x-vllm: - duration: 1800 search-space: - { tp: 4, offloading: none, conc-list: [2, 4, 8, 16, 32] } - - { tp: 4, offloading: lmcache, conc-list: [2, 4, 8, 16, 32] } + - { tp: 4, offloading: lmcache_cpu, conc-list: [2, 4, 8, 16, 32] } - { tp: 8, offloading: none, conc-list: [2, 4, 8, 16, 32] } - - { tp: 8, offloading: lmcache, conc-list: [2, 4, 8, 16, 32] } + - { tp: 8, offloading: lmcache_cpu, conc-list: [2, 4, 8, 16, 32] } minimaxm2.5-fp8-mi355x-atom: image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2 @@ -692,9 +692,9 @@ minimaxm2.5-fp8-mi300x-vllm: - duration: 1800 search-space: - { tp: 2, offloading: none, conc-list: [2, 4, 8, 16] } - - { tp: 2, offloading: lmcache, conc-list: [2, 4, 8, 16] } + - { tp: 2, offloading: lmcache_cpu, conc-list: [2, 4, 8, 16] } - { tp: 8, offloading: none, conc-list: [2, 4, 8, 16, 32] } - - { tp: 8, offloading: lmcache, conc-list: [2, 4, 8, 16, 32] } + - { tp: 8, offloading: lmcache_cpu, conc-list: [2, 4, 8, 16, 32] } minimaxm2.5-fp8-mi325x-vllm: image: vllm/vllm-openai-rocm:v0.19.1 @@ -720,9 +720,9 @@ minimaxm2.5-fp8-mi325x-vllm: - duration: 1800 search-space: - { tp: 2, offloading: none, conc-list: [2, 4, 8, 16] } - - { tp: 2, offloading: lmcache, conc-list: [2, 4, 8, 16] } + - { tp: 2, offloading: lmcache_cpu, conc-list: [2, 4, 8, 16] } - { tp: 8, offloading: none, conc-list: [2, 4, 8, 16, 32] } - - { tp: 8, offloading: lmcache, conc-list: [2, 4, 8, 16, 32] } + - { tp: 8, offloading: lmcache_cpu, conc-list: [2, 4, 8, 16, 32] } gptoss-fp4-mi300x-vllm: image: vllm/vllm-openai-rocm:v0.17.0 diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh index 9512c5a63..649a29d77 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh @@ -64,7 +64,7 @@ case "$OFFLOADING" in cpu) OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager" ;; - lmcache) + lmcache_cpu) # LMCache CPU DRAM offloading via LMCacheConnectorV1. # Critical: PYTHONHASHSEED=0 is mandatory for cache key consistency # across TP workers. Without it, hit rate is 0%. @@ -78,7 +78,7 @@ case "$OFFLOADING" in OFFLOAD_ARGS="--kv-transfer-config {\"kv_connector\":\"LMCacheConnectorV1\",\"kv_role\":\"kv_both\"}" ;; *) - echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu, lmcache)" >&2 + echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu, lmcache_cpu)" >&2 exit 1 ;; esac diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh index 8e2306721..8724f1afc 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh @@ -64,7 +64,7 @@ case "$OFFLOADING" in cpu) OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager" ;; - lmcache) + lmcache_cpu) # LMCache CPU DRAM offloading via LMCacheConnectorV1. # Critical: PYTHONHASHSEED=0 is mandatory for cache key consistency # across TP workers. Without it, hit rate is 0%. @@ -78,7 +78,7 @@ case "$OFFLOADING" in OFFLOAD_ARGS="--kv-transfer-config {\"kv_connector\":\"LMCacheConnectorV1\",\"kv_role\":\"kv_both\"}" ;; *) - echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu, lmcache)" >&2 + echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu, lmcache_cpu)" >&2 exit 1 ;; esac diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh index 9304ae5c9..eee3a7f0f 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh @@ -64,7 +64,7 @@ case "$OFFLOADING" in cpu) OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager" ;; - lmcache) + lmcache_cpu) # LMCache CPU DRAM offloading via LMCacheConnectorV1. # Critical: PYTHONHASHSEED=0 is mandatory for cache key consistency # across TP workers. Without it, hit rate is 0%. @@ -78,7 +78,7 @@ case "$OFFLOADING" in OFFLOAD_ARGS="--kv-transfer-config {\"kv_connector\":\"LMCacheConnectorV1\",\"kv_role\":\"kv_both\"}" ;; *) - echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu, lmcache)" >&2 + echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu, lmcache_cpu)" >&2 exit 1 ;; esac diff --git a/utils/matrix_logic/test_validation.py b/utils/matrix_logic/test_validation.py index 284937815..d7bbe9b00 100644 --- a/utils/matrix_logic/test_validation.py +++ b/utils/matrix_logic/test_validation.py @@ -910,13 +910,13 @@ def test_valid_with_offloading_cpu(self): assert entry.offloading == "cpu" def test_valid_with_offloading_lmcache(self): - """Valid entry with offloading=lmcache should pass.""" + """Valid entry with offloading=lmcache_cpu should pass.""" entry = AgenticCodingSearchSpaceEntry(**{ "tp": 2, - "offloading": "lmcache", + "offloading": "lmcache_cpu", "conc-list": [2, 4, 8, 16], }) - assert entry.offloading == "lmcache" + assert entry.offloading == "lmcache_cpu" assert entry.tp == 2 def test_valid_with_offloading_ssd(self): @@ -950,13 +950,15 @@ def test_must_specify_tp_or_prefill_decode(self): """Must specify either tp or both prefill and decode.""" with pytest.raises(Exception) as exc_info: AgenticCodingSearchSpaceEntry(**{ - "offloading": "lmcache", + "offloading": "lmcache_cpu", "conc-list": [4], }) assert "must specify either tp" in str(exc_info.value).lower() def test_cannot_mix_tp_and_prefill(self): - """Cannot specify both tp and prefill/decode.""" + """Current validation rejects both tp and prefill/decode in the same entry. + Note: this tests existing behavior from PR #1201. A future PR may relax + this constraint to allow different prefill/decode TP values.""" with pytest.raises(Exception): AgenticCodingSearchSpaceEntry(**{ "tp": 8, @@ -990,16 +992,16 @@ def valid_agentic_entry(self): "ep": 1, "dp-attn": False, "conc": 8, - "offloading": "lmcache", + "offloading": "lmcache_cpu", "duration": 1800, - "exp-name": "minimaxm2.5_tp2_conc8_offloadlmcache", + "exp-name": "minimaxm2.5_tp2_conc8_offloadlmcache_cpu", "scenario-type": "agentic-coding", } def test_valid_lmcache_entry(self, valid_agentic_entry): """Valid agentic entry with lmcache offloading should pass.""" entry = SingleNodeAgenticMatrixEntry(**valid_agentic_entry) - assert entry.offloading == "lmcache" + assert entry.offloading == "lmcache_cpu" assert entry.tp == 2 assert entry.conc == 8 assert entry.scenario_type == "agentic-coding" @@ -1060,13 +1062,13 @@ def test_valid_with_lmcache_and_none(self): "duration": 1800, "search-space": [ {"tp": 2, "offloading": "none", "conc-list": [2, 4, 8]}, - {"tp": 2, "offloading": "lmcache", "conc-list": [2, 4, 8]}, + {"tp": 2, "offloading": "lmcache_cpu", "conc-list": [2, 4, 8]}, ], }) assert config.duration == 1800 assert len(config.search_space) == 2 assert config.search_space[0].offloading == "none" - assert config.search_space[1].offloading == "lmcache" + assert config.search_space[1].offloading == "lmcache_cpu" def test_duration_defaults_to_1800(self): """Duration should default to 1800.""" @@ -1100,7 +1102,7 @@ def test_single_node_with_agentic_only(self): { "duration": 1800, "search-space": [ - {"tp": 2, "offloading": "lmcache", "conc-list": [2, 4, 8]}, + {"tp": 2, "offloading": "lmcache_cpu", "conc-list": [2, 4, 8]}, ], } ], @@ -1108,7 +1110,7 @@ def test_single_node_with_agentic_only(self): }) assert config.scenarios.agentic_coding is not None assert len(config.scenarios.agentic_coding) == 1 - assert config.scenarios.agentic_coding[0].search_space[0].offloading == "lmcache" + assert config.scenarios.agentic_coding[0].search_space[0].offloading == "lmcache_cpu" def test_single_node_with_both_scenarios(self): """Single node config with both fixed-seq-len and agentic-coding should pass.""" @@ -1132,7 +1134,7 @@ def test_single_node_with_both_scenarios(self): "duration": 1800, "search-space": [ {"tp": 2, "offloading": "none", "conc-list": [2, 4, 8]}, - {"tp": 2, "offloading": "lmcache", "conc-list": [2, 4, 8]}, + {"tp": 2, "offloading": "lmcache_cpu", "conc-list": [2, 4, 8]}, ], } ], diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py index 195311d49..ed925a432 100644 --- a/utils/matrix_logic/validation.py +++ b/utils/matrix_logic/validation.py @@ -156,7 +156,7 @@ class SingleNodeAgenticMatrixEntry(BaseModel): ep: int dp_attn: bool = Field(alias=Fields.DP_ATTN.value) conc: int - offloading: Literal["none", "cpu", "lmcache", "ssd"] = Field(alias=Fields.OFFLOADING.value) + offloading: Literal["none", "cpu", "lmcache_cpu", "ssd"] = Field(alias=Fields.OFFLOADING.value) duration: int = Field(default=1800, alias=Fields.DURATION.value) exp_name: str = Field(alias=Fields.EXP_NAME.value) scenario_type: str = Field(alias=Fields.SCENARIO_TYPE.value) @@ -338,7 +338,7 @@ class AgenticCodingSearchSpaceEntry(BaseModel): default="none", alias=Fields.SPEC_DECODING.value) prefill: Optional[WorkerConfig] = None decode: Optional[WorkerConfig] = None - offloading: Literal["none", "cpu", "lmcache", "ssd"] = Field(default="none", alias=Fields.OFFLOADING.value) + offloading: Literal["none", "cpu", "lmcache_cpu", "ssd"] = Field(default="none", alias=Fields.OFFLOADING.value) conc_start: Optional[int] = Field(default=None, alias=Fields.CONC_START.value) conc_end: Optional[int] = Field(default=None, alias=Fields.CONC_END.value) conc_list: Optional[List[int]] = Field(default=None, alias=Fields.CONC_LIST.value) From e7ab020c947845df1e28c3aae570e1d3c97d6042 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Sat, 2 May 2026 21:03:25 +0000 Subject: [PATCH 3/4] fix: relax validate_topology_fields to allow disaggregated prefill/decode TP vLLM and SGLang support different prefill vs decode TP on most models. The previous validation rejected entries specifying both tp and prefill/decode configs. Now tp can coexist with prefill/decode for disaggregated serving, while still requiring both prefill and decode if either is specified. Co-authored-by: functionstackx --- utils/matrix_logic/test_validation.py | 36 ++++++++++++++++++++++----- utils/matrix_logic/validation.py | 16 ++++++------ 2 files changed, 37 insertions(+), 15 deletions(-) diff --git a/utils/matrix_logic/test_validation.py b/utils/matrix_logic/test_validation.py index d7bbe9b00..ab3c4b51e 100644 --- a/utils/matrix_logic/test_validation.py +++ b/utils/matrix_logic/test_validation.py @@ -953,23 +953,47 @@ def test_must_specify_tp_or_prefill_decode(self): "offloading": "lmcache_cpu", "conc-list": [4], }) - assert "must specify either tp" in str(exc_info.value).lower() + assert "must specify at least tp" in str(exc_info.value).lower() - def test_cannot_mix_tp_and_prefill(self): - """Current validation rejects both tp and prefill/decode in the same entry. - Note: this tests existing behavior from PR #1201. A future PR may relax - this constraint to allow different prefill/decode TP values.""" - with pytest.raises(Exception): + def test_tp_with_prefill_decode_allowed(self): + """tp can coexist with prefill/decode for disaggregated serving.""" + entry = AgenticCodingSearchSpaceEntry(**{ + "tp": 8, + "prefill": { + "num-worker": 1, "tp": 4, "ep": 4, "dp-attn": False, + }, + "decode": { + "num-worker": 1, "tp": 8, "ep": 8, "dp-attn": False, + }, + "conc-list": [4], + }) + assert entry.tp == 8 + assert entry.prefill.tp == 4 + assert entry.decode.tp == 8 + + def test_prefill_without_decode_rejected(self): + """Specifying only prefill without decode should fail.""" + with pytest.raises(Exception) as exc_info: AgenticCodingSearchSpaceEntry(**{ "tp": 8, "prefill": { "num-worker": 1, "tp": 4, "ep": 4, "dp-attn": False, }, + "conc-list": [4], + }) + assert "both prefill and decode" in str(exc_info.value).lower() + + def test_decode_without_prefill_rejected(self): + """Specifying only decode without prefill should fail.""" + with pytest.raises(Exception) as exc_info: + AgenticCodingSearchSpaceEntry(**{ + "tp": 8, "decode": { "num-worker": 1, "tp": 8, "ep": 8, "dp-attn": False, }, "conc-list": [4], }) + assert "both prefill and decode" in str(exc_info.value).lower() # ============================================================================= diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py index ed925a432..385e7c75b 100644 --- a/utils/matrix_logic/validation.py +++ b/utils/matrix_logic/validation.py @@ -349,15 +349,13 @@ def validate_conc_fields(self): @model_validator(mode='after') def validate_topology_fields(self): - has_single_node = self.tp is not None - has_any_multinode_field = self.prefill is not None or self.decode is not None - has_complete_multinode = self.prefill is not None and self.decode is not None - if has_single_node: - valid = not has_any_multinode_field - else: - valid = has_complete_multinode - if not valid: - raise ValueError("Agentic search-space entries must specify either tp or both prefill and decode") + has_tp = self.tp is not None + has_prefill = self.prefill is not None + has_decode = self.decode is not None + if has_prefill != has_decode: + raise ValueError("Agentic search-space entries must specify both prefill and decode, not just one") + if not has_tp and not has_prefill: + raise ValueError("Agentic search-space entries must specify at least tp or both prefill and decode") return self From 696a8040228f78c3190ab1293ef7c0dc42fcd82e Mon Sep 17 00:00:00 2001 From: Andy Luo Date: Mon, 4 May 2026 11:27:03 -0700 Subject: [PATCH 4/4] fix: size LMCache CPU pool per-worker to avoid OOM LMCACHE_MAX_LOCAL_CPU_SIZE is per TP worker, not total. Using TOTAL_CPU_DRAM_GB directly would allocate TOTAL_CPU_DRAM_GB * TP_SIZE of pinned memory (e.g. 600 GB * 8 = 4.8 TB at TP=8), causing OOM. Divide by TP so the aggregate matches the intended total budget. Co-Authored-By: Claude Opus 4.6 --- benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh | 1 + benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh | 1 + benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh | 1 + 3 files changed, 3 insertions(+) diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh index 649a29d77..47d17137a 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh @@ -72,6 +72,7 @@ case "$OFFLOADING" in export PYTHONHASHSEED=0 export LMCACHE_LOCAL_CPU=true export LMCACHE_CHUNK_SIZE=256 + export LMCACHE_MAX_LOCAL_CPU_SIZE=$((TOTAL_CPU_DRAM_GB / TP)) # LMCache reuses vLLM's prefix cache hash function, so prefix caching # must be enabled (unlike native CPU offloading). PREFIX_CACHE_FLAG="--enable-prefix-caching" diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh index 8724f1afc..6090a4408 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh @@ -72,6 +72,7 @@ case "$OFFLOADING" in export PYTHONHASHSEED=0 export LMCACHE_LOCAL_CPU=true export LMCACHE_CHUNK_SIZE=256 + export LMCACHE_MAX_LOCAL_CPU_SIZE=$((TOTAL_CPU_DRAM_GB / TP)) # LMCache reuses vLLM's prefix cache hash function, so prefix caching # must be enabled (unlike native CPU offloading). PREFIX_CACHE_FLAG="--enable-prefix-caching" diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh index eee3a7f0f..dea4dec32 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh @@ -72,6 +72,7 @@ case "$OFFLOADING" in export PYTHONHASHSEED=0 export LMCACHE_LOCAL_CPU=true export LMCACHE_CHUNK_SIZE=256 + export LMCACHE_MAX_LOCAL_CPU_SIZE=$((TOTAL_CPU_DRAM_GB / TP)) # LMCache reuses vLLM's prefix cache hash function, so prefix caching # must be enabled (unlike native CPU offloading). PREFIX_CACHE_FLAG="--enable-prefix-caching"