From f67aea9a8cbbb145904ad87b2f06e5890d7dd886 Mon Sep 17 00:00:00 2001
From: Andy Luo <andy.luo@amd.com>
Date: Sat, 2 May 2026 12:09:51 -0700
Subject: [PATCH 1/4] feat: add vLLM + LMCache CPU offloading for MiniMax-M2.5
 agentic benchmark on AMD GPUs

Add `offloading: lmcache` as a new KV cache offloading option for the
agentic trace replay benchmark on MI300X/MI325X/MI355X. LMCache offloads
cold KV cache pages to CPU DRAM via LMCacheConnectorV1, enabling larger
working sets than HBM-only prefix caching.

- Add benchmark scripts for MiniMax-M2.5 FP8 on MI300X/MI325X/MI355X
- Add install_lmcache_hip() helper (PyPI wheel is CUDA-only, must build from source)
- Extend offloading Literal to include "lmcache" in validation
- Add agentic-coding scenarios to AMD master config with lmcache/none sweeps
- Add 21 new tests for agentic + LMCache validation
- Bump MiniMax vLLM images to v0.19.1

Smoke-tested on MI300X (TP=2) and MI355X (TP=4) with MiniMax-M2.7.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml               |  27 +-
 benchmarks/benchmark_lib.sh                   |  13 +
 .../agentic/minimaxm2.5_fp8_mi300x.sh         | 122 ++++++++
 .../agentic/minimaxm2.5_fp8_mi325x.sh         | 122 ++++++++
 .../agentic/minimaxm2.5_fp8_mi355x.sh         | 122 ++++++++
 utils/matrix_logic/test_validation.py         | 280 ++++++++++++++++++
 utils/matrix_logic/validation.py              |   4 +-
 7 files changed, 685 insertions(+), 5 deletions(-)
 create mode 100755 benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh
 create mode 100755 benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh
 create mode 100755 benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 949a8a106..41894d75d 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -568,7 +568,7 @@ kimik2.5-fp4-mi355x-atom:
       - { tp: 4, conc-start: 4, conc-end: 128 }
 
 minimaxm2.5-fp8-mi355x-vllm:
-  image: vllm/vllm-openai-rocm:v0.19.0
+  image: vllm/vllm-openai-rocm:v0.19.1
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: mi355x
@@ -589,6 +589,13 @@ minimaxm2.5-fp8-mi355x-vllm:
       - { tp: 2, ep: 2, conc-start: 2, conc-end: 256 }
       - { tp: 4, ep: 4, conc-start: 4, conc-end: 512 }
       - { tp: 8, ep: 8, conc-start: 2, conc-end: 2 }
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 4, offloading: none, conc-list: [2, 4, 8, 16, 32] }
+      - { tp: 4, offloading: lmcache, conc-list: [2, 4, 8, 16, 32] }
+      - { tp: 8, offloading: none, conc-list: [2, 4, 8, 16, 32] }
+      - { tp: 8, offloading: lmcache, conc-list: [2, 4, 8, 16, 32] }
 
 minimaxm2.5-fp8-mi355x-atom:
   image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2
@@ -662,7 +669,7 @@ minimaxm2.5-fp4-mi355x-vllm:
       - { tp: 4, conc-start: 4, conc-end: 64 }
 
 minimaxm2.5-fp8-mi300x-vllm:
-  image: vllm/vllm-openai-rocm:v0.16.0
+  image: vllm/vllm-openai-rocm:v0.19.1
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: mi300x
@@ -681,9 +688,16 @@ minimaxm2.5-fp8-mi300x-vllm:
       search-space:
       - { tp: 2, conc-start: 4, conc-end: 64 }
       - { tp: 4, conc-start: 4, conc-end: 64 }
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 2, offloading: none, conc-list: [2, 4, 8, 16] }
+      - { tp: 2, offloading: lmcache, conc-list: [2, 4, 8, 16] }
+      - { tp: 8, offloading: none, conc-list: [2, 4, 8, 16, 32] }
+      - { tp: 8, offloading: lmcache, conc-list: [2, 4, 8, 16, 32] }
 
 minimaxm2.5-fp8-mi325x-vllm:
-  image: vllm/vllm-openai-rocm:v0.18.0
+  image: vllm/vllm-openai-rocm:v0.19.1
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: mi325x
@@ -702,6 +716,13 @@ minimaxm2.5-fp8-mi325x-vllm:
       search-space:
       - { tp: 2, conc-start: 4, conc-end: 64 }
       - { tp: 8, ep: 8, conc-start: 4, conc-end: 256 }
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 2, offloading: none, conc-list: [2, 4, 8, 16] }
+      - { tp: 2, offloading: lmcache, conc-list: [2, 4, 8, 16] }
+      - { tp: 8, offloading: none, conc-list: [2, 4, 8, 16, 32] }
+      - { tp: 8, offloading: lmcache, conc-list: [2, 4, 8, 16, 32] }
 
 gptoss-fp4-mi300x-vllm:
   image: vllm/vllm-openai-rocm:v0.17.0
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 4c0c8642e..75c72316f 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -902,6 +902,19 @@ resolve_trace_source() {
     hf download --repo-type dataset "$dataset"
 }
 
+install_lmcache_hip() {
+    # LMCache PyPI wheel ships CUDA-only c_ops.so; must build from source for ROCm.
+    # `pip install lmcache` ignores BUILD_WITH_HIP and installs the pre-built CUDA wheel.
+    # We must clone and build with --no-build-isolation to get the HIP c_ops.so.
+    local lmcache_dir
+    lmcache_dir="$(mktemp -d)/LMCache"
+    echo "Building LMCache from source with HIP support..."
+    git clone --depth 1 https://github.com/LMCache/LMCache.git "$lmcache_dir"
+    SETUPTOOLS_SCM_PRETEND_VERSION=0.4.4 BUILD_WITH_HIP=1 \
+        agentic_pip_install -e "$lmcache_dir" --no-build-isolation
+    echo "LMCache HIP build complete."
+}
+
 install_agentic_deps() {
     agentic_pip_install --quiet urllib3 requests 2>/dev/null || true
     agentic_pip_install -q -r "$AGENTIC_DIR/requirements.txt"
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh
new file mode 100755
index 000000000..9512c5a63
--- /dev/null
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh
@@ -0,0 +1,122 @@
+#!/usr/bin/env bash
+set -euo pipefail
+set -x
+
+# Agentic trace replay benchmark for MiniMax-M2.5 FP8 on MI300X using vLLM.
+# Supports LMCache CPU DRAM offloading for KV cache.
+#
+# Required env vars:
+#   MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR
+
+source "$(dirname "$0")/../../benchmark_lib.sh"
+
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR
+
+PORT=${PORT:-8888}
+DURATION=${DURATION:-1800}
+MAX_DELAY=${MAX_DELAY:-60}
+ADVANCE_MIN=${ADVANCE_MIN:-0.0}
+ADVANCE_MAX=${ADVANCE_MAX:-0.7}
+# Agentic matrix entries don't set max-model-len, so the workflow passes 0.
+# ${:-DEFAULT} only fires on unset/empty, so handle 0 explicitly.
+if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
+    MAX_MODEL_LEN=131072
+fi
+
+if [[ -n "${SLURM_JOB_ID:-}" ]]; then
+    echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
+fi
+
+if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+rocm-smi
+
+# If the machine runs a MEC FW older than 177, RCCL cannot reclaim some memory.
+# See https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates
+version=`rocm-smi --showfw | grep MEC | head -n 1 | awk '{print $NF}'`
+if [[ "$version" == "" || $version -lt 177 ]]; then
+  export HSA_NO_SCRATCH_RECLAIM=1
+fi
+
+# Ray compatibility in vLLM 0.14+ needs HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES
+if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then
+    export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
+fi
+
+export AMDGCN_USE_BUFFER_OPS=0
+export VLLM_ROCM_USE_AITER=1
+export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
+export PYTHONNOUSERSITE=1
+
+# ---- Resolve traces and install deps ----------------------------------------
+resolve_trace_source
+install_agentic_deps
+
+# ---- Server config ----------------------------------------------------------
+SERVER_LOG="$RESULT_DIR/server.log"
+mkdir -p "$RESULT_DIR"
+
+OFFLOAD_ARGS=""
+PREFIX_CACHE_FLAG="--no-enable-prefix-caching"
+
+case "$OFFLOADING" in
+    none)
+        ;;
+    cpu)
+        OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager"
+        ;;
+    lmcache)
+        # LMCache CPU DRAM offloading via LMCacheConnectorV1.
+        # Critical: PYTHONHASHSEED=0 is mandatory for cache key consistency
+        # across TP workers. Without it, hit rate is 0%.
+        install_lmcache_hip
+        export PYTHONHASHSEED=0
+        export LMCACHE_LOCAL_CPU=true
+        export LMCACHE_CHUNK_SIZE=256
+        # LMCache reuses vLLM's prefix cache hash function, so prefix caching
+        # must be enabled (unlike native CPU offloading).
+        PREFIX_CACHE_FLAG="--enable-prefix-caching"
+        OFFLOAD_ARGS="--kv-transfer-config {\"kv_connector\":\"LMCacheConnectorV1\",\"kv_role\":\"kv_both\"}"
+        ;;
+    *)
+        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu, lmcache)" >&2
+        exit 1
+        ;;
+esac
+
+echo "Starting vllm server..."
+
+vllm serve $MODEL \
+--host 0.0.0.0 \
+--port $PORT \
+--trust-remote-code \
+--tool-call-parser minimax_m2 \
+--reasoning-parser minimax_m2 \
+--enable-auto-tool-choice \
+--attention-backend ROCM_AITER_UNIFIED_ATTN \
+--tensor-parallel-size=$TP \
+--gpu-memory-utilization 0.85 \
+--max-model-len $MAX_MODEL_LEN \
+--max-num-seqs $CONC \
+--block-size=64 \
+--kv-cache-dtype fp8 \
+$PREFIX_CACHE_FLAG \
+$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 &
+SERVER_PID=$!
+echo "Server PID: $SERVER_PID"
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+# ---- Run benchmark ----------------------------------------------------------
+build_replay_cmd "$RESULT_DIR"
+
+echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt"
+
+set -x
+$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true
+set +x
+
+write_agentic_result_json "$RESULT_DIR"
+
+# ---- Post-processing --------------------------------------------------------
+python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
+    "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh
new file mode 100755
index 000000000..8e2306721
--- /dev/null
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh
@@ -0,0 +1,122 @@
+#!/usr/bin/env bash
+set -euo pipefail
+set -x
+
+# Agentic trace replay benchmark for MiniMax-M2.5 FP8 on MI325X using vLLM.
+# Supports LMCache CPU DRAM offloading for KV cache.
+#
+# Required env vars:
+#   MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR
+
+source "$(dirname "$0")/../../benchmark_lib.sh"
+
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR
+
+PORT=${PORT:-8888}
+DURATION=${DURATION:-1800}
+MAX_DELAY=${MAX_DELAY:-60}
+ADVANCE_MIN=${ADVANCE_MIN:-0.0}
+ADVANCE_MAX=${ADVANCE_MAX:-0.7}
+# Agentic matrix entries don't set max-model-len, so the workflow passes 0.
+# ${:-DEFAULT} only fires on unset/empty, so handle 0 explicitly.
+if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
+    MAX_MODEL_LEN=131072
+fi
+
+if [[ -n "${SLURM_JOB_ID:-}" ]]; then
+    echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
+fi
+
+if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+rocm-smi
+
+# If the machine runs a MEC FW older than 177, RCCL cannot reclaim some memory.
+# See https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates
+version=`rocm-smi --showfw | grep MEC | head -n 1 | awk '{print $NF}'`
+if [[ "$version" == "" || $version -lt 177 ]]; then
+  export HSA_NO_SCRATCH_RECLAIM=1
+fi
+
+# Ray compatibility in vLLM 0.14+ needs HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES
+if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then
+    export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
+fi
+
+export AMDGCN_USE_BUFFER_OPS=0
+export VLLM_ROCM_USE_AITER=1
+export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
+export PYTHONNOUSERSITE=1
+
+# ---- Resolve traces and install deps ----------------------------------------
+resolve_trace_source
+install_agentic_deps
+
+# ---- Server config ----------------------------------------------------------
+SERVER_LOG="$RESULT_DIR/server.log"
+mkdir -p "$RESULT_DIR"
+
+OFFLOAD_ARGS=""
+PREFIX_CACHE_FLAG="--no-enable-prefix-caching"
+
+case "$OFFLOADING" in
+    none)
+        ;;
+    cpu)
+        OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager"
+        ;;
+    lmcache)
+        # LMCache CPU DRAM offloading via LMCacheConnectorV1.
+        # Critical: PYTHONHASHSEED=0 is mandatory for cache key consistency
+        # across TP workers. Without it, hit rate is 0%.
+        install_lmcache_hip
+        export PYTHONHASHSEED=0
+        export LMCACHE_LOCAL_CPU=true
+        export LMCACHE_CHUNK_SIZE=256
+        # LMCache reuses vLLM's prefix cache hash function, so prefix caching
+        # must be enabled (unlike native CPU offloading).
+        PREFIX_CACHE_FLAG="--enable-prefix-caching"
+        OFFLOAD_ARGS="--kv-transfer-config {\"kv_connector\":\"LMCacheConnectorV1\",\"kv_role\":\"kv_both\"}"
+        ;;
+    *)
+        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu, lmcache)" >&2
+        exit 1
+        ;;
+esac
+
+echo "Starting vllm server..."
+
+vllm serve $MODEL \
+--host 0.0.0.0 \
+--port $PORT \
+--trust-remote-code \
+--tool-call-parser minimax_m2 \
+--reasoning-parser minimax_m2 \
+--enable-auto-tool-choice \
+--attention-backend ROCM_AITER_UNIFIED_ATTN \
+--tensor-parallel-size=$TP \
+--gpu-memory-utilization 0.85 \
+--max-model-len $MAX_MODEL_LEN \
+--max-num-seqs $CONC \
+--block-size=64 \
+--kv-cache-dtype fp8 \
+$PREFIX_CACHE_FLAG \
+$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 &
+SERVER_PID=$!
+echo "Server PID: $SERVER_PID"
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+# ---- Run benchmark ----------------------------------------------------------
+build_replay_cmd "$RESULT_DIR"
+
+echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt"
+
+set -x
+$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true
+set +x
+
+write_agentic_result_json "$RESULT_DIR"
+
+# ---- Post-processing --------------------------------------------------------
+python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
+    "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh
new file mode 100755
index 000000000..9304ae5c9
--- /dev/null
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh
@@ -0,0 +1,122 @@
+#!/usr/bin/env bash
+set -euo pipefail
+set -x
+
+# Agentic trace replay benchmark for MiniMax-M2.5 FP8 on MI355X using vLLM.
+# Supports LMCache CPU DRAM offloading for KV cache.
+#
+# Required env vars:
+#   MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR
+
+source "$(dirname "$0")/../../benchmark_lib.sh"
+
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR
+
+PORT=${PORT:-8888}
+DURATION=${DURATION:-1800}
+MAX_DELAY=${MAX_DELAY:-60}
+ADVANCE_MIN=${ADVANCE_MIN:-0.0}
+ADVANCE_MAX=${ADVANCE_MAX:-0.7}
+# Agentic matrix entries don't set max-model-len, so the workflow passes 0.
+# ${:-DEFAULT} only fires on unset/empty, so handle 0 explicitly.
+if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
+    MAX_MODEL_LEN=131072
+fi
+
+if [[ -n "${SLURM_JOB_ID:-}" ]]; then
+    echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
+fi
+
+if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+rocm-smi
+
+# If the machine runs a MEC FW older than 177, RCCL cannot reclaim some memory.
+# See https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates
+version=`rocm-smi --showfw | grep MEC | head -n 1 | awk '{print $NF}'`
+if [[ "$version" == "" || $version -lt 177 ]]; then
+  export HSA_NO_SCRATCH_RECLAIM=1
+fi
+
+# Ray compatibility in vLLM 0.14+ needs HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES
+if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then
+    export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
+fi
+
+export AMDGCN_USE_BUFFER_OPS=0
+export VLLM_ROCM_USE_AITER=1
+export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
+export PYTHONNOUSERSITE=1
+
+# ---- Resolve traces and install deps ----------------------------------------
+resolve_trace_source
+install_agentic_deps
+
+# ---- Server config ----------------------------------------------------------
+SERVER_LOG="$RESULT_DIR/server.log"
+mkdir -p "$RESULT_DIR"
+
+OFFLOAD_ARGS=""
+PREFIX_CACHE_FLAG="--no-enable-prefix-caching"
+
+case "$OFFLOADING" in
+    none)
+        ;;
+    cpu)
+        OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager"
+        ;;
+    lmcache)
+        # LMCache CPU DRAM offloading via LMCacheConnectorV1.
+        # Critical: PYTHONHASHSEED=0 is mandatory for cache key consistency
+        # across TP workers. Without it, hit rate is 0%.
+        install_lmcache_hip
+        export PYTHONHASHSEED=0
+        export LMCACHE_LOCAL_CPU=true
+        export LMCACHE_CHUNK_SIZE=256
+        # LMCache reuses vLLM's prefix cache hash function, so prefix caching
+        # must be enabled (unlike native CPU offloading).
+        PREFIX_CACHE_FLAG="--enable-prefix-caching"
+        OFFLOAD_ARGS="--kv-transfer-config {\"kv_connector\":\"LMCacheConnectorV1\",\"kv_role\":\"kv_both\"}"
+        ;;
+    *)
+        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu, lmcache)" >&2
+        exit 1
+        ;;
+esac
+
+echo "Starting vllm server..."
+
+vllm serve $MODEL \
+--host 0.0.0.0 \
+--port $PORT \
+--trust-remote-code \
+--tool-call-parser minimax_m2 \
+--reasoning-parser minimax_m2 \
+--enable-auto-tool-choice \
+--attention-backend ROCM_AITER_UNIFIED_ATTN \
+--tensor-parallel-size=$TP \
+--gpu-memory-utilization 0.85 \
+--max-model-len $MAX_MODEL_LEN \
+--max-num-seqs $CONC \
+--block-size=64 \
+--kv-cache-dtype fp8 \
+$PREFIX_CACHE_FLAG \
+$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 &
+SERVER_PID=$!
+echo "Server PID: $SERVER_PID"
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+# ---- Run benchmark ----------------------------------------------------------
+build_replay_cmd "$RESULT_DIR"
+
+echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt"
+
+set -x
+$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true
+set +x
+
+write_agentic_result_json "$RESULT_DIR"
+
+# ---- Post-processing --------------------------------------------------------
+python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
+    "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true
diff --git a/utils/matrix_logic/test_validation.py b/utils/matrix_logic/test_validation.py
index 1274fd86a..284937815 100644
--- a/utils/matrix_logic/test_validation.py
+++ b/utils/matrix_logic/test_validation.py
@@ -11,7 +11,12 @@
     MultiNodeSeqLenConfig,
     SingleNodeMasterConfigEntry,
     MultiNodeMasterConfigEntry,
+    SingleNodeAgenticMatrixEntry,
+    AgenticCodingSearchSpaceEntry,
+    AgenticCodingConfig,
+    SingleNodeScenarios,
     validate_matrix_entry,
+    validate_agentic_matrix_entry,
     validate_master_config,
     validate_runner_config,
     load_config_files,
@@ -875,3 +880,278 @@ def test_validation_runs_by_default(self, tmp_path):
         with pytest.raises(ValueError) as exc_info:
             load_runner_file(str(runner_file))
         assert "must be a list" in str(exc_info.value)
+
+
+# =============================================================================
+# Test AgenticCodingSearchSpaceEntry
+# =============================================================================
+
+class TestAgenticCodingSearchSpaceEntry:
+    """Tests for AgenticCodingSearchSpaceEntry model."""
+
+    def test_valid_with_offloading_none(self):
+        """Valid entry with offloading=none should pass."""
+        entry = AgenticCodingSearchSpaceEntry(**{
+            "tp": 8,
+            "offloading": "none",
+            "conc-list": [2, 4, 8, 16],
+        })
+        assert entry.tp == 8
+        assert entry.offloading == "none"
+        assert entry.conc_list == [2, 4, 8, 16]
+
+    def test_valid_with_offloading_cpu(self):
+        """Valid entry with offloading=cpu should pass."""
+        entry = AgenticCodingSearchSpaceEntry(**{
+            "tp": 4,
+            "offloading": "cpu",
+            "conc-list": [4, 8],
+        })
+        assert entry.offloading == "cpu"
+
+    def test_valid_with_offloading_lmcache(self):
+        """Valid entry with offloading=lmcache should pass."""
+        entry = AgenticCodingSearchSpaceEntry(**{
+            "tp": 2,
+            "offloading": "lmcache",
+            "conc-list": [2, 4, 8, 16],
+        })
+        assert entry.offloading == "lmcache"
+        assert entry.tp == 2
+
+    def test_valid_with_offloading_ssd(self):
+        """Valid entry with offloading=ssd should pass."""
+        entry = AgenticCodingSearchSpaceEntry(**{
+            "tp": 8,
+            "offloading": "ssd",
+            "conc-start": 4,
+            "conc-end": 32,
+        })
+        assert entry.offloading == "ssd"
+
+    def test_invalid_offloading_value(self):
+        """Invalid offloading value should fail."""
+        with pytest.raises(Exception):
+            AgenticCodingSearchSpaceEntry(**{
+                "tp": 8,
+                "offloading": "invalid",
+                "conc-list": [4],
+            })
+
+    def test_offloading_defaults_to_none(self):
+        """Offloading should default to none."""
+        entry = AgenticCodingSearchSpaceEntry(**{
+            "tp": 8,
+            "conc-list": [4, 8],
+        })
+        assert entry.offloading == "none"
+
+    def test_must_specify_tp_or_prefill_decode(self):
+        """Must specify either tp or both prefill and decode."""
+        with pytest.raises(Exception) as exc_info:
+            AgenticCodingSearchSpaceEntry(**{
+                "offloading": "lmcache",
+                "conc-list": [4],
+            })
+        assert "must specify either tp" in str(exc_info.value).lower()
+
+    def test_cannot_mix_tp_and_prefill(self):
+        """Cannot specify both tp and prefill/decode."""
+        with pytest.raises(Exception):
+            AgenticCodingSearchSpaceEntry(**{
+                "tp": 8,
+                "prefill": {
+                    "num-worker": 1, "tp": 4, "ep": 4, "dp-attn": False,
+                },
+                "decode": {
+                    "num-worker": 1, "tp": 8, "ep": 8, "dp-attn": False,
+                },
+                "conc-list": [4],
+            })
+
+
+# =============================================================================
+# Test SingleNodeAgenticMatrixEntry
+# =============================================================================
+
+class TestSingleNodeAgenticMatrixEntry:
+    """Tests for SingleNodeAgenticMatrixEntry model."""
+
+    @pytest.fixture
+    def valid_agentic_entry(self):
+        return {
+            "image": "vllm/vllm-openai-rocm:v0.19.1",
+            "model": "MiniMaxAI/MiniMax-M2.5",
+            "model-prefix": "minimaxm2.5",
+            "precision": "fp8",
+            "framework": "vllm",
+            "runner": "mi300x",
+            "tp": 2,
+            "ep": 1,
+            "dp-attn": False,
+            "conc": 8,
+            "offloading": "lmcache",
+            "duration": 1800,
+            "exp-name": "minimaxm2.5_tp2_conc8_offloadlmcache",
+            "scenario-type": "agentic-coding",
+        }
+
+    def test_valid_lmcache_entry(self, valid_agentic_entry):
+        """Valid agentic entry with lmcache offloading should pass."""
+        entry = SingleNodeAgenticMatrixEntry(**valid_agentic_entry)
+        assert entry.offloading == "lmcache"
+        assert entry.tp == 2
+        assert entry.conc == 8
+        assert entry.scenario_type == "agentic-coding"
+
+    def test_valid_none_offloading(self, valid_agentic_entry):
+        """Valid agentic entry with no offloading should pass."""
+        valid_agentic_entry["offloading"] = "none"
+        entry = SingleNodeAgenticMatrixEntry(**valid_agentic_entry)
+        assert entry.offloading == "none"
+
+    def test_valid_cpu_offloading(self, valid_agentic_entry):
+        """Valid agentic entry with cpu offloading should pass."""
+        valid_agentic_entry["offloading"] = "cpu"
+        entry = SingleNodeAgenticMatrixEntry(**valid_agentic_entry)
+        assert entry.offloading == "cpu"
+
+    def test_invalid_offloading_rejected(self, valid_agentic_entry):
+        """Invalid offloading value should fail."""
+        valid_agentic_entry["offloading"] = "gpu"
+        with pytest.raises(Exception):
+            SingleNodeAgenticMatrixEntry(**valid_agentic_entry)
+
+    def test_missing_offloading_fails(self, valid_agentic_entry):
+        """Missing offloading field should fail (no default on matrix entry)."""
+        del valid_agentic_entry["offloading"]
+        with pytest.raises(Exception):
+            SingleNodeAgenticMatrixEntry(**valid_agentic_entry)
+
+    def test_extra_field_forbidden(self, valid_agentic_entry):
+        """Extra fields should be rejected."""
+        valid_agentic_entry["extra-field"] = "value"
+        with pytest.raises(Exception):
+            SingleNodeAgenticMatrixEntry(**valid_agentic_entry)
+
+    def test_validate_agentic_matrix_entry_function(self, valid_agentic_entry):
+        """validate_agentic_matrix_entry should accept valid entry."""
+        result = validate_agentic_matrix_entry(valid_agentic_entry)
+        assert result == valid_agentic_entry
+
+    def test_validate_agentic_matrix_entry_invalid(self, valid_agentic_entry):
+        """validate_agentic_matrix_entry should reject invalid entry."""
+        del valid_agentic_entry["tp"]
+        with pytest.raises(ValueError) as exc_info:
+            validate_agentic_matrix_entry(valid_agentic_entry)
+        assert "failed validation" in str(exc_info.value)
+
+
+# =============================================================================
+# Test AgenticCodingConfig
+# =============================================================================
+
+class TestAgenticCodingConfig:
+    """Tests for AgenticCodingConfig model."""
+
+    def test_valid_with_lmcache_and_none(self):
+        """Config with both lmcache and none offloading entries should pass."""
+        config = AgenticCodingConfig(**{
+            "duration": 1800,
+            "search-space": [
+                {"tp": 2, "offloading": "none", "conc-list": [2, 4, 8]},
+                {"tp": 2, "offloading": "lmcache", "conc-list": [2, 4, 8]},
+            ],
+        })
+        assert config.duration == 1800
+        assert len(config.search_space) == 2
+        assert config.search_space[0].offloading == "none"
+        assert config.search_space[1].offloading == "lmcache"
+
+    def test_duration_defaults_to_1800(self):
+        """Duration should default to 1800."""
+        config = AgenticCodingConfig(**{
+            "search-space": [
+                {"tp": 8, "offloading": "none", "conc-list": [4]},
+            ],
+        })
+        assert config.duration == 1800
+
+
+# =============================================================================
+# Test Master Config with Agentic Scenarios
+# =============================================================================
+
+class TestMasterConfigWithAgentic:
+    """Tests for master config entries containing agentic-coding scenarios."""
+
+    def test_single_node_with_agentic_only(self):
+        """Single node config with only agentic-coding scenario should pass."""
+        config = SingleNodeMasterConfigEntry(**{
+            "image": "vllm/vllm-openai-rocm:v0.19.1",
+            "model": "MiniMaxAI/MiniMax-M2.5",
+            "model-prefix": "minimaxm2.5",
+            "precision": "fp8",
+            "framework": "vllm",
+            "runner": "mi300x",
+            "multinode": False,
+            "scenarios": {
+                "agentic-coding": [
+                    {
+                        "duration": 1800,
+                        "search-space": [
+                            {"tp": 2, "offloading": "lmcache", "conc-list": [2, 4, 8]},
+                        ],
+                    }
+                ],
+            },
+        })
+        assert config.scenarios.agentic_coding is not None
+        assert len(config.scenarios.agentic_coding) == 1
+        assert config.scenarios.agentic_coding[0].search_space[0].offloading == "lmcache"
+
+    def test_single_node_with_both_scenarios(self):
+        """Single node config with both fixed-seq-len and agentic-coding should pass."""
+        config = SingleNodeMasterConfigEntry(**{
+            "image": "vllm/vllm-openai-rocm:v0.19.1",
+            "model": "MiniMaxAI/MiniMax-M2.5",
+            "model-prefix": "minimaxm2.5",
+            "precision": "fp8",
+            "framework": "vllm",
+            "runner": "mi300x",
+            "multinode": False,
+            "scenarios": {
+                "fixed-seq-len": [
+                    {
+                        "isl": 1024, "osl": 1024,
+                        "search-space": [{"tp": 2, "conc-start": 4, "conc-end": 64}],
+                    }
+                ],
+                "agentic-coding": [
+                    {
+                        "duration": 1800,
+                        "search-space": [
+                            {"tp": 2, "offloading": "none", "conc-list": [2, 4, 8]},
+                            {"tp": 2, "offloading": "lmcache", "conc-list": [2, 4, 8]},
+                        ],
+                    }
+                ],
+            },
+        })
+        assert config.scenarios.fixed_seq_len is not None
+        assert config.scenarios.agentic_coding is not None
+
+    def test_scenarios_must_have_at_least_one(self):
+        """Scenarios must have at least one scenario type."""
+        with pytest.raises(Exception) as exc_info:
+            SingleNodeMasterConfigEntry(**{
+                "image": "test",
+                "model": "test",
+                "model-prefix": "test",
+                "precision": "fp8",
+                "framework": "vllm",
+                "runner": "mi300x",
+                "multinode": False,
+                "scenarios": {},
+            })
+        assert "At least one scenario" in str(exc_info.value)
diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py
index dd245aec7..195311d49 100644
--- a/utils/matrix_logic/validation.py
+++ b/utils/matrix_logic/validation.py
@@ -156,7 +156,7 @@ class SingleNodeAgenticMatrixEntry(BaseModel):
     ep: int
     dp_attn: bool = Field(alias=Fields.DP_ATTN.value)
     conc: int
-    offloading: Literal["none", "cpu", "ssd"] = Field(alias=Fields.OFFLOADING.value)
+    offloading: Literal["none", "cpu", "lmcache", "ssd"] = Field(alias=Fields.OFFLOADING.value)
     duration: int = Field(default=1800, alias=Fields.DURATION.value)
     exp_name: str = Field(alias=Fields.EXP_NAME.value)
     scenario_type: str = Field(alias=Fields.SCENARIO_TYPE.value)
@@ -338,7 +338,7 @@ class AgenticCodingSearchSpaceEntry(BaseModel):
         default="none", alias=Fields.SPEC_DECODING.value)
     prefill: Optional[WorkerConfig] = None
     decode: Optional[WorkerConfig] = None
-    offloading: Literal["none", "cpu", "ssd"] = Field(default="none", alias=Fields.OFFLOADING.value)
+    offloading: Literal["none", "cpu", "lmcache", "ssd"] = Field(default="none", alias=Fields.OFFLOADING.value)
     conc_start: Optional[int] = Field(default=None, alias=Fields.CONC_START.value)
     conc_end: Optional[int] = Field(default=None, alias=Fields.CONC_END.value)
     conc_list: Optional[List[int]] = Field(default=None, alias=Fields.CONC_LIST.value)

From 603fbb874016ece18f22f3a293e9929b901f5345 Mon Sep 17 00:00:00 2001
From: Andy Luo <andy.luo@amd.com>
Date: Sat, 2 May 2026 12:33:40 -0700
Subject: [PATCH 2/4] =?UTF-8?q?fix:=20address=20PR=20review=20=E2=80=94=20?=
 =?UTF-8?q?rename=20lmcache=20to=20lmcache=5Fcpu,=20clarify=20test?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Rename offloading value from "lmcache" to "lmcache_cpu" to distinguish
  from potential future LMCache backends (NVMe, WEKA, etc.)
- Clarify test_cannot_mix_tp_and_prefill docstring: this tests existing
  validation behavior from PR #1201, not a new constraint. Note that a
  future PR may relax this to allow different prefill/decode TP values.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml               | 12 ++++----
 .../agentic/minimaxm2.5_fp8_mi300x.sh         |  4 +--
 .../agentic/minimaxm2.5_fp8_mi325x.sh         |  4 +--
 .../agentic/minimaxm2.5_fp8_mi355x.sh         |  4 +--
 utils/matrix_logic/test_validation.py         | 28 ++++++++++---------
 utils/matrix_logic/validation.py              |  4 +--
 6 files changed, 29 insertions(+), 27 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 41894d75d..296e1c052 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -593,9 +593,9 @@ minimaxm2.5-fp8-mi355x-vllm:
     - duration: 1800
       search-space:
       - { tp: 4, offloading: none, conc-list: [2, 4, 8, 16, 32] }
-      - { tp: 4, offloading: lmcache, conc-list: [2, 4, 8, 16, 32] }
+      - { tp: 4, offloading: lmcache_cpu, conc-list: [2, 4, 8, 16, 32] }
       - { tp: 8, offloading: none, conc-list: [2, 4, 8, 16, 32] }
-      - { tp: 8, offloading: lmcache, conc-list: [2, 4, 8, 16, 32] }
+      - { tp: 8, offloading: lmcache_cpu, conc-list: [2, 4, 8, 16, 32] }
 
 minimaxm2.5-fp8-mi355x-atom:
   image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2
@@ -692,9 +692,9 @@ minimaxm2.5-fp8-mi300x-vllm:
     - duration: 1800
       search-space:
       - { tp: 2, offloading: none, conc-list: [2, 4, 8, 16] }
-      - { tp: 2, offloading: lmcache, conc-list: [2, 4, 8, 16] }
+      - { tp: 2, offloading: lmcache_cpu, conc-list: [2, 4, 8, 16] }
       - { tp: 8, offloading: none, conc-list: [2, 4, 8, 16, 32] }
-      - { tp: 8, offloading: lmcache, conc-list: [2, 4, 8, 16, 32] }
+      - { tp: 8, offloading: lmcache_cpu, conc-list: [2, 4, 8, 16, 32] }
 
 minimaxm2.5-fp8-mi325x-vllm:
   image: vllm/vllm-openai-rocm:v0.19.1
@@ -720,9 +720,9 @@ minimaxm2.5-fp8-mi325x-vllm:
     - duration: 1800
       search-space:
       - { tp: 2, offloading: none, conc-list: [2, 4, 8, 16] }
-      - { tp: 2, offloading: lmcache, conc-list: [2, 4, 8, 16] }
+      - { tp: 2, offloading: lmcache_cpu, conc-list: [2, 4, 8, 16] }
       - { tp: 8, offloading: none, conc-list: [2, 4, 8, 16, 32] }
-      - { tp: 8, offloading: lmcache, conc-list: [2, 4, 8, 16, 32] }
+      - { tp: 8, offloading: lmcache_cpu, conc-list: [2, 4, 8, 16, 32] }
 
 gptoss-fp4-mi300x-vllm:
   image: vllm/vllm-openai-rocm:v0.17.0
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh
index 9512c5a63..649a29d77 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh
@@ -64,7 +64,7 @@ case "$OFFLOADING" in
     cpu)
         OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager"
         ;;
-    lmcache)
+    lmcache_cpu)
         # LMCache CPU DRAM offloading via LMCacheConnectorV1.
         # Critical: PYTHONHASHSEED=0 is mandatory for cache key consistency
         # across TP workers. Without it, hit rate is 0%.
@@ -78,7 +78,7 @@ case "$OFFLOADING" in
         OFFLOAD_ARGS="--kv-transfer-config {\"kv_connector\":\"LMCacheConnectorV1\",\"kv_role\":\"kv_both\"}"
         ;;
     *)
-        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu, lmcache)" >&2
+        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu, lmcache_cpu)" >&2
         exit 1
         ;;
 esac
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh
index 8e2306721..8724f1afc 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh
@@ -64,7 +64,7 @@ case "$OFFLOADING" in
     cpu)
         OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager"
         ;;
-    lmcache)
+    lmcache_cpu)
         # LMCache CPU DRAM offloading via LMCacheConnectorV1.
         # Critical: PYTHONHASHSEED=0 is mandatory for cache key consistency
         # across TP workers. Without it, hit rate is 0%.
@@ -78,7 +78,7 @@ case "$OFFLOADING" in
         OFFLOAD_ARGS="--kv-transfer-config {\"kv_connector\":\"LMCacheConnectorV1\",\"kv_role\":\"kv_both\"}"
         ;;
     *)
-        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu, lmcache)" >&2
+        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu, lmcache_cpu)" >&2
         exit 1
         ;;
 esac
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh
index 9304ae5c9..eee3a7f0f 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh
@@ -64,7 +64,7 @@ case "$OFFLOADING" in
     cpu)
         OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager"
         ;;
-    lmcache)
+    lmcache_cpu)
         # LMCache CPU DRAM offloading via LMCacheConnectorV1.
         # Critical: PYTHONHASHSEED=0 is mandatory for cache key consistency
         # across TP workers. Without it, hit rate is 0%.
@@ -78,7 +78,7 @@ case "$OFFLOADING" in
         OFFLOAD_ARGS="--kv-transfer-config {\"kv_connector\":\"LMCacheConnectorV1\",\"kv_role\":\"kv_both\"}"
         ;;
     *)
-        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu, lmcache)" >&2
+        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu, lmcache_cpu)" >&2
         exit 1
         ;;
 esac
diff --git a/utils/matrix_logic/test_validation.py b/utils/matrix_logic/test_validation.py
index 284937815..d7bbe9b00 100644
--- a/utils/matrix_logic/test_validation.py
+++ b/utils/matrix_logic/test_validation.py
@@ -910,13 +910,13 @@ def test_valid_with_offloading_cpu(self):
         assert entry.offloading == "cpu"
 
     def test_valid_with_offloading_lmcache(self):
-        """Valid entry with offloading=lmcache should pass."""
+        """Valid entry with offloading=lmcache_cpu should pass."""
         entry = AgenticCodingSearchSpaceEntry(**{
             "tp": 2,
-            "offloading": "lmcache",
+            "offloading": "lmcache_cpu",
             "conc-list": [2, 4, 8, 16],
         })
-        assert entry.offloading == "lmcache"
+        assert entry.offloading == "lmcache_cpu"
         assert entry.tp == 2
 
     def test_valid_with_offloading_ssd(self):
@@ -950,13 +950,15 @@ def test_must_specify_tp_or_prefill_decode(self):
         """Must specify either tp or both prefill and decode."""
         with pytest.raises(Exception) as exc_info:
             AgenticCodingSearchSpaceEntry(**{
-                "offloading": "lmcache",
+                "offloading": "lmcache_cpu",
                 "conc-list": [4],
             })
         assert "must specify either tp" in str(exc_info.value).lower()
 
     def test_cannot_mix_tp_and_prefill(self):
-        """Cannot specify both tp and prefill/decode."""
+        """Current validation rejects both tp and prefill/decode in the same entry.
+        Note: this tests existing behavior from PR #1201. A future PR may relax
+        this constraint to allow different prefill/decode TP values."""
         with pytest.raises(Exception):
             AgenticCodingSearchSpaceEntry(**{
                 "tp": 8,
@@ -990,16 +992,16 @@ def valid_agentic_entry(self):
             "ep": 1,
             "dp-attn": False,
             "conc": 8,
-            "offloading": "lmcache",
+            "offloading": "lmcache_cpu",
             "duration": 1800,
-            "exp-name": "minimaxm2.5_tp2_conc8_offloadlmcache",
+            "exp-name": "minimaxm2.5_tp2_conc8_offloadlmcache_cpu",
             "scenario-type": "agentic-coding",
         }
 
     def test_valid_lmcache_entry(self, valid_agentic_entry):
         """Valid agentic entry with lmcache offloading should pass."""
         entry = SingleNodeAgenticMatrixEntry(**valid_agentic_entry)
-        assert entry.offloading == "lmcache"
+        assert entry.offloading == "lmcache_cpu"
         assert entry.tp == 2
         assert entry.conc == 8
         assert entry.scenario_type == "agentic-coding"
@@ -1060,13 +1062,13 @@ def test_valid_with_lmcache_and_none(self):
             "duration": 1800,
             "search-space": [
                 {"tp": 2, "offloading": "none", "conc-list": [2, 4, 8]},
-                {"tp": 2, "offloading": "lmcache", "conc-list": [2, 4, 8]},
+                {"tp": 2, "offloading": "lmcache_cpu", "conc-list": [2, 4, 8]},
             ],
         })
         assert config.duration == 1800
         assert len(config.search_space) == 2
         assert config.search_space[0].offloading == "none"
-        assert config.search_space[1].offloading == "lmcache"
+        assert config.search_space[1].offloading == "lmcache_cpu"
 
     def test_duration_defaults_to_1800(self):
         """Duration should default to 1800."""
@@ -1100,7 +1102,7 @@ def test_single_node_with_agentic_only(self):
                     {
                         "duration": 1800,
                         "search-space": [
-                            {"tp": 2, "offloading": "lmcache", "conc-list": [2, 4, 8]},
+                            {"tp": 2, "offloading": "lmcache_cpu", "conc-list": [2, 4, 8]},
                         ],
                     }
                 ],
@@ -1108,7 +1110,7 @@ def test_single_node_with_agentic_only(self):
         })
         assert config.scenarios.agentic_coding is not None
         assert len(config.scenarios.agentic_coding) == 1
-        assert config.scenarios.agentic_coding[0].search_space[0].offloading == "lmcache"
+        assert config.scenarios.agentic_coding[0].search_space[0].offloading == "lmcache_cpu"
 
     def test_single_node_with_both_scenarios(self):
         """Single node config with both fixed-seq-len and agentic-coding should pass."""
@@ -1132,7 +1134,7 @@ def test_single_node_with_both_scenarios(self):
                         "duration": 1800,
                         "search-space": [
                             {"tp": 2, "offloading": "none", "conc-list": [2, 4, 8]},
-                            {"tp": 2, "offloading": "lmcache", "conc-list": [2, 4, 8]},
+                            {"tp": 2, "offloading": "lmcache_cpu", "conc-list": [2, 4, 8]},
                         ],
                     }
                 ],
diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py
index 195311d49..ed925a432 100644
--- a/utils/matrix_logic/validation.py
+++ b/utils/matrix_logic/validation.py
@@ -156,7 +156,7 @@ class SingleNodeAgenticMatrixEntry(BaseModel):
     ep: int
     dp_attn: bool = Field(alias=Fields.DP_ATTN.value)
     conc: int
-    offloading: Literal["none", "cpu", "lmcache", "ssd"] = Field(alias=Fields.OFFLOADING.value)
+    offloading: Literal["none", "cpu", "lmcache_cpu", "ssd"] = Field(alias=Fields.OFFLOADING.value)
     duration: int = Field(default=1800, alias=Fields.DURATION.value)
     exp_name: str = Field(alias=Fields.EXP_NAME.value)
     scenario_type: str = Field(alias=Fields.SCENARIO_TYPE.value)
@@ -338,7 +338,7 @@ class AgenticCodingSearchSpaceEntry(BaseModel):
         default="none", alias=Fields.SPEC_DECODING.value)
     prefill: Optional[WorkerConfig] = None
     decode: Optional[WorkerConfig] = None
-    offloading: Literal["none", "cpu", "lmcache", "ssd"] = Field(default="none", alias=Fields.OFFLOADING.value)
+    offloading: Literal["none", "cpu", "lmcache_cpu", "ssd"] = Field(default="none", alias=Fields.OFFLOADING.value)
     conc_start: Optional[int] = Field(default=None, alias=Fields.CONC_START.value)
     conc_end: Optional[int] = Field(default=None, alias=Fields.CONC_END.value)
     conc_list: Optional[List[int]] = Field(default=None, alias=Fields.CONC_LIST.value)

From e7ab020c947845df1e28c3aae570e1d3c97d6042 Mon Sep 17 00:00:00 2001
From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com>
Date: Sat, 2 May 2026 21:03:25 +0000
Subject: [PATCH 3/4] fix: relax validate_topology_fields to allow
 disaggregated prefill/decode TP

vLLM and SGLang support different prefill vs decode TP on most models.
The previous validation rejected entries specifying both tp and
prefill/decode configs. Now tp can coexist with prefill/decode for
disaggregated serving, while still requiring both prefill and decode
if either is specified.

Co-authored-by: functionstackx <functionstackx@users.noreply.github.com>
---
 utils/matrix_logic/test_validation.py | 36 ++++++++++++++++++++++-----
 utils/matrix_logic/validation.py      | 16 ++++++------
 2 files changed, 37 insertions(+), 15 deletions(-)

diff --git a/utils/matrix_logic/test_validation.py b/utils/matrix_logic/test_validation.py
index d7bbe9b00..ab3c4b51e 100644
--- a/utils/matrix_logic/test_validation.py
+++ b/utils/matrix_logic/test_validation.py
@@ -953,23 +953,47 @@ def test_must_specify_tp_or_prefill_decode(self):
                 "offloading": "lmcache_cpu",
                 "conc-list": [4],
             })
-        assert "must specify either tp" in str(exc_info.value).lower()
+        assert "must specify at least tp" in str(exc_info.value).lower()
 
-    def test_cannot_mix_tp_and_prefill(self):
-        """Current validation rejects both tp and prefill/decode in the same entry.
-        Note: this tests existing behavior from PR #1201. A future PR may relax
-        this constraint to allow different prefill/decode TP values."""
-        with pytest.raises(Exception):
+    def test_tp_with_prefill_decode_allowed(self):
+        """tp can coexist with prefill/decode for disaggregated serving."""
+        entry = AgenticCodingSearchSpaceEntry(**{
+            "tp": 8,
+            "prefill": {
+                "num-worker": 1, "tp": 4, "ep": 4, "dp-attn": False,
+            },
+            "decode": {
+                "num-worker": 1, "tp": 8, "ep": 8, "dp-attn": False,
+            },
+            "conc-list": [4],
+        })
+        assert entry.tp == 8
+        assert entry.prefill.tp == 4
+        assert entry.decode.tp == 8
+
+    def test_prefill_without_decode_rejected(self):
+        """Specifying only prefill without decode should fail."""
+        with pytest.raises(Exception) as exc_info:
             AgenticCodingSearchSpaceEntry(**{
                 "tp": 8,
                 "prefill": {
                     "num-worker": 1, "tp": 4, "ep": 4, "dp-attn": False,
                 },
+                "conc-list": [4],
+            })
+        assert "both prefill and decode" in str(exc_info.value).lower()
+
+    def test_decode_without_prefill_rejected(self):
+        """Specifying only decode without prefill should fail."""
+        with pytest.raises(Exception) as exc_info:
+            AgenticCodingSearchSpaceEntry(**{
+                "tp": 8,
                 "decode": {
                     "num-worker": 1, "tp": 8, "ep": 8, "dp-attn": False,
                 },
                 "conc-list": [4],
             })
+        assert "both prefill and decode" in str(exc_info.value).lower()
 
 
 # =============================================================================
diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py
index ed925a432..385e7c75b 100644
--- a/utils/matrix_logic/validation.py
+++ b/utils/matrix_logic/validation.py
@@ -349,15 +349,13 @@ def validate_conc_fields(self):
 
     @model_validator(mode='after')
     def validate_topology_fields(self):
-        has_single_node = self.tp is not None
-        has_any_multinode_field = self.prefill is not None or self.decode is not None
-        has_complete_multinode = self.prefill is not None and self.decode is not None
-        if has_single_node:
-            valid = not has_any_multinode_field
-        else:
-            valid = has_complete_multinode
-        if not valid:
-            raise ValueError("Agentic search-space entries must specify either tp or both prefill and decode")
+        has_tp = self.tp is not None
+        has_prefill = self.prefill is not None
+        has_decode = self.decode is not None
+        if has_prefill != has_decode:
+            raise ValueError("Agentic search-space entries must specify both prefill and decode, not just one")
+        if not has_tp and not has_prefill:
+            raise ValueError("Agentic search-space entries must specify at least tp or both prefill and decode")
         return self
 
 

From 696a8040228f78c3190ab1293ef7c0dc42fcd82e Mon Sep 17 00:00:00 2001
From: Andy Luo <andy.luo@amd.com>
Date: Mon, 4 May 2026 11:27:03 -0700
Subject: [PATCH 4/4] fix: size LMCache CPU pool per-worker to avoid OOM

LMCACHE_MAX_LOCAL_CPU_SIZE is per TP worker, not total. Using
TOTAL_CPU_DRAM_GB directly would allocate TOTAL_CPU_DRAM_GB * TP_SIZE
of pinned memory (e.g. 600 GB * 8 = 4.8 TB at TP=8), causing OOM.

Divide by TP so the aggregate matches the intended total budget.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh | 1 +
 benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh | 1 +
 benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh | 1 +
 3 files changed, 3 insertions(+)

diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh
index 649a29d77..47d17137a 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh
@@ -72,6 +72,7 @@ case "$OFFLOADING" in
         export PYTHONHASHSEED=0
         export LMCACHE_LOCAL_CPU=true
         export LMCACHE_CHUNK_SIZE=256
+        export LMCACHE_MAX_LOCAL_CPU_SIZE=$((TOTAL_CPU_DRAM_GB / TP))
         # LMCache reuses vLLM's prefix cache hash function, so prefix caching
         # must be enabled (unlike native CPU offloading).
         PREFIX_CACHE_FLAG="--enable-prefix-caching"
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh
index 8724f1afc..6090a4408 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh
@@ -72,6 +72,7 @@ case "$OFFLOADING" in
         export PYTHONHASHSEED=0
         export LMCACHE_LOCAL_CPU=true
         export LMCACHE_CHUNK_SIZE=256
+        export LMCACHE_MAX_LOCAL_CPU_SIZE=$((TOTAL_CPU_DRAM_GB / TP))
         # LMCache reuses vLLM's prefix cache hash function, so prefix caching
         # must be enabled (unlike native CPU offloading).
         PREFIX_CACHE_FLAG="--enable-prefix-caching"
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh
index eee3a7f0f..dea4dec32 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh
@@ -72,6 +72,7 @@ case "$OFFLOADING" in
         export PYTHONHASHSEED=0
         export LMCACHE_LOCAL_CPU=true
         export LMCACHE_CHUNK_SIZE=256
+        export LMCACHE_MAX_LOCAL_CPU_SIZE=$((TOTAL_CPU_DRAM_GB / TP))
         # LMCache reuses vLLM's prefix cache hash function, so prefix caching
         # must be enabled (unlike native CPU offloading).
         PREFIX_CACHE_FLAG="--enable-prefix-caching"