diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 949a8a106..296e1c052 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -568,7 +568,7 @@ kimik2.5-fp4-mi355x-atom: - { tp: 4, conc-start: 4, conc-end: 128 } minimaxm2.5-fp8-mi355x-vllm: - image: vllm/vllm-openai-rocm:v0.19.0 + image: vllm/vllm-openai-rocm:v0.19.1 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi355x @@ -589,6 +589,13 @@ minimaxm2.5-fp8-mi355x-vllm: - { tp: 2, ep: 2, conc-start: 2, conc-end: 256 } - { tp: 4, ep: 4, conc-start: 4, conc-end: 512 } - { tp: 8, ep: 8, conc-start: 2, conc-end: 2 } + agentic-coding: + - duration: 1800 + search-space: + - { tp: 4, offloading: none, conc-list: [2, 4, 8, 16, 32] } + - { tp: 4, offloading: lmcache_cpu, conc-list: [2, 4, 8, 16, 32] } + - { tp: 8, offloading: none, conc-list: [2, 4, 8, 16, 32] } + - { tp: 8, offloading: lmcache_cpu, conc-list: [2, 4, 8, 16, 32] } minimaxm2.5-fp8-mi355x-atom: image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2 @@ -662,7 +669,7 @@ minimaxm2.5-fp4-mi355x-vllm: - { tp: 4, conc-start: 4, conc-end: 64 } minimaxm2.5-fp8-mi300x-vllm: - image: vllm/vllm-openai-rocm:v0.16.0 + image: vllm/vllm-openai-rocm:v0.19.1 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi300x @@ -681,9 +688,16 @@ minimaxm2.5-fp8-mi300x-vllm: search-space: - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } + agentic-coding: + - duration: 1800 + search-space: + - { tp: 2, offloading: none, conc-list: [2, 4, 8, 16] } + - { tp: 2, offloading: lmcache_cpu, conc-list: [2, 4, 8, 16] } + - { tp: 8, offloading: none, conc-list: [2, 4, 8, 16, 32] } + - { tp: 8, offloading: lmcache_cpu, conc-list: [2, 4, 8, 16, 32] } minimaxm2.5-fp8-mi325x-vllm: - image: vllm/vllm-openai-rocm:v0.18.0 + image: vllm/vllm-openai-rocm:v0.19.1 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi325x @@ -702,6 +716,13 @@ minimaxm2.5-fp8-mi325x-vllm: search-space: - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 8, ep: 8, conc-start: 4, conc-end: 256 } + agentic-coding: + - duration: 1800 + search-space: + - { tp: 2, offloading: none, conc-list: [2, 4, 8, 16] } + - { tp: 2, offloading: lmcache_cpu, conc-list: [2, 4, 8, 16] } + - { tp: 8, offloading: none, conc-list: [2, 4, 8, 16, 32] } + - { tp: 8, offloading: lmcache_cpu, conc-list: [2, 4, 8, 16, 32] } gptoss-fp4-mi300x-vllm: image: vllm/vllm-openai-rocm:v0.17.0 diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 4c0c8642e..75c72316f 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -902,6 +902,19 @@ resolve_trace_source() { hf download --repo-type dataset "$dataset" } +install_lmcache_hip() { + # LMCache PyPI wheel ships CUDA-only c_ops.so; must build from source for ROCm. + # `pip install lmcache` ignores BUILD_WITH_HIP and installs the pre-built CUDA wheel. + # We must clone and build with --no-build-isolation to get the HIP c_ops.so. + local lmcache_dir + lmcache_dir="$(mktemp -d)/LMCache" + echo "Building LMCache from source with HIP support..." + git clone --depth 1 https://github.com/LMCache/LMCache.git "$lmcache_dir" + SETUPTOOLS_SCM_PRETEND_VERSION=0.4.4 BUILD_WITH_HIP=1 \ + agentic_pip_install -e "$lmcache_dir" --no-build-isolation + echo "LMCache HIP build complete." +} + install_agentic_deps() { agentic_pip_install --quiet urllib3 requests 2>/dev/null || true agentic_pip_install -q -r "$AGENTIC_DIR/requirements.txt" diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh new file mode 100755 index 000000000..47d17137a --- /dev/null +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh @@ -0,0 +1,123 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# Agentic trace replay benchmark for MiniMax-M2.5 FP8 on MI300X using vLLM. +# Supports LMCache CPU DRAM offloading for KV cache. +# +# Required env vars: +# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR + +source "$(dirname "$0")/../../benchmark_lib.sh" + +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR + +PORT=${PORT:-8888} +DURATION=${DURATION:-1800} +MAX_DELAY=${MAX_DELAY:-60} +ADVANCE_MIN=${ADVANCE_MIN:-0.0} +ADVANCE_MAX=${ADVANCE_MAX:-0.7} +# Agentic matrix entries don't set max-model-len, so the workflow passes 0. +# ${:-DEFAULT} only fires on unset/empty, so handle 0 explicitly. +if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then + MAX_MODEL_LEN=131072 +fi + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +rocm-smi + +# If the machine runs a MEC FW older than 177, RCCL cannot reclaim some memory. +# See https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates +version=`rocm-smi --showfw | grep MEC | head -n 1 | awk '{print $NF}'` +if [[ "$version" == "" || $version -lt 177 ]]; then + export HSA_NO_SCRATCH_RECLAIM=1 +fi + +# Ray compatibility in vLLM 0.14+ needs HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES +if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then + export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" +fi + +export AMDGCN_USE_BUFFER_OPS=0 +export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 +export PYTHONNOUSERSITE=1 + +# ---- Resolve traces and install deps ---------------------------------------- +resolve_trace_source +install_agentic_deps + +# ---- Server config ---------------------------------------------------------- +SERVER_LOG="$RESULT_DIR/server.log" +mkdir -p "$RESULT_DIR" + +OFFLOAD_ARGS="" +PREFIX_CACHE_FLAG="--no-enable-prefix-caching" + +case "$OFFLOADING" in + none) + ;; + cpu) + OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager" + ;; + lmcache_cpu) + # LMCache CPU DRAM offloading via LMCacheConnectorV1. + # Critical: PYTHONHASHSEED=0 is mandatory for cache key consistency + # across TP workers. Without it, hit rate is 0%. + install_lmcache_hip + export PYTHONHASHSEED=0 + export LMCACHE_LOCAL_CPU=true + export LMCACHE_CHUNK_SIZE=256 + export LMCACHE_MAX_LOCAL_CPU_SIZE=$((TOTAL_CPU_DRAM_GB / TP)) + # LMCache reuses vLLM's prefix cache hash function, so prefix caching + # must be enabled (unlike native CPU offloading). + PREFIX_CACHE_FLAG="--enable-prefix-caching" + OFFLOAD_ARGS="--kv-transfer-config {\"kv_connector\":\"LMCacheConnectorV1\",\"kv_role\":\"kv_both\"}" + ;; + *) + echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu, lmcache_cpu)" >&2 + exit 1 + ;; +esac + +echo "Starting vllm server..." + +vllm serve $MODEL \ +--host 0.0.0.0 \ +--port $PORT \ +--trust-remote-code \ +--tool-call-parser minimax_m2 \ +--reasoning-parser minimax_m2 \ +--enable-auto-tool-choice \ +--attention-backend ROCM_AITER_UNIFIED_ATTN \ +--tensor-parallel-size=$TP \ +--gpu-memory-utilization 0.85 \ +--max-model-len $MAX_MODEL_LEN \ +--max-num-seqs $CONC \ +--block-size=64 \ +--kv-cache-dtype fp8 \ +$PREFIX_CACHE_FLAG \ +$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +# ---- Run benchmark ---------------------------------------------------------- +build_replay_cmd "$RESULT_DIR" + +echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" + +set -x +$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true +set +x + +write_agentic_result_json "$RESULT_DIR" + +# ---- Post-processing -------------------------------------------------------- +python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ + "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh new file mode 100755 index 000000000..6090a4408 --- /dev/null +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh @@ -0,0 +1,123 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# Agentic trace replay benchmark for MiniMax-M2.5 FP8 on MI325X using vLLM. +# Supports LMCache CPU DRAM offloading for KV cache. +# +# Required env vars: +# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR + +source "$(dirname "$0")/../../benchmark_lib.sh" + +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR + +PORT=${PORT:-8888} +DURATION=${DURATION:-1800} +MAX_DELAY=${MAX_DELAY:-60} +ADVANCE_MIN=${ADVANCE_MIN:-0.0} +ADVANCE_MAX=${ADVANCE_MAX:-0.7} +# Agentic matrix entries don't set max-model-len, so the workflow passes 0. +# ${:-DEFAULT} only fires on unset/empty, so handle 0 explicitly. +if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then + MAX_MODEL_LEN=131072 +fi + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +rocm-smi + +# If the machine runs a MEC FW older than 177, RCCL cannot reclaim some memory. +# See https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates +version=`rocm-smi --showfw | grep MEC | head -n 1 | awk '{print $NF}'` +if [[ "$version" == "" || $version -lt 177 ]]; then + export HSA_NO_SCRATCH_RECLAIM=1 +fi + +# Ray compatibility in vLLM 0.14+ needs HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES +if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then + export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" +fi + +export AMDGCN_USE_BUFFER_OPS=0 +export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 +export PYTHONNOUSERSITE=1 + +# ---- Resolve traces and install deps ---------------------------------------- +resolve_trace_source +install_agentic_deps + +# ---- Server config ---------------------------------------------------------- +SERVER_LOG="$RESULT_DIR/server.log" +mkdir -p "$RESULT_DIR" + +OFFLOAD_ARGS="" +PREFIX_CACHE_FLAG="--no-enable-prefix-caching" + +case "$OFFLOADING" in + none) + ;; + cpu) + OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager" + ;; + lmcache_cpu) + # LMCache CPU DRAM offloading via LMCacheConnectorV1. + # Critical: PYTHONHASHSEED=0 is mandatory for cache key consistency + # across TP workers. Without it, hit rate is 0%. + install_lmcache_hip + export PYTHONHASHSEED=0 + export LMCACHE_LOCAL_CPU=true + export LMCACHE_CHUNK_SIZE=256 + export LMCACHE_MAX_LOCAL_CPU_SIZE=$((TOTAL_CPU_DRAM_GB / TP)) + # LMCache reuses vLLM's prefix cache hash function, so prefix caching + # must be enabled (unlike native CPU offloading). + PREFIX_CACHE_FLAG="--enable-prefix-caching" + OFFLOAD_ARGS="--kv-transfer-config {\"kv_connector\":\"LMCacheConnectorV1\",\"kv_role\":\"kv_both\"}" + ;; + *) + echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu, lmcache_cpu)" >&2 + exit 1 + ;; +esac + +echo "Starting vllm server..." + +vllm serve $MODEL \ +--host 0.0.0.0 \ +--port $PORT \ +--trust-remote-code \ +--tool-call-parser minimax_m2 \ +--reasoning-parser minimax_m2 \ +--enable-auto-tool-choice \ +--attention-backend ROCM_AITER_UNIFIED_ATTN \ +--tensor-parallel-size=$TP \ +--gpu-memory-utilization 0.85 \ +--max-model-len $MAX_MODEL_LEN \ +--max-num-seqs $CONC \ +--block-size=64 \ +--kv-cache-dtype fp8 \ +$PREFIX_CACHE_FLAG \ +$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +# ---- Run benchmark ---------------------------------------------------------- +build_replay_cmd "$RESULT_DIR" + +echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" + +set -x +$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true +set +x + +write_agentic_result_json "$RESULT_DIR" + +# ---- Post-processing -------------------------------------------------------- +python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ + "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh new file mode 100755 index 000000000..dea4dec32 --- /dev/null +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh @@ -0,0 +1,123 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# Agentic trace replay benchmark for MiniMax-M2.5 FP8 on MI355X using vLLM. +# Supports LMCache CPU DRAM offloading for KV cache. +# +# Required env vars: +# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR + +source "$(dirname "$0")/../../benchmark_lib.sh" + +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR + +PORT=${PORT:-8888} +DURATION=${DURATION:-1800} +MAX_DELAY=${MAX_DELAY:-60} +ADVANCE_MIN=${ADVANCE_MIN:-0.0} +ADVANCE_MAX=${ADVANCE_MAX:-0.7} +# Agentic matrix entries don't set max-model-len, so the workflow passes 0. +# ${:-DEFAULT} only fires on unset/empty, so handle 0 explicitly. +if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then + MAX_MODEL_LEN=131072 +fi + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +rocm-smi + +# If the machine runs a MEC FW older than 177, RCCL cannot reclaim some memory. +# See https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates +version=`rocm-smi --showfw | grep MEC | head -n 1 | awk '{print $NF}'` +if [[ "$version" == "" || $version -lt 177 ]]; then + export HSA_NO_SCRATCH_RECLAIM=1 +fi + +# Ray compatibility in vLLM 0.14+ needs HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES +if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then + export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" +fi + +export AMDGCN_USE_BUFFER_OPS=0 +export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 +export PYTHONNOUSERSITE=1 + +# ---- Resolve traces and install deps ---------------------------------------- +resolve_trace_source +install_agentic_deps + +# ---- Server config ---------------------------------------------------------- +SERVER_LOG="$RESULT_DIR/server.log" +mkdir -p "$RESULT_DIR" + +OFFLOAD_ARGS="" +PREFIX_CACHE_FLAG="--no-enable-prefix-caching" + +case "$OFFLOADING" in + none) + ;; + cpu) + OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager" + ;; + lmcache_cpu) + # LMCache CPU DRAM offloading via LMCacheConnectorV1. + # Critical: PYTHONHASHSEED=0 is mandatory for cache key consistency + # across TP workers. Without it, hit rate is 0%. + install_lmcache_hip + export PYTHONHASHSEED=0 + export LMCACHE_LOCAL_CPU=true + export LMCACHE_CHUNK_SIZE=256 + export LMCACHE_MAX_LOCAL_CPU_SIZE=$((TOTAL_CPU_DRAM_GB / TP)) + # LMCache reuses vLLM's prefix cache hash function, so prefix caching + # must be enabled (unlike native CPU offloading). + PREFIX_CACHE_FLAG="--enable-prefix-caching" + OFFLOAD_ARGS="--kv-transfer-config {\"kv_connector\":\"LMCacheConnectorV1\",\"kv_role\":\"kv_both\"}" + ;; + *) + echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu, lmcache_cpu)" >&2 + exit 1 + ;; +esac + +echo "Starting vllm server..." + +vllm serve $MODEL \ +--host 0.0.0.0 \ +--port $PORT \ +--trust-remote-code \ +--tool-call-parser minimax_m2 \ +--reasoning-parser minimax_m2 \ +--enable-auto-tool-choice \ +--attention-backend ROCM_AITER_UNIFIED_ATTN \ +--tensor-parallel-size=$TP \ +--gpu-memory-utilization 0.85 \ +--max-model-len $MAX_MODEL_LEN \ +--max-num-seqs $CONC \ +--block-size=64 \ +--kv-cache-dtype fp8 \ +$PREFIX_CACHE_FLAG \ +$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +# ---- Run benchmark ---------------------------------------------------------- +build_replay_cmd "$RESULT_DIR" + +echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" + +set -x +$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true +set +x + +write_agentic_result_json "$RESULT_DIR" + +# ---- Post-processing -------------------------------------------------------- +python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ + "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true diff --git a/utils/matrix_logic/test_validation.py b/utils/matrix_logic/test_validation.py index 1274fd86a..ab3c4b51e 100644 --- a/utils/matrix_logic/test_validation.py +++ b/utils/matrix_logic/test_validation.py @@ -11,7 +11,12 @@ MultiNodeSeqLenConfig, SingleNodeMasterConfigEntry, MultiNodeMasterConfigEntry, + SingleNodeAgenticMatrixEntry, + AgenticCodingSearchSpaceEntry, + AgenticCodingConfig, + SingleNodeScenarios, validate_matrix_entry, + validate_agentic_matrix_entry, validate_master_config, validate_runner_config, load_config_files, @@ -875,3 +880,304 @@ def test_validation_runs_by_default(self, tmp_path): with pytest.raises(ValueError) as exc_info: load_runner_file(str(runner_file)) assert "must be a list" in str(exc_info.value) + + +# ============================================================================= +# Test AgenticCodingSearchSpaceEntry +# ============================================================================= + +class TestAgenticCodingSearchSpaceEntry: + """Tests for AgenticCodingSearchSpaceEntry model.""" + + def test_valid_with_offloading_none(self): + """Valid entry with offloading=none should pass.""" + entry = AgenticCodingSearchSpaceEntry(**{ + "tp": 8, + "offloading": "none", + "conc-list": [2, 4, 8, 16], + }) + assert entry.tp == 8 + assert entry.offloading == "none" + assert entry.conc_list == [2, 4, 8, 16] + + def test_valid_with_offloading_cpu(self): + """Valid entry with offloading=cpu should pass.""" + entry = AgenticCodingSearchSpaceEntry(**{ + "tp": 4, + "offloading": "cpu", + "conc-list": [4, 8], + }) + assert entry.offloading == "cpu" + + def test_valid_with_offloading_lmcache(self): + """Valid entry with offloading=lmcache_cpu should pass.""" + entry = AgenticCodingSearchSpaceEntry(**{ + "tp": 2, + "offloading": "lmcache_cpu", + "conc-list": [2, 4, 8, 16], + }) + assert entry.offloading == "lmcache_cpu" + assert entry.tp == 2 + + def test_valid_with_offloading_ssd(self): + """Valid entry with offloading=ssd should pass.""" + entry = AgenticCodingSearchSpaceEntry(**{ + "tp": 8, + "offloading": "ssd", + "conc-start": 4, + "conc-end": 32, + }) + assert entry.offloading == "ssd" + + def test_invalid_offloading_value(self): + """Invalid offloading value should fail.""" + with pytest.raises(Exception): + AgenticCodingSearchSpaceEntry(**{ + "tp": 8, + "offloading": "invalid", + "conc-list": [4], + }) + + def test_offloading_defaults_to_none(self): + """Offloading should default to none.""" + entry = AgenticCodingSearchSpaceEntry(**{ + "tp": 8, + "conc-list": [4, 8], + }) + assert entry.offloading == "none" + + def test_must_specify_tp_or_prefill_decode(self): + """Must specify either tp or both prefill and decode.""" + with pytest.raises(Exception) as exc_info: + AgenticCodingSearchSpaceEntry(**{ + "offloading": "lmcache_cpu", + "conc-list": [4], + }) + assert "must specify at least tp" in str(exc_info.value).lower() + + def test_tp_with_prefill_decode_allowed(self): + """tp can coexist with prefill/decode for disaggregated serving.""" + entry = AgenticCodingSearchSpaceEntry(**{ + "tp": 8, + "prefill": { + "num-worker": 1, "tp": 4, "ep": 4, "dp-attn": False, + }, + "decode": { + "num-worker": 1, "tp": 8, "ep": 8, "dp-attn": False, + }, + "conc-list": [4], + }) + assert entry.tp == 8 + assert entry.prefill.tp == 4 + assert entry.decode.tp == 8 + + def test_prefill_without_decode_rejected(self): + """Specifying only prefill without decode should fail.""" + with pytest.raises(Exception) as exc_info: + AgenticCodingSearchSpaceEntry(**{ + "tp": 8, + "prefill": { + "num-worker": 1, "tp": 4, "ep": 4, "dp-attn": False, + }, + "conc-list": [4], + }) + assert "both prefill and decode" in str(exc_info.value).lower() + + def test_decode_without_prefill_rejected(self): + """Specifying only decode without prefill should fail.""" + with pytest.raises(Exception) as exc_info: + AgenticCodingSearchSpaceEntry(**{ + "tp": 8, + "decode": { + "num-worker": 1, "tp": 8, "ep": 8, "dp-attn": False, + }, + "conc-list": [4], + }) + assert "both prefill and decode" in str(exc_info.value).lower() + + +# ============================================================================= +# Test SingleNodeAgenticMatrixEntry +# ============================================================================= + +class TestSingleNodeAgenticMatrixEntry: + """Tests for SingleNodeAgenticMatrixEntry model.""" + + @pytest.fixture + def valid_agentic_entry(self): + return { + "image": "vllm/vllm-openai-rocm:v0.19.1", + "model": "MiniMaxAI/MiniMax-M2.5", + "model-prefix": "minimaxm2.5", + "precision": "fp8", + "framework": "vllm", + "runner": "mi300x", + "tp": 2, + "ep": 1, + "dp-attn": False, + "conc": 8, + "offloading": "lmcache_cpu", + "duration": 1800, + "exp-name": "minimaxm2.5_tp2_conc8_offloadlmcache_cpu", + "scenario-type": "agentic-coding", + } + + def test_valid_lmcache_entry(self, valid_agentic_entry): + """Valid agentic entry with lmcache offloading should pass.""" + entry = SingleNodeAgenticMatrixEntry(**valid_agentic_entry) + assert entry.offloading == "lmcache_cpu" + assert entry.tp == 2 + assert entry.conc == 8 + assert entry.scenario_type == "agentic-coding" + + def test_valid_none_offloading(self, valid_agentic_entry): + """Valid agentic entry with no offloading should pass.""" + valid_agentic_entry["offloading"] = "none" + entry = SingleNodeAgenticMatrixEntry(**valid_agentic_entry) + assert entry.offloading == "none" + + def test_valid_cpu_offloading(self, valid_agentic_entry): + """Valid agentic entry with cpu offloading should pass.""" + valid_agentic_entry["offloading"] = "cpu" + entry = SingleNodeAgenticMatrixEntry(**valid_agentic_entry) + assert entry.offloading == "cpu" + + def test_invalid_offloading_rejected(self, valid_agentic_entry): + """Invalid offloading value should fail.""" + valid_agentic_entry["offloading"] = "gpu" + with pytest.raises(Exception): + SingleNodeAgenticMatrixEntry(**valid_agentic_entry) + + def test_missing_offloading_fails(self, valid_agentic_entry): + """Missing offloading field should fail (no default on matrix entry).""" + del valid_agentic_entry["offloading"] + with pytest.raises(Exception): + SingleNodeAgenticMatrixEntry(**valid_agentic_entry) + + def test_extra_field_forbidden(self, valid_agentic_entry): + """Extra fields should be rejected.""" + valid_agentic_entry["extra-field"] = "value" + with pytest.raises(Exception): + SingleNodeAgenticMatrixEntry(**valid_agentic_entry) + + def test_validate_agentic_matrix_entry_function(self, valid_agentic_entry): + """validate_agentic_matrix_entry should accept valid entry.""" + result = validate_agentic_matrix_entry(valid_agentic_entry) + assert result == valid_agentic_entry + + def test_validate_agentic_matrix_entry_invalid(self, valid_agentic_entry): + """validate_agentic_matrix_entry should reject invalid entry.""" + del valid_agentic_entry["tp"] + with pytest.raises(ValueError) as exc_info: + validate_agentic_matrix_entry(valid_agentic_entry) + assert "failed validation" in str(exc_info.value) + + +# ============================================================================= +# Test AgenticCodingConfig +# ============================================================================= + +class TestAgenticCodingConfig: + """Tests for AgenticCodingConfig model.""" + + def test_valid_with_lmcache_and_none(self): + """Config with both lmcache and none offloading entries should pass.""" + config = AgenticCodingConfig(**{ + "duration": 1800, + "search-space": [ + {"tp": 2, "offloading": "none", "conc-list": [2, 4, 8]}, + {"tp": 2, "offloading": "lmcache_cpu", "conc-list": [2, 4, 8]}, + ], + }) + assert config.duration == 1800 + assert len(config.search_space) == 2 + assert config.search_space[0].offloading == "none" + assert config.search_space[1].offloading == "lmcache_cpu" + + def test_duration_defaults_to_1800(self): + """Duration should default to 1800.""" + config = AgenticCodingConfig(**{ + "search-space": [ + {"tp": 8, "offloading": "none", "conc-list": [4]}, + ], + }) + assert config.duration == 1800 + + +# ============================================================================= +# Test Master Config with Agentic Scenarios +# ============================================================================= + +class TestMasterConfigWithAgentic: + """Tests for master config entries containing agentic-coding scenarios.""" + + def test_single_node_with_agentic_only(self): + """Single node config with only agentic-coding scenario should pass.""" + config = SingleNodeMasterConfigEntry(**{ + "image": "vllm/vllm-openai-rocm:v0.19.1", + "model": "MiniMaxAI/MiniMax-M2.5", + "model-prefix": "minimaxm2.5", + "precision": "fp8", + "framework": "vllm", + "runner": "mi300x", + "multinode": False, + "scenarios": { + "agentic-coding": [ + { + "duration": 1800, + "search-space": [ + {"tp": 2, "offloading": "lmcache_cpu", "conc-list": [2, 4, 8]}, + ], + } + ], + }, + }) + assert config.scenarios.agentic_coding is not None + assert len(config.scenarios.agentic_coding) == 1 + assert config.scenarios.agentic_coding[0].search_space[0].offloading == "lmcache_cpu" + + def test_single_node_with_both_scenarios(self): + """Single node config with both fixed-seq-len and agentic-coding should pass.""" + config = SingleNodeMasterConfigEntry(**{ + "image": "vllm/vllm-openai-rocm:v0.19.1", + "model": "MiniMaxAI/MiniMax-M2.5", + "model-prefix": "minimaxm2.5", + "precision": "fp8", + "framework": "vllm", + "runner": "mi300x", + "multinode": False, + "scenarios": { + "fixed-seq-len": [ + { + "isl": 1024, "osl": 1024, + "search-space": [{"tp": 2, "conc-start": 4, "conc-end": 64}], + } + ], + "agentic-coding": [ + { + "duration": 1800, + "search-space": [ + {"tp": 2, "offloading": "none", "conc-list": [2, 4, 8]}, + {"tp": 2, "offloading": "lmcache_cpu", "conc-list": [2, 4, 8]}, + ], + } + ], + }, + }) + assert config.scenarios.fixed_seq_len is not None + assert config.scenarios.agentic_coding is not None + + def test_scenarios_must_have_at_least_one(self): + """Scenarios must have at least one scenario type.""" + with pytest.raises(Exception) as exc_info: + SingleNodeMasterConfigEntry(**{ + "image": "test", + "model": "test", + "model-prefix": "test", + "precision": "fp8", + "framework": "vllm", + "runner": "mi300x", + "multinode": False, + "scenarios": {}, + }) + assert "At least one scenario" in str(exc_info.value) diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py index dd245aec7..385e7c75b 100644 --- a/utils/matrix_logic/validation.py +++ b/utils/matrix_logic/validation.py @@ -156,7 +156,7 @@ class SingleNodeAgenticMatrixEntry(BaseModel): ep: int dp_attn: bool = Field(alias=Fields.DP_ATTN.value) conc: int - offloading: Literal["none", "cpu", "ssd"] = Field(alias=Fields.OFFLOADING.value) + offloading: Literal["none", "cpu", "lmcache_cpu", "ssd"] = Field(alias=Fields.OFFLOADING.value) duration: int = Field(default=1800, alias=Fields.DURATION.value) exp_name: str = Field(alias=Fields.EXP_NAME.value) scenario_type: str = Field(alias=Fields.SCENARIO_TYPE.value) @@ -338,7 +338,7 @@ class AgenticCodingSearchSpaceEntry(BaseModel): default="none", alias=Fields.SPEC_DECODING.value) prefill: Optional[WorkerConfig] = None decode: Optional[WorkerConfig] = None - offloading: Literal["none", "cpu", "ssd"] = Field(default="none", alias=Fields.OFFLOADING.value) + offloading: Literal["none", "cpu", "lmcache_cpu", "ssd"] = Field(default="none", alias=Fields.OFFLOADING.value) conc_start: Optional[int] = Field(default=None, alias=Fields.CONC_START.value) conc_end: Optional[int] = Field(default=None, alias=Fields.CONC_END.value) conc_list: Optional[List[int]] = Field(default=None, alias=Fields.CONC_LIST.value) @@ -349,15 +349,13 @@ def validate_conc_fields(self): @model_validator(mode='after') def validate_topology_fields(self): - has_single_node = self.tp is not None - has_any_multinode_field = self.prefill is not None or self.decode is not None - has_complete_multinode = self.prefill is not None and self.decode is not None - if has_single_node: - valid = not has_any_multinode_field - else: - valid = has_complete_multinode - if not valid: - raise ValueError("Agentic search-space entries must specify either tp or both prefill and decode") + has_tp = self.tp is not None + has_prefill = self.prefill is not None + has_decode = self.decode is not None + if has_prefill != has_decode: + raise ValueError("Agentic search-space entries must specify both prefill and decode, not just one") + if not has_tp and not has_prefill: + raise ValueError("Agentic search-space entries must specify at least tp or both prefill and decode") return self