From 06ee98804c7b6f522425a2f42688b9eccf316a5a Mon Sep 17 00:00:00 2001 From: mohit Date: Sat, 13 Jun 2026 13:10:53 +0530 Subject: [PATCH] fix(benchmark): wire measured profiles into recommender Three checkpoints were blocking measured benchmark data from reaching the recommender: 1. BenchmarkStore used glob() instead of rglob(), so profiles in benchmarks/profiles/measured/ were never loaded 2. GpuTier enum had no RTX_4070 entry, causing ValueError on load 3. RuntimeRegistry didn't list rtx4070 as a vLLM-supported GPU tier All three are now fixed. The 4 measured RTX 4070 profiles (qwen2.5-7b, qwen2.5-3b, llama3.1-8b, llama3.2-3b) are now active in the recommender. Total profiles: 32 (was 28). Measured: 4 (was 0). Tests updated: 2 (cost optimization and response shape now account for $0/hr local GPU). All 149 tests pass. --- app/schemas/hardware.py | 10 ++++++++++ app/services/benchmark_store/store.py | 2 +- app/services/runtime_registry/registry.py | 2 +- tests/test_engine.py | 4 ++-- 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/app/schemas/hardware.py b/app/schemas/hardware.py index 414d013..ea21d18 100644 --- a/app/schemas/hardware.py +++ b/app/schemas/hardware.py @@ -7,6 +7,7 @@ class GpuTier(str, Enum): L40S = "l40s" A100_80GB = "a100-80gb" H100 = "h100" + RTX_4070 = "rtx4070" class GpuSpec(BaseModel): @@ -61,4 +62,13 @@ def monthly_cost_usd(self) -> float: memory_bandwidth_gbps=3350, hourly_cost_usd=4.50, ), + GpuTier.RTX_4070: GpuSpec( + tier=GpuTier.RTX_4070, + name="NVIDIA GeForce RTX 4070", + vram_gb=12, + fp16_tflops=44, + fp8_tflops=88, + memory_bandwidth_gbps=432, + hourly_cost_usd=0.00, + ), } diff --git a/app/services/benchmark_store/store.py b/app/services/benchmark_store/store.py index 164b5f8..8748451 100644 --- a/app/services/benchmark_store/store.py +++ b/app/services/benchmark_store/store.py @@ -19,7 +19,7 @@ def _load(self) -> None: self._profiles = [] if not self._dir.exists(): return - for fp in sorted(self._dir.glob("*.json")): + for fp in sorted(self._dir.rglob("*.json")): with open(fp) as f: data = json.load(f) # each file is a list of profiles for one model diff --git a/app/services/runtime_registry/registry.py b/app/services/runtime_registry/registry.py index 12afaf5..37ede7a 100644 --- a/app/services/runtime_registry/registry.py +++ b/app/services/runtime_registry/registry.py @@ -25,7 +25,7 @@ def __init__(self) -> None: name="vllm", display_name="vLLM", supported_quantizations=["fp16", "bf16", "fp8", "awq", "gptq"], - supported_gpu_tiers=["l4", "l40s", "a100-80gb", "h100"], + supported_gpu_tiers=["l4", "l40s", "a100-80gb", "h100", "rtx4070"], supports_continuous_batching=True, supports_paged_attention=True, supports_speculative_decoding=True, diff --git a/tests/test_engine.py b/tests/test_engine.py index e3f1738..615cda4 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -49,7 +49,7 @@ def test_cost_optimized_recommendation(): rec = result.recommended # should not pick H100 for a cost-optimized workload - assert rec.gpu_tier in ("l4", "l40s"), f"Expected budget GPU, got {rec.gpu_tier}" + assert rec.gpu_tier in ("l4", "l40s", "rtx4070"), f"Expected budget GPU, got {rec.gpu_tier}" assert rec.estimated_monthly_cost_usd <= 1200 assert rec.scores.meets_latency is True or rec.estimated_ttft_p95_ms <= 250 @@ -108,7 +108,7 @@ def test_recommendation_response_shape(): assert rec.quantization assert rec.estimated_ttft_p95_ms > 0 assert rec.estimated_tokens_per_sec > 0 - assert rec.estimated_hourly_cost_usd > 0 + assert rec.estimated_hourly_cost_usd >= 0 def test_compare_two_models():