From 06ee98804c7b6f522425a2f42688b9eccf316a5a Mon Sep 17 00:00:00 2001
From: mohit <mohit.gyanchandani09@gmail.com>
Date: Sat, 13 Jun 2026 13:10:53 +0530
Subject: [PATCH] fix(benchmark): wire measured profiles into recommender

Three checkpoints were blocking measured benchmark data from reaching
the recommender:

1. BenchmarkStore used glob() instead of rglob(), so profiles in
   benchmarks/profiles/measured/ were never loaded
2. GpuTier enum had no RTX_4070 entry, causing ValueError on load
3. RuntimeRegistry didn't list rtx4070 as a vLLM-supported GPU tier

All three are now fixed. The 4 measured RTX 4070 profiles (qwen2.5-7b,
qwen2.5-3b, llama3.1-8b, llama3.2-3b) are now active in the recommender.

Total profiles: 32 (was 28). Measured: 4 (was 0).
Tests updated: 2 (cost optimization and response shape now account for
$0/hr local GPU).

All 149 tests pass.
---
 app/schemas/hardware.py                   | 10 ++++++++++
 app/services/benchmark_store/store.py     |  2 +-
 app/services/runtime_registry/registry.py |  2 +-
 tests/test_engine.py                      |  4 ++--
 4 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/app/schemas/hardware.py b/app/schemas/hardware.py
index 414d013..ea21d18 100644
--- a/app/schemas/hardware.py
+++ b/app/schemas/hardware.py
@@ -7,6 +7,7 @@ class GpuTier(str, Enum):
     L40S = "l40s"
     A100_80GB = "a100-80gb"
     H100 = "h100"
+    RTX_4070 = "rtx4070"
 
 
 class GpuSpec(BaseModel):
@@ -61,4 +62,13 @@ def monthly_cost_usd(self) -> float:
         memory_bandwidth_gbps=3350,
         hourly_cost_usd=4.50,
     ),
+    GpuTier.RTX_4070: GpuSpec(
+        tier=GpuTier.RTX_4070,
+        name="NVIDIA GeForce RTX 4070",
+        vram_gb=12,
+        fp16_tflops=44,
+        fp8_tflops=88,
+        memory_bandwidth_gbps=432,
+        hourly_cost_usd=0.00,
+    ),
 }
diff --git a/app/services/benchmark_store/store.py b/app/services/benchmark_store/store.py
index 164b5f8..8748451 100644
--- a/app/services/benchmark_store/store.py
+++ b/app/services/benchmark_store/store.py
@@ -19,7 +19,7 @@ def _load(self) -> None:
         self._profiles = []
         if not self._dir.exists():
             return
-        for fp in sorted(self._dir.glob("*.json")):
+        for fp in sorted(self._dir.rglob("*.json")):
             with open(fp) as f:
                 data = json.load(f)
             # each file is a list of profiles for one model
diff --git a/app/services/runtime_registry/registry.py b/app/services/runtime_registry/registry.py
index 12afaf5..37ede7a 100644
--- a/app/services/runtime_registry/registry.py
+++ b/app/services/runtime_registry/registry.py
@@ -25,7 +25,7 @@ def __init__(self) -> None:
                 name="vllm",
                 display_name="vLLM",
                 supported_quantizations=["fp16", "bf16", "fp8", "awq", "gptq"],
-                supported_gpu_tiers=["l4", "l40s", "a100-80gb", "h100"],
+                supported_gpu_tiers=["l4", "l40s", "a100-80gb", "h100", "rtx4070"],
                 supports_continuous_batching=True,
                 supports_paged_attention=True,
                 supports_speculative_decoding=True,
diff --git a/tests/test_engine.py b/tests/test_engine.py
index e3f1738..615cda4 100644
--- a/tests/test_engine.py
+++ b/tests/test_engine.py
@@ -49,7 +49,7 @@ def test_cost_optimized_recommendation():
 
     rec = result.recommended
     # should not pick H100 for a cost-optimized workload
-    assert rec.gpu_tier in ("l4", "l40s"), f"Expected budget GPU, got {rec.gpu_tier}"
+    assert rec.gpu_tier in ("l4", "l40s", "rtx4070"), f"Expected budget GPU, got {rec.gpu_tier}"
     assert rec.estimated_monthly_cost_usd <= 1200
     assert rec.scores.meets_latency is True or rec.estimated_ttft_p95_ms <= 250
 
@@ -108,7 +108,7 @@ def test_recommendation_response_shape():
     assert rec.quantization
     assert rec.estimated_ttft_p95_ms > 0
     assert rec.estimated_tokens_per_sec > 0
-    assert rec.estimated_hourly_cost_usd > 0
+    assert rec.estimated_hourly_cost_usd >= 0
 
 
 def test_compare_two_models():