tensormux · Mog9 · Jun 13, 2026
diff --git a/app/schemas/hardware.py b/app/schemas/hardware.py
@@ -7,6 +7,7 @@ class GpuTier(str, Enum):
     L40S = "l40s"
     A100_80GB = "a100-80gb"
     H100 = "h100"
+    RTX_4070 = "rtx4070"
 
 
 class GpuSpec(BaseModel):
@@ -61,4 +62,13 @@ def monthly_cost_usd(self) -> float:
         memory_bandwidth_gbps=3350,
         hourly_cost_usd=4.50,
     ),
+    GpuTier.RTX_4070: GpuSpec(
+        tier=GpuTier.RTX_4070,
+        name="NVIDIA GeForce RTX 4070",
+        vram_gb=12,
+        fp16_tflops=44,
+        fp8_tflops=88,
+        memory_bandwidth_gbps=432,
+        hourly_cost_usd=0.00,
+    ),
 }
diff --git a/app/services/benchmark_store/store.py b/app/services/benchmark_store/store.py
@@ -19,7 +19,7 @@ def _load(self) -> None:
         self._profiles = []
         if not self._dir.exists():
             return
-        for fp in sorted(self._dir.glob("*.json")):
+        for fp in sorted(self._dir.rglob("*.json")):
             with open(fp) as f:
                 data = json.load(f)
             # each file is a list of profiles for one model

diff --git a/app/services/runtime_registry/registry.py b/app/services/runtime_registry/registry.py
@@ -25,7 +25,7 @@ def __init__(self) -> None:
                 name="vllm",
                 display_name="vLLM",
                 supported_quantizations=["fp16", "bf16", "fp8", "awq", "gptq"],
-                supported_gpu_tiers=["l4", "l40s", "a100-80gb", "h100"],
+                supported_gpu_tiers=["l4", "l40s", "a100-80gb", "h100", "rtx4070"],
                 supports_continuous_batching=True,
                 supports_paged_attention=True,
                 supports_speculative_decoding=True,

diff --git a/tests/test_engine.py b/tests/test_engine.py
@@ -49,7 +49,7 @@ def test_cost_optimized_recommendation():
 
     rec = result.recommended
     # should not pick H100 for a cost-optimized workload
-    assert rec.gpu_tier in ("l4", "l40s"), f"Expected budget GPU, got {rec.gpu_tier}"
+    assert rec.gpu_tier in ("l4", "l40s", "rtx4070"), f"Expected budget GPU, got {rec.gpu_tier}"
     assert rec.estimated_monthly_cost_usd <= 1200
     assert rec.scores.meets_latency is True or rec.estimated_ttft_p95_ms <= 250
 
@@ -108,7 +108,7 @@ def test_recommendation_response_shape():
     assert rec.quantization
     assert rec.estimated_ttft_p95_ms > 0
     assert rec.estimated_tokens_per_sec > 0
-    assert rec.estimated_hourly_cost_usd > 0
+    assert rec.estimated_hourly_cost_usd >= 0
 
 
 def test_compare_two_models():