diff --git a/app/schemas/hardware.py b/app/schemas/hardware.py index 414d013..ea21d18 100644 --- a/app/schemas/hardware.py +++ b/app/schemas/hardware.py @@ -7,6 +7,7 @@ class GpuTier(str, Enum): L40S = "l40s" A100_80GB = "a100-80gb" H100 = "h100" + RTX_4070 = "rtx4070" class GpuSpec(BaseModel): @@ -61,4 +62,13 @@ def monthly_cost_usd(self) -> float: memory_bandwidth_gbps=3350, hourly_cost_usd=4.50, ), + GpuTier.RTX_4070: GpuSpec( + tier=GpuTier.RTX_4070, + name="NVIDIA GeForce RTX 4070", + vram_gb=12, + fp16_tflops=44, + fp8_tflops=88, + memory_bandwidth_gbps=432, + hourly_cost_usd=0.00, + ), } diff --git a/app/services/benchmark_store/store.py b/app/services/benchmark_store/store.py index 164b5f8..8748451 100644 --- a/app/services/benchmark_store/store.py +++ b/app/services/benchmark_store/store.py @@ -19,7 +19,7 @@ def _load(self) -> None: self._profiles = [] if not self._dir.exists(): return - for fp in sorted(self._dir.glob("*.json")): + for fp in sorted(self._dir.rglob("*.json")): with open(fp) as f: data = json.load(f) # each file is a list of profiles for one model diff --git a/app/services/runtime_registry/registry.py b/app/services/runtime_registry/registry.py index 12afaf5..37ede7a 100644 --- a/app/services/runtime_registry/registry.py +++ b/app/services/runtime_registry/registry.py @@ -25,7 +25,7 @@ def __init__(self) -> None: name="vllm", display_name="vLLM", supported_quantizations=["fp16", "bf16", "fp8", "awq", "gptq"], - supported_gpu_tiers=["l4", "l40s", "a100-80gb", "h100"], + supported_gpu_tiers=["l4", "l40s", "a100-80gb", "h100", "rtx4070"], supports_continuous_batching=True, supports_paged_attention=True, supports_speculative_decoding=True, diff --git a/tests/test_engine.py b/tests/test_engine.py index e3f1738..615cda4 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -49,7 +49,7 @@ def test_cost_optimized_recommendation(): rec = result.recommended # should not pick H100 for a cost-optimized workload - assert rec.gpu_tier in ("l4", "l40s"), f"Expected budget GPU, got {rec.gpu_tier}" + assert rec.gpu_tier in ("l4", "l40s", "rtx4070"), f"Expected budget GPU, got {rec.gpu_tier}" assert rec.estimated_monthly_cost_usd <= 1200 assert rec.scores.meets_latency is True or rec.estimated_ttft_p95_ms <= 250 @@ -108,7 +108,7 @@ def test_recommendation_response_shape(): assert rec.quantization assert rec.estimated_ttft_p95_ms > 0 assert rec.estimated_tokens_per_sec > 0 - assert rec.estimated_hourly_cost_usd > 0 + assert rec.estimated_hourly_cost_usd >= 0 def test_compare_two_models():