Add experiment timing and remove _usd (#158)

xzrderek · web-flow · commit 3dbbfede6596 · 2025-09-03T22:23:05.000-07:00
diff --git a/eval_protocol/models.py b/eval_protocol/models.py
@@ -519,11 +519,11 @@ class EvalMetadata(BaseModel):
 class CostMetrics(BaseModel):
     """Cost metrics for LLM API calls."""
 
-    input_cost_usd: Optional[float] = Field(None, description="Cost in USD for input tokens.")
+    input_cost: Optional[float] = Field(None, description="Cost in USD for input tokens.")
 
-    output_cost_usd: Optional[float] = Field(None, description="Cost in USD for output tokens.")
+    output_cost: Optional[float] = Field(None, description="Cost in USD for output tokens.")
 
-    total_cost_usd: Optional[float] = Field(None, description="Total cost in USD for the API call.")
+    total_cost: Optional[float] = Field(None, description="Total cost in USD for the API call.")
 
 
 class ExecutionMetadata(BaseModel):
@@ -560,6 +560,11 @@ class ExecutionMetadata(BaseModel):
         description="Processing duration in seconds for this evaluation row. Note that if it gets retried, this will be the duration of the last attempt.",
     )
 
+    experiment_duration_seconds: Optional[float] = Field(
+        default=None,
+        description="Processing duration in seconds for an entire experiment. Note that includes time it took for retries.",
+    )
+
 
 class EvaluationRow(BaseModel):
     """
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
@@ -2,6 +2,7 @@
 import inspect
 import os
 import sys
+import time
 from collections import defaultdict
 from typing import Any, Callable
 from typing_extensions import Unpack
@@ -212,6 +213,7 @@ async def wrapper_body(**kwargs: Unpack[ParameterizedTestKwargs]) -> None:
                 all_results: list[list[EvaluationRow]] = [[] for _ in range(num_runs)]
 
                 experiment_id = generate_id()
+                experiment_start_time = time.perf_counter()
 
                 def _log_eval_error(status: Status, rows: list[EvaluationRow] | None, passed: bool) -> None:
                     log_eval_status_and_rows(eval_metadata, rows, status, passed, active_logger)
@@ -506,6 +508,8 @@ async def execute_run_with_progress(run_idx: int, config):
                                 tasks.append(asyncio.create_task(execute_run_with_progress(run_idx, config)))
                             await asyncio.gather(*tasks)  # pyright: ignore[reportUnknownArgumentType]
 
+                    experiment_duration_seconds = time.perf_counter() - experiment_start_time
+
                     # for groupwise mode, the result contains eval otuput from multiple completion_params, we need to differentiate them
                     # rollout_id is used to differentiate the result from different completion_params
                     if mode == "groupwise":
@@ -526,6 +530,7 @@ async def execute_run_with_progress(run_idx: int, config):
                                 original_completion_params[rollout_id],  # pyright: ignore[reportArgumentType]
                                 test_func.__name__,
                                 num_runs,
+                                experiment_duration_seconds,
                             )
                     else:
                         postprocess(
@@ -537,6 +542,7 @@ async def execute_run_with_progress(run_idx: int, config):
                             completion_params,  # pyright: ignore[reportArgumentType]
                             test_func.__name__,
                             num_runs,
+                            experiment_duration_seconds,
                         )
 
                 except AssertionError:
diff --git a/eval_protocol/pytest/evaluation_test_postprocess.py b/eval_protocol/pytest/evaluation_test_postprocess.py
@@ -23,6 +23,7 @@ def postprocess(
     completion_params: CompletionParams,
     test_func_name: str,
     num_runs: int,
+    experiment_duration_seconds: float,
 ):
     scores = [
         sum([r.evaluation_result.score for r in result if r.evaluation_result]) / len(result) for result in all_results
@@ -68,6 +69,7 @@ def postprocess(
             if r.evaluation_result is not None:
                 r.evaluation_result.agg_score = agg_score
                 r.evaluation_result.standard_error = standard_error
+                r.execution_metadata.experiment_duration_seconds = experiment_duration_seconds
             active_logger.log(r)
 
     # Optional: print and/or persist a summary artifact for CI
diff --git a/eval_protocol/pytest/utils.py b/eval_protocol/pytest/utils.py
@@ -307,9 +307,9 @@ def add_cost_metrics(row: EvaluationRow) -> None:
     # Can't calculate cost without usage stats or model info
     if not row.execution_metadata.usage or not row.input_metadata.completion_params:
         row.execution_metadata.cost_metrics = CostMetrics(
-            input_cost_usd=0.0,
-            output_cost_usd=0.0,
-            total_cost_usd=0.0,
+            input_cost=0.0,
+            output_cost=0.0,
+            total_cost=0.0,
         )
         return
 
@@ -348,7 +348,7 @@ def add_cost_metrics(row: EvaluationRow) -> None:
 
     # Set all cost metrics on the row
     row.execution_metadata.cost_metrics = CostMetrics(
-        input_cost_usd=input_cost,
-        output_cost_usd=output_cost,
-        total_cost_usd=total_cost,
+        input_cost=input_cost,
+        output_cost=output_cost,
+        total_cost=total_cost,
     )
diff --git a/tests/pytest/test_execution_metadata.py b/tests/pytest/test_execution_metadata.py
@@ -23,9 +23,9 @@ def test_single_model_with_provider(self):
         add_cost_metrics(row)
 
         assert row.execution_metadata.cost_metrics is not None
-        assert row.execution_metadata.cost_metrics.input_cost_usd is not None
-        assert row.execution_metadata.cost_metrics.output_cost_usd is not None
-        assert row.execution_metadata.cost_metrics.total_cost_usd is not None
+        assert row.execution_metadata.cost_metrics.input_cost is not None
+        assert row.execution_metadata.cost_metrics.output_cost is not None
+        assert row.execution_metadata.cost_metrics.total_cost is not None
 
     @pytest.mark.skip(reason="Revisit when we figure out how to get cost metrics for multi-agent Pydantic.")
     def test_pydantic_ai_multi_agent_model_dict(self):
@@ -54,9 +54,9 @@ def test_pydantic_ai_multi_agent_model_dict(self):
         add_cost_metrics(row)
 
         assert row.execution_metadata.cost_metrics is not None
-        assert row.execution_metadata.cost_metrics.input_cost_usd is not None
-        assert row.execution_metadata.cost_metrics.output_cost_usd is not None
-        assert row.execution_metadata.cost_metrics.total_cost_usd is not None
+        assert row.execution_metadata.cost_metrics.input_cost is not None
+        assert row.execution_metadata.cost_metrics.output_cost is not None
+        assert row.execution_metadata.cost_metrics.total_cost is not None
 
     def test_no_usage_stats(self):
         """Test case with no usage statistics."""
@@ -69,9 +69,9 @@ def test_no_usage_stats(self):
         add_cost_metrics(row)
 
         assert row.execution_metadata.cost_metrics is not None
-        assert row.execution_metadata.cost_metrics.input_cost_usd == 0.0
-        assert row.execution_metadata.cost_metrics.output_cost_usd == 0.0
-        assert row.execution_metadata.cost_metrics.total_cost_usd == 0.0
+        assert row.execution_metadata.cost_metrics.input_cost == 0.0
+        assert row.execution_metadata.cost_metrics.output_cost == 0.0
+        assert row.execution_metadata.cost_metrics.total_cost == 0.0
 
     def test_no_completion_params(self):
         """Test case with empty completion parameters."""
@@ -86,9 +86,9 @@ def test_no_completion_params(self):
         add_cost_metrics(row)
 
         assert row.execution_metadata.cost_metrics is not None
-        assert row.execution_metadata.cost_metrics.input_cost_usd == 0.0
-        assert row.execution_metadata.cost_metrics.output_cost_usd == 0.0
-        assert row.execution_metadata.cost_metrics.total_cost_usd == 0.0
+        assert row.execution_metadata.cost_metrics.input_cost == 0.0
+        assert row.execution_metadata.cost_metrics.output_cost == 0.0
+        assert row.execution_metadata.cost_metrics.total_cost == 0.0
 
     def test_zero_tokens(self):
         """Test case with zero token usage."""
@@ -103,9 +103,9 @@ def test_zero_tokens(self):
         add_cost_metrics(row)
 
         assert row.execution_metadata.cost_metrics is not None
-        assert row.execution_metadata.cost_metrics.input_cost_usd == 0.0
-        assert row.execution_metadata.cost_metrics.output_cost_usd == 0.0
-        assert row.execution_metadata.cost_metrics.total_cost_usd == 0.0
+        assert row.execution_metadata.cost_metrics.input_cost == 0.0
+        assert row.execution_metadata.cost_metrics.output_cost == 0.0
+        assert row.execution_metadata.cost_metrics.total_cost == 0.0
 
     def test_provider_mapping_variations(self):
         """Test different provider mappings."""