Skip to content

Commit ba8ed43

Browse files
committed
Add timing metrics
1 parent 649c6d1 commit ba8ed43

File tree

7 files changed

+192
-0
lines changed

7 files changed

+192
-0
lines changed

eval_protocol/mcp/execution/manager.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ def execute_rollouts(
100100
async def _execute_with_semaphore(idx):
101101
async with semaphore:
102102
evaluation_row: EvaluationRow = evaluation_rows[idx]
103+
row_start_time = time.perf_counter()
103104

104105
trajectory = await self._execute_rollout(
105106
envs, policy, idx, steps, openai_logger, recording_mode, playback_mode, start_time, evaluation_row
@@ -152,6 +153,8 @@ async def _execute_with_semaphore(idx):
152153
else:
153154
evaluation_row.rollout_status = Status.rollout_running()
154155

156+
evaluation_row.execution_metadata.duration_seconds = time.perf_counter() - row_start_time
157+
155158
return evaluation_row
156159

157160
# Create all tasks

eval_protocol/models.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -555,6 +555,11 @@ class ExecutionMetadata(BaseModel):
555555

556556
cost_metrics: Optional[CostMetrics] = Field(default=None, description="Cost breakdown for LLM API calls.")
557557

558+
duration_seconds: Optional[float] = Field(
559+
default=None,
560+
description="Processing duration in seconds for this evaluation row. Note that if it gets retried, this will be the duration of the last attempt.",
561+
)
562+
558563

559564
class EvaluationRow(BaseModel):
560565
"""

eval_protocol/pytest/default_agent_rollout_processor.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import json
33
import logging
44
import os
5+
import time
56
from typing import Any, AsyncIterator, List, Optional, Union, Dict
67

78
from mcp.types import CallToolResult, TextContent
@@ -240,6 +241,8 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) ->
240241

241242
async def process_row(row: EvaluationRow) -> EvaluationRow:
242243
"""Process a single row with agent rollout."""
244+
start_time = time.perf_counter()
245+
243246
agent = Agent(
244247
model=row.input_metadata.completion_params["model"],
245248
row=row,
@@ -256,6 +259,8 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
256259
total_tokens=agent.usage["total_tokens"],
257260
)
258261

262+
agent.evaluation_row.execution_metadata.duration_seconds = time.perf_counter() - start_time
263+
259264
return agent.evaluation_row
260265
finally:
261266
if agent.mcp_client:

eval_protocol/pytest/default_langchain_rollout_processor.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import asyncio
2+
import time
23
from typing import List
34

45
try:
@@ -31,6 +32,8 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig):
3132
tasks: List[asyncio.Task] = []
3233

3334
async def _process_row(row: EvaluationRow) -> EvaluationRow:
35+
start_time = time.perf_counter()
36+
3437
# Build LC messages from EP row
3538
try:
3639
from langchain_core.messages import HumanMessage
@@ -121,6 +124,9 @@ def _serialize_message(msg: BaseMessage) -> Message:
121124
return Message(role=getattr(msg, "type", "assistant"), content=str(content))
122125

123126
row.messages = [_serialize_message(m) for m in result_messages]
127+
128+
row.execution_metadata.duration_seconds = time.perf_counter() - start_time
129+
124130
return row
125131

126132
for r in rows:

eval_protocol/pytest/default_pydantic_ai_rollout_processor.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import asyncio
44
import logging
5+
import time
56
import types
67
from pydantic_ai.models import Model
78
from typing_extensions import override
@@ -85,6 +86,8 @@ def __call__(self, rows: list[EvaluationRow], config: RolloutProcessorConfig) ->
8586

8687
async def process_row(row: EvaluationRow) -> EvaluationRow:
8788
"""Process a single row with agent rollout."""
89+
start_time = time.perf_counter()
90+
8891
model_messages = [self.convert_ep_message_to_pyd_message(m, row) for m in row.messages]
8992
response = await agent_instance.run(
9093
message_history=model_messages, model=model, usage_limits=config.kwargs.get("usage_limits")
@@ -99,6 +102,8 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
99102
# total_tokens=usage_info.total_tokens or 0,
100103
# )
101104

105+
row.execution_metadata.duration_seconds = time.perf_counter() - start_time
106+
102107
return row
103108

104109
async def _sem_wrapper(r: EvaluationRow) -> EvaluationRow:

eval_protocol/pytest/default_single_turn_rollout_process.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) ->
2525

2626
async def process_row(row: EvaluationRow) -> EvaluationRow:
2727
"""Process a single row asynchronously."""
28+
start_time = time.perf_counter()
29+
2830
if len(row.messages) == 0:
2931
raise ValueError("Messages is empty. Please provide a non-empty dataset")
3032

@@ -116,6 +118,9 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
116118
)
117119

118120
row.messages = messages
121+
122+
row.execution_metadata.duration_seconds = time.perf_counter() - start_time
123+
119124
default_logger.log(row)
120125
return row
121126

Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
import pytest
2+
from openai.types import CompletionUsage
3+
4+
from eval_protocol.models import EvaluationRow, ExecutionMetadata, InputMetadata, CostMetrics, Message
5+
from eval_protocol.pytest.utils import add_cost_metrics
6+
7+
8+
class TestExecutionMetadata:
9+
"""Test execution metadata tracking including cost metrics, usage statistics, and timing."""
10+
11+
def test_single_model_with_provider(self):
12+
"""Test normal case: single model string with provider."""
13+
row = EvaluationRow(
14+
messages=[],
15+
input_metadata=InputMetadata(
16+
completion_params={"model": "accounts/fireworks/models/gpt-oss-120b", "provider": "fireworks"}
17+
),
18+
execution_metadata=ExecutionMetadata(
19+
usage=CompletionUsage(prompt_tokens=100, completion_tokens=50, total_tokens=150)
20+
),
21+
)
22+
23+
add_cost_metrics(row)
24+
25+
assert row.execution_metadata.cost_metrics is not None
26+
assert row.execution_metadata.cost_metrics.input_cost_usd is not None
27+
assert row.execution_metadata.cost_metrics.output_cost_usd is not None
28+
assert row.execution_metadata.cost_metrics.total_cost_usd is not None
29+
30+
@pytest.mark.skip(reason="Revisit when we figure out how to get cost metrics for multi-agent Pydantic.")
31+
def test_pydantic_ai_multi_agent_model_dict(self):
32+
"""Test Pydantic AI multi-agent case: nested dictionary with multiple models."""
33+
row = EvaluationRow(
34+
messages=[],
35+
input_metadata=InputMetadata(
36+
completion_params={
37+
"model": {
38+
"joke_generation_model": {
39+
"model": "accounts/fireworks/models/kimi-k2-instruct",
40+
"provider": "fireworks",
41+
},
42+
"joke_selection_model": {
43+
"model": "accounts/fireworks/models/deepseek-v3p1",
44+
"provider": "fireworks",
45+
},
46+
}
47+
}
48+
),
49+
execution_metadata=ExecutionMetadata(
50+
usage=CompletionUsage(prompt_tokens=200, completion_tokens=75, total_tokens=275)
51+
),
52+
)
53+
54+
add_cost_metrics(row)
55+
56+
assert row.execution_metadata.cost_metrics is not None
57+
assert row.execution_metadata.cost_metrics.input_cost_usd is not None
58+
assert row.execution_metadata.cost_metrics.output_cost_usd is not None
59+
assert row.execution_metadata.cost_metrics.total_cost_usd is not None
60+
61+
def test_no_usage_stats(self):
62+
"""Test case with no usage statistics."""
63+
row = EvaluationRow(
64+
messages=[],
65+
input_metadata=InputMetadata(completion_params={"model": "gpt-3.5-turbo", "provider": "openai"}),
66+
execution_metadata=ExecutionMetadata(usage=None),
67+
)
68+
69+
add_cost_metrics(row)
70+
71+
assert row.execution_metadata.cost_metrics is not None
72+
assert row.execution_metadata.cost_metrics.input_cost_usd == 0.0
73+
assert row.execution_metadata.cost_metrics.output_cost_usd == 0.0
74+
assert row.execution_metadata.cost_metrics.total_cost_usd == 0.0
75+
76+
def test_no_completion_params(self):
77+
"""Test case with empty completion parameters."""
78+
row = EvaluationRow(
79+
messages=[],
80+
input_metadata=InputMetadata(completion_params={}),
81+
execution_metadata=ExecutionMetadata(
82+
usage=CompletionUsage(prompt_tokens=100, completion_tokens=50, total_tokens=150)
83+
),
84+
)
85+
86+
add_cost_metrics(row)
87+
88+
assert row.execution_metadata.cost_metrics is not None
89+
assert row.execution_metadata.cost_metrics.input_cost_usd == 0.0
90+
assert row.execution_metadata.cost_metrics.output_cost_usd == 0.0
91+
assert row.execution_metadata.cost_metrics.total_cost_usd == 0.0
92+
93+
def test_zero_tokens(self):
94+
"""Test case with zero token usage."""
95+
row = EvaluationRow(
96+
messages=[],
97+
input_metadata=InputMetadata(completion_params={"model": "gpt-3.5-turbo", "provider": "openai"}),
98+
execution_metadata=ExecutionMetadata(
99+
usage=CompletionUsage(prompt_tokens=0, completion_tokens=0, total_tokens=0)
100+
),
101+
)
102+
103+
add_cost_metrics(row)
104+
105+
assert row.execution_metadata.cost_metrics is not None
106+
assert row.execution_metadata.cost_metrics.input_cost_usd == 0.0
107+
assert row.execution_metadata.cost_metrics.output_cost_usd == 0.0
108+
assert row.execution_metadata.cost_metrics.total_cost_usd == 0.0
109+
110+
def test_provider_mapping_variations(self):
111+
"""Test different provider mappings."""
112+
providers_and_expected = [
113+
("openai", "gpt-3.5-turbo", "gpt-3.5-turbo"), # No prefix - known model
114+
(
115+
"fireworks",
116+
"accounts/fireworks/models/llama-v2-7b-chat",
117+
"fireworks_ai/accounts/fireworks/models/llama-v2-7b-chat",
118+
),
119+
("unknown_provider", "gpt-3.5-turbo", "gpt-3.5-turbo"), # Fallback to original - use known model
120+
]
121+
122+
for provider, model, expected_model_id in providers_and_expected:
123+
row = EvaluationRow(
124+
messages=[],
125+
input_metadata=InputMetadata(completion_params={"model": model, "provider": provider}),
126+
execution_metadata=ExecutionMetadata(
127+
usage=CompletionUsage(prompt_tokens=10, completion_tokens=5, total_tokens=15)
128+
),
129+
)
130+
131+
add_cost_metrics(row)
132+
133+
# Should not raise an error and should set cost metrics
134+
assert row.execution_metadata.cost_metrics is not None
135+
136+
def test_model_without_provider(self):
137+
"""Test model string without provider field."""
138+
row = EvaluationRow(
139+
messages=[],
140+
input_metadata=InputMetadata(
141+
completion_params={"model": "gpt-3.5-turbo"} # No provider field
142+
),
143+
execution_metadata=ExecutionMetadata(
144+
usage=CompletionUsage(prompt_tokens=50, completion_tokens=25, total_tokens=75)
145+
),
146+
)
147+
148+
add_cost_metrics(row)
149+
150+
assert row.execution_metadata.cost_metrics is not None
151+
# Should still work for OpenAI models even without explicit provider
152+
153+
def test_execution_metadata_timing_field(self):
154+
"""Test that the new duration_seconds field works correctly."""
155+
metadata = ExecutionMetadata()
156+
157+
# Check field exists and defaults to None
158+
assert hasattr(metadata, "duration_seconds")
159+
assert metadata.duration_seconds is None
160+
161+
# Check it can be set
162+
metadata.duration_seconds = 1.234
163+
assert metadata.duration_seconds == 1.234

0 commit comments

Comments
 (0)