Skip to content

Commit 9171bc5

Browse files
committed
Add rollout start time and latency display
1 parent da7fc9d commit 9171bc5

14 files changed

+81
-1
lines changed

eval_protocol/integrations/tinker_rollout_processor.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import logging
33
import os
44
import time
5+
from datetime import datetime, timezone
56
import traceback
67
from typing import Any, Dict, List, Optional, Union
78

@@ -76,6 +77,8 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) ->
7677
"""Generate rollout tasks using Tinker."""
7778

7879
async def process_row(row: EvaluationRow) -> EvaluationRow:
80+
if row.execution_metadata.rollout_start_time is None:
81+
row.execution_metadata.rollout_start_time = datetime.now(timezone.utc)
7982
start_time = time.perf_counter()
8083

8184
if not row.messages:

eval_protocol/mcp/execution/manager.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import os
1212
import threading
1313
import time
14+
from datetime import datetime, timezone
1415
from dataclasses import asdict
1516
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union, cast
1617

@@ -97,6 +98,8 @@ def execute_rollouts(
9798
async def _execute_with_semaphore(idx):
9899
async with semaphore:
99100
evaluation_row: EvaluationRow = evaluation_rows[idx]
101+
if evaluation_row.execution_metadata.rollout_start_time is None:
102+
evaluation_row.execution_metadata.rollout_start_time = datetime.now(timezone.utc)
100103
row_start_time = time.perf_counter()
101104

102105
trajectory = await self._execute_rollout(

eval_protocol/models.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -793,6 +793,11 @@ class CostMetrics(BaseModel):
793793
class ExecutionMetadata(BaseModel):
794794
"""Metadata about the execution of the evaluation."""
795795

796+
rollout_start_time: Optional[datetime] = Field(
797+
default=None,
798+
description="UTC timestamp when the rollout started.",
799+
)
800+
796801
invocation_id: Optional[str] = Field(
797802
default_factory=generate_id,
798803
description="The ID of the invocation that this row belongs to.",

eval_protocol/pytest/default_agent_rollout_processor.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import logging
44
import os
55
import time
6+
from datetime import datetime, timezone
67
from typing import Any, AsyncIterator, List, Optional, Union, Dict
78

89
from mcp.types import CallToolResult, TextContent
@@ -249,6 +250,8 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) ->
249250

250251
async def process_row(row: EvaluationRow) -> EvaluationRow:
251252
"""Process a single row with agent rollout."""
253+
if row.execution_metadata.rollout_start_time is None:
254+
row.execution_metadata.rollout_start_time = datetime.now(timezone.utc)
252255
start_time = time.perf_counter()
253256

254257
agent = Agent(

eval_protocol/pytest/default_klavis_sandbox_rollout_processor.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import os
55
import tempfile
66
import time
7+
from datetime import datetime, timezone
78
from typing import Any, Callable, Dict, List, Optional
89

910
from pydantic import BaseModel, Field
@@ -66,7 +67,8 @@ def __call__(
6667

6768
async def process_row(row: EvaluationRow) -> EvaluationRow:
6869
"""Process a single row with complete sandbox lifecycle"""
69-
70+
if row.execution_metadata.rollout_start_time is None:
71+
row.execution_metadata.rollout_start_time = datetime.now(timezone.utc)
7072
start_time = time.perf_counter()
7173
agent: Agent | None = None
7274
temp_config_path: str | None = None

eval_protocol/pytest/default_pydantic_ai_rollout_processor.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from collections.abc import Callable
55
import logging
66
import time
7+
from datetime import datetime, timezone
78
from pydantic_ai.toolsets import FunctionToolset
89
from pydantic_ai.usage import UsageLimits
910
from typing_extensions import override
@@ -50,6 +51,8 @@ def __call__(self, rows: list[EvaluationRow], config: RolloutProcessorConfig) ->
5051

5152
async def process_row(row: EvaluationRow) -> EvaluationRow:
5253
"""Process a single row with agent rollout."""
54+
if row.execution_metadata.rollout_start_time is None:
55+
row.execution_metadata.rollout_start_time = datetime.now(timezone.utc)
5356
start_time = time.perf_counter()
5457

5558
tools = []

eval_protocol/pytest/default_single_turn_rollout_process.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import logging
44
import os
55
import time
6+
from datetime import datetime, timezone
67
from dataclasses import asdict, is_dataclass
78
from types import SimpleNamespace
89
from typing import Any, List
@@ -62,6 +63,8 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) ->
6263

6364
async def process_row(row: EvaluationRow) -> EvaluationRow:
6465
"""Process a single row asynchronously."""
66+
if row.execution_metadata.rollout_start_time is None:
67+
row.execution_metadata.rollout_start_time = datetime.now(timezone.utc)
6568
start_time = time.perf_counter()
6669

6770
if len(row.messages) == 0:

eval_protocol/pytest/github_action_rollout_processor.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) ->
6767
max_pages = (num_rows + 99) // 100 # Round up pages
6868

6969
async def _process_row(row: EvaluationRow) -> EvaluationRow:
70+
if row.execution_metadata.rollout_start_time is None:
71+
row.execution_metadata.rollout_start_time = datetime.now(timezone.utc)
7072
start_time = time.perf_counter()
7173

7274
if row.execution_metadata.invocation_id is None:

eval_protocol/pytest/openenv_rollout_processor.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import asyncio
1616
import logging
1717
import time
18+
from datetime import datetime, timezone
1819
from itertools import count
1920
from typing import List, Any, Dict, Callable, Generic, TypeVar, Optional, Type
2021

@@ -167,6 +168,8 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) ->
167168

168169
async def process_row(row: EvaluationRow) -> EvaluationRow:
169170
"""Process a single row with OpenEnv rollout."""
171+
if row.execution_metadata.rollout_start_time is None:
172+
row.execution_metadata.rollout_start_time = datetime.now(timezone.utc)
170173
start_time = time.perf_counter()
171174

172175
logger.info("[OpenEnvRolloutProcessor] Starting rollout for row")

eval_protocol/pytest/priority_scheduler.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import logging
44
import os
55
import time
6+
from datetime import datetime, timezone
67
from collections import defaultdict
78
from dataclasses import dataclass, field
89
from typing import Any, List, Dict, Optional, Union
@@ -293,6 +294,8 @@ async def _run_eval(rows_to_eval: Union[EvaluationRow, List[EvaluationRow]]):
293294

294295
# 3. Execute the rollout
295296
result_row: Optional[EvaluationRow] = None
297+
if row_copy.execution_metadata.rollout_start_time is None:
298+
row_copy.execution_metadata.rollout_start_time = datetime.now(timezone.utc)
296299
start_time = time.perf_counter()
297300
try:
298301
async for result in rollout_processor_with_retry(

0 commit comments

Comments
 (0)