Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion eval_protocol/mcp/execution/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ async def _execute_with_semaphore(idx):

evaluation_row.messages = messages
evaluation_row.tools = shared_tool_schema
evaluation_row.usage = CompletionUsage(**trajectory.usage)
evaluation_row.execution_metadata.usage = CompletionUsage(**trajectory.usage)
evaluation_row.input_metadata.completion_params = {
"model": policy.model_id,
"temperature": getattr(policy, "temperature", None),
Expand Down
21 changes: 16 additions & 5 deletions eval_protocol/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -462,6 +462,16 @@ class EvalMetadata(BaseModel):
passed: Optional[bool] = Field(None, description="Whether the evaluation passed based on the threshold")


class CostMetrics(BaseModel):
"""Cost metrics for LLM API calls."""

input_cost_usd: Optional[float] = Field(None, description="Cost in USD for input tokens.")

output_cost_usd: Optional[float] = Field(None, description="Cost in USD for output tokens.")

total_cost_usd: Optional[float] = Field(None, description="Total cost in USD for the API call.")


class ExecutionMetadata(BaseModel):
"""Metadata about the execution of the evaluation."""

Expand All @@ -485,6 +495,12 @@ class ExecutionMetadata(BaseModel):
description=("The ID of the run that this row belongs to."),
)

usage: Optional[CompletionUsage] = Field(
default=None, description="Token usage statistics from LLM calls during execution."
)

cost_metrics: Optional[CostMetrics] = Field(default=None, description="Cost breakdown for LLM API calls.")


class EvaluationRow(BaseModel):
"""
Expand Down Expand Up @@ -530,11 +546,6 @@ class EvaluationRow(BaseModel):
description="Metadata about the execution of the evaluation.",
)

# LLM usage statistics
usage: Optional[CompletionUsage] = Field(
default=None, description="Token usage statistics from LLM calls during execution."
)

created_at: datetime = Field(default_factory=datetime.now, description="The timestamp when the row was created.")

eval_metadata: Optional[EvalMetadata] = Field(
Expand Down
10 changes: 8 additions & 2 deletions eval_protocol/pytest/evaluation_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@
from eval_protocol.pytest.utils import (
AggregationMethod,
aggregate,
calculate_cost_metrics_for_row,
create_dynamically_parameterized_wrapper,
deep_update_dict,
extract_effort_tag,
generate_parameter_combinations,
log_eval_status_and_rows,
Expand Down Expand Up @@ -633,7 +633,11 @@ async def _execute_eval_with_semaphore(**inner_kwargs):
processed_dataset=inner_kwargs["rows"],
evaluation_test_kwargs=kwargs.get("evaluation_test_kwargs") or {},
)
if results is None or not isinstance(results, list):
if (
results is None
or not isinstance(results, list)
or not all(isinstance(r, EvaluationRow) for r in results)
):
raise ValueError(
f"Test function {test_func.__name__} did not return a list of EvaluationRow instances. You must return a list of EvaluationRow instances from your test function decorated with @evaluation_test."
)
Expand Down Expand Up @@ -724,6 +728,8 @@ async def _collect_result(config, lst):
all_results[i] = results

for r in results:
calculate_cost_metrics_for_row(r)
print(r.execution_metadata.cost_metrics)
if r.eval_metadata is not None:
if r.rollout_status.is_error():
r.eval_metadata.status = Status.error(
Expand Down
34 changes: 32 additions & 2 deletions eval_protocol/pytest/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
import os
import re
from dataclasses import replace
from typing import Any, Callable, Dict, List, Literal, Optional, Union
from typing import Any, Callable, List, Literal, Optional, Union

from litellm import cost_per_token

from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
from eval_protocol.models import EvalMetadata, EvaluationRow, Status
from eval_protocol.models import CostMetrics, EvalMetadata, EvaluationRow, Status
from eval_protocol.pytest.rollout_processor import RolloutProcessor
from eval_protocol.pytest.types import (
CompletionParams,
Expand Down Expand Up @@ -435,3 +437,31 @@ def extract_effort_tag(params: dict) -> Optional[str]:
except Exception:
return None
return None


def calculate_cost_metrics_for_row(row: EvaluationRow) -> None:
"""Calculate and set cost metrics for an EvaluationRow based on its usage data."""
if not row.execution_metadata.usage:
return

model_id = (
row.input_metadata.completion_params.get("model", "unknown")
if row.input_metadata.completion_params
else "unknown"
)
usage = row.execution_metadata.usage

input_tokens = usage.prompt_tokens or 0
output_tokens = usage.completion_tokens or 0

input_cost, output_cost = cost_per_token(
model=model_id, prompt_tokens=input_tokens, completion_tokens=output_tokens
)
total_cost = input_cost + output_cost

# Set all cost metrics on the row
row.execution_metadata.cost_metrics = CostMetrics(
input_cost_usd=input_cost,
output_cost_usd=output_cost,
total_cost_usd=total_cost,
)
Loading