Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,6 @@ jobs:
- name: Install tau2 for testing
run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main

- name: Ruff format (check)
run: uv run ruff format --check .

- name: Ruff lint
run: uv run ruff check .

Expand Down
1,005 changes: 1,000 additions & 5 deletions development/gsm8k_sample.jsonl

Large diffs are not rendered by default.

20 changes: 18 additions & 2 deletions eval_protocol/pytest/default_single_turn_rollout_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,16 @@
class SingleTurnRolloutProcessor(RolloutProcessor):
"""Single turn rollout processor for direct LLM calls."""

def __init__(self, *, drop_trailing_assistant_messages: bool = True) -> None:
"""
Args:
drop_trailing_assistant_messages: When True (default), strip any trailing
assistant messages from the input conversation before calling the model.
This helps when datasets include previous assistant turns and you want
the model to answer the latest user query.
"""
self.drop_trailing_assistant_messages = drop_trailing_assistant_messages

def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) -> List[asyncio.Task[EvaluationRow]]:
"""Generate single turn rollout tasks and return them for external handling."""
# Do not modify global LiteLLM cache. Disable caching per-request instead.
Expand All @@ -32,7 +42,13 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
if len(row.messages) == 0:
raise ValueError("Messages is empty. Please provide a non-empty dataset")

messages_payload = [message.model_dump() for message in row.messages]
# Optionally drop trailing assistant messages for single-turn prompts
messages_for_request: List[Message] = list(row.messages)
if self.drop_trailing_assistant_messages:
while messages_for_request and messages_for_request[-1].role == "assistant":
messages_for_request.pop()

messages_payload = [message.model_dump() for message in messages_for_request]
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Empty Payload After Message Filtering

Missing validation after filtering trailing assistant messages. If all messages in row.messages are assistant messages and drop_trailing_assistant_messages=True, the messages_for_request list becomes empty, resulting in an empty messages_payload being sent to the LLM API. This will fail with an API error rather than being caught by the existing validation on line 42-43. A check should be added after the filtering loop (lines 47-49) to ensure messages_for_request is not empty before proceeding.

Fix in Cursor Fix in Web


request_params = {"messages": messages_payload, **config.completion_params}
# Ensure caching is disabled only for this request (review feedback)
Expand Down Expand Up @@ -114,7 +130,7 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
except Exception:
pass

messages = list(row.messages) + [
messages = list(messages_for_request) + [
Message(
role="assistant",
content=assistant_content,
Expand Down
1 change: 1 addition & 0 deletions eval_protocol/pytest/exception_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import requests
import httpx


# Default exceptions that should be retried with backoff
DEFAULT_RETRYABLE_EXCEPTIONS: Set[Type[Exception]] = {
# Standard library exceptions
Expand Down
1 change: 1 addition & 0 deletions tests/pytest/gsm8k/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
eval-protocol
66 changes: 66 additions & 0 deletions tests/pytest/gsm8k/test_pytest_math_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import re
from eval_protocol.models import EvaluateResult, EvaluationRow, MetricResult, Message
from eval_protocol.pytest import SingleTurnRolloutProcessor, evaluation_test
from typing import List, Dict, Any, Optional


def extract_answer_digits(ground_truth: str) -> Optional[str]:
"""
Extract the digits from the answer string.
"""
answer_string = ground_truth.split("<answer>")[1].split("</answer>")[0]
return re.search(r"(\d+)", answer_string).group(1) if answer_string else None
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Function Fails Gracefully on Malformed Inputs

The extract_answer_digits function can crash with an IndexError if <answer> tags are missing, or an AttributeError if no digits are found. Its Optional[str] return type implies it should return None for malformed inputs, rather than raising exceptions.

Fix in Cursor Fix in Web



@evaluation_test(
input_dataset=["development/gsm8k_sample.jsonl"],
completion_params=[{"temperature": 0.0, "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}],
max_dataset_rows=5,
passed_threshold=0.0,
rollout_processor=SingleTurnRolloutProcessor(),
mode="pointwise",
evaluation_test_kwargs=[
{"math_reward_kwargs": {"tolerance": 0.001, "absolute_tolerance": 1e-8, "require_units": False}}
],
)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Test Decorator Missing Dataset Adapter

The test decorator is missing the dataset_adapter parameter. Without it, the default adapter will attempt to unpack the JSONL data directly as EvaluationRow constructor arguments. However, the dataset format (as defined in conf/dataset/gsm8k_local_prompts.yaml) has fields "user_query" and "ground_truth_for_eval", not "messages" and "ground_truth" expected by EvaluationRow. This will cause a RuntimeError when the dataset is loaded. The test should include dataset_adapter=gsm8k_to_evaluation_row like the old test did, or the dataset format needs to be changed to include a "messages" field with Message objects.

Fix in Cursor Fix in Web

def test_math_dataset(row: EvaluationRow, **kwargs) -> EvaluationRow:
"""
Evaluate math problem solving considering both accuracy and format.

This function demonstrates how to combine multiple evaluation criteria:
- Numerical accuracy using built-in math evaluation (80% weight)
- Format compliance checking for <think>...</think><answer>...</answer> structure (20% weight)

Args:
row: EvaluationRow containing the conversation messages and ground truth
**kwargs: Additional parameters (like math_reward_kwargs)

Returns:
EvaluationRow with the evaluation result
"""
#### Get predicted answer value
prediction = extract_answer_digits(str(row.messages[2].content))
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Model Response Access Assumes Fixed Message Count

Hard-coded access to row.messages[2] assumes exactly 3+ messages are present and that the model response is at index 2. This is brittle because: (1) the SingleTurnRolloutProcessor may drop trailing assistant messages, reducing the message count, and (2) the input data structure from the data adapter may have a different number of messages. The code should access the last message instead: row.messages[-1] since that's where the model's response is added.

Fix in Cursor Fix in Web

gt = extract_answer_digits(str(row.ground_truth))

#### Get score
if prediction is None or gt is None:
score = 0
reason = "Missing answer tags in prediction or ground truth."

elif gt == prediction:
score = 1
reason = "Model answer is correct."

else:
score = 0
reason = "Model answer is not correct."

reason += f" Prediction: {prediction}, Ground Truth: {gt}"

evaluation_result = EvaluateResult(
score=score, # Required: The final evaluation score
is_score_valid=True, # Optional: Whether the score is valid, true by default
reason=reason, # Optional: The reason for the score
)
row.evaluation_result = evaluation_result
return row
71 changes: 0 additions & 71 deletions tests/pytest/test_pytest_math_example.py

This file was deleted.

2 changes: 0 additions & 2 deletions tests/pytest/test_pytest_math_format_length.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,10 @@
from eval_protocol.rewards.length import count_tokens
from eval_protocol.rewards.math import math_reward
from examples.math_with_format_and_length.main import check_think_answer_format
from tests.pytest.helper.gsm8k_to_evaluation_row import gsm8k_to_evaluation_row


@evaluation_test(
input_dataset=["development/gsm8k_sample.jsonl"],
dataset_adapter=gsm8k_to_evaluation_row,
completion_params=[{"temperature": 0.0, "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}],
max_dataset_rows=5,
passed_threshold=0.0,
Expand Down
2 changes: 0 additions & 2 deletions tests/pytest/test_pytest_word_count_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,10 @@

from eval_protocol.models import EvaluateResult, EvaluationRow, MetricResult
from eval_protocol.pytest import SingleTurnRolloutProcessor, evaluation_test
from tests.pytest.helper.word_count_to_evaluation_row import word_count_to_evaluation_row


@evaluation_test(
input_dataset=["development/gsm8k_sample.jsonl"],
dataset_adapter=word_count_to_evaluation_row,
completion_params=[{"temperature": 0.0, "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}],
max_dataset_rows=5,
passed_threshold=0.3, # Reasonable threshold for word count evaluation
Expand Down
118 changes: 118 additions & 0 deletions tests/pytest/test_single_turn_rollout_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import asyncio
from types import SimpleNamespace

import pytest

from eval_protocol.models import EvaluationRow, Message
from eval_protocol.pytest import SingleTurnRolloutProcessor


class _DummyConfig:
def __init__(self):
self.completion_params = {"model": "fake-model", "temperature": 0}
self.semaphore = asyncio.Semaphore(10)


@pytest.mark.asyncio
async def test_single_turn_drops_trailing_assistant_by_default(monkeypatch):
# Arrange dataset row with trailing assistant message
row = EvaluationRow(
messages=[
Message(role="user", content="What is 2+2?"),
Message(role="assistant", content="Old response"),
]
)

# Capture the messages payload passed to the LLM call
captured = {}

# Patch module-level imports in the processor module
import eval_protocol.pytest.default_single_turn_rollout_process as mod

class StubChoices:
pass

class StubModelResponse:
def __init__(self, text: str):
self.choices = [StubChoices()]
# Emulate OpenAI-like response.message fields
self.choices[0].message = SimpleNamespace(content=text, tool_calls=None)
# Minimal usage payload
self.usage = SimpleNamespace(prompt_tokens=1, completion_tokens=1, total_tokens=2)

async def fake_acompletion(**kwargs):
# Verify that trailing assistant was dropped before sending
msgs = kwargs.get("messages", [])
assert msgs, "Expected non-empty messages payload"
captured["messages"] = msgs
assert msgs[-1]["role"] != "assistant", "Trailing assistant should be dropped by default"
return StubModelResponse(text="4")

# Monkeypatch the processor module's symbols to avoid dependency on litellm types
monkeypatch.setattr(mod, "ModelResponse", StubModelResponse, raising=True)
monkeypatch.setattr(mod, "Choices", StubChoices, raising=True)
monkeypatch.setattr(mod, "acompletion", fake_acompletion, raising=True)

processor = SingleTurnRolloutProcessor()
config = _DummyConfig()

# Act
tasks = processor([row], config)
out = await tasks[0]

# Assert: request trimmed the trailing assistant
sent_msgs = captured["messages"]
assert len(sent_msgs) == 1
assert sent_msgs[0]["role"] == "user"
assert out.messages[-1].role == "assistant"
assert out.messages[-1].content == "4"
# Ensure previous trailing assistant was not duplicated
assert [m.role for m in out.messages] == ["user", "assistant"]


@pytest.mark.asyncio
async def test_single_turn_keeps_trailing_assistant_when_disabled(monkeypatch):
# Arrange dataset row with trailing assistant message
row = EvaluationRow(
messages=[
Message(role="user", content="Say hi"),
Message(role="assistant", content="Hi!"),
]
)

captured = {}

import eval_protocol.pytest.default_single_turn_rollout_process as mod

class StubChoices:
pass

class StubModelResponse:
def __init__(self, text: str):
self.choices = [StubChoices()]
self.choices[0].message = SimpleNamespace(content=text, tool_calls=None)
self.usage = SimpleNamespace(prompt_tokens=1, completion_tokens=1, total_tokens=2)

async def fake_acompletion(**kwargs):
msgs = kwargs.get("messages", [])
captured["messages"] = msgs
# With opt-out, trailing assistant is preserved
assert msgs[-1]["role"] == "assistant"
return StubModelResponse(text="Hello again")

monkeypatch.setattr(mod, "ModelResponse", StubModelResponse, raising=True)
monkeypatch.setattr(mod, "Choices", StubChoices, raising=True)
monkeypatch.setattr(mod, "acompletion", fake_acompletion, raising=True)

processor = SingleTurnRolloutProcessor(drop_trailing_assistant_messages=False)
config = _DummyConfig()

# Act
tasks = processor([row], config)
out = await tasks[0]

# Assert: both original messages plus new assistant
sent_msgs = captured["messages"]
assert [m["role"] for m in sent_msgs] == ["user", "assistant"]
assert [m.role for m in out.messages] == ["user", "assistant", "assistant"]
assert out.messages[-1].content == "Hello again"
Loading