Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true,
"python.testing.pytestArgs": ["tests", "examples", "-s", "--tb=short"],
"python.testing.pytestArgs": ["-c", "pytest.ini"],
"python.testing.autoTestDiscoverOnSaveEnabled": true,
"python.defaultInterpreterPath": "./.venv/bin/python",
"python.testing.cwd": "${workspaceFolder}",
Expand Down
13 changes: 12 additions & 1 deletion eval_protocol/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,26 @@
_FIREWORKS_AVAILABLE = False
# Import submodules to make them available via eval_protocol.rewards, etc.
from . import mcp, rewards
from .models import EvaluateResult, Message, MetricResult
from .models import EvaluateResult, Message, MetricResult, EvaluationRow
from .playback_policy import PlaybackPolicyBase
from .resources import create_llm_resource
from .reward_function import RewardFunction
from .typed_interface import reward_function
from .quickstart import aha_judge, split_multi_turn_rows
from .pytest import evaluation_test, SingleTurnRolloutProcessor
from .adapters import OpenAIResponsesAdapter, LangfuseAdapter, BraintrustAdapter, LangSmithAdapter

warnings.filterwarnings("default", category=DeprecationWarning, module="eval_protocol")

__all__ = [
"aha_judge",
"split_multi_turn_rows",
"evaluation_test",
"SingleTurnRolloutProcessor",
"OpenAIResponsesAdapter",
"LangfuseAdapter",
"BraintrustAdapter",
"LangSmithAdapter",
# Core interfaces
"Message",
"MetricResult",
Expand Down
14 changes: 14 additions & 0 deletions eval_protocol/adapters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,17 @@
__all__.extend(["create_trl_adapter"])
except ImportError:
pass

try:
from .openai_responses import OpenAIResponsesAdapter

__all__.extend(["OpenAIResponsesAdapter"])
except ImportError:
pass

try:
from .langsmith import LangSmithAdapter

__all__.extend(["LangSmithAdapter"])
except ImportError:
pass
2 changes: 1 addition & 1 deletion eval_protocol/adapters/langfuse.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def __call__(


def convert_trace_to_evaluation_row(
trace: TraceWithFullDetails, include_tool_calls: bool = True, span_name: Optional[str] = None
trace: "TraceWithFullDetails", include_tool_calls: bool = True, span_name: Optional[str] = None
) -> Optional[EvaluationRow]:
"""Convert a Langfuse trace to EvaluationRow format.

Expand Down
20 changes: 11 additions & 9 deletions eval_protocol/quickstart/llm_judge_braintrust.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,20 +13,22 @@
from eval_protocol.adapters.braintrust import create_braintrust_adapter
from eval_protocol.quickstart import aha_judge

adapter = create_braintrust_adapter()
# adapter = create_braintrust_adapter()


@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI")
@pytest.mark.asyncio
@evaluation_test(
input_rows=[
adapter.get_evaluation_rows(
btql_query=f"""
select: *
from: project_logs('{os.getenv("BRAINTRUST_PROJECT_ID")}') traces
filter: is_root = true
limit: 10
"""
)
# adapter.get_evaluation_rows(
# btql_query=f"""
# select: *
# from: project_logs('{os.getenv("BRAINTRUST_PROJECT_ID")}') traces
# filter: is_root = true
# limit: 10
# """
# )
[]
],
completion_params=[
{"model": "gpt-4.1"},
Expand Down
2 changes: 2 additions & 0 deletions eval_protocol/quickstart/llm_judge_langfuse.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""

from datetime import datetime
import os

import pytest

Expand All @@ -17,6 +18,7 @@
adapter = create_langfuse_adapter()


@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI")
@pytest.mark.asyncio
@evaluation_test(
input_rows=[
Expand Down
21 changes: 13 additions & 8 deletions eval_protocol/quickstart/llm_judge_openai_responses.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,23 +20,28 @@

import pytest

from eval_protocol.models import EvaluationRow
from eval_protocol.pytest import evaluation_test
from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor
from eval_protocol.quickstart import aha_judge, split_multi_turn_rows
from eval_protocol.adapters.openai_responses import OpenAIResponsesAdapter
from eval_protocol import (
evaluation_test,
aha_judge,
split_multi_turn_rows,
EvaluationRow,
SingleTurnRolloutProcessor,
OpenAIResponsesAdapter,
)

adapter = OpenAIResponsesAdapter()
input_rows = adapter.get_evaluation_rows(
response_ids=[
"resp_0e1b7db5d96e92470068c99506443c819e9305e92915d2405f",
"resp_05639dcaca074fbc0068c9946593b481908cac70075926d85c",
# "resp_05639dcaca074fbc0068c9946593b481908cac70075926d85c",
# "resp_0c96a910416e87aa0068c994d0b34c81a3bda0eddf22445aec",
# "resp_0efe023280e986f90068c994b85e088190bc8d8263fa603e02",
]
)


@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI") # pyright: ignore[reportAttributeAccessIssue]
@pytest.mark.asyncio # pyright: ignore[reportAttributeAccessIssue]
@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI")
@pytest.mark.asyncio
@evaluation_test(
input_rows=[input_rows],
completion_params=[
Expand Down
9 changes: 0 additions & 9 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -152,15 +152,6 @@ langgraph_tools = [
"langchain-fireworks>=0.3.0",
]

[tool.pytest.ini_options]
addopts = "-q"
testpaths = [
"examples",
]
plugins = [
"eval_protocol.pytest.plugin",
]

[project.scripts]
fireworks-reward = "eval_protocol.cli:main"
eval-protocol = "eval_protocol.cli:main"
Expand Down
8 changes: 5 additions & 3 deletions pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@ markers =
asyncio
asyncio_mode = auto
asyncio_default_fixture_loop_scope = function
testpaths = tests
python_files = test_*.py
testpaths = tests ./eval_protocol/quickstart
python_files = test_*.py llm_judge_*.py
plugins =
eval_protocol.pytest.plugin
python_classes = Test*
python_functions = test_*
# Configure stdout/stderr capture for debugging
addopts = -s --tb=short
addopts = -s --tb=short -q
# Alternative: disable capture completely for debugging
# addopts = -s --tb=short --capture=no
filterwarnings =
Expand Down
Loading