Skip to content

Commit 5c9194b

Browse files
author
Dylan Huang
authored
Responses API (part 3) (#183)
* cleanup + add more responses conversations * Refactor testing configuration and clean up project files - Removed pytest configuration from pyproject.toml. - Updated pytest.ini to include additional test paths and file patterns. - Adjusted VSCode settings to use pytest.ini for test arguments. - Minor code adjustments in langfuse.py and llm_judge_openai_responses.py for consistency and clarity. * remove the pyright decorators * remove examples * comment out for now
1 parent bcda711 commit 5c9194b

File tree

9 files changed

+59
-32
lines changed

9 files changed

+59
-32
lines changed

.vscode/settings.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"python.testing.unittestEnabled": false,
33
"python.testing.pytestEnabled": true,
4-
"python.testing.pytestArgs": ["tests", "examples", "-s", "--tb=short"],
4+
"python.testing.pytestArgs": ["-c", "pytest.ini"],
55
"python.testing.autoTestDiscoverOnSaveEnabled": true,
66
"python.defaultInterpreterPath": "./.venv/bin/python",
77
"python.testing.cwd": "${workspaceFolder}",

eval_protocol/__init__.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,15 +32,26 @@
3232
_FIREWORKS_AVAILABLE = False
3333
# Import submodules to make them available via eval_protocol.rewards, etc.
3434
from . import mcp, rewards
35-
from .models import EvaluateResult, Message, MetricResult
35+
from .models import EvaluateResult, Message, MetricResult, EvaluationRow
3636
from .playback_policy import PlaybackPolicyBase
3737
from .resources import create_llm_resource
3838
from .reward_function import RewardFunction
3939
from .typed_interface import reward_function
40+
from .quickstart import aha_judge, split_multi_turn_rows
41+
from .pytest import evaluation_test, SingleTurnRolloutProcessor
42+
from .adapters import OpenAIResponsesAdapter, LangfuseAdapter, BraintrustAdapter, LangSmithAdapter
4043

4144
warnings.filterwarnings("default", category=DeprecationWarning, module="eval_protocol")
4245

4346
__all__ = [
47+
"aha_judge",
48+
"split_multi_turn_rows",
49+
"evaluation_test",
50+
"SingleTurnRolloutProcessor",
51+
"OpenAIResponsesAdapter",
52+
"LangfuseAdapter",
53+
"BraintrustAdapter",
54+
"LangSmithAdapter",
4455
# Core interfaces
4556
"Message",
4657
"MetricResult",

eval_protocol/adapters/__init__.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,3 +73,17 @@
7373
__all__.extend(["create_trl_adapter"])
7474
except ImportError:
7575
pass
76+
77+
try:
78+
from .openai_responses import OpenAIResponsesAdapter
79+
80+
__all__.extend(["OpenAIResponsesAdapter"])
81+
except ImportError:
82+
pass
83+
84+
try:
85+
from .langsmith import LangSmithAdapter
86+
87+
__all__.extend(["LangSmithAdapter"])
88+
except ImportError:
89+
pass

eval_protocol/adapters/langfuse.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def __call__(
5656

5757

5858
def convert_trace_to_evaluation_row(
59-
trace: TraceWithFullDetails, include_tool_calls: bool = True, span_name: Optional[str] = None
59+
trace: "TraceWithFullDetails", include_tool_calls: bool = True, span_name: Optional[str] = None
6060
) -> Optional[EvaluationRow]:
6161
"""Convert a Langfuse trace to EvaluationRow format.
6262

eval_protocol/quickstart/llm_judge_braintrust.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,20 +13,22 @@
1313
from eval_protocol.adapters.braintrust import create_braintrust_adapter
1414
from eval_protocol.quickstart import aha_judge
1515

16-
adapter = create_braintrust_adapter()
16+
# adapter = create_braintrust_adapter()
1717

1818

19+
@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI")
1920
@pytest.mark.asyncio
2021
@evaluation_test(
2122
input_rows=[
22-
adapter.get_evaluation_rows(
23-
btql_query=f"""
24-
select: *
25-
from: project_logs('{os.getenv("BRAINTRUST_PROJECT_ID")}') traces
26-
filter: is_root = true
27-
limit: 10
28-
"""
29-
)
23+
# adapter.get_evaluation_rows(
24+
# btql_query=f"""
25+
# select: *
26+
# from: project_logs('{os.getenv("BRAINTRUST_PROJECT_ID")}') traces
27+
# filter: is_root = true
28+
# limit: 10
29+
# """
30+
# )
31+
[]
3032
],
3133
completion_params=[
3234
{"model": "gpt-4.1"},

eval_protocol/quickstart/llm_judge_langfuse.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"""
44

55
from datetime import datetime
6+
import os
67

78
import pytest
89

@@ -17,6 +18,7 @@
1718
adapter = create_langfuse_adapter()
1819

1920

21+
@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI")
2022
@pytest.mark.asyncio
2123
@evaluation_test(
2224
input_rows=[

eval_protocol/quickstart/llm_judge_openai_responses.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,23 +20,28 @@
2020

2121
import pytest
2222

23-
from eval_protocol.models import EvaluationRow
24-
from eval_protocol.pytest import evaluation_test
25-
from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor
26-
from eval_protocol.quickstart import aha_judge, split_multi_turn_rows
27-
from eval_protocol.adapters.openai_responses import OpenAIResponsesAdapter
23+
from eval_protocol import (
24+
evaluation_test,
25+
aha_judge,
26+
split_multi_turn_rows,
27+
EvaluationRow,
28+
SingleTurnRolloutProcessor,
29+
OpenAIResponsesAdapter,
30+
)
2831

2932
adapter = OpenAIResponsesAdapter()
3033
input_rows = adapter.get_evaluation_rows(
3134
response_ids=[
3235
"resp_0e1b7db5d96e92470068c99506443c819e9305e92915d2405f",
33-
"resp_05639dcaca074fbc0068c9946593b481908cac70075926d85c",
36+
# "resp_05639dcaca074fbc0068c9946593b481908cac70075926d85c",
37+
# "resp_0c96a910416e87aa0068c994d0b34c81a3bda0eddf22445aec",
38+
# "resp_0efe023280e986f90068c994b85e088190bc8d8263fa603e02",
3439
]
3540
)
3641

3742

38-
@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI") # pyright: ignore[reportAttributeAccessIssue]
39-
@pytest.mark.asyncio # pyright: ignore[reportAttributeAccessIssue]
43+
@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI")
44+
@pytest.mark.asyncio
4045
@evaluation_test(
4146
input_rows=[input_rows],
4247
completion_params=[

pyproject.toml

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -152,15 +152,6 @@ langgraph_tools = [
152152
"langchain-fireworks>=0.3.0",
153153
]
154154

155-
[tool.pytest.ini_options]
156-
addopts = "-q"
157-
testpaths = [
158-
"examples",
159-
]
160-
plugins = [
161-
"eval_protocol.pytest.plugin",
162-
]
163-
164155
[project.scripts]
165156
fireworks-reward = "eval_protocol.cli:main"
166157
eval-protocol = "eval_protocol.cli:main"

pytest.ini

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,14 @@ markers =
33
asyncio
44
asyncio_mode = auto
55
asyncio_default_fixture_loop_scope = function
6-
testpaths = tests
7-
python_files = test_*.py
6+
testpaths = tests ./eval_protocol/quickstart
7+
python_files = test_*.py llm_judge_*.py
8+
plugins =
9+
eval_protocol.pytest.plugin
810
python_classes = Test*
911
python_functions = test_*
1012
# Configure stdout/stderr capture for debugging
11-
addopts = -s --tb=short
13+
addopts = -s --tb=short -q
1214
# Alternative: disable capture completely for debugging
1315
# addopts = -s --tb=short --capture=no
1416
filterwarnings =

0 commit comments

Comments
 (0)