Skip to content

Commit 6e9f7af

Browse files
author
Dylan Huang
committed
cleanup + add more responses conversations
1 parent bcda711 commit 6e9f7af

File tree

6 files changed

+44
-11
lines changed

6 files changed

+44
-11
lines changed

eval_protocol/__init__.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,15 +32,26 @@
3232
_FIREWORKS_AVAILABLE = False
3333
# Import submodules to make them available via eval_protocol.rewards, etc.
3434
from . import mcp, rewards
35-
from .models import EvaluateResult, Message, MetricResult
35+
from .models import EvaluateResult, Message, MetricResult, EvaluationRow
3636
from .playback_policy import PlaybackPolicyBase
3737
from .resources import create_llm_resource
3838
from .reward_function import RewardFunction
3939
from .typed_interface import reward_function
40+
from .quickstart import aha_judge, split_multi_turn_rows
41+
from .pytest import evaluation_test, SingleTurnRolloutProcessor
42+
from .adapters import OpenAIResponsesAdapter, LangfuseAdapter, BraintrustAdapter, LangSmithAdapter
4043

4144
warnings.filterwarnings("default", category=DeprecationWarning, module="eval_protocol")
4245

4346
__all__ = [
47+
"aha_judge",
48+
"split_multi_turn_rows",
49+
"evaluation_test",
50+
"SingleTurnRolloutProcessor",
51+
"OpenAIResponsesAdapter",
52+
"LangfuseAdapter",
53+
"BraintrustAdapter",
54+
"LangSmithAdapter",
4455
# Core interfaces
4556
"Message",
4657
"MetricResult",

eval_protocol/adapters/__init__.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,3 +73,17 @@
7373
__all__.extend(["create_trl_adapter"])
7474
except ImportError:
7575
pass
76+
77+
try:
78+
from .openai_responses import OpenAIResponsesAdapter
79+
80+
__all__.extend(["OpenAIResponsesAdapter"])
81+
except ImportError:
82+
pass
83+
84+
try:
85+
from .langsmith import LangSmithAdapter
86+
87+
__all__.extend(["LangSmithAdapter"])
88+
except ImportError:
89+
pass

eval_protocol/quickstart/llm_judge_braintrust.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@
1616
adapter = create_braintrust_adapter()
1717

1818

19-
@pytest.mark.asyncio
19+
@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI") # pyright: ignore[reportAttributeAccessIssue]
20+
@pytest.mark.asyncio # pyright: ignore[reportAttributeAccessIssue]
2021
@evaluation_test(
2122
input_rows=[
2223
adapter.get_evaluation_rows(

eval_protocol/quickstart/llm_judge_langfuse.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"""
44

55
from datetime import datetime
6+
import os
67

78
import pytest
89

@@ -17,7 +18,8 @@
1718
adapter = create_langfuse_adapter()
1819

1920

20-
@pytest.mark.asyncio
21+
@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI") # pyright: ignore[reportAttributeAccessIssue]
22+
@pytest.mark.asyncio # pyright: ignore[reportAttributeAccessIssue]
2123
@evaluation_test(
2224
input_rows=[
2325
adapter.get_evaluation_rows(

eval_protocol/quickstart/llm_judge_langsmith.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,8 @@ def fetch_langsmith_traces_as_evaluation_rows(
5656
return []
5757

5858

59-
@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI")
60-
@pytest.mark.asyncio
59+
@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI") # pyright: ignore[reportAttributeAccessIssue]
60+
@pytest.mark.asyncio # pyright: ignore[reportAttributeAccessIssue]
6161
@evaluation_test(
6262
input_rows=[fetch_langsmith_traces_as_evaluation_rows()],
6363
completion_params=[

eval_protocol/quickstart/llm_judge_openai_responses.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,17 +20,22 @@
2020

2121
import pytest
2222

23-
from eval_protocol.models import EvaluationRow
24-
from eval_protocol.pytest import evaluation_test
25-
from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor
26-
from eval_protocol.quickstart import aha_judge, split_multi_turn_rows
27-
from eval_protocol.adapters.openai_responses import OpenAIResponsesAdapter
23+
from eval_protocol import (
24+
evaluation_test,
25+
aha_judge,
26+
split_multi_turn_rows,
27+
EvaluationRow,
28+
SingleTurnRolloutProcessor,
29+
OpenAIResponsesAdapter,
30+
)
2831

2932
adapter = OpenAIResponsesAdapter()
3033
input_rows = adapter.get_evaluation_rows(
3134
response_ids=[
3235
"resp_0e1b7db5d96e92470068c99506443c819e9305e92915d2405f",
33-
"resp_05639dcaca074fbc0068c9946593b481908cac70075926d85c",
36+
# "resp_05639dcaca074fbc0068c9946593b481908cac70075926d85c",
37+
# "resp_0c96a910416e87aa0068c994d0b34c81a3bda0eddf22445aec",
38+
# "resp_0efe023280e986f90068c994b85e088190bc8d8263fa603e02",
3439
]
3540
)
3641

0 commit comments

Comments
 (0)