From 6e9f7af22ec526bcb60d3bc552bb70e3576bd8ff Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Wed, 17 Sep 2025 10:48:03 -0700 Subject: [PATCH 1/5] cleanup + add more responses conversations --- eval_protocol/__init__.py | 13 ++++++++++++- eval_protocol/adapters/__init__.py | 14 ++++++++++++++ .../quickstart/llm_judge_braintrust.py | 3 ++- eval_protocol/quickstart/llm_judge_langfuse.py | 4 +++- eval_protocol/quickstart/llm_judge_langsmith.py | 4 ++-- .../quickstart/llm_judge_openai_responses.py | 17 +++++++++++------ 6 files changed, 44 insertions(+), 11 deletions(-) diff --git a/eval_protocol/__init__.py b/eval_protocol/__init__.py index f39369b2..9f873fe3 100644 --- a/eval_protocol/__init__.py +++ b/eval_protocol/__init__.py @@ -32,15 +32,26 @@ _FIREWORKS_AVAILABLE = False # Import submodules to make them available via eval_protocol.rewards, etc. from . import mcp, rewards -from .models import EvaluateResult, Message, MetricResult +from .models import EvaluateResult, Message, MetricResult, EvaluationRow from .playback_policy import PlaybackPolicyBase from .resources import create_llm_resource from .reward_function import RewardFunction from .typed_interface import reward_function +from .quickstart import aha_judge, split_multi_turn_rows +from .pytest import evaluation_test, SingleTurnRolloutProcessor +from .adapters import OpenAIResponsesAdapter, LangfuseAdapter, BraintrustAdapter, LangSmithAdapter warnings.filterwarnings("default", category=DeprecationWarning, module="eval_protocol") __all__ = [ + "aha_judge", + "split_multi_turn_rows", + "evaluation_test", + "SingleTurnRolloutProcessor", + "OpenAIResponsesAdapter", + "LangfuseAdapter", + "BraintrustAdapter", + "LangSmithAdapter", # Core interfaces "Message", "MetricResult", diff --git a/eval_protocol/adapters/__init__.py b/eval_protocol/adapters/__init__.py index dd906568..d338b6c2 100644 --- a/eval_protocol/adapters/__init__.py +++ b/eval_protocol/adapters/__init__.py @@ -73,3 +73,17 @@ __all__.extend(["create_trl_adapter"]) except ImportError: pass + +try: + from .openai_responses import OpenAIResponsesAdapter + + __all__.extend(["OpenAIResponsesAdapter"]) +except ImportError: + pass + +try: + from .langsmith import LangSmithAdapter + + __all__.extend(["LangSmithAdapter"]) +except ImportError: + pass diff --git a/eval_protocol/quickstart/llm_judge_braintrust.py b/eval_protocol/quickstart/llm_judge_braintrust.py index 71b51b0a..3ff1dfba 100644 --- a/eval_protocol/quickstart/llm_judge_braintrust.py +++ b/eval_protocol/quickstart/llm_judge_braintrust.py @@ -16,7 +16,8 @@ adapter = create_braintrust_adapter() -@pytest.mark.asyncio +@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI") # pyright: ignore[reportAttributeAccessIssue] +@pytest.mark.asyncio # pyright: ignore[reportAttributeAccessIssue] @evaluation_test( input_rows=[ adapter.get_evaluation_rows( diff --git a/eval_protocol/quickstart/llm_judge_langfuse.py b/eval_protocol/quickstart/llm_judge_langfuse.py index 08f72c67..1318dffa 100644 --- a/eval_protocol/quickstart/llm_judge_langfuse.py +++ b/eval_protocol/quickstart/llm_judge_langfuse.py @@ -3,6 +3,7 @@ """ from datetime import datetime +import os import pytest @@ -17,7 +18,8 @@ adapter = create_langfuse_adapter() -@pytest.mark.asyncio +@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI") # pyright: ignore[reportAttributeAccessIssue] +@pytest.mark.asyncio # pyright: ignore[reportAttributeAccessIssue] @evaluation_test( input_rows=[ adapter.get_evaluation_rows( diff --git a/eval_protocol/quickstart/llm_judge_langsmith.py b/eval_protocol/quickstart/llm_judge_langsmith.py index 16a287a9..e682ee4c 100644 --- a/eval_protocol/quickstart/llm_judge_langsmith.py +++ b/eval_protocol/quickstart/llm_judge_langsmith.py @@ -56,8 +56,8 @@ def fetch_langsmith_traces_as_evaluation_rows( return [] -@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI") -@pytest.mark.asyncio +@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI") # pyright: ignore[reportAttributeAccessIssue] +@pytest.mark.asyncio # pyright: ignore[reportAttributeAccessIssue] @evaluation_test( input_rows=[fetch_langsmith_traces_as_evaluation_rows()], completion_params=[ diff --git a/eval_protocol/quickstart/llm_judge_openai_responses.py b/eval_protocol/quickstart/llm_judge_openai_responses.py index 056abfc3..e1aef5f2 100644 --- a/eval_protocol/quickstart/llm_judge_openai_responses.py +++ b/eval_protocol/quickstart/llm_judge_openai_responses.py @@ -20,17 +20,22 @@ import pytest -from eval_protocol.models import EvaluationRow -from eval_protocol.pytest import evaluation_test -from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor -from eval_protocol.quickstart import aha_judge, split_multi_turn_rows -from eval_protocol.adapters.openai_responses import OpenAIResponsesAdapter +from eval_protocol import ( + evaluation_test, + aha_judge, + split_multi_turn_rows, + EvaluationRow, + SingleTurnRolloutProcessor, + OpenAIResponsesAdapter, +) adapter = OpenAIResponsesAdapter() input_rows = adapter.get_evaluation_rows( response_ids=[ "resp_0e1b7db5d96e92470068c99506443c819e9305e92915d2405f", - "resp_05639dcaca074fbc0068c9946593b481908cac70075926d85c", + # "resp_05639dcaca074fbc0068c9946593b481908cac70075926d85c", + # "resp_0c96a910416e87aa0068c994d0b34c81a3bda0eddf22445aec", + # "resp_0efe023280e986f90068c994b85e088190bc8d8263fa603e02", ] ) From 247ec0a4e03093bb4277ceac3b88ab5e3935a715 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Wed, 17 Sep 2025 11:08:45 -0700 Subject: [PATCH 2/5] Refactor testing configuration and clean up project files - Removed pytest configuration from pyproject.toml. - Updated pytest.ini to include additional test paths and file patterns. - Adjusted VSCode settings to use pytest.ini for test arguments. - Minor code adjustments in langfuse.py and llm_judge_openai_responses.py for consistency and clarity. --- .vscode/settings.json | 2 +- eval_protocol/adapters/langfuse.py | 2 +- eval_protocol/quickstart/llm_judge_openai_responses.py | 4 ++-- pyproject.toml | 9 --------- pytest.ini | 8 +++++--- 5 files changed, 9 insertions(+), 16 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 13b13c52..2c4a8fff 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,7 +1,7 @@ { "python.testing.unittestEnabled": false, "python.testing.pytestEnabled": true, - "python.testing.pytestArgs": ["tests", "examples", "-s", "--tb=short"], + "python.testing.pytestArgs": ["-c", "pytest.ini"], "python.testing.autoTestDiscoverOnSaveEnabled": true, "python.defaultInterpreterPath": "./.venv/bin/python", "python.testing.cwd": "${workspaceFolder}", diff --git a/eval_protocol/adapters/langfuse.py b/eval_protocol/adapters/langfuse.py index 6d057372..44c43fe2 100644 --- a/eval_protocol/adapters/langfuse.py +++ b/eval_protocol/adapters/langfuse.py @@ -56,7 +56,7 @@ def __call__( def convert_trace_to_evaluation_row( - trace: TraceWithFullDetails, include_tool_calls: bool = True, span_name: Optional[str] = None + trace: "TraceWithFullDetails", include_tool_calls: bool = True, span_name: Optional[str] = None ) -> Optional[EvaluationRow]: """Convert a Langfuse trace to EvaluationRow format. diff --git a/eval_protocol/quickstart/llm_judge_openai_responses.py b/eval_protocol/quickstart/llm_judge_openai_responses.py index e1aef5f2..5d8cb983 100644 --- a/eval_protocol/quickstart/llm_judge_openai_responses.py +++ b/eval_protocol/quickstart/llm_judge_openai_responses.py @@ -40,8 +40,8 @@ ) -@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI") # pyright: ignore[reportAttributeAccessIssue] -@pytest.mark.asyncio # pyright: ignore[reportAttributeAccessIssue] +@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI") +@pytest.mark.asyncio @evaluation_test( input_rows=[input_rows], completion_params=[ diff --git a/pyproject.toml b/pyproject.toml index 581e880b..664dc857 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -152,15 +152,6 @@ langgraph_tools = [ "langchain-fireworks>=0.3.0", ] -[tool.pytest.ini_options] -addopts = "-q" -testpaths = [ - "examples", -] -plugins = [ - "eval_protocol.pytest.plugin", -] - [project.scripts] fireworks-reward = "eval_protocol.cli:main" eval-protocol = "eval_protocol.cli:main" diff --git a/pytest.ini b/pytest.ini index 785d4d7e..935d364c 100644 --- a/pytest.ini +++ b/pytest.ini @@ -3,12 +3,14 @@ markers = asyncio asyncio_mode = auto asyncio_default_fixture_loop_scope = function -testpaths = tests -python_files = test_*.py +testpaths = tests examples ./eval_protocol/quickstart +python_files = test_*.py llm_judge_*.py +plugins = + eval_protocol.pytest.plugin python_classes = Test* python_functions = test_* # Configure stdout/stderr capture for debugging -addopts = -s --tb=short +addopts = -s --tb=short -q # Alternative: disable capture completely for debugging # addopts = -s --tb=short --capture=no filterwarnings = From 0760c855e2f9b99a307895fd7ace663096a6aaa3 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Wed, 17 Sep 2025 11:44:08 -0700 Subject: [PATCH 3/5] remove the pyright decorators --- eval_protocol/quickstart/llm_judge_braintrust.py | 4 ++-- eval_protocol/quickstart/llm_judge_langfuse.py | 4 ++-- eval_protocol/quickstart/llm_judge_langsmith.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/eval_protocol/quickstart/llm_judge_braintrust.py b/eval_protocol/quickstart/llm_judge_braintrust.py index 3ff1dfba..2f9c9ad7 100644 --- a/eval_protocol/quickstart/llm_judge_braintrust.py +++ b/eval_protocol/quickstart/llm_judge_braintrust.py @@ -16,8 +16,8 @@ adapter = create_braintrust_adapter() -@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI") # pyright: ignore[reportAttributeAccessIssue] -@pytest.mark.asyncio # pyright: ignore[reportAttributeAccessIssue] +@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI") +@pytest.mark.asyncio @evaluation_test( input_rows=[ adapter.get_evaluation_rows( diff --git a/eval_protocol/quickstart/llm_judge_langfuse.py b/eval_protocol/quickstart/llm_judge_langfuse.py index 1318dffa..5154ac8e 100644 --- a/eval_protocol/quickstart/llm_judge_langfuse.py +++ b/eval_protocol/quickstart/llm_judge_langfuse.py @@ -18,8 +18,8 @@ adapter = create_langfuse_adapter() -@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI") # pyright: ignore[reportAttributeAccessIssue] -@pytest.mark.asyncio # pyright: ignore[reportAttributeAccessIssue] +@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI") +@pytest.mark.asyncio @evaluation_test( input_rows=[ adapter.get_evaluation_rows( diff --git a/eval_protocol/quickstart/llm_judge_langsmith.py b/eval_protocol/quickstart/llm_judge_langsmith.py index e682ee4c..16a287a9 100644 --- a/eval_protocol/quickstart/llm_judge_langsmith.py +++ b/eval_protocol/quickstart/llm_judge_langsmith.py @@ -56,8 +56,8 @@ def fetch_langsmith_traces_as_evaluation_rows( return [] -@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI") # pyright: ignore[reportAttributeAccessIssue] -@pytest.mark.asyncio # pyright: ignore[reportAttributeAccessIssue] +@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI") +@pytest.mark.asyncio @evaluation_test( input_rows=[fetch_langsmith_traces_as_evaluation_rows()], completion_params=[ From 4a3b22be7dc0bfbb9b65fd01214905ccf53b8f83 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Wed, 17 Sep 2025 11:55:52 -0700 Subject: [PATCH 4/5] remove examples --- pytest.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytest.ini b/pytest.ini index 935d364c..cd7f77df 100644 --- a/pytest.ini +++ b/pytest.ini @@ -3,7 +3,7 @@ markers = asyncio asyncio_mode = auto asyncio_default_fixture_loop_scope = function -testpaths = tests examples ./eval_protocol/quickstart +testpaths = tests ./eval_protocol/quickstart python_files = test_*.py llm_judge_*.py plugins = eval_protocol.pytest.plugin From e6cdd2c7939b12f9a0293cd86fdeb07e7a006bba Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Wed, 17 Sep 2025 12:10:36 -0700 Subject: [PATCH 5/5] comment out for now --- .../quickstart/llm_judge_braintrust.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/eval_protocol/quickstart/llm_judge_braintrust.py b/eval_protocol/quickstart/llm_judge_braintrust.py index 2f9c9ad7..b31966e1 100644 --- a/eval_protocol/quickstart/llm_judge_braintrust.py +++ b/eval_protocol/quickstart/llm_judge_braintrust.py @@ -13,21 +13,22 @@ from eval_protocol.adapters.braintrust import create_braintrust_adapter from eval_protocol.quickstart import aha_judge -adapter = create_braintrust_adapter() +# adapter = create_braintrust_adapter() @pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI") @pytest.mark.asyncio @evaluation_test( input_rows=[ - adapter.get_evaluation_rows( - btql_query=f""" -select: * -from: project_logs('{os.getenv("BRAINTRUST_PROJECT_ID")}') traces -filter: is_root = true -limit: 10 -""" - ) + # adapter.get_evaluation_rows( + # btql_query=f""" + # select: * + # from: project_logs('{os.getenv("BRAINTRUST_PROJECT_ID")}') traces + # filter: is_root = true + # limit: 10 + # """ + # ) + [] ], completion_params=[ {"model": "gpt-4.1"},