diff --git a/.vscode/settings.json b/.vscode/settings.json index 13b13c52..2c4a8fff 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,7 +1,7 @@ { "python.testing.unittestEnabled": false, "python.testing.pytestEnabled": true, - "python.testing.pytestArgs": ["tests", "examples", "-s", "--tb=short"], + "python.testing.pytestArgs": ["-c", "pytest.ini"], "python.testing.autoTestDiscoverOnSaveEnabled": true, "python.defaultInterpreterPath": "./.venv/bin/python", "python.testing.cwd": "${workspaceFolder}", diff --git a/eval_protocol/__init__.py b/eval_protocol/__init__.py index f39369b2..9f873fe3 100644 --- a/eval_protocol/__init__.py +++ b/eval_protocol/__init__.py @@ -32,15 +32,26 @@ _FIREWORKS_AVAILABLE = False # Import submodules to make them available via eval_protocol.rewards, etc. from . import mcp, rewards -from .models import EvaluateResult, Message, MetricResult +from .models import EvaluateResult, Message, MetricResult, EvaluationRow from .playback_policy import PlaybackPolicyBase from .resources import create_llm_resource from .reward_function import RewardFunction from .typed_interface import reward_function +from .quickstart import aha_judge, split_multi_turn_rows +from .pytest import evaluation_test, SingleTurnRolloutProcessor +from .adapters import OpenAIResponsesAdapter, LangfuseAdapter, BraintrustAdapter, LangSmithAdapter warnings.filterwarnings("default", category=DeprecationWarning, module="eval_protocol") __all__ = [ + "aha_judge", + "split_multi_turn_rows", + "evaluation_test", + "SingleTurnRolloutProcessor", + "OpenAIResponsesAdapter", + "LangfuseAdapter", + "BraintrustAdapter", + "LangSmithAdapter", # Core interfaces "Message", "MetricResult", diff --git a/eval_protocol/adapters/__init__.py b/eval_protocol/adapters/__init__.py index dd906568..d338b6c2 100644 --- a/eval_protocol/adapters/__init__.py +++ b/eval_protocol/adapters/__init__.py @@ -73,3 +73,17 @@ __all__.extend(["create_trl_adapter"]) except ImportError: pass + +try: + from .openai_responses import OpenAIResponsesAdapter + + __all__.extend(["OpenAIResponsesAdapter"]) +except ImportError: + pass + +try: + from .langsmith import LangSmithAdapter + + __all__.extend(["LangSmithAdapter"]) +except ImportError: + pass diff --git a/eval_protocol/adapters/langfuse.py b/eval_protocol/adapters/langfuse.py index 6d057372..44c43fe2 100644 --- a/eval_protocol/adapters/langfuse.py +++ b/eval_protocol/adapters/langfuse.py @@ -56,7 +56,7 @@ def __call__( def convert_trace_to_evaluation_row( - trace: TraceWithFullDetails, include_tool_calls: bool = True, span_name: Optional[str] = None + trace: "TraceWithFullDetails", include_tool_calls: bool = True, span_name: Optional[str] = None ) -> Optional[EvaluationRow]: """Convert a Langfuse trace to EvaluationRow format. diff --git a/eval_protocol/quickstart/llm_judge_braintrust.py b/eval_protocol/quickstart/llm_judge_braintrust.py index 71b51b0a..b31966e1 100644 --- a/eval_protocol/quickstart/llm_judge_braintrust.py +++ b/eval_protocol/quickstart/llm_judge_braintrust.py @@ -13,20 +13,22 @@ from eval_protocol.adapters.braintrust import create_braintrust_adapter from eval_protocol.quickstart import aha_judge -adapter = create_braintrust_adapter() +# adapter = create_braintrust_adapter() +@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI") @pytest.mark.asyncio @evaluation_test( input_rows=[ - adapter.get_evaluation_rows( - btql_query=f""" -select: * -from: project_logs('{os.getenv("BRAINTRUST_PROJECT_ID")}') traces -filter: is_root = true -limit: 10 -""" - ) + # adapter.get_evaluation_rows( + # btql_query=f""" + # select: * + # from: project_logs('{os.getenv("BRAINTRUST_PROJECT_ID")}') traces + # filter: is_root = true + # limit: 10 + # """ + # ) + [] ], completion_params=[ {"model": "gpt-4.1"}, diff --git a/eval_protocol/quickstart/llm_judge_langfuse.py b/eval_protocol/quickstart/llm_judge_langfuse.py index 08f72c67..5154ac8e 100644 --- a/eval_protocol/quickstart/llm_judge_langfuse.py +++ b/eval_protocol/quickstart/llm_judge_langfuse.py @@ -3,6 +3,7 @@ """ from datetime import datetime +import os import pytest @@ -17,6 +18,7 @@ adapter = create_langfuse_adapter() +@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI") @pytest.mark.asyncio @evaluation_test( input_rows=[ diff --git a/eval_protocol/quickstart/llm_judge_openai_responses.py b/eval_protocol/quickstart/llm_judge_openai_responses.py index 056abfc3..5d8cb983 100644 --- a/eval_protocol/quickstart/llm_judge_openai_responses.py +++ b/eval_protocol/quickstart/llm_judge_openai_responses.py @@ -20,23 +20,28 @@ import pytest -from eval_protocol.models import EvaluationRow -from eval_protocol.pytest import evaluation_test -from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor -from eval_protocol.quickstart import aha_judge, split_multi_turn_rows -from eval_protocol.adapters.openai_responses import OpenAIResponsesAdapter +from eval_protocol import ( + evaluation_test, + aha_judge, + split_multi_turn_rows, + EvaluationRow, + SingleTurnRolloutProcessor, + OpenAIResponsesAdapter, +) adapter = OpenAIResponsesAdapter() input_rows = adapter.get_evaluation_rows( response_ids=[ "resp_0e1b7db5d96e92470068c99506443c819e9305e92915d2405f", - "resp_05639dcaca074fbc0068c9946593b481908cac70075926d85c", + # "resp_05639dcaca074fbc0068c9946593b481908cac70075926d85c", + # "resp_0c96a910416e87aa0068c994d0b34c81a3bda0eddf22445aec", + # "resp_0efe023280e986f90068c994b85e088190bc8d8263fa603e02", ] ) -@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI") # pyright: ignore[reportAttributeAccessIssue] -@pytest.mark.asyncio # pyright: ignore[reportAttributeAccessIssue] +@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI") +@pytest.mark.asyncio @evaluation_test( input_rows=[input_rows], completion_params=[ diff --git a/pyproject.toml b/pyproject.toml index 581e880b..664dc857 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -152,15 +152,6 @@ langgraph_tools = [ "langchain-fireworks>=0.3.0", ] -[tool.pytest.ini_options] -addopts = "-q" -testpaths = [ - "examples", -] -plugins = [ - "eval_protocol.pytest.plugin", -] - [project.scripts] fireworks-reward = "eval_protocol.cli:main" eval-protocol = "eval_protocol.cli:main" diff --git a/pytest.ini b/pytest.ini index 785d4d7e..cd7f77df 100644 --- a/pytest.ini +++ b/pytest.ini @@ -3,12 +3,14 @@ markers = asyncio asyncio_mode = auto asyncio_default_fixture_loop_scope = function -testpaths = tests -python_files = test_*.py +testpaths = tests ./eval_protocol/quickstart +python_files = test_*.py llm_judge_*.py +plugins = + eval_protocol.pytest.plugin python_classes = Test* python_functions = test_* # Configure stdout/stderr capture for debugging -addopts = -s --tb=short +addopts = -s --tb=short -q # Alternative: disable capture completely for debugging # addopts = -s --tb=short --capture=no filterwarnings =