Responses API (part 3) (#183)

Dylan Huang · web-flow · commit 5c9194b59e85 · 2025-09-17T13:05:14.000-07:00
* cleanup + add more responses conversations

* Refactor testing configuration and clean up project files

- Removed pytest configuration from pyproject.toml.
- Updated pytest.ini to include additional test paths and file patterns.
- Adjusted VSCode settings to use pytest.ini for test arguments.
- Minor code adjustments in langfuse.py and llm_judge_openai_responses.py for consistency and clarity.

* remove the pyright decorators

* remove examples

* comment out for now
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,7 +1,7 @@
 {
   "python.testing.unittestEnabled": false,
   "python.testing.pytestEnabled": true,
-  "python.testing.pytestArgs": ["tests", "examples", "-s", "--tb=short"],
+  "python.testing.pytestArgs": ["-c", "pytest.ini"],
   "python.testing.autoTestDiscoverOnSaveEnabled": true,
   "python.defaultInterpreterPath": "./.venv/bin/python",
   "python.testing.cwd": "${workspaceFolder}",
diff --git a/eval_protocol/__init__.py b/eval_protocol/__init__.py
@@ -32,15 +32,26 @@
     _FIREWORKS_AVAILABLE = False
 # Import submodules to make them available via eval_protocol.rewards, etc.
 from . import mcp, rewards
-from .models import EvaluateResult, Message, MetricResult
+from .models import EvaluateResult, Message, MetricResult, EvaluationRow
 from .playback_policy import PlaybackPolicyBase
 from .resources import create_llm_resource
 from .reward_function import RewardFunction
 from .typed_interface import reward_function
+from .quickstart import aha_judge, split_multi_turn_rows
+from .pytest import evaluation_test, SingleTurnRolloutProcessor
+from .adapters import OpenAIResponsesAdapter, LangfuseAdapter, BraintrustAdapter, LangSmithAdapter
 
 warnings.filterwarnings("default", category=DeprecationWarning, module="eval_protocol")
 
 __all__ = [
+    "aha_judge",
+    "split_multi_turn_rows",
+    "evaluation_test",
+    "SingleTurnRolloutProcessor",
+    "OpenAIResponsesAdapter",
+    "LangfuseAdapter",
+    "BraintrustAdapter",
+    "LangSmithAdapter",
     # Core interfaces
     "Message",
     "MetricResult",
diff --git a/eval_protocol/adapters/__init__.py b/eval_protocol/adapters/__init__.py
@@ -73,3 +73,17 @@
     __all__.extend(["create_trl_adapter"])
 except ImportError:
     pass
+
+try:
+    from .openai_responses import OpenAIResponsesAdapter
+
+    __all__.extend(["OpenAIResponsesAdapter"])
+except ImportError:
+    pass
+
+try:
+    from .langsmith import LangSmithAdapter
+
+    __all__.extend(["LangSmithAdapter"])
+except ImportError:
+    pass
diff --git a/eval_protocol/adapters/langfuse.py b/eval_protocol/adapters/langfuse.py
@@ -56,7 +56,7 @@ def __call__(
 
 
 def convert_trace_to_evaluation_row(
-    trace: TraceWithFullDetails, include_tool_calls: bool = True, span_name: Optional[str] = None
+    trace: "TraceWithFullDetails", include_tool_calls: bool = True, span_name: Optional[str] = None
 ) -> Optional[EvaluationRow]:
     """Convert a Langfuse trace to EvaluationRow format.
 
diff --git a/eval_protocol/quickstart/llm_judge_braintrust.py b/eval_protocol/quickstart/llm_judge_braintrust.py
@@ -13,20 +13,22 @@
 from eval_protocol.adapters.braintrust import create_braintrust_adapter
 from eval_protocol.quickstart import aha_judge
 
-adapter = create_braintrust_adapter()
+# adapter = create_braintrust_adapter()
 
 
+@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI")
 @pytest.mark.asyncio
 @evaluation_test(
     input_rows=[
-        adapter.get_evaluation_rows(
-            btql_query=f"""
-select: *
-from: project_logs('{os.getenv("BRAINTRUST_PROJECT_ID")}') traces
-filter: is_root = true
-limit: 10
-"""
-        )
+        #         adapter.get_evaluation_rows(
+        #             btql_query=f"""
+        # select: *
+        # from: project_logs('{os.getenv("BRAINTRUST_PROJECT_ID")}') traces
+        # filter: is_root = true
+        # limit: 10
+        # """
+        #         )
+        []
     ],
     completion_params=[
         {"model": "gpt-4.1"},
diff --git a/eval_protocol/quickstart/llm_judge_langfuse.py b/eval_protocol/quickstart/llm_judge_langfuse.py
@@ -3,6 +3,7 @@
 """
 
 from datetime import datetime
+import os
 
 import pytest
 
@@ -17,6 +18,7 @@
 adapter = create_langfuse_adapter()
 
 
+@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI")
 @pytest.mark.asyncio
 @evaluation_test(
     input_rows=[
diff --git a/eval_protocol/quickstart/llm_judge_openai_responses.py b/eval_protocol/quickstart/llm_judge_openai_responses.py
@@ -20,23 +20,28 @@
 
 import pytest
 
-from eval_protocol.models import EvaluationRow
-from eval_protocol.pytest import evaluation_test
-from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor
-from eval_protocol.quickstart import aha_judge, split_multi_turn_rows
-from eval_protocol.adapters.openai_responses import OpenAIResponsesAdapter
+from eval_protocol import (
+    evaluation_test,
+    aha_judge,
+    split_multi_turn_rows,
+    EvaluationRow,
+    SingleTurnRolloutProcessor,
+    OpenAIResponsesAdapter,
+)
 
 adapter = OpenAIResponsesAdapter()
 input_rows = adapter.get_evaluation_rows(
     response_ids=[
         "resp_0e1b7db5d96e92470068c99506443c819e9305e92915d2405f",
-        "resp_05639dcaca074fbc0068c9946593b481908cac70075926d85c",
+        # "resp_05639dcaca074fbc0068c9946593b481908cac70075926d85c",
+        # "resp_0c96a910416e87aa0068c994d0b34c81a3bda0eddf22445aec",
+        # "resp_0efe023280e986f90068c994b85e088190bc8d8263fa603e02",
     ]
 )
 
 
-@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI")  # pyright: ignore[reportAttributeAccessIssue]
-@pytest.mark.asyncio  # pyright: ignore[reportAttributeAccessIssue]
+@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI")
+@pytest.mark.asyncio
 @evaluation_test(
     input_rows=[input_rows],
     completion_params=[
diff --git a/pyproject.toml b/pyproject.toml
@@ -152,15 +152,6 @@ langgraph_tools = [
     "langchain-fireworks>=0.3.0",
 ]
 
-[tool.pytest.ini_options]
-addopts = "-q"
-testpaths = [
-    "examples",
-]
-plugins = [
-    "eval_protocol.pytest.plugin",
-]
-
 [project.scripts]
 fireworks-reward = "eval_protocol.cli:main"
 eval-protocol = "eval_protocol.cli:main"
diff --git a/pytest.ini b/pytest.ini
@@ -3,12 +3,14 @@ markers =
     asyncio
 asyncio_mode = auto
 asyncio_default_fixture_loop_scope = function
-testpaths = tests
-python_files = test_*.py
+testpaths = tests ./eval_protocol/quickstart
+python_files = test_*.py llm_judge_*.py
+plugins =
+    eval_protocol.pytest.plugin
 python_classes = Test*
 python_functions = test_*
 # Configure stdout/stderr capture for debugging
-addopts = -s --tb=short
+addopts = -s --tb=short -q
 # Alternative: disable capture completely for debugging
 # addopts = -s --tb=short --capture=no
 filterwarnings =

Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`{`
`2`	`2`	`"python.testing.unittestEnabled": false,`
`3`	`3`	`"python.testing.pytestEnabled": true,`
`4`		`- "python.testing.pytestArgs": ["tests", "examples", "-s", "--tb=short"],`
	`4`	`+ "python.testing.pytestArgs": ["-c", "pytest.ini"],`
`5`	`5`	`"python.testing.autoTestDiscoverOnSaveEnabled": true,`
`6`	`6`	`"python.defaultInterpreterPath": "./.venv/bin/python",`
`7`	`7`	`"python.testing.cwd": "${workspaceFolder}",`