Merge branch 'derekx/quick-start' of https://github.com/eval-protocol/python-sdk into derekx/quick-start

xzrderek · xzrderek · commit 24a6ba3ab996 · 2025-09-10T18:11:08.000-07:00
diff --git a/tests/chinook/pydantic/agent.py b/tests/chinook/pydantic/agent.py
@@ -27,7 +27,7 @@ def setup_agent(orchestrator_agent_model: Model):
     """
 
     agent = Agent(
-        system_prompt=SYSTEM_PROMPT,
+        instructions=SYSTEM_PROMPT,
         model=orchestrator_agent_model,
         instrument=True,
     )
diff --git a/tests/chinook/pydantic/test_pydantic_chinook.py b/tests/chinook/pydantic/test_pydantic_chinook.py
@@ -61,6 +61,9 @@ async def test_simple_query(row: EvaluationRow) -> EvaluationRow:
     assert hasattr(row, "tools"), "Row missing 'tools' attribute"
     assert row.tools == expected_tools, f"Tools validation failed. Expected: {expected_tools}, Got: {row.tools}"
 
+    # assert that there is a system message
+    assert row.messages[0].role == "system"
+
     last_assistant_message = row.last_assistant_message()
     if last_assistant_message is None:
         row.evaluation_result = EvaluateResult(
diff --git a/tests/chinook/pydantic/test_pydantic_complex_queries_responses.py b/tests/chinook/pydantic/test_pydantic_complex_queries_responses.py
@@ -0,0 +1,47 @@
+from collections.abc import Awaitable, Callable
+import os
+from typing_extensions import cast
+from pydantic_ai import Agent
+from pydantic_ai.models.openai import OpenAIResponsesModel, OpenAIResponsesModelSettings
+import pytest
+
+from eval_protocol.models import EvaluationRow
+from eval_protocol.pytest import evaluation_test
+from eval_protocol.pytest.types import RolloutProcessorConfig
+from tests.chinook.dataset import collect_dataset
+from tests.chinook.pydantic.agent import setup_agent
+from tests.pytest.test_pydantic_agent import PydanticAgentRolloutProcessor
+
+# IMPORTANT: import must be renamed to something without the "test_" prefix to
+# avoid pytest discovering the import as a test
+from tests.chinook.pydantic.test_pydantic_complex_queries import test_pydantic_complex_queries as eval
+
+
+def agent_factory(config: RolloutProcessorConfig) -> Agent:
+    model_name = config.completion_params["model"]
+    model_settings = OpenAIResponsesModelSettings()
+    model = OpenAIResponsesModel(model_name)
+    return setup_agent(model)
+
+
+@pytest.mark.skipif(
+    os.environ.get("CI") == "true",
+    reason="This was only run locally to generate traces in Responses API",
+)
+@pytest.mark.asyncio
+@evaluation_test(
+    input_rows=[collect_dataset()],
+    completion_params=[
+        {
+            "model": "gpt-4o",
+        },
+    ],
+    rollout_processor=PydanticAgentRolloutProcessor(agent_factory),
+)
+async def test_pydantic_complex_queries_responses(row: EvaluationRow) -> EvaluationRow:
+    """
+    Evaluation of complex queries for the Chinook database using PydanticAI
+    """
+    casted_evaluation_test = cast(Callable[[EvaluationRow], Awaitable[EvaluationRow]], eval)
+    evaluated_row = await casted_evaluation_test(row)
+    return evaluated_row

Original file line number	Diff line number	Diff line change
`@@ -27,7 +27,7 @@ def setup_agent(orchestrator_agent_model: Model):`
`27`	`27`	`"""`
`28`	`28`
`29`	`29`	`agent = Agent(`
`30`		`- system_prompt=SYSTEM_PROMPT,`
	`30`	`+ instructions=SYSTEM_PROMPT,`
`31`	`31`	`model=orchestrator_agent_model,`
`32`	`32`	`instrument=True,`
`33`	`33`	`)`