From 2d8944f8a8627ab2a1f7305eea1b91f77d3ee4b8 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Wed, 27 Aug 2025 14:58:13 -0700 Subject: [PATCH 1/7] save --- tests/pytest/test_pydantic_multi_agent.py | 26 +++++++++++++++++------ 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/tests/pytest/test_pydantic_multi_agent.py b/tests/pytest/test_pydantic_multi_agent.py index 56338a63..d3306abd 100644 --- a/tests/pytest/test_pydantic_multi_agent.py +++ b/tests/pytest/test_pydantic_multi_agent.py @@ -49,14 +49,26 @@ async def joke_factory(ctx: RunContext[None], count: int) -> list[str]: @evaluation_test( input_messages=[Message(role="user", content="Tell me a joke.")], completion_params=[ + # single agent { - "model": { - "joke_generation_model": { - "model": "accounts/fireworks/models/kimi-k2-instruct", - "provider": "fireworks", - }, - "joke_selection_model": {"model": "accounts/fireworks/models/deepseek-v3p1", "provider": "fireworks"}, - } + "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct", + }, + # multi-agent + { + "joke_generation_model": { + "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct", + }, + "joke_selection_model": { + "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1", + }, + }, + { + "joke_generation_model": { + "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct", + }, + "joke_selection_model": { + "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct", + }, }, ], rollout_processor=PydanticAgentRolloutProcessor(), From f17169edf8031e215439c128edab812db5cd815a Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Wed, 27 Aug 2025 16:33:52 -0700 Subject: [PATCH 2/7] TODO: refactor rolloutprocessor to not use __call__ --- .../default_pydantic_ai_rollout_processor.py | 65 ++++++++++++++----- eval_protocol/pytest/evaluation_test.py | 5 -- tests/pytest/test_pydantic_multi_agent.py | 23 ++----- 3 files changed, 52 insertions(+), 41 deletions(-) diff --git a/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py b/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py index b134199d..d9773ca4 100644 --- a/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +++ b/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py @@ -1,9 +1,8 @@ import asyncio import logging import types -from typing import List +from typing import List, Literal -from attr import dataclass from openai.types.chat.chat_completion_assistant_message_param import ChatCompletionAssistantMessageParam from eval_protocol.models import EvaluationRow, Message @@ -18,6 +17,7 @@ from pydantic_ai.messages import ModelMessage from pydantic_ai._utils import generate_tool_call_id from pydantic_ai import Agent +from pydantic_ai.usage import UsageLimits from pydantic_ai.messages import ( ModelRequest, SystemPromptPart, @@ -25,7 +25,7 @@ UserPromptPart, ) from pydantic_ai.providers.openai import OpenAIProvider -from typing_extensions import TypedDict +from typing_extensions import Callable logger = logging.getLogger(__name__) @@ -34,9 +34,33 @@ class PydanticAgentRolloutProcessor(RolloutProcessor): """Rollout processor for Pydantic AI agents. Mainly converts EvaluationRow.messages to and from Pydantic AI ModelMessage format.""" - def __init__(self): + def __init__(self, setup_agent: Callable[..., Agent], usage_limits: UsageLimits = None): # dummy model used for its helper functions for processing messages self.util = OpenAIModel("dummy-model", provider=OpenAIProvider(api_key="dummy")) + self.setup_agent = setup_agent + self.usage_limits = usage_limits + + def _map_litellm_to_pydantic_ai( + self, model_name: str + ) -> Literal[ + "openai", + "deepseek", + "azure", + "openrouter", + "grok", + "fireworks", + "together", + ]: + mapping = { + "fireworks_ai": "fireworks", + "together_ai": "together", + "xai": "grok", + "azure_ai": "azure", + } + provider = model_name.split("/")[0] + if provider in mapping: + provider = mapping[provider] + return provider # type: ignore def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) -> List[asyncio.Task[EvaluationRow]]: """Create agent rollout tasks and return them for external handling.""" @@ -60,20 +84,28 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) -> raise ValueError( "completion_params['model'] must be a dict mapping agent argument names to model config dicts (with 'model' and 'provider' keys)" ) - kwargs = {} - for k, v in config.completion_params["model"].items(): - if v["model"] and v["model"].startswith("anthropic:"): - kwargs[k] = AnthropicModel( - v["model"].removeprefix("anthropic:"), + kwargs: dict[str, OpenAIModel | GoogleModel | AnthropicModel] = {} + for agent, model_config in config.completion_params["model"].items(): + if "model" not in model_config: + raise ValueError(f"model_config for agent {agent} must contain a 'model' key") + model_name = model_config["model"] + if model_name.startswith("anthropic/"): + kwargs[agent] = AnthropicModel( + model_name.removeprefix("anthropic/"), + ) + elif model_name.startswith("google/"): + kwargs[agent] = GoogleModel( + model_name.removeprefix("google/"), ) - elif v["model"] and v["model"].startswith("google:"): - kwargs[k] = GoogleModel( - v["model"].removeprefix("google:"), + elif model_name.startswith("gemini/"): + kwargs[agent] = GoogleModel( + model_name.removeprefix("gemini/"), ) else: - kwargs[k] = OpenAIModel( - v["model"], - provider=v["provider"], + provider = self._map_litellm_to_pydantic_ai(model_name) + kwargs[agent] = OpenAIModel( + model_name.removeprefix(f"{provider}/"), + provider=provider, ) agent = setup_agent(**kwargs) model = None @@ -144,5 +176,4 @@ def convert_ep_message_to_pyd_message(self, message: Message, row: EvaluationRow ) ] ) - else: - raise ValueError(f"Unknown role: {message.role}") + raise ValueError(f"Unknown role: {message.role}") diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 47d98eb6..fa2c607a 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -544,11 +544,6 @@ def _log_eval_error(status: Status, rows: Optional[List[EvaluationRow]] | None, row.input_metadata.row_id = generate_id(seed=0, index=index) completion_params = kwargs["completion_params"] - if completion_params and ("model" not in completion_params or not completion_params["model"]): - raise ValueError( - "No model provided. Please provide a model in the completion parameters object." - ) - # Create eval metadata with test function info and current commit hash eval_metadata = EvalMetadata( name=test_func.__name__, diff --git a/tests/pytest/test_pydantic_multi_agent.py b/tests/pytest/test_pydantic_multi_agent.py index d3306abd..91158853 100644 --- a/tests/pytest/test_pydantic_multi_agent.py +++ b/tests/pytest/test_pydantic_multi_agent.py @@ -47,12 +47,8 @@ async def joke_factory(ctx: RunContext[None], count: int) -> list[str]: @pytest.mark.asyncio @evaluation_test( - input_messages=[Message(role="user", content="Tell me a joke.")], + input_messages=[[Message(role="user", content="Tell me a joke.")]], completion_params=[ - # single agent - { - "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct", - }, # multi-agent { "joke_generation_model": { @@ -62,21 +58,10 @@ async def joke_factory(ctx: RunContext[None], count: int) -> list[str]: "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1", }, }, - { - "joke_generation_model": { - "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct", - }, - "joke_selection_model": { - "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct", - }, - }, ], - rollout_processor=PydanticAgentRolloutProcessor(), - rollout_processor_kwargs={ - "agent": setup_agent, - # PydanticAgentRolloutProcessor will pass usage_limits into the "run" call - "usage_limits": UsageLimits(request_limit=5, total_tokens_limit=1000), - }, + rollout_processor=PydanticAgentRolloutProcessor.__init__( + setup_agent, UsageLimits(request_limit=5, total_tokens_limit=1000) + ), mode="pointwise", ) async def test_pydantic_multi_agent(row: EvaluationRow) -> EvaluationRow: From 944998df65bef43d1f1c694b2c342407ca180e15 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Wed, 27 Aug 2025 18:30:27 -0700 Subject: [PATCH 3/7] save --- .vscode/settings.json | 3 +- .../default_pydantic_ai_rollout_processor.py | 72 ++++++++----------- pyproject.toml | 1 + tests/chinook/test_pydantic_chinook.py | 6 +- tests/pytest/test_pydantic_agent.py | 5 +- tests/pytest/test_pydantic_multi_agent.py | 2 +- uv.lock | 28 ++++++++ 7 files changed, 64 insertions(+), 53 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 6ec04673..cf61abce 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -5,5 +5,6 @@ "python.testing.autoTestDiscoverOnSaveEnabled": true, "python.defaultInterpreterPath": "./.venv/bin/python", "python.testing.cwd": "${workspaceFolder}", - "editor.defaultFormatter": "ms-python.black-formatter" + "editor.defaultFormatter": "ms-python.black-formatter", + "cursorpyright.analysis.typeCheckingMode": "recommended" } diff --git a/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py b/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py index d9773ca4..c5b58c9a 100644 --- a/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +++ b/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py @@ -25,7 +25,7 @@ UserPromptPart, ) from pydantic_ai.providers.openai import OpenAIProvider -from typing_extensions import Callable +from typing import Callable, Union logger = logging.getLogger(__name__) @@ -34,7 +34,7 @@ class PydanticAgentRolloutProcessor(RolloutProcessor): """Rollout processor for Pydantic AI agents. Mainly converts EvaluationRow.messages to and from Pydantic AI ModelMessage format.""" - def __init__(self, setup_agent: Callable[..., Agent], usage_limits: UsageLimits = None): + def __init__(self, setup_agent: Union[Callable[..., Agent], Agent], usage_limits: UsageLimits = None): # dummy model used for its helper functions for processing messages self.util = OpenAIModel("dummy-model", provider=OpenAIProvider(api_key="dummy")) self.setup_agent = setup_agent @@ -58,9 +58,29 @@ def _map_litellm_to_pydantic_ai( "azure_ai": "azure", } provider = model_name.split("/")[0] + model_name = model_name.removeprefix(f"{provider}/") if provider in mapping: provider = mapping[provider] - return provider # type: ignore + return provider, model_name + + def _map_litellm_to_pydantic_ai_model(self, model_name: str) -> Union[OpenAIModel, GoogleModel, AnthropicModel]: + if model_name.startswith("anthropic/"): + return AnthropicModel( + model_name.removeprefix("anthropic/"), + ) + elif model_name.startswith("google/"): + return GoogleModel( + model_name.removeprefix("google/"), + ) + elif model_name.startswith("gemini/"): + return GoogleModel( + model_name.removeprefix("gemini/"), + ) + provider, model_name = self._map_litellm_to_pydantic_ai(model_name) + return OpenAIModel( + model_name, + provider=provider, + ) def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) -> List[asyncio.Task[EvaluationRow]]: """Create agent rollout tasks and return them for external handling.""" @@ -68,53 +88,17 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) -> max_concurrent = getattr(config, "max_concurrent_rollouts", 8) or 8 semaphore = asyncio.Semaphore(max_concurrent) - # validate that the "agent" field is present with a valid Pydantic AI Agent instance in the completion_params dict - if "agent" not in config.kwargs: - raise ValueError("kwargs must contain an 'agent' field with a valid Pydantic AI Agent instance") - if not isinstance(config.kwargs["agent"], Agent) and not isinstance( - config.kwargs["agent"], types.FunctionType - ): - raise ValueError( - "kwargs['agent'] must be a valid Pydantic AI Agent instance or a function that returns an Agent" - ) - - if isinstance(config.kwargs["agent"], types.FunctionType): - setup_agent = config.kwargs["agent"] - if not isinstance(config.completion_params["model"], dict): - raise ValueError( - "completion_params['model'] must be a dict mapping agent argument names to model config dicts (with 'model' and 'provider' keys)" - ) + if isinstance(self.setup_agent, types.FunctionType): kwargs: dict[str, OpenAIModel | GoogleModel | AnthropicModel] = {} for agent, model_config in config.completion_params["model"].items(): if "model" not in model_config: raise ValueError(f"model_config for agent {agent} must contain a 'model' key") - model_name = model_config["model"] - if model_name.startswith("anthropic/"): - kwargs[agent] = AnthropicModel( - model_name.removeprefix("anthropic/"), - ) - elif model_name.startswith("google/"): - kwargs[agent] = GoogleModel( - model_name.removeprefix("google/"), - ) - elif model_name.startswith("gemini/"): - kwargs[agent] = GoogleModel( - model_name.removeprefix("gemini/"), - ) - else: - provider = self._map_litellm_to_pydantic_ai(model_name) - kwargs[agent] = OpenAIModel( - model_name.removeprefix(f"{provider}/"), - provider=provider, - ) - agent = setup_agent(**kwargs) + kwargs[agent] = self._map_litellm_to_pydantic_ai_model(model_config["model"]) + agent = self.setup_agent(**kwargs) model = None else: - agent = config.kwargs["agent"] - model = OpenAIModel( - config.completion_params["model"], - provider=config.completion_params["provider"], - ) + agent = self.setup_agent + model = self._map_litellm_to_pydantic_ai_model(config.completion_params["model"]) async def process_row(row: EvaluationRow) -> EvaluationRow: """Process a single row with agent rollout.""" diff --git a/pyproject.toml b/pyproject.toml index d660bb03..51e8b85c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -174,6 +174,7 @@ tau2 = { git = "https://github.com/sierra-research/tau2-bench.git" } [dependency-groups] dev = [ + "basedpyright>=1.31.3", "fastapi[standard]>=0.116.1", "fastmcp>=2.10.6", "haikus==0.3.8", diff --git a/tests/chinook/test_pydantic_chinook.py b/tests/chinook/test_pydantic_chinook.py index 9d4c128c..32a953a9 100644 --- a/tests/chinook/test_pydantic_chinook.py +++ b/tests/chinook/test_pydantic_chinook.py @@ -32,8 +32,7 @@ } }, ], - rollout_processor=PydanticAgentRolloutProcessor(), - rollout_processor_kwargs={"agent": setup_agent}, + rollout_processor=PydanticAgentRolloutProcessor(setup_agent), mode="pointwise", ) async def test_simple_query(row: EvaluationRow) -> EvaluationRow: @@ -96,8 +95,7 @@ class Response(BaseModel): } }, ], - rollout_processor=PydanticAgentRolloutProcessor(), - rollout_processor_kwargs={"agent": setup_agent}, + rollout_processor=PydanticAgentRolloutProcessor(setup_agent), mode="pointwise", ) async def test_complex_queries(row: EvaluationRow) -> EvaluationRow: diff --git a/tests/pytest/test_pydantic_agent.py b/tests/pytest/test_pydantic_agent.py index bea37b6e..10d7eb5c 100644 --- a/tests/pytest/test_pydantic_agent.py +++ b/tests/pytest/test_pydantic_agent.py @@ -14,10 +14,9 @@ @evaluation_test( input_messages=[Message(role="user", content="Hello, how are you?")], completion_params=[ - {"model": "accounts/fireworks/models/gpt-oss-120b", "provider": "fireworks"}, + {"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}, ], - rollout_processor=PydanticAgentRolloutProcessor(), - rollout_processor_kwargs={"agent": agent}, + rollout_processor=PydanticAgentRolloutProcessor(agent), mode="pointwise", ) async def test_pydantic_agent(row: EvaluationRow) -> EvaluationRow: diff --git a/tests/pytest/test_pydantic_multi_agent.py b/tests/pytest/test_pydantic_multi_agent.py index 91158853..8715baee 100644 --- a/tests/pytest/test_pydantic_multi_agent.py +++ b/tests/pytest/test_pydantic_multi_agent.py @@ -59,7 +59,7 @@ async def joke_factory(ctx: RunContext[None], count: int) -> list[str]: }, }, ], - rollout_processor=PydanticAgentRolloutProcessor.__init__( + rollout_processor=PydanticAgentRolloutProcessor( setup_agent, UsageLimits(request_limit=5, total_tokens_limit=1000) ), mode="pointwise", diff --git a/uv.lock b/uv.lock index cfaf01ef..77b473d1 100644 --- a/uv.lock +++ b/uv.lock @@ -383,6 +383,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b9/fa/123043af240e49752f1c4bd24da5053b6bd00cad78c2be53c0d1e8b975bc/backports.tarfile-1.2.0-py3-none-any.whl", hash = "sha256:77e284d754527b01fb1e6fa8a1afe577858ebe4e9dad8919e34c862cb399bc34", size = 30181, upload-time = "2024-05-28T17:01:53.112Z" }, ] +[[package]] +name = "basedpyright" +version = "1.31.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nodejs-wheel-binaries" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/64/3e/e5cd03d33a6ddd341427a0fe2fb27944ae11973069a8b880dad99102361b/basedpyright-1.31.3.tar.gz", hash = "sha256:c77bff2dc7df4fe09c0ee198589d8d24faaf8bfd883ee9e0af770b1a275a58f8", size = 22481852, upload-time = "2025-08-20T15:08:25.131Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/e5/edf168b8dd936bb82a97ebb76e7295c94a4f9d1c2e8e8a04696ef2b3a524/basedpyright-1.31.3-py3-none-any.whl", hash = "sha256:bdb0b5a9abe287a023d330fc71eaed181aaffd48f1dec59567f912cf716f38ff", size = 11722347, upload-time = "2025-08-20T15:08:20.528Z" }, +] + [[package]] name = "beautifulsoup4" version = "4.13.4" @@ -1262,6 +1274,7 @@ trl = [ [package.dev-dependencies] dev = [ + { name = "basedpyright" }, { name = "fastapi", extra = ["standard"] }, { name = "fastmcp" }, { name = "haikus" }, @@ -1353,6 +1366,7 @@ provides-extras = ["dev", "trl", "openevals", "fireworks", "box2d", "langfuse", [package.metadata.requires-dev] dev = [ + { name = "basedpyright", specifier = ">=1.31.3" }, { name = "fastapi", extras = ["standard"], specifier = ">=0.116.1" }, { name = "fastmcp", specifier = ">=2.10.6" }, { name = "haikus", specifier = "==0.3.8" }, @@ -3494,6 +3508,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9", size = 22314, upload-time = "2024-06-04T18:44:08.352Z" }, ] +[[package]] +name = "nodejs-wheel-binaries" +version = "22.18.0" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/6d/773e09de4a052cc75c129c3766a3cf77c36bff8504a38693b735f4a1eb55/nodejs_wheel_binaries-22.18.0-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:53b04495857755c5d5658f7ac969d84f25898fe0b0c1bdc41172e5e0ac6105ca", size = 50873051, upload-time = "2025-08-01T11:10:29.475Z" }, + { url = "https://files.pythonhosted.org/packages/ae/fc/3d6fd4ad5d26c9acd46052190d6a8895dc5050297b03d9cce03def53df0d/nodejs_wheel_binaries-22.18.0-py2.py3-none-macosx_11_0_x86_64.whl", hash = "sha256:bd4d016257d4dfe604ed526c19bd4695fdc4f4cc32e8afc4738111447aa96d03", size = 51814481, upload-time = "2025-08-01T11:10:33.086Z" }, + { url = "https://files.pythonhosted.org/packages/10/f9/7be44809a861605f844077f9e731a117b669d5ca6846a7820e7dd82c9fad/nodejs_wheel_binaries-22.18.0-py2.py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3b125f94f3f5e8ab9560d3bd637497f02e45470aeea74cf6fe60afe751cfa5f", size = 57804907, upload-time = "2025-08-01T11:10:36.83Z" }, + { url = "https://files.pythonhosted.org/packages/e9/67/563e74a0dff653ec7ddee63dc49b3f37a20df39f23675cfc801d7e8e4bb7/nodejs_wheel_binaries-22.18.0-py2.py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78bbb81b6e67c15f04e2a9c6c220d7615fb46ae8f1ad388df0d66abac6bed5f8", size = 58335587, upload-time = "2025-08-01T11:10:40.716Z" }, + { url = "https://files.pythonhosted.org/packages/b6/b1/ec45fefef60223dd40e7953e2ff087964e200d6ec2d04eae0171d6428679/nodejs_wheel_binaries-22.18.0-py2.py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:f5d3ea8b7f957ae16b73241451f6ce831d6478156f363cce75c7ea71cbe6c6f7", size = 59662356, upload-time = "2025-08-01T11:10:44.795Z" }, + { url = "https://files.pythonhosted.org/packages/a2/ed/6de2c73499eebf49d0d20e0704f64566029a3441c48cd4f655d49befd28b/nodejs_wheel_binaries-22.18.0-py2.py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:bcda35b07677039670102a6f9b78c2313fd526111d407cb7ffc2a4c243a48ef9", size = 60706806, upload-time = "2025-08-01T11:10:48.985Z" }, + { url = "https://files.pythonhosted.org/packages/2b/f5/487434b1792c4f28c63876e4a896f2b6e953e2dc1f0b3940e912bd087755/nodejs_wheel_binaries-22.18.0-py2.py3-none-win_amd64.whl", hash = "sha256:0f55e72733f1df2f542dce07f35145ac2e125408b5e2051cac08e5320e41b4d1", size = 39998139, upload-time = "2025-08-01T11:10:52.676Z" }, +] + [[package]] name = "notebook" version = "7.4.4" From 5aac93b3b50726acc1960136ff857765a8451e59 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Thu, 28 Aug 2025 10:51:54 -0700 Subject: [PATCH 4/7] save --- .vscode/settings.json | 4 +- .../default_pydantic_ai_rollout_processor.py | 40 ++++++++----------- pyproject.toml | 32 +-------------- uv.lock | 15 ------- 4 files changed, 19 insertions(+), 72 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index cf61abce..60f7540b 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -5,6 +5,6 @@ "python.testing.autoTestDiscoverOnSaveEnabled": true, "python.defaultInterpreterPath": "./.venv/bin/python", "python.testing.cwd": "${workspaceFolder}", - "editor.defaultFormatter": "ms-python.black-formatter", - "cursorpyright.analysis.typeCheckingMode": "recommended" + "python.analysis.typeCheckingMode": "strict", + "python.analysis.diagnosticMode": "workspace" } diff --git a/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py b/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py index c5b58c9a..8529cbdc 100644 --- a/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +++ b/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py @@ -2,30 +2,32 @@ import logging import types from typing import List, Literal +from typing import Callable, Union -from openai.types.chat.chat_completion_assistant_message_param import ChatCompletionAssistantMessageParam - -from eval_protocol.models import EvaluationRow, Message -from eval_protocol.pytest.rollout_processor import RolloutProcessor -from eval_protocol.pytest.types import RolloutProcessorConfig from openai.types.chat import ChatCompletion, ChatCompletionMessageParam from openai.types.chat.chat_completion import Choice as ChatCompletionChoice -from pydantic_ai.models.anthropic import AnthropicModel -from pydantic_ai.models.openai import OpenAIModel -from pydantic_ai.models.google import GoogleModel +from openai.types.chat.chat_completion_assistant_message_param import ( + ChatCompletionAssistantMessageParam, +) from pydantic import TypeAdapter -from pydantic_ai.messages import ModelMessage -from pydantic_ai._utils import generate_tool_call_id from pydantic_ai import Agent -from pydantic_ai.usage import UsageLimits +from pydantic_ai._utils import generate_tool_call_id +from pydantic_ai.messages import ModelMessage from pydantic_ai.messages import ( ModelRequest, SystemPromptPart, ToolReturnPart, UserPromptPart, ) +from pydantic_ai.models.anthropic import AnthropicModel +from pydantic_ai.models.google import GoogleModel +from pydantic_ai.models.openai import OpenAIModel from pydantic_ai.providers.openai import OpenAIProvider -from typing import Callable, Union +from pydantic_ai.usage import UsageLimits + +from eval_protocol.models import EvaluationRow, Message +from eval_protocol.pytest.rollout_processor import RolloutProcessor +from eval_protocol.pytest.types import RolloutProcessorConfig logger = logging.getLogger(__name__) @@ -40,17 +42,7 @@ def __init__(self, setup_agent: Union[Callable[..., Agent], Agent], usage_limits self.setup_agent = setup_agent self.usage_limits = usage_limits - def _map_litellm_to_pydantic_ai( - self, model_name: str - ) -> Literal[ - "openai", - "deepseek", - "azure", - "openrouter", - "grok", - "fireworks", - "together", - ]: + def _map_litellm_to_pydantic_ai(self, model_name: str) -> str: mapping = { "fireworks_ai": "fireworks", "together_ai": "together", @@ -61,7 +53,7 @@ def _map_litellm_to_pydantic_ai( model_name = model_name.removeprefix(f"{provider}/") if provider in mapping: provider = mapping[provider] - return provider, model_name + return 2 def _map_litellm_to_pydantic_ai_model(self, model_name: str) -> Union[OpenAIModel, GoogleModel, AnthropicModel]: if model_name.startswith("anthropic/"): diff --git a/pyproject.toml b/pyproject.toml index 51e8b85c..c612c7bf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,7 +65,6 @@ dev = [ "pytest-httpserver", "werkzeug>=2.0.0", "ruff>=0.5.0", - "pyright>=1.1.365", "transformers>=4.0.0", "types-setuptools", "types-requests", @@ -205,38 +204,9 @@ known-first-party = ["eval_protocol"] combine-as-imports = true [tool.pyright] -typeCheckingMode = "basic" +typeCheckingMode = "recommended" pythonVersion = "3.10" -reportMissingImports = "none" -reportMissingTypeStubs = "none" -reportMissingModuleSource = "none" include = ["eval_protocol", "examples", "tests"] exclude = ["vite-app", "vendor"] # Ignore diagnostics for vendored generator code ignore = ["versioneer.py"] -# Relax noisy diagnostics commonly triggered in tests and dynamic libs -reportAttributeAccessIssue = "none" -reportCallIssue = "none" -reportUnknownMemberType = "none" -reportUnknownVariableType = "none" -reportPossiblyUnboundVariable = "none" -# Additional suppressions per request -reportOptionalMemberAccess = "none" -reportIndexIssue = "none" -reportReturnType = "none" -reportOptionalCall = "none" -reportGeneralTypeIssues = "none" -reportOperatorIssue = "none" -reportOptionalSubscript = "none" -reportUnsupportedDunderAll = "none" -reportOptionalContextManager = "none" -reportInvalidTypeForm = "none" -reportRedeclaration = "none" -reportUndefinedVariable = "none" -reportPrivateImportUsage = "none" -reportOptionalIterable = "none" -# Make incompatibilities and argument types warnings instead of errors for now -# and suppress warnings output entirely -reportIncompatibleVariableOverride = "none" -reportArgumentType = "none" -reportAssignmentType = "none" diff --git a/uv.lock b/uv.lock index 77b473d1..2f053930 100644 --- a/uv.lock +++ b/uv.lock @@ -1228,7 +1228,6 @@ dev = [ { name = "openai" }, { name = "pip" }, { name = "pre-commit" }, - { name = "pyright" }, { name = "pytest-cov" }, { name = "pytest-httpserver" }, { name = "pytest-xdist" }, @@ -1331,7 +1330,6 @@ requires-dist = [ { name = "psycopg2-binary", marker = "extra == 'chinook'", specifier = ">=2.9.10" }, { name = "pydantic", specifier = ">=2.0.0" }, { name = "pydantic-ai", marker = "extra == 'pydantic'" }, - { name = "pyright", marker = "extra == 'dev'", specifier = ">=1.1.365" }, { name = "pytest", specifier = ">=6.0.0" }, { name = "pytest-asyncio", specifier = ">=0.21.0" }, { name = "pytest-cov", marker = "extra == 'dev'" }, @@ -5054,19 +5052,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/bd/24/12818598c362d7f300f18e74db45963dbcb85150324092410c8b49405e42/pyproject_hooks-1.2.0-py3-none-any.whl", hash = "sha256:9e5c6bfa8dcc30091c74b0cf803c81fdd29d94f01992a7707bc97babb1141913", size = 10216, upload-time = "2024-09-29T09:24:11.978Z" }, ] -[[package]] -name = "pyright" -version = "1.1.403" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "nodeenv" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/fe/f6/35f885264ff08c960b23d1542038d8da86971c5d8c955cfab195a4f672d7/pyright-1.1.403.tar.gz", hash = "sha256:3ab69b9f41c67fb5bbb4d7a36243256f0d549ed3608678d381d5f51863921104", size = 3913526, upload-time = "2025-07-09T07:15:52.882Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/49/b6/b04e5c2f41a5ccad74a1a4759da41adb20b4bc9d59a5e08d29ba60084d07/pyright-1.1.403-py3-none-any.whl", hash = "sha256:c0eeca5aa76cbef3fcc271259bbd785753c7ad7bcac99a9162b4c4c7daed23b3", size = 5684504, upload-time = "2025-07-09T07:15:50.958Z" }, -] - [[package]] name = "pysocks" version = "1.7.1" From 15a74d8f613407f90a0fd44ae1a0b7a89e13da10 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Mon, 8 Sep 2025 13:14:29 -0700 Subject: [PATCH 5/7] factory pattern works --- .../default_pydantic_ai_rollout_processor.py | 63 +++---------------- .../chinook/pydantic/test_pydantic_chinook.py | 28 ++++----- 2 files changed, 23 insertions(+), 68 deletions(-) diff --git a/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py b/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py index ca8564a8..8709e253 100644 --- a/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +++ b/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py @@ -4,19 +4,13 @@ from collections.abc import Callable import logging import time -import types -from pydantic_ai.models import Model from pydantic_ai.usage import UsageLimits from typing_extensions import override from eval_protocol.models import EvaluationRow, Message -from openai.types import CompletionUsage from eval_protocol.pytest.rollout_processor import RolloutProcessor from eval_protocol.pytest.types import RolloutProcessorConfig from openai.types.chat import ChatCompletion, ChatCompletionMessage, ChatCompletionMessageParam from openai.types.chat.chat_completion import Choice as ChatCompletionChoice -from openai.types.chat.chat_completion_assistant_message_param import ( - ChatCompletionAssistantMessageParam, -) from pydantic import TypeAdapter from pydantic_ai import Agent from pydantic_ai._utils import generate_tool_call_id @@ -27,8 +21,6 @@ ToolReturnPart, UserPromptPart, ) -from pydantic_ai.models.anthropic import AnthropicModel -from pydantic_ai.models.google import GoogleModel from pydantic_ai.models.openai import OpenAIModel from pydantic_ai.providers.openai import OpenAIProvider @@ -39,9 +31,12 @@ class PydanticAgentRolloutProcessor(RolloutProcessor): """Rollout processor for Pydantic AI agents. Mainly converts EvaluationRow.messages to and from Pydantic AI ModelMessage format.""" - def __init__(self, setup_agent: Callable[..., Agent] | Agent, usage_limits: UsageLimits | None = None): + def __init__( + self, agent_factory: Callable[[RolloutProcessorConfig], Agent], usage_limits: UsageLimits | None = None + ): # dummy model used for its helper functions for processing messages - self.util: OpenAIModel = OpenAIModel("dummy-model", provider=OpenAIProvider(api_key="dummy")) + self._util: OpenAIModel = OpenAIModel("dummy-model", provider=OpenAIProvider(api_key="dummy")) + self._setup_agent = agent_factory @override def __call__(self, rows: list[EvaluationRow], config: RolloutProcessorConfig) -> list[asyncio.Task[EvaluationRow]]: @@ -49,54 +44,14 @@ def __call__(self, rows: list[EvaluationRow], config: RolloutProcessorConfig) -> semaphore = config.semaphore - # validate that the "agent" field is present with a valid Pydantic AI Agent instance in the completion_params dict - if "agent" not in config.kwargs: - raise ValueError("kwargs must contain an 'agent' field with a valid Pydantic AI Agent instance") - if not isinstance(config.kwargs["agent"], Agent) and not isinstance( - config.kwargs["agent"], types.FunctionType - ): - raise ValueError( - "kwargs['agent'] must be a valid Pydantic AI Agent instance or a function that returns an Agent" - ) - - if isinstance(config.kwargs["agent"], types.FunctionType): - setup_agent = config.kwargs["agent"] - if not isinstance(config.completion_params["model"], dict): - raise ValueError( - "completion_params['model'] must be a dict mapping agent argument names to model config dicts (with 'model' and 'provider' keys)" - ) - kwargs: dict[str, Model] = {} - for k, v in config.completion_params["model"].items(): # pyright: ignore[reportUnknownVariableType] - if v["model"] and v["model"].startswith("anthropic:"): # pyright: ignore[reportUnknownMemberType] - kwargs[k] = AnthropicModel( - v["model"].removeprefix("anthropic:"), # pyright: ignore[reportUnknownMemberType, reportUnknownArgumentType] - ) - elif v["model"] and v["model"].startswith("google:"): # pyright: ignore[reportUnknownMemberType] - kwargs[k] = GoogleModel( - v["model"].removeprefix("google:"), # pyright: ignore[reportUnknownMemberType, reportUnknownArgumentType] - ) - else: - kwargs[k] = OpenAIModel( - v["model"], # pyright: ignore[reportUnknownArgumentType] - provider=v["provider"], # pyright: ignore[reportUnknownArgumentType] - ) - agent_instance: Agent = setup_agent(**kwargs) # pyright: ignore[reportAny] - model = None - else: - agent_instance = config.kwargs["agent"] # pyright: ignore[reportAssignmentType] - model = OpenAIModel( - config.completion_params["model"], # pyright: ignore[reportAny] - provider=config.completion_params["provider"], # pyright: ignore[reportAny] - ) + agent = self._setup_agent(config) async def process_row(row: EvaluationRow) -> EvaluationRow: """Process a single row with agent rollout.""" start_time = time.perf_counter() model_messages = [self.convert_ep_message_to_pyd_message(m, row) for m in row.messages] - response = await agent_instance.run( - message_history=model_messages, model=model, usage_limits=config.kwargs.get("usage_limits") - ) + response = await agent.run(message_history=model_messages, usage_limits=config.kwargs.get("usage_limits")) row.messages = await self.convert_pyd_message_to_ep_message(response.all_messages()) # TODO: pydantic ai accumulates usage info across all models in multi-agent setup, so this simple tracking doesn't work for cost. to discuss with @dphuang2 when he's back. @@ -121,7 +76,7 @@ async def _sem_wrapper(r: EvaluationRow) -> EvaluationRow: return tasks async def convert_pyd_message_to_ep_message(self, messages: list[ModelMessage]) -> list[Message]: - oai_messages: list[ChatCompletionMessageParam] = await self.util._map_messages(messages) + oai_messages: list[ChatCompletionMessageParam] = await self._util._map_messages(messages) return [Message(**m) for m in oai_messages] # pyright: ignore[reportArgumentType] def convert_ep_message_to_pyd_message(self, message: Message, row: EvaluationRow) -> ModelMessage: @@ -129,7 +84,7 @@ def convert_ep_message_to_pyd_message(self, message: Message, row: EvaluationRow type_adapter = TypeAdapter(ChatCompletionMessage) oai_message = type_adapter.validate_python(message) # Fix: Provide required finish_reason and index, and ensure created is int (timestamp) - return self.util._process_response( + return self._util._process_response( ChatCompletion( choices=[ChatCompletionChoice(message=oai_message, finish_reason="stop", index=0)], object="chat.completion", diff --git a/tests/chinook/pydantic/test_pydantic_chinook.py b/tests/chinook/pydantic/test_pydantic_chinook.py index 7d1b578b..0233a227 100644 --- a/tests/chinook/pydantic/test_pydantic_chinook.py +++ b/tests/chinook/pydantic/test_pydantic_chinook.py @@ -6,6 +6,7 @@ from eval_protocol.pytest import evaluation_test from eval_protocol.pytest.default_pydantic_ai_rollout_processor import PydanticAgentRolloutProcessor +from eval_protocol.pytest.types import RolloutProcessorConfig from tests.chinook.pydantic.agent import setup_agent import os from pydantic_ai.models.openai import OpenAIModel @@ -20,20 +21,23 @@ ) +def agent_factory(config: RolloutProcessorConfig) -> Agent: + model_name = config.completion_params["model"] + provider = config.completion_params["provider"] + model = OpenAIModel(model_name, provider=provider) + return setup_agent(model) + + @pytest.mark.asyncio @evaluation_test( input_messages=[[[Message(role="user", content="What is the total number of tracks in the database?")]]], completion_params=[ { - "model": { - "orchestrator_agent_model": { - "model": "accounts/fireworks/models/kimi-k2-instruct", - "provider": "fireworks", - } - } + "model": "accounts/fireworks/models/kimi-k2-instruct", + "provider": "fireworks", }, ], - rollout_processor=PydanticAgentRolloutProcessor(setup_agent), + rollout_processor=PydanticAgentRolloutProcessor(agent_factory), mode="pointwise", ) async def test_simple_query(row: EvaluationRow) -> EvaluationRow: @@ -91,15 +95,11 @@ class Response(BaseModel): input_rows=[collect_dataset()], completion_params=[ { - "model": { - "orchestrator_agent_model": { - "model": "accounts/fireworks/models/kimi-k2-instruct", - "provider": "fireworks", - } - } + "model": "accounts/fireworks/models/kimi-k2-instruct", + "provider": "fireworks", }, ], - rollout_processor=PydanticAgentRolloutProcessor(setup_agent), + rollout_processor=PydanticAgentRolloutProcessor(agent_factory), mode="pointwise", ) async def test_complex_queries(row: EvaluationRow) -> EvaluationRow: From c2b19b70b7051b41bf1a538e62fe9e253678f3f9 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Mon, 8 Sep 2025 13:19:07 -0700 Subject: [PATCH 6/7] refactor test_pydantic_multi_agent to work with factory setup --- tests/pytest/test_pydantic_multi_agent.py | 34 +++++++++++++++-------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/tests/pytest/test_pydantic_multi_agent.py b/tests/pytest/test_pydantic_multi_agent.py index df9feb8d..24be554c 100644 --- a/tests/pytest/test_pydantic_multi_agent.py +++ b/tests/pytest/test_pydantic_multi_agent.py @@ -1,12 +1,13 @@ """ Copied and modified for eval-protocol from https://ai.pydantic.dev/multi-agent-applications/#agent-delegation -To test your Pydantic AI multi-agent application, you can pass a function that -sets up the agents and their tools. The function should accept parameters that -map a model to each agent. In completion_params, you can provide mappings of -model to agent based on key. +To test your Pydantic AI multi-agent application, you can pass a factory that +sets up the agenet based on the completion_params. The function should accept a +RolloutProcessorConfig. In completion_params, you can provide mappings of model +to agent based on key. """ +from pydantic_ai.models.openai import OpenAIModel import pytest from eval_protocol.models import EvaluationRow, Message @@ -18,6 +19,8 @@ from pydantic_ai.models import Model from pydantic_ai.usage import UsageLimits +from eval_protocol.pytest.types import RolloutProcessorConfig + def setup_agent(joke_generation_model: Model, joke_selection_model: Model) -> Agent: """ @@ -45,22 +48,31 @@ async def joke_factory(ctx: RunContext[None], count: int) -> list[str]: # pyrig return joke_selection_agent +def agent_factory(config: RolloutProcessorConfig) -> Agent: + joke_generation_model = OpenAIModel( + config.completion_params["model"]["joke_generation_model"], provider="fireworks" + ) + joke_selection_model = OpenAIModel(config.completion_params["model"]["joke_selection_model"], provider="fireworks") + return setup_agent( + joke_generation_model, + joke_selection_model, + ) + + @pytest.mark.asyncio @evaluation_test( input_messages=[[[Message(role="user", content="Tell me a joke.")]]], completion_params=[ # multi-agent { - "joke_generation_model": { - "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct", - }, - "joke_selection_model": { - "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1", - }, + "model": { + "joke_generation_model": "accounts/fireworks/models/kimi-k2-instruct", + "joke_selection_model": "accounts/fireworks/models/deepseek-v3p1", + } }, ], rollout_processor=PydanticAgentRolloutProcessor( - setup_agent, UsageLimits(request_limit=5, total_tokens_limit=1000) + agent_factory, UsageLimits(request_limit=5, total_tokens_limit=1000) ), mode="pointwise", ) From 2c52f3ac4d975f81276bc7e726385179cfae2238 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Mon, 8 Sep 2025 13:30:23 -0700 Subject: [PATCH 7/7] fix test_pydantic_agent.py --- .../pytest/default_pydantic_ai_rollout_processor.py | 4 +++- tests/pytest/test_pydantic_agent.py | 13 +++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py b/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py index 8709e253..4c0edfc3 100644 --- a/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +++ b/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py @@ -32,7 +32,9 @@ class PydanticAgentRolloutProcessor(RolloutProcessor): EvaluationRow.messages to and from Pydantic AI ModelMessage format.""" def __init__( - self, agent_factory: Callable[[RolloutProcessorConfig], Agent], usage_limits: UsageLimits | None = None + self, + agent_factory: Callable[[RolloutProcessorConfig], Agent], + usage_limits: UsageLimits | None = None, ): # dummy model used for its helper functions for processing messages self._util: OpenAIModel = OpenAIModel("dummy-model", provider=OpenAIProvider(api_key="dummy")) diff --git a/tests/pytest/test_pydantic_agent.py b/tests/pytest/test_pydantic_agent.py index 1a9b5ecd..d08f74c9 100644 --- a/tests/pytest/test_pydantic_agent.py +++ b/tests/pytest/test_pydantic_agent.py @@ -1,21 +1,26 @@ +from pydantic_ai.agent import Agent +from pydantic_ai.models.openai import OpenAIModel import pytest from eval_protocol.models import EvaluationRow, Message from eval_protocol.pytest import evaluation_test -from pydantic_ai import Agent from eval_protocol.pytest.default_pydantic_ai_rollout_processor import PydanticAgentRolloutProcessor +from eval_protocol.pytest.types import RolloutProcessorConfig -agent = Agent() + +def agent_factory(config: RolloutProcessorConfig) -> Agent: + model = OpenAIModel(config.completion_params["model"], provider="fireworks") + return Agent(model=model) @pytest.mark.asyncio @evaluation_test( input_messages=[[[Message(role="user", content="Hello, how are you?")]]], completion_params=[ - {"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}, + {"model": "accounts/fireworks/models/gpt-oss-120b"}, ], - rollout_processor=PydanticAgentRolloutProcessor(agent), + rollout_processor=PydanticAgentRolloutProcessor(agent_factory), mode="pointwise", ) async def test_pydantic_agent(row: EvaluationRow) -> EvaluationRow: