diff --git a/docs/guides/configuration.md b/docs/guides/configuration.md index fd660da0..2fa99046 100644 --- a/docs/guides/configuration.md +++ b/docs/guides/configuration.md @@ -609,19 +609,14 @@ Add model configuration to your user config file (`~/.mxcp/config.yml`): ```yaml models: - default: "claude-4-sonnet" # Default model to use for evals + default: "claude-3-5-sonnet-20240620" # Default model to use for evals (update to a valid ID) models: - claude-4-opus: - type: "claude" + claude-3-5-sonnet-20240620: + type: "anthropic" api_key: "${ANTHROPIC_API_KEY}" # Environment variable containing API key - timeout: 60 # Request timeout in seconds + timeout: 30 # Anthropic Messages model ID; ensure your account has access max_retries: 3 # Number of retries on failure - claude-4-sonnet: - type: "claude" - api_key: "${ANTHROPIC_API_KEY}" - timeout: 30 - gpt-4o: type: "openai" api_key: "${OPENAI_API_KEY}" @@ -638,11 +633,48 @@ models: - **default**: The model to use when not specified in eval suite or CLI - **models**: Dictionary of model configurations - - **type**: Either "claude" or "openai" +- **type**: Either "anthropic" or "openai" - **api_key**: API key (you can use environment variables references) - - **base_url**: Custom API endpoint (optional, for OpenAI-compatible services) - - **timeout**: Request timeout in seconds - - **max_retries**: Number of retries on failure +- **base_url**: Custom API endpoint (optional, for OpenAI-compatible services) +- **timeout**: Request timeout in seconds +- **max_retries**: Number of retries on failure +- **options**: Extra provider-specific options forwarded to the model (e.g. `thinking: false`) + +Example with mixed providers and options: + +```yaml +models: + default: "gpt-4o" + models: + gpt-4o: + type: "openai" + api_key: "${OPENAI_API_KEY}" + timeout: 45 + options: + reasoning: "fast" + claude-3-5-sonnet-20240620: + type: "anthropic" + api_key: "${ANTHROPIC_API_KEY}" + timeout: 30 + options: + thinking: false + +# Using OpenAI Responses API with reasoning +# Set api: responses to route through the Responses endpoint (e.g., for reasoning) +models: + default: "gpt-5" + models: + gpt-5: + type: "openai" + api_key: "${OPENAI_API_KEY}" + options: + api: "responses" # Choices: responses (for OpenAI Responses API), chat (default) + # Provider-specific fields must be prefixed: + # - body: goes into the request body + # - header: goes into request headers + body:reasoning: + effort: "medium" # Passed via extra_body to the provider +``` For more information on using evals, see the [LLM Evaluation section](quality.md#llm-evaluation-evals) in the Quality & Testing Guide. diff --git a/docs/guides/quality.md b/docs/guides/quality.md index 9b732c4f..d52a8c46 100644 --- a/docs/guides/quality.md +++ b/docs/guides/quality.md @@ -805,7 +805,7 @@ Create eval files with the suffix `-evals.yml` or `.evals.yml`: mxcp: 1 suite: customer_analysis description: "Test LLM's ability to analyze customer data" -model: claude-3-opus # Optional: specify model for this suite +model: claude-3-5-sonnet-20240620 # Optional: specify model for this suite (ensure valid ID) tests: - name: churn_risk_assessment @@ -819,6 +819,7 @@ tests: - tool: get_churn_score args: customer_id: "ABC" + expected_answer: "The customer is high risk of churn" answer_contains: - "risk" - "churn" @@ -880,6 +881,115 @@ answer_not_contains: - "unauthorized" ``` +#### `expected_answer` +Checks the model's final answer against an expected answer using the LLM as a grader. The grader +returns `correct`, `wrong`, or `partially correct` plus a short comment. + +```yaml +expected_answer: "The customer is high risk of churn" +``` + +### Complete Eval Example + +```yaml +# faq-evals.yml +mxcp: 1 +suite: faq_checks +description: "Make sure the assistant answers FAQs accurately and uses tools when needed" +model: gpt-4o + +tests: + - name: tool_usage_for_price_lookup + prompt: "What's the current price for SKU-1234?" + assertions: + must_call: + - tool: get_product_price + args: + sku: "SKU-1234" + answer_contains: + - "price" + + - name: expected_answer_grading + prompt: "What are your support hours?" + assertions: + expected_answer: "Our support team is available Monday to Friday, 9am-5pm local time." + answer_contains: + - "Monday" + - "Friday" +``` + +### Customizing the System Prompt + +Each eval suite can override the default LLM instructions to better match your domain or desired behavior. Add a `system_prompt` field at the suite level—if it is omitted, MXCP falls back to the built-in prompt that encourages concise, tool-aware answers. + +```yaml +mxcp: 1 +suite: relationship_navigation +description: "Ensure the assistant navigates relationships carefully" +model: gpt-4o +system_prompt: | + You are a Vertec specialist. Always explain which tool you used. + If a tool fails, read the error carefully before trying again. + +tests: + - name: compare_owners + prompt: "Are the owners of Project A and Project B the same?" + assertions: + must_call: + - tool: sql_search_objects + args: + object_type: "Project" +``` + +### Model Configuration Example + +Add models to your user config (`~/.mxcp/config.yml`) so evals know which providers to call: + +```yaml +models: + default: "claude-3-5-sonnet-20240620" + models: + claude-3-5-sonnet-20240620: + type: "anthropic" + api_key: "${ANTHROPIC_API_KEY}" + timeout: 30 + gpt-4o: + type: "openai" + api_key: "${OPENAI_API_KEY}" + base_url: "https://api.openai.com/v1" + timeout: 45 + options: + reasoning: "fast" # forwarded to the provider as-is + +# Example: use a faster model just for grading expected answers +mxcp: 1 +suite: faq_checks +model: gpt-4o # primary model used to answer +expected_answer_model: gpt-4o-mini # model used only for grading expected answers +tests: + - name: expected_answer_grading + prompt: "What are your support hours?" + assertions: + expected_answer: "Our support team is available Monday to Friday, 9am-5pm local time." + # expected_answer_model is useful when: + # - Your main model is slow/expensive, but grading can use a lighter model + # - You want deterministic, faster grading for many evals + +# OpenAI Responses API example (reasoning) +models: + default: "gpt-5" + models: + gpt-5: + type: "openai" + api_key: "${OPENAI_API_KEY}" + options: + api: "responses" # Choices: responses (Responses API) or chat (default) + # Provider-specific fields must use prefixes: + # body: for request body, header: for headers + body:reasoning: + effort: "medium" +``` + ### Running Evals ```bash @@ -910,7 +1020,7 @@ models: default: claude-3-opus models: claude-3-opus: - type: claude + type: anthropic api_key: ${ANTHROPIC_API_KEY} timeout: 60 max_retries: 3 @@ -1037,4 +1147,4 @@ Well-tested endpoints with rich metadata provide: - Faster debugging - Safe AI interactions -Remember: LLMs perform best when they clearly understand what your endpoints do, how to use them, and what to expect in return! \ No newline at end of file +Remember: LLMs perform best when they clearly understand what your endpoints do, how to use them, and what to expect in return! diff --git a/docs/reference/cli.md b/docs/reference/cli.md index 7ef974a0..cf67984a 100644 --- a/docs/reference/cli.md +++ b/docs/reference/cli.md @@ -624,4 +624,4 @@ The following environment variables can be used to configure MXCP: - `MXCP_TELEMETRY_TRACING_CONSOLE`: Enable console trace export for debugging (`true`/`false`) - `MXCP_TELEMETRY_METRICS_INTERVAL`: Metrics export interval in seconds (default: `60`) -For more details on environment variables and their usage, see the [Configuration Guide](../guides/configuration.md) and [Observability Guide](../guides/observability.md). \ No newline at end of file +For more details on environment variables and their usage, see the [Configuration Guide](../guides/configuration.md) and [Observability Guide](../guides/observability.md). diff --git a/pyproject.toml b/pyproject.toml index aa7450bf..01f020e9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,6 +41,7 @@ dependencies = [ "fastapi>=0.110.0", # FastAPI for admin API "uvicorn[standard]>=0.27.0", # ASGI server for admin API "psutil>=5.9.0", # System metrics for admin API + "pydantic-ai-slim[anthropic,openai]>=1.25.0", ] [project.scripts] diff --git a/src/mxcp/sdk/evals/__init__.py b/src/mxcp/sdk/evals/__init__.py index 9f8a91b0..9045c433 100644 --- a/src/mxcp/sdk/evals/__init__.py +++ b/src/mxcp/sdk/evals/__init__.py @@ -9,15 +9,13 @@ - Tool definition types for describing available tools to the LLM """ -from ._types import ClaudeConfig, ModelConfigType, OpenAIConfig, ParameterDefinition, ToolDefinition -from .executor import LLMExecutor, ToolExecutor +from ._types import ParameterDefinition, ToolDefinition +from .executor import LLMExecutor, ProviderConfig, ToolExecutor __all__ = [ "LLMExecutor", "ToolExecutor", "ToolDefinition", "ParameterDefinition", - "ModelConfigType", - "ClaudeConfig", - "OpenAIConfig", + "ProviderConfig", ] diff --git a/src/mxcp/sdk/evals/_types.py b/src/mxcp/sdk/evals/_types.py index 6b9359b7..60bab68e 100644 --- a/src/mxcp/sdk/evals/_types.py +++ b/src/mxcp/sdk/evals/_types.py @@ -1,54 +1,16 @@ """Types for MXCP SDK Evals module. -This module contains type definitions for LLM models, tool definitions, -and other data structures used in the evaluation framework. +This module contains type definitions for tool definitions and +other data structures used in the evaluation framework. """ -from abc import ABC, abstractmethod from dataclasses import dataclass, field from typing import Any from mxcp.sdk.validator import TypeSchemaModel - -# LLM Model configuration types -@dataclass -class ModelConfig(ABC): - """Base class for LLM model configurations.""" - - name: str - api_key: str - - @abstractmethod - def get_type(self) -> str: - """Get the type identifier for this model.""" - pass - - -@dataclass -class ClaudeConfig(ModelConfig): - """Configuration for Claude models.""" - - base_url: str = "https://api.anthropic.com" - timeout: int = 30 - - def get_type(self) -> str: - return "claude" - - -@dataclass -class OpenAIConfig(ModelConfig): - """Configuration for OpenAI models.""" - - base_url: str = "https://api.openai.com/v1" - timeout: int = 30 - - def get_type(self) -> str: - return "openai" - - -# Union type for all supported model configurations -ModelConfigType = ClaudeConfig | OpenAIConfig +# Type alias for JSON Schema representation +JsonSchema = dict[str, Any] @dataclass @@ -60,6 +22,8 @@ class ParameterDefinition: description: str = "" default: Any | None = None required: bool = True + schema: JsonSchema | None = None + """Optional JSON Schema for complex parameter validation.""" @dataclass diff --git a/src/mxcp/sdk/evals/executor.py b/src/mxcp/sdk/evals/executor.py index 55a68451..8de4e02e 100644 --- a/src/mxcp/sdk/evals/executor.py +++ b/src/mxcp/sdk/evals/executor.py @@ -1,367 +1,423 @@ -"""Core LLM executor for MXCP SDK Evals. +"""Agent-style LLM executor for MXCP evals.""" -This module provides the main LLMExecutor class that handles LLM orchestration -and tool calling, with tool execution delegated to external implementations. -""" +from __future__ import annotations -import json import logging -import re -from typing import Any, Protocol, cast - -import httpx +from collections.abc import Callable +from dataclasses import dataclass, field +from typing import Any, Protocol + +from pydantic import BaseModel, Field, create_model +from pydantic_ai import Agent, ModelSettings, RunContext +from pydantic_ai.exceptions import ModelRetry, UnexpectedModelBehavior, UsageLimitExceeded +from pydantic_ai.models.anthropic import AnthropicModel +from pydantic_ai.models.openai import OpenAIChatModel, OpenAIResponsesModel +from pydantic_ai.providers.anthropic import AnthropicProvider +from pydantic_ai.providers.openai import OpenAIProvider +from pydantic_ai.tools import Tool +from pydantic_ai.tools import ToolDefinition as AgentToolDefinition from mxcp.sdk.auth import UserContextModel -from ._types import ModelConfigType, ToolDefinition +from ._types import ToolDefinition + +# Agent/tool retry configuration +DEFAULT_AGENT_RETRIES = 30 + +# Type alias for model references (either a model object or a string identifier) +ModelReference = OpenAIChatModel | OpenAIResponsesModel | AnthropicModel | str logger = logging.getLogger(__name__) class ToolExecutor(Protocol): - """Protocol for tool execution strategies. - - Different contexts can implement this protocol to provide their own - tool execution logic (e.g., using ExecutionEngine, HTTP APIs, mocks, etc.). - """ + """Protocol for tool execution strategies.""" async def execute_tool( self, tool_name: str, arguments: dict[str, Any], user_context: UserContextModel | None = None, - ) -> Any: - """Execute a tool and return the result. + ) -> Any: ... + + +@dataclass +class ToolCallRecord: + id: str | None + tool: str + arguments: dict[str, Any] + result: Any | None = None + error: Any | None = None + - Args: - tool_name: Name of the tool to execute - arguments: Arguments to pass to the tool - user_context: Optional user context for execution +@dataclass +class AgentResult: + answer: str + tool_calls: list[ToolCallRecord] = field(default_factory=list) + error: str | None = None # Execution error if agent failed to produce an answer - Returns: - Result of tool execution - Raises: - Exception: If tool execution fails - """ - ... +class ProviderConfig(BaseModel): + api_key: str | None = None + base_url: str | None = None + timeout: int | None = None + model_config = {"extra": "forbid"} + + +class GradeResult(BaseModel): + result: str = Field(default="unknown") + comment: str = Field(default="") + reasoning: str = Field(default="") class LLMExecutor: - """Core LLM executor focused on LLM orchestration and tool calling. - - This class handles: - - LLM API interactions (Claude, OpenAI, etc.) - - Tool call extraction from LLM responses - - Multi-turn conversations with tool results - - Prompt formatting for different model types - - Tool execution is delegated to an external ToolExecutor implementation, - making this class highly testable and reusable across different contexts. - - Example usage: - >>> # Create tool definitions (metadata only) - >>> tools = [ - ... ToolDefinition( - ... name="get_weather", - ... description="Get current weather for a location", - ... parameters=[ - ... ParameterDefinition(name="location", type="string", description="City name") - ... ] - ... ) - ... ] - >>> - >>> # Create model config - >>> model = ClaudeConfig(name="claude-3-haiku", api_key="...") - >>> - >>> # Create tool executor (implemented by context) - >>> tool_executor = MyToolExecutor(...) - >>> - >>> # Create LLM executor - >>> executor = LLMExecutor(model, tools, tool_executor) - >>> - >>> # Execute a prompt - >>> response, tool_calls = await executor.execute_prompt( - ... "What's the weather in Paris?", - ... user_context=user_context - ... ) - """ + """Pydantic-based agent loop with tool support.""" def __init__( self, - model_config: ModelConfigType, + model_name: str, + model_type: str, + model_settings: ModelSettings, available_tools: list[ToolDefinition], tool_executor: ToolExecutor, + provider_config: ProviderConfig | None = None, + system_prompt: str | None = None, + agent_retries: int = DEFAULT_AGENT_RETRIES, ): - """Initialize LLM executor. - - Args: - model_config: Configuration for the LLM model (Claude, OpenAI, etc.) - available_tools: List of tool definitions available to the LLM - tool_executor: Implementation for executing tools - """ - self.model_config = model_config self.available_tools = available_tools self.tool_executor = tool_executor + self.model_name = model_name + self.model_type = model_type + self.provider_config = provider_config or ProviderConfig() + self._agent_cls: Callable[..., Any] = Agent + self._model_settings = model_settings + self._tool_models = self._build_tool_models(available_tools) + self._tool_schemas: dict[str, dict[str, Any]] = {} + self.system_prompt = system_prompt or self._build_system_prompt(available_tools) + self._agent_retries = max(1, agent_retries) + self._model_reference = self._build_model_reference() logger.info( - f"LLM executor initialized with model: {model_config.name} ({model_config.get_type()})" + "LLM executor initialized with model %s (%s) and %d tools", + self.model_name, + self.model_type, + len(available_tools), ) - logger.info(f"Available tools: {len(available_tools)}") - - def _format_tools_for_prompt(self) -> str: - """Format all available tools for inclusion in the prompt.""" - if not self.available_tools: - return "No tools available." - - tool_sections = [] - for tool in self.available_tools: - tool_sections.append(tool.to_prompt_format()) - - return "=== AVAILABLE TOOLS ===\n\n" + "\n\n".join(tool_sections) - - def _get_model_prompt( - self, user_prompt: str, conversation_history: list[dict[str, str]] | None = None - ) -> str: - """Get model-specific prompt format""" - available_tools = self._format_tools_for_prompt() - model_type = self.model_config.get_type() - - if model_type == "claude": - return self._get_claude_prompt(user_prompt, available_tools, conversation_history) - elif model_type == "openai": - return self._get_openai_prompt(user_prompt, available_tools, conversation_history) - else: - return self._get_default_prompt(user_prompt, available_tools, conversation_history) - - def _get_claude_prompt( - self, - user_prompt: str, - available_tools: str, - conversation_history: list[dict[str, str]] | None = None, - ) -> str: - """Claude-specific prompt format""" - system_prompt = f"""You are a helpful assistant with access to the following tools: - -{available_tools} - -To use a tool, respond with a JSON object: -{{"tool": "tool_name", "arguments": {{"param": "value"}}}} - -For multiple tool calls, use an array: -[{{"tool": "tool1", "arguments": {{}}}}, {{"tool": "tool2", "arguments": {{}}}}] - -Only output JSON when calling tools. Otherwise respond with regular text.""" - - messages = [] - if conversation_history: - for msg in conversation_history: - messages.append(f"{msg['role']}: {msg['content']}") - messages.append(f"Human: {user_prompt}") - - return system_prompt + "\n\n" + "\n\n".join(messages) - - def _get_openai_prompt( - self, - user_prompt: str, - available_tools: str, - conversation_history: list[dict[str, str]] | None = None, - ) -> str: - """OpenAI-specific prompt format""" - system_prompt = f"""You are a helpful assistant with access to the following tools: - -{available_tools} - -To use a tool, respond with a JSON object: -{{"tool": "tool_name", "arguments": {{"param": "value"}}}} - -For multiple tool calls, use an array: -[{{"tool": "tool1", "arguments": {{}}}}, {{"tool": "tool2", "arguments": {{}}}}] - -Only output JSON when calling tools. Otherwise respond with regular text.""" - - messages = [] - if conversation_history: - for msg in conversation_history: - messages.append(f"{msg['role']}: {msg['content']}") - messages.append(f"User: {user_prompt}") - - return system_prompt + "\n\n" + "\n\n".join(messages) - - def _get_default_prompt( - self, - user_prompt: str, - available_tools: str, - conversation_history: list[dict[str, str]] | None = None, - ) -> str: - """Default prompt format""" - return self._get_claude_prompt(user_prompt, available_tools, conversation_history) async def execute_prompt( - self, prompt: str, user_context: UserContextModel | None = None - ) -> tuple[str, list[dict[str, Any]]]: - """Execute a prompt and return the response and tool calls made. + self, prompt: str, user_context: UserContextModel | None = None, max_turns: int = 20 + ) -> AgentResult: + """Run the agent loop for a prompt using pydantic-ai Agent.""" + history: list[ToolCallRecord] = [] + # Local callable mapping for this execution (passed to agent factory for testing) + tool_callables: dict[str, Callable[..., Any]] = {} + + def _make_tool(tool_def: ToolDefinition) -> Tool: + args_model = self._tool_models.get(tool_def.name) + schema = self._tool_schemas.get(tool_def.name) + if schema is None: + schema = ( + args_model.model_json_schema() + if args_model + else {"type": "object", "properties": {}, "required": []} + ) + self._tool_schemas[tool_def.name] = schema + + async def _fn(**kwargs: Any) -> Any: + if max_turns is not None and len(history) >= max_turns: + error_msg = f"Maximum tool calls exceeded ({max_turns})" + history.append( + ToolCallRecord( + id=None, tool=tool_def.name, arguments=kwargs, error=error_msg + ) + ) + raise RuntimeError(error_msg) - Args: - prompt: The user prompt to execute - user_context: Optional user context for tool execution + record = ToolCallRecord(id=None, tool=tool_def.name, arguments=kwargs) + try: + validated = ( + args_model.model_validate(kwargs).model_dump() if args_model else kwargs + ) + record.arguments = validated + result = await self.tool_executor.execute_tool( + tool_def.name, validated, user_context + ) + record.result = result + return result + except ModelRetry as exc: + error_response = self._build_tool_error_response(tool_def.name, exc.message) + record.error = error_response + raise + except Exception as exc: # noqa: BLE001 + error_response = self._build_tool_error_response(tool_def.name, str(exc)) + record.error = error_response + retry_message = self._format_tool_retry_message(error_response) + raise ModelRetry(retry_message) from exc + finally: + history.append(record) + + async def _prepare( + _ctx: RunContext[Any], _tool_def: AgentToolDefinition + ) -> AgentToolDefinition: + return AgentToolDefinition( + name=tool_def.name, + description=tool_def.description, + parameters_json_schema=schema, + strict=True, + ) + + tool = Tool( + _fn, + name=tool_def.name, + description=tool_def.description, + prepare=_prepare, + ) + tool_callables[tool_def.name] = _fn + return tool - Returns: - Tuple of (final_response, list_of_tool_calls_made) - """ - conversation_history: list[dict[str, Any]] = [] - tool_calls_made: list[dict[str, Any]] = [] - max_iterations = 10 # Prevent infinite loops + agent_tools = [_make_tool(t) for t in self.available_tools] - for _iteration in range(max_iterations): - # Get model-specific prompt - full_prompt = self._get_model_prompt(prompt, conversation_history) + # Build agent kwargs - only pass _tool_callables for test agents (not real pydantic-ai Agent) + agent_kwargs: dict[str, Any] = { + "model": self._model_reference, + "instructions": self.system_prompt, + "tools": agent_tools, + "retries": self._agent_retries, + } + if self._agent_cls is not Agent: + # Test agent factory - pass tool callables for invocation + agent_kwargs["_tool_callables"] = tool_callables - # Call the LLM - response = await self._call_llm(full_prompt) + agent = self._agent_cls(**agent_kwargs) - # Check if response contains tool calls - tool_calls = self._extract_tool_calls(response) + try: + agent_run = await agent.run( + prompt, deps=user_context, model_settings=self._model_settings + ) - if not tool_calls: - # No more tool calls, return final response - return response, tool_calls_made + answer = getattr(agent_run, "output", "") - # Execute tool calls - tool_results = [] - for tool_call in tool_calls: - tool_calls_made.append(tool_call) + # Log detailed info about the result + logger.debug( + "Agent completed: answer_length=%d, tool_calls=%d, raw_output_type=%s", + len(str(answer)) if answer else 0, + len(history), + type(answer).__name__, + ) - try: - tool_name = tool_call["tool"] - arguments = tool_call.get("arguments", {}) + if not answer: + logger.warning( + "Agent returned empty output after %d tool calls. " + "Check conversation history above for details.", + len(history), + ) + return AgentResult(answer=str(answer), tool_calls=history) + + except UnexpectedModelBehavior as exc: + error_msg = f"Agent exhausted retries ({self._agent_retries}): {exc}" + logger.error( + "Agent failed after exhausting retries (retries=%d, tool_calls=%d): %s", + self._agent_retries, + len(history), + exc, + ) + if logger.isEnabledFor(logging.DEBUG): + logger.debug("Tool call history on failure: %s", [tc.tool for tc in history]) + return AgentResult(answer="", tool_calls=history, error=error_msg) + except UsageLimitExceeded as exc: + error_msg = f"Usage limit exceeded: {exc}" + logger.error("Agent hit usage limit after %d tool calls: %s", len(history), exc) + return AgentResult(answer="", tool_calls=history, error=error_msg) + except RuntimeError as exc: + error_msg = f"Execution aborted: {exc}" + logger.error("LLM execution aborted after %d tool calls: %s", len(history), exc) + return AgentResult(answer="", tool_calls=history, error=error_msg) + except Exception as exc: + error_msg = f"Unexpected error ({type(exc).__name__}): {exc}" + logger.error( + "Unexpected error during agent execution after %d tool calls: %s: %s", + len(history), + type(exc).__name__, + exc, + ) + return AgentResult(answer="", tool_calls=history, error=error_msg) + + async def evaluate_expected_answer(self, answer: str, expected_answer: str) -> dict[str, str]: + """Ask the model to grade an answer against an expected value.""" + logger.debug( + "Grading answer:\n Candidate: %s\n Expected: %s", + answer[:200] + "..." if len(answer) > 200 else answer, + expected_answer[:200] + "..." if len(expected_answer) > 200 else expected_answer, + ) - # Execute the tool using external executor - result = await self.tool_executor.execute_tool( - tool_name, arguments, user_context - ) + grader_system = ( + "You check if the candidate answer CONTAINS the expected information.\n\n" + "GRADING RULES:\n" + "- 'correct': The expected fact(s) appear in the candidate answer. " + "Extra details, context, or longer explanations are FINE and do not affect the grade.\n" + "- 'partially correct': Only use when the expected answer has MULTIPLE facts and some are missing.\n" + "- 'wrong': The expected information is absent, contradicted, or the candidate says it's unavailable.\n\n" + "IMPORTANT: If the expected answer is a single value (e.g., a name, status, role) and that exact value " + "appears anywhere in the candidate answer, grade it as 'correct' regardless of surrounding text.\n\n" + 'Return JSON: {"result": "correct|wrong|partially correct", "comment": "...", "reasoning": "..."}' + ) + grader_prompt = ( + "Compare the candidate answer to the expected answer (semantic match, not exact string).\n" + "Candidate answer:\n" + f"{answer}\n\n" + "Expected answer:\n" + f"{expected_answer}\n\n" + "Respond with JSON like " + '{"result":"correct|wrong|partially correct","comment":"short","reasoning":"short"}' + ) - tool_results.append({"tool": tool_name, "result": result}) + agent = self._agent_cls( + model=self._model_reference, + instructions=grader_system, + tools=(), + output_type=GradeResult, + retries=self._agent_retries, + ) - except Exception as e: - tool_results.append({"tool": tool_call.get("tool", "unknown"), "error": str(e)}) + try: + run = await agent.run(grader_prompt, model_settings=self._model_settings) + out: GradeResult = getattr(run, "output", GradeResult()) + result = out.model_dump() - # Add tool results to conversation - conversation_history.append({"role": "assistant", "content": response}) - conversation_history.append( - {"role": "system", "content": f"Tool results: {json.dumps(tool_results)}"} + logger.debug( + "Grading result: %s (comment: %s, reasoning: %s)", + result.get("result", "unknown"), + result.get("comment", ""), + result.get("reasoning", ""), ) - # Continue conversation with tool results - prompt = "Please incorporate the tool results into your response." + return result + except Exception as exc: + logger.error("Grading failed with error: %s: %s", type(exc).__name__, exc) + return {"result": "unknown", "comment": f"Grading error: {exc}", "reasoning": ""} + + def _build_system_prompt(self, tools: list[ToolDefinition]) -> str: + if not tools: + return "You are an AI assistant. If no tools are suitable, answer directly." + + tool_names = ", ".join(tool.name for tool in tools) + return ( + "You are an AI assistant that uses tools to answer questions accurately. " + f"Available tools: {tool_names}.\n\n" + "IMPORTANT GUIDELINES:\n" + "1. If a tool call fails, READ THE ERROR MESSAGE CAREFULLY. " + "It often contains hints about what went wrong and how to fix it.\n" + "2. If you don't know the correct parameters (like field names or schema), " + "look for tools that can help you discover this information first.\n" + "3. Be persistent: try different approaches if one doesn't work.\n" + "4. YOU MUST ALWAYS PROVIDE A FINAL ANSWER. Even if tools fail, " + "provide the best answer you can with the information available, " + "or explain what information you were unable to retrieve." + ) - # If we reach here, we hit the max iterations - return response, tool_calls_made + def _build_tool_models(self, tools: list[ToolDefinition]) -> dict[str, type[BaseModel]]: + models: dict[str, type[BaseModel]] = {} + for tool in tools: + fields: dict[str, Any] = {} + for param in tool.parameters: + py_type = self._map_param_type(param.type) + field_kwargs: dict[str, Any] = {} + if param.description: + field_kwargs["description"] = param.description + + if getattr(param, "schema", None): + schema_extra_raw: dict[str, Any] = param.schema or {} + schema_extra = dict(schema_extra_raw) + schema_extra.pop("type", None) + if schema_extra: + field_kwargs["json_schema_extra"] = schema_extra + + if param.default is not None: + default_value: Any = param.default + elif param.required: + default_value = ... + else: + default_value = None + + field_info = Field(default_value, **field_kwargs) + fields[param.name] = (py_type, field_info) + + models[tool.name] = create_model(f"{tool.name}_Args", **fields) + return models + + def _build_tool_error_response(self, tool_name: str, error_message: str) -> dict[str, Any]: + """Build a structured error response that guides the model to recover.""" + return { + "status": "error", + "tool": tool_name, + "error": error_message, + "suggestion": ( + "This tool call failed. Read the error message carefully - it often " + "contains hints about what went wrong. Consider: (1) calling this tool " + "with corrected arguments, (2) using a different tool to discover the " + "correct parameters first, or (3) trying a different approach." + ), + } + + def _format_tool_retry_message(self, error_response: dict[str, Any]) -> str: + """Convert a structured error response into a message for ModelRetry.""" + tool = error_response.get("tool", "unknown") + error_text = error_response.get("error", "Unknown error") + suggestion = error_response.get("suggestion") + base = f"Tool '{tool}' failed with error: {error_text}" + if suggestion: + return f"{base}. {suggestion}" + return base + + def _build_model_reference(self) -> ModelReference: + """Instantiate a model object for providers that support direct configuration.""" + model_type = (self.model_type or "").lower() + provider_kwargs = self._provider_kwargs() - def _extract_tool_calls(self, response: str) -> list[dict[str, Any]]: - """Extract tool calls from LLM response""" try: - # Try to parse as JSON (single tool call) - tool_call = json.loads(response.strip()) - if isinstance(tool_call, dict) and "tool" in tool_call: - return [tool_call] - elif isinstance(tool_call, list): - # Multiple tool calls - return [tc for tc in tool_call if isinstance(tc, dict) and "tool" in tc] - except json.JSONDecodeError: - pass - - # If not pure JSON, look for JSON in the response - - json_pattern = r'\{[^}]*"tool"[^}]*\}' - matches = re.findall(json_pattern, response) - - tool_calls = [] - for match in matches: - try: - tool_call = json.loads(match) - if "tool" in tool_call: - tool_calls.append(tool_call) - except json.JSONDecodeError: - continue - - return tool_calls - - async def _call_llm(self, prompt: str) -> str: - """Call the actual LLM API using the configured model""" - - # Log the full prompt in debug mode - logger.debug(f"=== LLM Request to {self.model_config.name} ===") - logger.debug(f"Full prompt:\n{prompt}") - logger.debug("=== End of prompt ===") - - model_type = self.model_config.get_type() - - if model_type == "claude": - return await self._call_claude(prompt) - elif model_type == "openai": - return await self._call_openai(prompt) - else: - raise ValueError(f"Unknown model type: {model_type}") - - async def _call_claude(self, prompt: str) -> str: - """Call Claude API""" - - async with httpx.AsyncClient() as client: - response = await client.post( - f"{self.model_config.base_url}/v1/messages", - headers={ - "x-api-key": self.model_config.api_key, - "anthropic-version": "2023-06-01", - "content-type": "application/json", - }, - json={ - "model": self.model_config.name, - "messages": [{"role": "user", "content": prompt}], - "max_tokens": 4096, - }, - timeout=self.model_config.timeout, - ) - - response.raise_for_status() - data = response.json() - - # Log response in debug mode - logger.debug(f"=== LLM Response from {self.model_config.name} ===") - logger.debug(f"Response: {data['content'][0]['text'][:500]}...") # First 500 chars - logger.debug("=== End of response ===") - - return cast(str, data["content"][0]["text"]) - - async def _call_openai(self, prompt: str) -> str: - """Call OpenAI API""" - - async with httpx.AsyncClient() as client: - response = await client.post( - f"{self.model_config.base_url}/chat/completions", - headers={ - "Authorization": f"Bearer {self.model_config.api_key}", - "Content-Type": "application/json", - }, - json={ - "model": self.model_config.name, - "messages": [ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": prompt}, - ], - "max_tokens": 4096, - }, - timeout=self.model_config.timeout, + if model_type in {"openai", "openai-chat"}: + return OpenAIChatModel(self.model_name, provider=OpenAIProvider(**provider_kwargs)) + if model_type == "openai-responses": + return OpenAIResponsesModel( + self.model_name, provider=OpenAIProvider(**provider_kwargs) + ) + if model_type.startswith("anthropic"): + return AnthropicModel( + self.model_name, provider=AnthropicProvider(**provider_kwargs) + ) + except Exception as exc: # noqa: BLE001 + logger.warning( + "Failed to build custom provider for model '%s' (%s): %s. Falling back to string reference.", + self.model_name, + self.model_type, + exc, ) - response.raise_for_status() - data = response.json() - - # Log response in debug mode - logger.debug(f"=== LLM Response from {self.model_config.name} ===") - logger.debug( - f"Response: {data['choices'][0]['message']['content'][:500]}..." - ) # First 500 chars - logger.debug("=== End of response ===") - - return cast(str, data["choices"][0]["message"]["content"]) + return f"{self.model_type}:{self.model_name}" + + def _provider_kwargs(self) -> dict[str, Any]: + kwargs: dict[str, Any] = {} + if self.provider_config.base_url: + kwargs["base_url"] = self.provider_config.base_url + if self.provider_config.api_key: + kwargs["api_key"] = self.provider_config.api_key + if self.provider_config.timeout: + kwargs["timeout"] = self.provider_config.timeout + return kwargs + + def _map_param_type(self, param_type: str) -> Any: + """Map simple tool parameter types to Python/Pydantic types.""" + key = param_type.lower() + mapping: dict[tuple[str, ...], Any] = { + ("string", "str", "text"): str, + ("integer", "int"): int, + ("number", "float", "double"): float, + ("boolean", "bool"): bool, + ("object", "map", "dict"): dict[str, Any], + ("array", "list"): list[Any], + } + for aliases, py_type in mapping.items(): + if key in aliases: + return py_type + logger.warning("Unknown tool parameter type '%s'; defaulting to Any", param_type) + return Any diff --git a/src/mxcp/sdk/executor/plugins/python.py b/src/mxcp/sdk/executor/plugins/python.py index 6308cf77..0ee1d1e8 100644 --- a/src/mxcp/sdk/executor/plugins/python.py +++ b/src/mxcp/sdk/executor/plugins/python.py @@ -411,7 +411,7 @@ async def execute( return result except (ImportError, SyntaxError) as e: # These are executor-level errors that should be wrapped - logger.error(f"Python execution failed: {e}") + logger.debug(f"Python execution failed: {e}") # Record error metrics record_counter( "mxcp.python.executions_total", @@ -496,7 +496,7 @@ async def _execute_from_file( return await self._execute_function(func, params, context) except Exception as e: - logger.error(f"Failed to execute file {file_path}: {e}") + logger.debug(f"Failed to execute file {file_path}: {e}") raise async def _execute_inline( @@ -566,7 +566,7 @@ async def _execute_inline( raise ValueError("No suitable function found in inline code") except Exception as e: - logger.error(f"Failed to execute inline code: {e}") + logger.debug(f"Failed to execute inline code: {e}") raise async def _execute_function( @@ -606,5 +606,5 @@ def sync_function_wrapper() -> Any: return result except Exception as e: - logger.error(f"Function execution failed: {e}") + logger.debug(f"Function execution failed: {e}") raise diff --git a/src/mxcp/server/core/config/models.py b/src/mxcp/server/core/config/models.py index 98622b4c..e50a84dd 100644 --- a/src/mxcp/server/core/config/models.py +++ b/src/mxcp/server/core/config/models.py @@ -392,11 +392,12 @@ def _apply_defaults(self) -> UserAuthConfigModel: class UserModelConfigModel(BaseModel): model_config = ConfigDict(extra="forbid", frozen=True) - type: Literal["claude", "openai"] + type: Literal["anthropic", "openai"] api_key: str | None = None base_url: str | None = None timeout: int | None = None max_retries: int | None = None + options: dict[str, Any] = Field(default_factory=dict) class UserModelsConfigModel(BaseModel): diff --git a/src/mxcp/server/definitions/endpoints/utils.py b/src/mxcp/server/definitions/endpoints/utils.py index c9ca9cbf..ac2410c2 100644 --- a/src/mxcp/server/definitions/endpoints/utils.py +++ b/src/mxcp/server/definitions/endpoints/utils.py @@ -58,11 +58,7 @@ def get_endpoint_source_code( return source.code if source.file is not None: - source_path = Path(source.file) - if source_path.is_absolute(): - full_path = repo_root / source_path.relative_to("/") - else: - full_path = endpoint_file_path.parent / source_path + full_path = resolve_file_path(source.file, endpoint_file_path, repo_root) return full_path.read_text() raise ValueError("No source code found in endpoint definition") @@ -124,9 +120,24 @@ def resolve_file_path(file_path: str, endpoint_file_path: Path, repo_root: Path) """ source_path = Path(file_path) if source_path.is_absolute(): - return repo_root / source_path.relative_to("/") + return source_path + + repo_candidate = (repo_root / source_path).resolve(strict=False) + + endpoint_path = endpoint_file_path + if not endpoint_path.is_absolute(): + endpoint_path = (repo_root / endpoint_path).resolve(strict=False) else: - return endpoint_file_path.parent / source_path + endpoint_path = endpoint_path.resolve(strict=False) + endpoint_candidate = (endpoint_path.parent / source_path).resolve(strict=False) + + if repo_candidate.exists(): + return repo_candidate + if endpoint_candidate.exists(): + return endpoint_candidate + + # Default to repo-relative path to keep behavior predictable even if file is missing + return repo_candidate def get_endpoint_name_or_uri( @@ -193,14 +204,14 @@ def prepare_source_for_execution( if not tool_def: raise ValueError("No tool definition found") source = tool_def.source - language = tool_def.language + language = (source.language if source else None) or tool_def.language function_name = tool_def.name elif endpoint_type == "resource": resource_def = endpoint_definition.resource if not resource_def: raise ValueError("No resource definition found") source = resource_def.source - language = resource_def.language + language = (source.language if source else None) or resource_def.language else: raise ValueError("Prompts don't have source code") @@ -236,4 +247,4 @@ def prepare_source_for_execution( endpoint_definition, endpoint_type, endpoint_file_path, repo_root ) return (language, source_code) - raise ValueError("No source code or file specified in endpoint definition") + raise ValueError("No source found for endpoint") diff --git a/src/mxcp/server/definitions/evals/models.py b/src/mxcp/server/definitions/evals/models.py index 78e489a0..cbed8dff 100644 --- a/src/mxcp/server/definitions/evals/models.py +++ b/src/mxcp/server/definitions/evals/models.py @@ -1,7 +1,7 @@ from __future__ import annotations from contextlib import suppress -from typing import Any, Literal +from typing import Any from pydantic import BaseModel, ConfigDict, field_validator, model_validator @@ -22,6 +22,7 @@ class EvalAssertionsModel(EvalBaseModel): must_not_call: list[str] | None = None answer_contains: list[str] | None = None answer_not_contains: list[str] | None = None + expected_answer: str | None = None class EvalTestModel(EvalBaseModel): @@ -49,7 +50,9 @@ class EvalSuiteModel(EvalBaseModel): mxcp: int = 1 suite: str description: str | None = None - model: Literal["claude-4-opus", "claude-4-sonnet", "gpt-4o", "gpt-4.1"] | None = None + model: str | None = None + expected_answer_model: str | None = None + system_prompt: str | None = None tests: list[EvalTestModel] @field_validator("suite") diff --git a/src/mxcp/server/executor/runners/tool.py b/src/mxcp/server/executor/runners/tool.py index d1f4ae67..cfcd1eec 100644 --- a/src/mxcp/server/executor/runners/tool.py +++ b/src/mxcp/server/executor/runners/tool.py @@ -6,16 +6,25 @@ """ import logging +from dataclasses import dataclass +from pathlib import Path from typing import Any from mxcp.sdk.auth import UserContextModel from mxcp.sdk.executor import ExecutionContext, ExecutionEngine +from mxcp.server.core.config.site_config import find_repo_root from mxcp.server.definitions.endpoints.models import EndpointDefinitionModel -from mxcp.server.definitions.endpoints.utils import detect_language_from_source, extract_source_info +from mxcp.server.definitions.endpoints.utils import prepare_source_for_execution logger = logging.getLogger(__name__) +@dataclass(frozen=True) +class EndpointWithPath: + definition: EndpointDefinitionModel + path: Path + + class EndpointToolExecutor: """Tool executor that executes tools via SDK ExecutionEngine and endpoints. @@ -41,7 +50,7 @@ class EndpointToolExecutor: >>> llm_executor = LLMExecutor(model_config, tool_definitions, tool_executor) """ - def __init__(self, engine: ExecutionEngine, endpoints: list[EndpointDefinitionModel]): + def __init__(self, engine: ExecutionEngine, endpoints: list[EndpointWithPath]): """Initialize the endpoint tool executor. Args: @@ -49,15 +58,16 @@ def __init__(self, engine: ExecutionEngine, endpoints: list[EndpointDefinitionMo endpoints: List of endpoint definitions """ self.engine = engine - self.endpoints = endpoints + self.endpoints = [entry.definition for entry in endpoints] # Create lookup map for faster tool resolution - self._tool_map: dict[str, EndpointDefinitionModel] = {} - for endpoint_def in endpoints: + self._tool_map: dict[str, tuple[EndpointDefinitionModel, Path]] = {} + for entry in endpoints: + endpoint_def, path = entry.definition, entry.path if endpoint_def.tool: - self._tool_map[endpoint_def.tool.name] = endpoint_def + self._tool_map[endpoint_def.tool.name] = (endpoint_def, path) elif endpoint_def.resource: - self._tool_map[endpoint_def.resource.uri] = endpoint_def + self._tool_map[endpoint_def.resource.uri] = (endpoint_def, path) logger.info(f"EndpointToolExecutor initialized with {len(endpoints)} endpoints") @@ -82,60 +92,43 @@ async def execute_tool( Exception: If execution fails """ # Find the endpoint - endpoint_def = self._tool_map.get(tool_name) - if not endpoint_def: + entry = self._tool_map.get(tool_name) + if not entry: available_tools = list(self._tool_map.keys()) raise ValueError(f"Tool '{tool_name}' not found. Available tools: {available_tools}") + endpoint_def, endpoint_path = entry # Create execution context context = ExecutionContext(user_context=user_context) # Determine the source code and language - source_info = self._get_source_code(endpoint_def, tool_name) - language = self._get_language(endpoint_def, tool_name, source_info) + if endpoint_def.tool: + endpoint_type = "tool" + elif endpoint_def.resource: + endpoint_type = "resource" + else: + raise ValueError(f"Endpoint '{tool_name}' has no tool or resource definition") + + repo_root = find_repo_root() + language, source_payload = prepare_source_for_execution( + endpoint_def, + endpoint_type, + endpoint_path, + repo_root, + include_function_name=True, + ) logger.debug(f"Executing tool '{tool_name}' with language '{language}'") try: # Execute using the SDK engine result = await self.engine.execute( - language=language, source_code=source_info, params=arguments, context=context + language=language, source_code=source_payload, params=arguments, context=context ) logger.debug(f"Tool '{tool_name}' executed successfully") return result except Exception as e: - logger.error(f"Tool '{tool_name}' execution failed: {e}") + logger.debug(f"Tool '{tool_name}' execution failed: {e}") raise - - def _get_source_code(self, endpoint_def: EndpointDefinitionModel, tool_name: str) -> str: - """Extract source code from endpoint definition.""" - # Get the tool or resource definition - source = None - if endpoint_def.tool: - source = endpoint_def.tool.source - elif endpoint_def.resource: - source = endpoint_def.resource.source - - if not source: - raise ValueError(f"No source found for endpoint '{tool_name}'") - - source_type, source_value = extract_source_info(source) - return source_value - - def _get_language( - self, endpoint_def: EndpointDefinitionModel, tool_name: str, source_info: str - ) -> str: - """Determine the programming language for the endpoint.""" - # Get the tool or resource definition - source = None - if endpoint_def.tool: - source = endpoint_def.tool.source - elif endpoint_def.resource: - source = endpoint_def.resource.source - - if not source: - raise ValueError(f"No source found for endpoint '{tool_name}'") - - return detect_language_from_source(source, source_info) diff --git a/src/mxcp/server/interfaces/cli/evals.py b/src/mxcp/server/interfaces/cli/evals.py index 5d944843..1ad0dad4 100644 --- a/src/mxcp/server/interfaces/cli/evals.py +++ b/src/mxcp/server/interfaces/cli/evals.py @@ -1,4 +1,7 @@ import json +import logging +import re +import sys import time from pathlib import Path from typing import Any @@ -18,6 +21,93 @@ ) from mxcp.server.services.evals import run_all_evals, run_eval_suite +_NOISY_EVAL_LOGGERS = ( + "openai", + "openai._base_client", + "openai._client", + "openai._streaming", + "httpx", + "httpcore", + "httpcore.connection", + "httpcore.connectionpool", + "urllib3", + "urllib3.connectionpool", +) + +_ANSI_ESCAPE = re.compile(r"\x1B\[[0-?]*[ -/]*[@-~]") + +# ANSI escape sequences for terminal control +_ANSI_MOVE_UP_CLEAR = "\033[F\033[2K" + +# Characters that indicate a final (completed) progress message +_FINAL_INDICATORS = ("✓", "✗") + + +class ProgressRenderer: + """Stateful progress renderer that overwrites lines in TTY mode.""" + + def __init__(self, is_tty: bool = True) -> None: + self._lines: dict[str, str] = {} + self._order: list[str] = [] + self._lines_printed: int = 0 + self._is_tty = is_tty + + def clear_all(self) -> None: + """Clear all progress lines and reset cursor to start.""" + if not self._is_tty or self._lines_printed == 0: + return + for _ in range(self._lines_printed): + sys.stdout.write(_ANSI_MOVE_UP_CLEAR) + sys.stdout.flush() + self._lines_printed = 0 + + def _render(self) -> None: + """Render all in-progress items.""" + if not self._is_tty or not self._order: + return + for key in self._order: + line = self._lines.get(key, "") + sys.stdout.write(line + "\n") + sys.stdout.flush() + self._lines_printed = len(self._order) + + def _is_final_message(self, msg: str) -> bool: + """Check if message indicates completion (success or failure).""" + clean = _ANSI_ESCAPE.sub("", msg).lstrip() + return any(clean.startswith(indicator) for indicator in _FINAL_INDICATORS) + + def update(self, key: str, msg: str) -> None: + """Update progress for a key. Final messages are printed permanently.""" + if not self._is_tty: + click.echo(msg) + return + + if self._is_final_message(msg): + # Clear progress, print final result, re-render remaining + self.clear_all() + self._order = [k for k in self._order if k != key] + self._lines.pop(key, None) + click.echo(msg) + self._render() + else: + # Update or add progress line + self.clear_all() + if key not in self._order: + self._order.append(key) + self._lines[key] = msg + self._render() + + +def _suppress_noisy_eval_logs(debug: bool) -> None: + """Clamp overly chatty third-party loggers unless debug is explicitly enabled.""" + if debug: + return + + for name in _NOISY_EVAL_LOGGERS: + noisy_logger = logging.getLogger(name) + noisy_logger.setLevel(logging.WARNING) + noisy_logger.propagate = True + def format_eval_results(results: dict[str, Any], debug: bool = False) -> str: """Format eval results for human-readable output""" @@ -65,7 +155,14 @@ def format_eval_results(results: dict[str, Any], debug: bool = False) -> str: failures = test.get("failures", []) for failure in failures: - output.append(f" {click.style('💡', fg='yellow')} {failure}") + lines = failure.splitlines() + if not lines: + continue + indent = " " * 4 + continuation_indent = indent + " " * 3 + output.append(f"{indent}{click.style('💡', fg='yellow')} {lines[0]}") + for line in lines[1:]: + output.append(f"{continuation_indent}{line}") if debug and "details" in test: output.append(f" {click.style('Debug info:', fg='yellow')}") @@ -173,7 +270,14 @@ def format_eval_results(results: dict[str, Any], debug: bool = False) -> str: f" {click.style('Error:', fg='red')} {test['error']}" ) for failure in test.get("failures", []): - output.append(f" {click.style('💡', fg='yellow')} {failure}") + lines = failure.splitlines() + if not lines: + continue + indent = " " * 6 + continuation_indent = indent + " " * 3 + output.append(f"{indent}{click.style('💡', fg='yellow')} {lines[0]}") + for line in lines[1:]: + output.append(f"{continuation_indent}{line}") # Show passed suites passed = [s for s in suites if s.get("status") == "passed"] @@ -268,6 +372,7 @@ def evals( # Configure logging configure_logging_from_config(user_config=user_config, debug=debug) + _suppress_noisy_eval_logs(debug) # Run async implementation run_async_cli( _evals_impl( @@ -339,6 +444,11 @@ async def _evals_impl( # Run evals start_time = time.time() + + # Create progress renderer for TTY-aware output + is_tty = click.get_text_stream("stdout").isatty() + progress = ProgressRenderer(is_tty=is_tty) + if suite_name: results = await run_eval_suite( suite_name, @@ -347,6 +457,7 @@ async def _evals_impl( profile, cli_user_context=cli_user_context, override_model=model, + progress_callback=progress.update, ) else: results = await run_all_evals( @@ -355,6 +466,7 @@ async def _evals_impl( profile, cli_user_context=cli_user_context, override_model=model, + progress_callback=progress.update, ) elapsed_time = time.time() - start_time results["elapsed_time"] = elapsed_time @@ -362,6 +474,7 @@ async def _evals_impl( if json_output: output_result(results, json_output, debug) else: + progress.clear_all() click.echo(format_eval_results(results, debug)) # Exit with error code if any tests failed diff --git a/src/mxcp/server/services/evals/service.py b/src/mxcp/server/services/evals/service.py index 43870b33..0d5b2468 100644 --- a/src/mxcp/server/services/evals/service.py +++ b/src/mxcp/server/services/evals/service.py @@ -1,37 +1,36 @@ import logging import time +from collections.abc import Callable from typing import Any +import click +from pydantic_ai import ModelSettings + from mxcp.sdk.auth import UserContextModel -from mxcp.sdk.evals import ( - ClaudeConfig, - LLMExecutor, - ModelConfigType, - OpenAIConfig, - ParameterDefinition, - ToolDefinition, -) +from mxcp.sdk.evals import LLMExecutor, ParameterDefinition, ProviderConfig, ToolDefinition from mxcp.sdk.validator import TypeSchemaModel from mxcp.server.core.config.models import SiteConfigModel, UserConfigModel from mxcp.server.core.config.site_config import find_repo_root from mxcp.server.definitions.endpoints.loader import EndpointLoader -from mxcp.server.definitions.endpoints.models import EndpointDefinitionModel +from mxcp.server.definitions.endpoints.models import ParamDefinitionModel, TypeDefinitionModel from mxcp.server.definitions.evals.loader import discover_eval_files, load_eval_suite from mxcp.server.executor.engine import create_runtime_environment -from mxcp.server.executor.runners.tool import EndpointToolExecutor +from mxcp.server.executor.runners.tool import EndpointToolExecutor, EndpointWithPath logger = logging.getLogger(__name__) -def _create_model_config(model: str, user_config: UserConfigModel) -> ModelConfigType: - """Create a model configuration from user config. +def _create_model_config( + model: str, user_config: UserConfigModel +) -> tuple[str, str, dict[str, Any], ProviderConfig]: + """Create a model configuration tuple from user config. Args: model: Model name to use user_config: User configuration containing model settings Returns: - Configured model object + Tuple of (model_name, model_type, options, provider_config) Raises: ValueError: If model is not configured or has invalid type @@ -46,45 +45,171 @@ def _create_model_config(model: str, user_config: UserConfigModel) -> ModelConfi model_type = model_config.type api_key = model_config.api_key + options = dict(model_config.options or {}) + api_mode = options.get("api") or options.get("endpoint") if not api_key: raise ValueError(f"No API key configured for model '{model}'") - if model_type == "claude": - base_url = model_config.base_url or "https://api.anthropic.com" - timeout = model_config.timeout or 30 - return ClaudeConfig(name=model, api_key=api_key, base_url=base_url, timeout=timeout) - elif model_type == "openai": - base_url = model_config.base_url or "https://api.openai.com/v1" - timeout = model_config.timeout or 30 - return OpenAIConfig(name=model, api_key=api_key, base_url=base_url, timeout=timeout) - else: + if model_type not in {"anthropic", "openai"}: raise ValueError(f"Unknown model type: {model_type}") - -def _load_endpoints(site_config: SiteConfigModel) -> list[EndpointDefinitionModel]: + effective_model_type = ( + "openai-responses" if model_type == "openai" and api_mode == "responses" else model_type + ) + + base_url = model_config.base_url + timeout = model_config.timeout + + # Ensure timeout also flows through options if present + if timeout and "timeout" not in options: + options["timeout"] = timeout + + provider_config = ProviderConfig(api_key=api_key, base_url=base_url, timeout=timeout) + + return model, effective_model_type, options, provider_config + + +def _build_model_settings( + model_name: str, model_type: str, model_options: dict[str, Any], allowed_keys: set[str] +) -> ModelSettings: + model_opts = dict(model_options) + model_opts.pop("api", None) + model_opts.pop("endpoint", None) + + recognized_options = {k: v for k, v in model_opts.items() if k in allowed_keys} + body_extras: dict[str, Any] = dict(recognized_options.get("extra_body") or {}) + header_extras: dict[str, str] = dict(recognized_options.get("extra_headers") or {}) + ignored: list[str] = [] + + for key, value in model_opts.items(): + if key in allowed_keys: + continue + if key.startswith("body:"): + body_extras[key.split(":", 1)[1]] = value + elif key.startswith("header:"): + header_value: str + if isinstance(value, list): + header_value = ",".join(str(v) for v in value) + else: + header_value = str(value) + header_extras[key.split(":", 1)[1]] = header_value + else: + ignored.append(key) + + if ignored: + logger.warning( + "Ignoring unprefixed model options for model '%s': %s. " + "Use 'body:' or 'header:' prefixes.", + model_name, + sorted(ignored), + ) + + if body_extras: + recognized_options["extra_body"] = body_extras + if header_extras: + recognized_options["extra_headers"] = header_extras + + if "max_tokens" not in recognized_options: + recognized_options["max_tokens"] = 10_000 + + return ModelSettings(**recognized_options) # type: ignore[typeddict-item,no-any-return] + + +def _type_definition_to_schema(type_definition: TypeDefinitionModel) -> dict[str, Any]: + schema: dict[str, Any] = {"type": type_definition.type} + + if type_definition.description: + schema["description"] = type_definition.description + if type_definition.default is not None: + schema["default"] = type_definition.default + if type_definition.enum: + schema["enum"] = list(type_definition.enum) + if type_definition.examples: + schema["examples"] = list(type_definition.examples) + + if type_definition.type == "string": + if type_definition.format: + schema["format"] = type_definition.format + if type_definition.minLength is not None: + schema["minLength"] = type_definition.minLength + if type_definition.maxLength is not None: + schema["maxLength"] = type_definition.maxLength + if type_definition.pattern: + schema["pattern"] = type_definition.pattern + elif type_definition.type in {"number", "integer"}: + if type_definition.minimum is not None: + schema["minimum"] = type_definition.minimum + if type_definition.maximum is not None: + schema["maximum"] = type_definition.maximum + if type_definition.exclusiveMinimum is not None: + schema["exclusiveMinimum"] = type_definition.exclusiveMinimum + if type_definition.exclusiveMaximum is not None: + schema["exclusiveMaximum"] = type_definition.exclusiveMaximum + if type_definition.multipleOf is not None: + schema["multipleOf"] = type_definition.multipleOf + elif type_definition.type == "array": + if type_definition.items is not None: + schema["items"] = _type_definition_to_schema(type_definition.items) + else: + schema["items"] = {"type": "string"} + if type_definition.minItems is not None: + schema["minItems"] = type_definition.minItems + if type_definition.maxItems is not None: + schema["maxItems"] = type_definition.maxItems + if type_definition.uniqueItems is not None: + schema["uniqueItems"] = type_definition.uniqueItems + elif type_definition.type == "object": + if type_definition.properties: + schema["properties"] = { + key: _type_definition_to_schema(value) + for key, value in type_definition.properties.items() + } + if type_definition.required: + schema["required"] = list(type_definition.required) + if type_definition.additionalProperties is not None: + schema["additionalProperties"] = type_definition.additionalProperties + + return schema + + +def _parameter_definition_from_model(param: ParamDefinitionModel) -> ParameterDefinition: + has_default = "default" in param.model_fields_set + schema = _type_definition_to_schema(param) + schema.pop("name", None) + return ParameterDefinition( + name=param.name, + type=param.type, + description=param.description or "", + default=param.default if has_default else None, + required=not has_default, + schema=schema or None, + ) + + +def _load_endpoints(site_config: SiteConfigModel) -> list[EndpointWithPath]: """Load all available endpoints. Args: site_config: Site configuration for endpoint discovery Returns: - List of endpoint definitions + List of (endpoint definition, file path) """ loader = EndpointLoader(site_config) - endpoints: list[EndpointDefinitionModel] = [] + endpoints: list[EndpointWithPath] = [] discovered = loader.discover_endpoints() - for _path, endpoint_def, error in discovered: + for path, endpoint_def, error in discovered: if error is None and endpoint_def and (endpoint_def.tool or endpoint_def.resource): # Only include endpoints that have a tool or resource definition - endpoints.append(endpoint_def) + endpoints.append(EndpointWithPath(endpoint_def, path)) return endpoints def _convert_endpoints_to_tool_definitions( - endpoints: list[EndpointDefinitionModel], + endpoints: list[EndpointWithPath], ) -> list[ToolDefinition]: """Convert endpoint definitions to ToolDefinition objects for the LLM. @@ -96,22 +221,14 @@ def _convert_endpoints_to_tool_definitions( """ tool_definitions = [] - for endpoint_def in endpoints: + for entry in endpoints: + endpoint_def = entry.definition if endpoint_def.tool: tool = endpoint_def.tool - tool_parameters = [] - for param in tool.parameters or []: - has_default = "default" in param.model_fields_set - tool_parameters.append( - ParameterDefinition( - name=param.name, - type=param.type, - description=param.description or "", - default=param.default if has_default else None, - required=not has_default, - ) - ) + tool_parameters = [ + _parameter_definition_from_model(param) for param in (tool.parameters or []) + ] return_type = None if tool.return_: @@ -138,18 +255,9 @@ def _convert_endpoints_to_tool_definitions( elif endpoint_def.resource: resource = endpoint_def.resource - resource_parameters = [] - for param in resource.parameters or []: - has_default = "default" in param.model_fields_set - resource_parameters.append( - ParameterDefinition( - name=param.name, - type=param.type, - description=param.description or "", - default=param.default if has_default else None, - required=not has_default, - ) - ) + resource_parameters = [ + _parameter_definition_from_model(param) for param in (resource.parameters or []) + ] return_type = None if resource.return_: @@ -171,6 +279,132 @@ def _convert_endpoints_to_tool_definitions( return tool_definitions +def _format_expected_answer_failure( + response: str, + expected: str, + grade: str | None, + comment: str | None, + reasoning: str | None, +) -> str: + """Build a multi-line failure detail block for expected-answer grading.""" + lines = [ + f"LLM Answer: {response}", + f"Expected: {expected}", + f"Grade: {grade or 'unknown'}", + f"Comment: {comment or 'n/a'}", + f"Reasoning: {reasoning or 'n/a'}", + ] + return "\n".join(lines) + + +async def _evaluate_test_assertions( + test: Any, + response: str, + tool_calls: list[Any], + execution_error: str | None, + grader: LLMExecutor, +) -> tuple[list[str], dict[str, Any] | None]: + """Evaluate all assertions for a test. + + Returns: + Tuple of (failures list, expected_answer_evaluation dict or None) + """ + failures: list[str] = [] + evaluation: dict[str, Any] | None = None + assertions = test.assertions + + # If the agent failed to execute, report it clearly + if execution_error: + failures.append(f"Agent execution failed: {execution_error}") + + for call in tool_calls: + if call.error: + logger.debug( + "Tool '%s' failed during test '%s': %s", + call.tool, + test.name, + call.error, + ) + + # Check must_call assertions + if assertions.must_call: + for expected_call in assertions.must_call: + expected_tool = expected_call.tool + expected_args = expected_call.args or {} + + found = False + for call in tool_calls: + if call.tool == expected_tool: + actual_args = call.arguments or {} + if all(actual_args.get(k) == v for k, v in expected_args.items()): + found = True + break + + if not found: + failures.append( + f"Expected call to '{expected_tool}' with args {expected_args} not found" + ) + + # Check must_not_call assertions + if assertions.must_not_call: + for forbidden_tool in assertions.must_not_call: + if any(call.tool == forbidden_tool for call in tool_calls): + failures.append(f"Tool '{forbidden_tool}' was called but should not have been") + + # Check answer_contains assertions + if assertions.answer_contains: + for expected_text in assertions.answer_contains: + if expected_text.lower() not in response.lower(): + failures.append(f"Expected text '{expected_text}' not found in response") + + # Check answer_not_contains assertions + if assertions.answer_not_contains: + for forbidden_text in assertions.answer_not_contains: + if forbidden_text.lower() in response.lower(): + failures.append(f"Forbidden text '{forbidden_text}' found in response") + + if assertions.expected_answer: + logger.debug( + "Evaluating expected_answer assertion for test '%s': response_length=%d, expected='%s'", + test.name, + len(response), + ( + assertions.expected_answer[:100] + "..." + if len(assertions.expected_answer) > 100 + else assertions.expected_answer + ), + ) + evaluation = await grader.evaluate_expected_answer(response, assertions.expected_answer) + grade = (evaluation.get("result") or "").lower() + comment = evaluation.get("comment") or "Model answer did not match expected" + reasoning = evaluation.get("reasoning") or "" + + logger.debug( + "Expected answer evaluation for '%s': grade=%s, response='%s'", + test.name, + grade, + response[:150] + "..." if len(response) > 150 else response, + ) + + detail = _format_expected_answer_failure( + response, + assertions.expected_answer, + grade or "unknown", + comment, + reasoning, + ) + if grade != "correct": + logger.info( + "Test '%s' failed expected_answer check: grade=%s, comment=%s", + test.name, + grade, + comment, + ) + failures.append(detail) + + return failures, evaluation + + async def run_eval_suite( suite_name: str, user_config: UserConfigModel, @@ -178,6 +412,8 @@ async def run_eval_suite( profile: str | None, cli_user_context: UserContextModel | None = None, override_model: str | None = None, + progress_callback: Callable[[str, str], None] | None = None, + expected_answer_model: str | None = None, ) -> dict[str, Any]: """Run a specific eval suite by name. @@ -210,9 +446,16 @@ async def run_eval_suite( "error": "No model specified. Set 'model' in eval suite or configure a default model.", "suite": suite_name, } + grading_model = expected_answer_model or getattr(eval_suite, "expected_answer_model", None) # Create model configuration - model_config = _create_model_config(model, user_config) + model_name, model_type, model_options, provider_config = _create_model_config( + model, user_config + ) + allowed_keys = set(ModelSettings.__annotations__.keys()) + model_opts = dict(model_options) + + model_settings = _build_model_settings(model_name, model_type, model_opts, allowed_keys) # Load endpoints endpoints = _load_endpoints(site_config) @@ -226,20 +469,56 @@ async def run_eval_suite( # Create tool executor that bridges LLM calls to endpoint execution tool_executor = EndpointToolExecutor(engine, endpoints) + grading_executor: LLMExecutor | None = None + + if grading_model: + grade_model_name, grade_model_type, grade_opts, grade_provider = _create_model_config( + grading_model, user_config + ) + grade_settings = _build_model_settings( + grade_model_name, grade_model_type, dict(grade_opts), allowed_keys + ) + grading_executor = LLMExecutor( + grade_model_name, + grade_model_type, + grade_settings, + [], # no tools needed for grading + tool_executor, + provider_config=grade_provider, + ) logger.info(f"Running eval suite: {suite_name} from {file_path}") logger.info(f"Suite description: {eval_suite.description or 'No description'}") logger.info(f"Model: {model}") logger.info(f"Number of tests: {len(eval_suite.tests)}") + total_tests = len(eval_suite.tests) + try: # Create LLM executor with model config, tool definitions, and tool executor - executor = LLMExecutor(model_config, tool_definitions, tool_executor) + executor = LLMExecutor( + model_name, + model_type, + model_settings, + tool_definitions, + tool_executor, + provider_config=provider_config, + system_prompt=eval_suite.system_prompt, + ) # Run each test tests = [] - for test in eval_suite.tests: + for idx, test in enumerate(eval_suite.tests, start=1): test_start = time.time() + if progress_callback: + progress_callback( + f"test:{suite_name}:{idx}", + " ⏳ " + + click.style( + f"[{suite_name}] {idx}/{total_tests} • {test.name}...", + fg="cyan", + ), + ) # Determine user context for this test test_user_context = cli_user_context @@ -257,67 +536,60 @@ async def run_eval_suite( try: # Execute the prompt - response, tool_calls = await executor.execute_prompt( + agent_result = await executor.execute_prompt( test.prompt, user_context=test_user_context ) + response = agent_result.answer + tool_calls = agent_result.tool_calls + execution_error = agent_result.error + # Evaluate assertions - failures = [] - assertions = test.assertions - - # Check must_call assertions - if assertions.must_call: - for expected_call in assertions.must_call: - expected_tool = expected_call.tool - expected_args = expected_call.args or {} - - found = False - for call in tool_calls: - if call["tool"] == expected_tool: - actual_args = call.get("arguments", {}) - if all(actual_args.get(k) == v for k, v in expected_args.items()): - found = True - break - - if not found: - failures.append( - f"Expected call to '{expected_tool}' with args {expected_args} not found" - ) - - # Check must_not_call assertions - if assertions.must_not_call: - for forbidden_tool in assertions.must_not_call: - if any(call["tool"] == forbidden_tool for call in tool_calls): - failures.append( - f"Tool '{forbidden_tool}' was called but should not have been" - ) - - # Check answer_contains assertions - if assertions.answer_contains: - for expected_text in assertions.answer_contains: - if expected_text.lower() not in response.lower(): - failures.append( - f"Expected text '{expected_text}' not found in response" - ) - - # Check answer_not_contains assertions - if assertions.answer_not_contains: - for forbidden_text in assertions.answer_not_contains: - if forbidden_text.lower() in response.lower(): - failures.append(f"Forbidden text '{forbidden_text}' found in response") + grader = grading_executor or executor + failures, evaluation = await _evaluate_test_assertions( + test, response, tool_calls, execution_error, grader + ) test_time = time.time() - test_start + passed = len(failures) == 0 tests.append( { "name": test.name, "description": test.description, - "passed": len(failures) == 0, + "passed": passed, "failures": failures, "time": test_time, - "details": {"response": response, "tool_calls": tool_calls}, + "details": { + "response": response, + "execution_error": execution_error, + "tool_calls": [ + { + "id": call.id, + "tool": call.tool, + "arguments": call.arguments, + "result": call.result, + "error": call.error, + } + for call in tool_calls + ], + "expected_answer": test.assertions.expected_answer, + "expected_answer_evaluation": evaluation, + }, } ) + if progress_callback: + icon = click.style("✓", fg="green") if passed else click.style("✗", fg="red") + progress_callback( + f"test:{suite_name}:{idx}", + " " + + icon + + " " + + click.style( + f"[{suite_name}] {idx}/{total_tests} • {test.name} ({test_time:.2f}s)", + fg="green" if passed else "red", + ), + ) except Exception as e: test_time = time.time() - test_start @@ -330,6 +602,15 @@ async def run_eval_suite( "time": test_time, } ) + if progress_callback: + progress_callback( + f"test:{suite_name}:{idx}", + " ✗ " + + click.style( + f"[{suite_name}] {idx}/{total_tests} • {test.name} errored: {e} ({test_time:.2f}s)", + fg="red", + ), + ) finally: # Clean up runtime environment @@ -352,6 +633,8 @@ async def run_all_evals( profile: str | None, cli_user_context: UserContextModel | None = None, override_model: str | None = None, + progress_callback: Callable[[str, str], None] | None = None, + expected_answer_model: str | None = None, ) -> dict[str, Any]: """Run all eval suites found in the repository. @@ -388,9 +671,16 @@ async def run_all_evals( if eval_suite is None: continue suite_name = eval_suite.suite or "unnamed" - # Run the suite + # Run the suite (progress_callback is passed through to run_eval_suite) result = await run_eval_suite( - suite_name, user_config, site_config, profile, cli_user_context, override_model + suite_name, + user_config, + site_config, + profile, + cli_user_context, + override_model, + progress_callback=progress_callback, + expected_answer_model=expected_answer_model, ) # Get relative path @@ -399,8 +689,10 @@ async def run_all_evals( except Exception: relative_path = str(file_path) - # Map new result structure to old structure for backward compatibility - all_passed = result.get("summary", {}).get("failed", 1) == 0 if result else False + # Determine pass/fail + all_passed = bool(result.get("all_passed")) + if not all_passed and result.get("summary"): + all_passed = result["summary"].get("failed", 1) == 0 suites.append( { diff --git a/tests/sdk/evals/test_executor.py b/tests/sdk/evals/test_executor.py index 73561412..2805187c 100644 --- a/tests/sdk/evals/test_executor.py +++ b/tests/sdk/evals/test_executor.py @@ -1,27 +1,62 @@ -"""Tests for mxcp.sdk.evals.executor module.""" - +import asyncio from typing import Any -from unittest.mock import AsyncMock import pytest +from pydantic_ai import ModelSettings +from pydantic_ai.exceptions import ModelRetry from mxcp.sdk.auth import UserContextModel -from mxcp.sdk.evals import ( - ClaudeConfig, - LLMExecutor, - OpenAIConfig, - ParameterDefinition, - ToolDefinition, -) -from mxcp.sdk.validator import TypeSchemaModel +from mxcp.sdk.evals import ParameterDefinition, ToolDefinition +from mxcp.sdk.evals.executor import AgentResult, GradeResult, LLMExecutor, ProviderConfig -class MockToolExecutor: - """Mock tool executor for testing.""" +class FakeRun: + def __init__(self, output: Any) -> None: + self.output = output + + +class FakeAgent: + def __init__( + self, + *, + tools: list[Any], + output: Any, + tool_args: dict[str, dict[str, Any]], + tool_callables: dict[str, Any] | None = None, + ) -> None: + self.tools = tools + self.output = output + self.tool_args = tool_args + self.tool_callables = tool_callables or {} + + async def run( + self, _prompt: str, deps: Any | None = None, model_settings: Any | None = None + ) -> FakeRun: + for tool in self.tools: + tool_name = getattr(tool, "name", None) or getattr( + getattr(tool, "tool_def", None), "name", None + ) + # Look up callable from provided map + fn = self.tool_callables.get(tool_name or "") + args = self.tool_args.get(tool_name or "", {}) + if fn: + if asyncio.iscoroutinefunction(fn): + try: + await fn(**args) + except ModelRetry: + continue + else: + try: + fn(**args) + except ModelRetry: + continue + return FakeRun(self.output) + +class MockToolExecutor: def __init__(self, responses: dict[str, Any] | None = None): self.responses = responses or {} - self.calls = [] + self.calls: list[dict[str, Any]] = [] async def execute_tool( self, @@ -29,275 +64,482 @@ async def execute_tool( arguments: dict[str, Any], user_context: UserContextModel | None = None, ) -> Any: - """Mock tool execution that records calls and returns predefined responses.""" self.calls.append( {"tool_name": tool_name, "arguments": arguments, "user_context": user_context} ) - if tool_name in self.responses: - result = self.responses[tool_name] - if isinstance(result, Exception): - raise result - return result - - return f"Mock result for {tool_name}" - - -class TestLLMExecutor: - """Test cases for LLMExecutor.""" - - def setup_method(self): - """Set up test fixtures.""" - self.model_config = ClaudeConfig(name="claude-3-haiku", api_key="test-key") - - self.tools = [ - ToolDefinition( - name="get_weather", - description="Get current weather for a location", - parameters=[ - ParameterDefinition(name="location", type="string", description="City name") - ], - ), - ToolDefinition( - name="calculate", - description="Perform mathematical calculations", - parameters=[ - ParameterDefinition( - name="expression", - type="string", - description="Mathematical expression to evaluate", - ) - ], - ), - ] - - self.tool_executor = MockToolExecutor( - {"get_weather": {"temperature": 22, "condition": "sunny"}, "calculate": 42} + value = self.responses[tool_name] + if isinstance(value, Exception): + raise value + return value + return {"echo": arguments} + + +def make_executor( + tools: list[ToolDefinition] | None = None, + responses: dict[str, Any] | None = None, + system_prompt: str | None = None, + agent_retries: int = 3, +) -> LLMExecutor: + default_tools = [ + ToolDefinition( + name="get_weather", + description="Weather lookup", + parameters=[ParameterDefinition(name="location", type="string", required=True)], ) - - self.executor = LLMExecutor(self.model_config, self.tools, self.tool_executor) - - def test_initialization(self): - """Test LLMExecutor initialization.""" - assert self.executor.model_config == self.model_config - assert self.executor.available_tools == self.tools - assert self.executor.tool_executor == self.tool_executor - - def test_format_tools_for_prompt(self): - """Test tool formatting for prompts.""" - formatted = self.executor._format_tools_for_prompt() - - assert "=== AVAILABLE TOOLS ===" in formatted - assert "Tool: get_weather" in formatted - assert "Tool: calculate" in formatted - assert "Description: Get current weather for a location" in formatted - assert "location (string): City name" in formatted - - def test_format_tools_for_prompt_empty(self): - """Test tool formatting with no tools.""" - executor = LLMExecutor(self.model_config, [], self.tool_executor) - formatted = executor._format_tools_for_prompt() - assert formatted == "No tools available." - - def test_get_claude_prompt(self): - """Test Claude-specific prompt formatting.""" - prompt = self.executor._get_claude_prompt( - "What's the weather in Paris?", "Mock tools", None + ] + tool_defs = tools or default_tools + default_responses = {"get_weather": {"temperature": 20}} if tools is None else {} + tool_executor = MockToolExecutor(responses or default_responses) + executor = LLMExecutor( + "claude-test", + "anthropic", + ModelSettings(), + tool_defs, + tool_executor, + provider_config=ProviderConfig(api_key="key", base_url="https://api.anthropic.com"), + system_prompt=system_prompt, + agent_retries=agent_retries, + ) + return executor + + +def test_executor_uses_custom_system_prompt() -> None: + custom_prompt = "You are a specialized assistant." + executor = make_executor(system_prompt=custom_prompt) + + assert executor.system_prompt == custom_prompt + + +def test_executor_passes_agent_retries_to_agent() -> None: + observed: list[int | None] = [] + + executor = make_executor(agent_retries=5) + + def agent_factory(**kwargs: Any) -> FakeAgent: + observed.append(kwargs.get("retries")) + return FakeAgent( + tools=kwargs["tools"], + output="ok", + tool_args={"get_weather": {"location": "Paris"}}, + tool_callables=kwargs.get("_tool_callables", {}), ) - assert "You are a helpful assistant" in prompt - assert "Mock tools" in prompt - assert "Human: What's the weather in Paris?" in prompt - assert '{"tool": "tool_name"' in prompt - - def test_get_openai_prompt(self): - """Test OpenAI-specific prompt formatting.""" - prompt = self.executor._get_openai_prompt("Calculate 2+2", "Mock tools", None) - - assert "You are a helpful assistant" in prompt - assert "Mock tools" in prompt - assert "User: Calculate 2+2" in prompt - assert '{"tool": "tool_name"' in prompt - - def test_extract_tool_calls_single(self): - """Test extraction of single tool call.""" - response = '{"tool": "get_weather", "arguments": {"location": "Paris"}}' - calls = self.executor._extract_tool_calls(response) - - assert len(calls) == 1 - assert calls[0]["tool"] == "get_weather" - assert calls[0]["arguments"]["location"] == "Paris" - - def test_extract_tool_calls_multiple(self): - """Test extraction of multiple tool calls.""" - response = '[{"tool": "get_weather", "arguments": {"location": "Paris"}}, {"tool": "calculate", "arguments": {"expression": "2+2"}}]' - calls = self.executor._extract_tool_calls(response) - - assert len(calls) == 2 - assert calls[0]["tool"] == "get_weather" - assert calls[1]["tool"] == "calculate" - - def test_extract_tool_calls_none(self): - """Test extraction when no tool calls present.""" - response = "The weather in Paris is sunny and 22 degrees." - calls = self.executor._extract_tool_calls(response) - - assert len(calls) == 0 - - def test_extract_tool_calls_invalid_json(self): - """Test extraction with invalid JSON.""" - response = "Invalid JSON {tool: get_weather}" - calls = self.executor._extract_tool_calls(response) - - assert len(calls) == 0 - - @pytest.mark.asyncio - async def test_execute_prompt_no_tools(self): - """Test prompt execution without tool calls.""" - # Mock the LLM call to return a simple response - self.executor._call_llm = AsyncMock(return_value="Hello! I'm a helpful assistant.") - - response, tool_calls = await self.executor.execute_prompt("Hello") - - assert response == "Hello! I'm a helpful assistant." - assert len(tool_calls) == 0 - assert len(self.tool_executor.calls) == 0 - - @pytest.mark.asyncio - async def test_execute_prompt_with_tools(self): - """Test prompt execution with tool calls.""" - # Mock LLM to first return tool call, then final response - self.executor._call_llm = AsyncMock( - side_effect=[ - '{"tool": "get_weather", "arguments": {"location": "Paris"}}', - "The weather in Paris is sunny and 22 degrees.", - ] + executor._agent_cls = agent_factory + + asyncio.run(executor.execute_prompt("Weather?")) + + assert observed == [5] + + +def test_execute_prompt_with_tool_call() -> None: + executor = make_executor() + user_ctx = UserContextModel(provider="test", user_id="u1", username="user") + executor._agent_cls = lambda **kwargs: FakeAgent( + tools=kwargs["tools"], + output="Sunny", + tool_args={"get_weather": {"location": "Paris"}}, + tool_callables=kwargs.get("_tool_callables", {}), + ) + + result = asyncio.run(executor.execute_prompt("Weather?", user_context=user_ctx)) + + assert isinstance(result, AgentResult) + assert result.answer == "Sunny" + assert len(result.tool_calls) == 1 + call = result.tool_calls[0] + assert call.tool == "get_weather" + assert call.arguments["location"] == "Paris" + assert call.result == {"temperature": 20} + assert call.error is None + + +def test_execute_prompt_tool_calls_do_not_leak_between_runs() -> None: + executor = make_executor() + + # First run + executor._agent_cls = lambda **kwargs: FakeAgent( + tools=kwargs["tools"], + output="ok", + tool_args={"get_weather": {"location": "Paris"}}, + tool_callables=kwargs.get("_tool_callables", {}), + ) + first = asyncio.run(executor.execute_prompt("Weather?")) + assert len(first.tool_calls) == 1 + assert first.tool_calls[0].arguments["location"] == "Paris" + + # Second run should still invoke tools and capture history independently + executor._agent_cls = lambda **kwargs: FakeAgent( + tools=kwargs["tools"], + output="ok", + tool_args={"get_weather": {"location": "Rome"}}, + tool_callables=kwargs.get("_tool_callables", {}), + ) + second = asyncio.run(executor.execute_prompt("Weather?")) + + assert len(second.tool_calls) == 1 + assert second.tool_calls[0].arguments["location"] == "Rome" + + +def test_execute_prompt_tool_error() -> None: + executor = make_executor() + executor.tool_executor.responses["get_weather"] = ValueError("boom") # type: ignore[attr-defined] + executor._agent_cls = lambda **kwargs: FakeAgent( + tools=kwargs["tools"], + output="Error", + tool_args={"get_weather": {"location": "Rome"}}, + tool_callables=kwargs.get("_tool_callables", {}), + ) + + result = asyncio.run(executor.execute_prompt("Weather?")) + + assert result.tool_calls + error = result.tool_calls[0].error + # Error is now a dict with status, tool, error, and suggestion + assert isinstance(error, dict) + assert error["status"] == "error" + assert error["tool"] == "get_weather" + assert "boom" in error["error"] + + +def test_tool_argument_validation_error_is_captured() -> None: + executor = make_executor() + executor._agent_cls = lambda **kwargs: FakeAgent( + tools=kwargs["tools"], + output="done", + tool_args={"get_weather": {}}, # missing required arg + tool_callables=kwargs.get("_tool_callables", {}), + ) + + result = asyncio.run(executor.execute_prompt("Weather?")) + + assert result.tool_calls + error = result.tool_calls[0].error + # Error is now a dict with status, tool, error, and suggestion + assert isinstance(error, dict) + assert error["status"] == "error" + assert "Field required" in error["error"] + + +def test_tool_model_retry_reinvokes_tool() -> None: + class FlakyToolExecutor: + def __init__(self) -> None: + self.calls = 0 + + async def execute_tool( + self, + tool_name: str, + arguments: dict[str, Any], + user_context: UserContextModel | None = None, + ) -> Any: + self.calls += 1 + if self.calls == 1: + raise ValueError("temporary issue") + return {"status": "ok"} + + class RetryingAgent(FakeAgent): + """Agent that retries tool calls when ModelRetry is raised.""" + + def __init__( + self, + *, + tools: list[Any], + output: Any, + tool_args: dict[str, dict[str, Any]], + tool_callables: dict[str, Any] | None = None, + max_retries: int = 1, + ) -> None: + super().__init__( + tools=tools, output=output, tool_args=tool_args, tool_callables=tool_callables + ) + self.max_retries = max_retries + + async def run( # type: ignore[override] + self, _prompt: str, deps: Any | None = None, model_settings: Any | None = None + ) -> FakeRun: + for tool in self.tools: + tool_name = getattr(tool, "name", None) or getattr( + getattr(tool, "tool_def", None), "name", None + ) + fn = self.tool_callables.get(tool_name or "") + args = self.tool_args.get(tool_name or "", {}) + if not fn: + continue + + attempt = 0 + while attempt < self.max_retries: + try: + if asyncio.iscoroutinefunction(fn): + await fn(**args) + else: + fn(**args) + break + except ModelRetry: + attempt += 1 + if attempt >= self.max_retries: + raise + continue + return FakeRun(self.output) + + executor = make_executor() + flaky_executor = FlakyToolExecutor() + executor.tool_executor = flaky_executor # type: ignore[assignment] + executor._agent_cls = lambda **kwargs: RetryingAgent( + tools=kwargs["tools"], + output="ok", + tool_args={"get_weather": {"location": "Paris"}}, + tool_callables=kwargs.get("_tool_callables", {}), + max_retries=kwargs.get("retries", 1), + ) + + result = asyncio.run(executor.execute_prompt("Weather?")) + + # Tool was called twice: first raised ModelRetry, second succeeded + assert flaky_executor.calls == 2 + assert len(result.tool_calls) == 2 + # First call should have error recorded as dict + first_error = result.tool_calls[0].error + assert isinstance(first_error, dict) + assert first_error["status"] == "error" + assert "temporary issue" in first_error["error"] + # Second call should succeed + assert result.tool_calls[1].result == {"status": "ok"} + + +def test_expected_answer_grading() -> None: + executor = make_executor() + executor._agent_cls = lambda **kwargs: FakeAgent( + tools=kwargs.get("tools", []), + output=GradeResult(result="correct", comment="ok", reasoning="match"), + tool_args={}, + tool_callables=kwargs.get("_tool_callables", {}), + ) + + result = asyncio.run(executor.evaluate_expected_answer("hello", "hello")) + assert result["result"] == "correct" + assert result["comment"] + + +def test_expected_answer_uses_model_reference() -> None: + executor = make_executor() + observed: list[Any] = [] + + def agent_factory(**kwargs: Any) -> FakeAgent: + observed.append(kwargs.get("model")) + return FakeAgent( + tools=kwargs.get("tools", []), + output=GradeResult(result="correct", comment="ok", reasoning="match"), + tool_args={}, + tool_callables=kwargs.get("_tool_callables", {}), ) - user_context = UserContextModel(provider="test", user_id="test-user", username="testuser") - - response, tool_calls = await self.executor.execute_prompt( - "What's the weather in Paris?", user_context=user_context + executor._agent_cls = agent_factory + + result = asyncio.run(executor.evaluate_expected_answer("value", "value")) + assert result["result"] == "correct" + assert observed == [executor._model_reference] + + +def test_max_turns_limits_tool_calls() -> None: + class MultiCallAgent: + def __init__(self, tools: list[Any], tool_callables: dict[str, Any]) -> None: + self.tools = tools + self.tool_callables = tool_callables + + async def run( + self, _prompt: str, deps: Any | None = None, model_settings: Any | None = None + ) -> FakeRun: + for _ in range(2): + for tool in self.tools: + tool_name = getattr(tool, "name", None) + fn = self.tool_callables.get(tool_name or "") + if fn: + if asyncio.iscoroutinefunction(fn): + try: + await fn() + except ModelRetry: + continue + else: + try: + fn() + except ModelRetry: + continue + return FakeRun("done") + + executor = make_executor() + executor._agent_cls = lambda **kwargs: MultiCallAgent( + kwargs["tools"], kwargs.get("_tool_callables", {}) + ) + + result = asyncio.run(executor.execute_prompt("Weather?", max_turns=1)) + + assert len(result.tool_calls) == 2 + assert result.tool_calls[-1].error + + +def test_tool_model_schema_preserves_array_items_type() -> None: + predicates_param = ParameterDefinition( + name="predicates", + type="array", + description="Filters", + required=True, + schema={ + "type": "array", + "description": "Filters", + "items": {"type": "string", "description": "SQL predicate"}, + }, + ) + members_param = ParameterDefinition( + name="members", + type="array", + description="Projection list", + required=True, + schema={"type": "array", "items": {"type": "string"}}, + ) + tools = [ + ToolDefinition( + name="sql_search", + description="Search objects", + parameters=[predicates_param, members_param], ) - - assert response == "The weather in Paris is sunny and 22 degrees." - assert len(tool_calls) == 1 - assert tool_calls[0]["tool"] == "get_weather" - assert tool_calls[0]["arguments"]["location"] == "Paris" - - # Verify tool executor was called correctly - assert len(self.tool_executor.calls) == 1 - call = self.tool_executor.calls[0] - assert call["tool_name"] == "get_weather" - assert call["arguments"]["location"] == "Paris" - assert call["user_context"] == user_context - - @pytest.mark.asyncio - async def test_execute_prompt_tool_error(self): - """Test prompt execution when tool execution fails.""" - # Configure tool executor to raise an error - self.tool_executor.responses["get_weather"] = ValueError("Tool failed") - - # Mock LLM to return tool call, then final response - self.executor._call_llm = AsyncMock( - side_effect=[ - '{"tool": "get_weather", "arguments": {"location": "Paris"}}', - "I'm sorry, I couldn't get the weather information.", - ] + ] + executor = make_executor(tools=tools) + + schema = executor._tool_models["sql_search"].model_json_schema() + props = schema["properties"] + + assert props["predicates"]["type"] == "array" + assert props["predicates"]["items"]["type"] == "string" + assert props["predicates"]["items"]["description"] == "SQL predicate" + assert props["members"]["items"]["type"] == "string" + assert "predicates" in schema["required"] + assert "members" in schema["required"] + + +def test_tool_model_schema_supports_optional_object_params() -> None: + context_param = ParameterDefinition( + name="context", + type="object", + description="Optional filters", + required=False, + default={}, + schema={ + "type": "object", + "properties": { + "limit": {"type": "integer", "minimum": 1, "maximum": 100}, + "sort": {"type": "string"}, + }, + "required": ["limit"], + "additionalProperties": False, + }, + ) + tools = [ + ToolDefinition( + name="fetch_objects", + description="Fetch objects", + parameters=[ + ParameterDefinition(name="object_type", type="string", required=True), + context_param, + ], ) - - response, tool_calls = await self.executor.execute_prompt("What's the weather in Paris?") - - assert response == "I'm sorry, I couldn't get the weather information." - assert len(tool_calls) == 1 - - # Verify the LLM received the tool error in the conversation - assert self.executor._call_llm.call_count == 2 - - @pytest.mark.asyncio - async def test_execute_prompt_max_iterations(self): - """Test that max iterations prevents infinite loops.""" - # Mock LLM to always return tool calls - self.executor._call_llm = AsyncMock( - return_value='{"tool": "get_weather", "arguments": {"location": "Paris"}}' + ] + executor = make_executor(tools=tools) + + schema = executor._tool_models["fetch_objects"].model_json_schema() + props = schema["properties"] + + assert "context" in props + assert props["context"]["type"] == "object" + assert props["context"]["properties"]["limit"]["minimum"] == 1 + assert props["context"]["properties"]["limit"]["maximum"] == 100 + assert props["context"]["required"] == ["limit"] + assert "context" not in schema.get("required", []) + + +def test_executor_with_empty_tool_list() -> None: + """Test executor handles empty tool list gracefully.""" + # Create executor directly, bypassing make_executor which adds default tools + tool_executor = MockToolExecutor() + executor = LLMExecutor( + "claude-test", + "anthropic", + ModelSettings(), + [], # Empty tool list + tool_executor, + provider_config=ProviderConfig(api_key="key", base_url="https://api.anthropic.com"), + ) + + assert executor.available_tools == [] + assert executor._tool_models == {} + # System prompt should indicate no tools + assert ( + "no tools" in executor.system_prompt.lower() + or "answer directly" in executor.system_prompt.lower() + ) + + +def test_unknown_parameter_type_defaults_to_any() -> None: + """Test that unknown parameter types fall back to Any with a warning.""" + tools = [ + ToolDefinition( + name="custom_tool", + description="Tool with unknown type", + parameters=[ParameterDefinition(name="param", type="unknown_type", required=True)], ) + ] + # This should not raise an error + executor = make_executor(tools=tools) - response, tool_calls = await self.executor.execute_prompt("Weather?") + # The tool model should be created + assert "custom_tool" in executor._tool_models - # Should hit max iterations (10) and return the last response - assert len(tool_calls) == 10 - assert self.executor._call_llm.call_count == 10 +def test_execute_prompt_handles_empty_output() -> None: + """Test that empty agent output is handled gracefully.""" + executor = make_executor() -class TestToolDefinition: - """Test cases for ToolDefinition.""" + executor._agent_cls = lambda **kwargs: FakeAgent( + tools=kwargs["tools"], + output="", # Empty output + tool_args={}, + tool_callables=kwargs.get("_tool_callables", {}), + ) - def test_to_prompt_format_basic(self): - """Test basic tool formatting.""" - tool = ToolDefinition(name="test_tool", description="A test tool") + result = asyncio.run(executor.execute_prompt("Test prompt")) - formatted = tool.to_prompt_format() - assert "Tool: test_tool" in formatted - assert "Description: A test tool" in formatted - assert "Parameters: None" in formatted + assert isinstance(result, AgentResult) + assert result.answer == "" + assert result.error is None - def test_to_prompt_format_with_parameters(self): - """Test tool formatting with parameters.""" - tool = ToolDefinition( - name="calculator", - description="Perform calculations", - parameters=[ - ParameterDefinition( - name="expression", type="string", description="Math expression", default="0" - ), - ParameterDefinition(name="precision", type="integer", description="Decimal places"), - ], - return_type=TypeSchemaModel(type="number", description="Result"), - tags=["math", "utility"], - ) - formatted = tool.to_prompt_format() - assert "Tool: calculator" in formatted - assert "Description: Perform calculations" in formatted - assert "expression (string) [default: 0]: Math expression" in formatted - assert "precision (integer): Decimal places" in formatted - assert "Returns: number - Result" in formatted - assert "Tags: math, utility" in formatted +def test_provider_config_defaults() -> None: + """Test ProviderConfig uses defaults correctly.""" + from mxcp.sdk.evals.executor import ProviderConfig + config = ProviderConfig() + assert config.api_key is None + assert config.base_url is None + assert config.timeout is None -class TestModelConfigs: - """Test cases for model configurations.""" - def test_claude_config(self): - """Test Claude configuration.""" - config = ClaudeConfig( - name="claude-3-haiku", api_key="test-key", base_url="https://api.custom.com", timeout=60 - ) +def test_provider_config_with_values() -> None: + """Test ProviderConfig accepts and stores values.""" + from mxcp.sdk.evals.executor import ProviderConfig - assert config.get_type() == "claude" - assert config.name == "claude-3-haiku" - assert config.api_key == "test-key" - assert config.base_url == "https://api.custom.com" - assert config.timeout == 60 + config = ProviderConfig( + api_key="test-key", + base_url="https://api.example.com", + timeout=30, + ) + assert config.api_key == "test-key" + assert config.base_url == "https://api.example.com" + assert config.timeout == 30 - def test_openai_config(self): - """Test OpenAI configuration.""" - config = OpenAIConfig( - name="gpt-4", api_key="test-key", base_url="https://api.custom.com", timeout=45 - ) - assert config.get_type() == "openai" - assert config.name == "gpt-4" - assert config.api_key == "test-key" - assert config.base_url == "https://api.custom.com" - assert config.timeout == 45 - - def test_config_defaults(self): - """Test default values for configs.""" - claude = ClaudeConfig(name="claude", api_key="key") - assert claude.base_url == "https://api.anthropic.com" - assert claude.timeout == 30 - - openai = OpenAIConfig(name="gpt", api_key="key") - assert openai.base_url == "https://api.openai.com/v1" - assert openai.timeout == 30 +def test_agent_retries_clamped_to_minimum() -> None: + """Test that agent_retries is clamped to at least 1.""" + executor = make_executor(agent_retries=0) + assert executor._agent_retries == 1 + + executor = make_executor(agent_retries=-5) + assert executor._agent_retries == 1 diff --git a/tests/server/test_evals_service.py b/tests/server/test_evals_service.py new file mode 100644 index 00000000..4224971b --- /dev/null +++ b/tests/server/test_evals_service.py @@ -0,0 +1,125 @@ +from pydantic_ai import ModelSettings + +from mxcp.server.definitions.endpoints.models import ParamDefinitionModel, TypeDefinitionModel +from mxcp.server.services.evals.service import ( + _build_model_settings, + _format_expected_answer_failure, + _parameter_definition_from_model, + _type_definition_to_schema, +) + + +def test_model_settings_chat_drops_response_only_keys() -> None: + allowed = set(ModelSettings.__annotations__.keys()) + settings = _build_model_settings( + "gpt-4o", + "openai", + {"body:reasoning": {"effort": "medium"}, "timeout": 30}, + allowed, + ) + + extra_body = settings.get("extra_body") + assert extra_body and "reasoning" in extra_body + assert settings.get("timeout") == 30 + assert settings.get("max_tokens") == 10_000 + + +def test_model_settings_responses_keeps_extras() -> None: + allowed = set(ModelSettings.__annotations__.keys()) + settings = _build_model_settings( + "gpt-5", + "openai", + {"api": "responses", "body:reasoning": {"effort": "medium"}}, + allowed, + ) + + extra_body = settings.get("extra_body") + assert extra_body and "reasoning" in extra_body + assert settings.get("max_tokens") == 10_000 + + +def test_model_settings_anthropic_output_config_and_betas() -> None: + allowed = set(ModelSettings.__annotations__.keys()) + settings = _build_model_settings( + "claude", + "anthropic", + { + "body:output_config": {"effort": "medium"}, + "header:anthropic-beta": ["effort-2025-11-24"], + }, + allowed, + ) + + extra_body = settings.get("extra_body") + assert extra_body and extra_body.get("output_config") == {"effort": "medium"} + headers = settings.get("extra_headers") + assert headers and headers.get("anthropic-beta") == "effort-2025-11-24" + assert settings.get("max_tokens") == 10_000 + + +def test_model_settings_respects_user_max_tokens_override() -> None: + allowed = set(ModelSettings.__annotations__.keys()) + settings = _build_model_settings( + "gpt-4o", + "openai", + {"max_tokens": 2048}, + allowed, + ) + + assert settings.get("max_tokens") == 2048 + + +def test_expected_answer_failure_formatting_is_multiline() -> None: + detail = _format_expected_answer_failure( + "Answer", + "Expected", + "wrong", + "bad", + "missed value", + ) + lines = detail.splitlines() + assert lines == [ + "LLM Answer: Answer", + "Expected: Expected", + "Grade: wrong", + "Comment: bad", + "Reasoning: missed value", + ] + + +def test_parameter_definition_from_model_includes_array_items_schema() -> None: + param = ParamDefinitionModel( + name="predicates", + type="array", + description="Filters", + items=TypeDefinitionModel(type="string", description="SQL predicate"), + ) + + definition = _parameter_definition_from_model(param) + + assert definition.required is True + assert definition.default is None + assert definition.schema == { + "type": "array", + "description": "Filters", + "items": {"type": "string", "description": "SQL predicate"}, + } + + +def test_parameter_definition_from_model_marks_optional_when_default_present() -> None: + param = ParamDefinitionModel( + name="limit", + type="integer", + description="Result limit", + default=25, + minimum=1, + maximum=100, + ) + + definition = _parameter_definition_from_model(param) + + assert definition.required is False + assert definition.default == 25 + assert definition.schema["type"] == "integer" + assert definition.schema["minimum"] == 1 + assert definition.schema["maximum"] == 100 diff --git a/tests/server/test_evals_tool_executor.py b/tests/server/test_evals_tool_executor.py index c0e21794..15d99a19 100644 --- a/tests/server/test_evals_tool_executor.py +++ b/tests/server/test_evals_tool_executor.py @@ -1,5 +1,6 @@ """Tests for EndpointToolExecutor integration.""" +from pathlib import Path from typing import Any import pytest @@ -7,7 +8,7 @@ from mxcp.sdk.auth import UserContextModel from mxcp.sdk.executor import ExecutionContext from mxcp.server.definitions.endpoints.models import EndpointDefinitionModel, SourceDefinitionModel -from mxcp.server.executor.runners.tool import EndpointToolExecutor +from mxcp.server.executor.runners.tool import EndpointToolExecutor, EndpointWithPath class MockExecutionEngine: @@ -47,60 +48,79 @@ def setup_method(self): "weather.py": {"temperature": 22, "condition": "sunny"}, } ) + self._monkeypatch = pytest.MonkeyPatch() + self._monkeypatch.setattr( + "mxcp.server.executor.runners.tool.find_repo_root", lambda: Path.cwd() + ) - self.endpoints: list[EndpointDefinitionModel] = [ - EndpointDefinitionModel.model_validate( - { - "mxcp": 1, - "tool": { - "name": "get_date", - "description": "Get current date", - "parameters": [], - "source": {"code": "SELECT current_date()"}, - }, - } + self.endpoints = [ + EndpointWithPath( + EndpointDefinitionModel.model_validate( + { + "mxcp": 1, + "tool": { + "name": "get_date", + "description": "Get current date", + "parameters": [], + "source": {"code": "SELECT current_date()"}, + }, + } + ), + Path("endpoints/get_date.yml"), ), - EndpointDefinitionModel.model_validate( - { - "mxcp": 1, - "tool": { - "name": "calculate", - "description": "Calculate expression", - "parameters": [{"name": "expr", "type": "string"}], - "source": {"code": "return 2 + 2", "language": "python"}, - }, - } + EndpointWithPath( + EndpointDefinitionModel.model_validate( + { + "mxcp": 1, + "tool": { + "name": "calculate", + "description": "Calculate expression", + "parameters": [{"name": "expr", "type": "string"}], + "source": {"code": "return 2 + 2", "language": "python"}, + }, + } + ), + Path("endpoints/calculate.yml"), ), - EndpointDefinitionModel.model_validate( - { - "mxcp": 1, - "tool": { - "name": "get_weather", - "description": "Get weather info", - "parameters": [{"name": "location", "type": "string"}], - "source": {"file": "weather.py", "language": "python"}, - }, - } + EndpointWithPath( + EndpointDefinitionModel.model_validate( + { + "mxcp": 1, + "tool": { + "name": "get_weather", + "description": "Get weather info", + "parameters": [{"name": "location", "type": "string"}], + "source": {"code": "weather.py", "language": "python"}, + }, + } + ), + Path("endpoints/get_weather.yml"), ), - EndpointDefinitionModel.model_validate( - { - "mxcp": 1, - "resource": { - "uri": "data://users", - "description": "User data resource", - "parameters": [{"name": "limit", "type": "integer"}], - "source": {"code": "SELECT * FROM users LIMIT $limit"}, - }, - } + EndpointWithPath( + EndpointDefinitionModel.model_validate( + { + "mxcp": 1, + "resource": { + "uri": "data://users", + "description": "User data resource", + "parameters": [{"name": "limit", "type": "integer"}], + "source": {"code": "SELECT * FROM users LIMIT $limit"}, + }, + } + ), + Path("endpoints/users.yml"), ), ] self.executor = EndpointToolExecutor(self.engine, self.endpoints) + def teardown_method(self): + self._monkeypatch.undo() + def test_initialization(self): """Test EndpointToolExecutor initialization.""" assert self.executor.engine == self.engine - assert self.executor.endpoints == self.endpoints + assert len(self.executor.endpoints) == len(self.endpoints) assert len(self.executor._tool_map) == 4 assert "get_date" in self.executor._tool_map assert "data://users" in self.executor._tool_map @@ -139,6 +159,8 @@ async def test_execute_tool_with_language(self): @pytest.mark.asyncio async def test_execute_tool_with_file(self): """Test executing a tool with file reference.""" + tmp_file = Path("weather.py") + tmp_file.write_text("weather.py") result = await self.executor.execute_tool("get_weather", {"location": "Paris"}) assert result == {"temperature": 22, "condition": "sunny"} @@ -149,6 +171,8 @@ async def test_execute_tool_with_file(self): assert call["language"] == "python" assert call["source_code"] == "weather.py" assert call["params"] == {"location": "Paris"} + if tmp_file.exists(): + tmp_file.unlink() @pytest.mark.asyncio async def test_execute_resource(self): @@ -205,43 +229,98 @@ async def test_execute_tool_no_source(self): "source", SourceDefinitionModel.model_construct(code=None, file=None), ) - endpoints_no_source: list[EndpointDefinitionModel] = [endpoint] - + endpoints_no_source = [EndpointWithPath(endpoint, Path("endpoints/broken.yml"))] executor = EndpointToolExecutor(self.engine, endpoints_no_source) with pytest.raises(ValueError) as exc_info: await executor.execute_tool("broken_tool", {}) - assert "No source code or file found in source definition" in str(exc_info.value) + assert "No source found for endpoint" in str(exc_info.value) - def test_get_language_inference(self): - """Test language inference via endpoint execution.""" - # Create endpoints with different language sources - test_endpoints: list[EndpointDefinitionModel] = [ + @pytest.mark.asyncio + async def test_execute_tool_loads_file_content(self, tmp_path, monkeypatch): + """Ensure file-based sources are read and executed with their content.""" + sql_dir = tmp_path / "sql" + sql_dir.mkdir() + sql_file = sql_dir / "hello.sql" + sql_file.write_text("select 1 as val;") + + # Provide mxcp-site.yml so find_repo_root() resolves to tmp_path + (tmp_path / "mxcp-site.yml").write_text("mxcp: 1\nproject: test\nprofile: default\n") + monkeypatch.chdir(tmp_path) + + endpoint = EndpointWithPath( EndpointDefinitionModel.model_validate( - {"mxcp": 1, "tool": {"name": "python_file_tool", "source": {"file": "script.py"}}} + {"mxcp": 1, "tool": {"name": "hello_tool", "source": {"file": "sql/hello.sql"}}} ), + Path("endpoints/hello.yml"), + ) + + engine = MockExecutionEngine({"select 1 as val;": {"val": 1}}) + executor = EndpointToolExecutor(engine, [endpoint]) + + result = await executor.execute_tool("hello_tool", {}) + + assert result == {"val": 1} + assert engine.calls[0]["source_code"] == "select 1 as val;" + + @pytest.mark.asyncio + async def test_execute_tool_loads_relative_parent_path(self, tmp_path, monkeypatch): + """Relative paths with '..' should resolve correctly.""" + (tmp_path / "mxcp-site.yml").write_text("mxcp: 1\nproject: test\nprofile: default\n") + sql_dir = tmp_path.parent / "shared-sql" + sql_dir.mkdir(exist_ok=True) + sql_file = sql_dir / "hi.sql" + sql_file.write_text("select 2 as val;") + + # endpoint references ../shared-sql/hi.sql relative to repo root + endpoint = EndpointWithPath( EndpointDefinitionModel.model_validate( - {"mxcp": 1, "tool": {"name": "sql_file_tool", "source": {"file": "query.sql"}}} + {"mxcp": 1, "tool": {"name": "hi_tool", "source": {"file": "../shared-sql/hi.sql"}}} ), + Path("endpoints/hi.yml"), + ) + + engine = MockExecutionEngine({"select 2 as val;": {"val": 2}}) + monkeypatch.chdir(tmp_path) + executor = EndpointToolExecutor(engine, [endpoint]) + + result = await executor.execute_tool("hi_tool", {}) + + assert result == {"val": 2} + + @pytest.mark.asyncio + async def test_python_file_executes_by_path(self, tmp_path, monkeypatch): + """Python sources should be passed as file paths to the engine.""" + (tmp_path / "mxcp-site.yml").write_text("mxcp: 1\nproject: test\nprofile: default\n") + py_dir = tmp_path / "python" + py_dir.mkdir() + script = py_dir / "hello.py" + script.write_text("def python_tool():\n" " return {'message': 'hi'}\n") + + endpoint = EndpointWithPath( EndpointDefinitionModel.model_validate( { "mxcp": 1, "tool": { - "name": "explicit_override_tool", - "source": {"file": "script.py", "language": "sql"}, + "name": "python_tool", + "source": {"file": "python/hello.py", "language": "python"}, }, } ), - EndpointDefinitionModel.model_validate( - {"mxcp": 1, "tool": {"name": "default_sql_tool", "source": {"code": "some code"}}} - ), - ] + Path("endpoints/python.yml"), + ) + + monkeypatch.chdir(tmp_path) + engine = MockExecutionEngine() + executor = EndpointToolExecutor(engine, [endpoint]) - test_executor = EndpointToolExecutor(self.engine, test_endpoints) + result = await executor.execute_tool("python_tool", {}) - # Verify the tools were properly registered - assert "python_file_tool" in test_executor._tool_map - assert "sql_file_tool" in test_executor._tool_map - assert "explicit_override_tool" in test_executor._tool_map - assert "default_sql_tool" in test_executor._tool_map + assert result == "Mock result for " + engine.calls[0]["source_code"] + assert engine.calls[0]["language"] == "python" + source_code = engine.calls[0]["source_code"] + file_part, sep, function_name = source_code.partition(":") + assert sep == ":" + assert function_name == "python_tool" + assert Path(file_part).resolve() == script.resolve() diff --git a/tests/server/test_user_config.py b/tests/server/test_user_config.py index c211ba3a..0f2f8bb8 100644 --- a/tests/server/test_user_config.py +++ b/tests/server/test_user_config.py @@ -402,3 +402,36 @@ def test_load_without_resolving_refs(tmp_path): # Clean up the secret file if secret_file.exists(): secret_file.unlink() + + +def test_model_options_allowed(tmp_path): + """Ensure model options field is accepted in user config.""" + config_path = tmp_path / "config.yml" + config_content = """ + mxcp: 1 + models: + default: "gpt-4o" + models: + gpt-4o: + type: "openai" + api_key: "${OPENAI_API_KEY}" + options: + reasoning: "fast" + projects: + test_project: + profiles: + dev: {} + """ + config_path.write_text(config_content) + + os.environ["MXCP_CONFIG"] = str(config_path) + os.environ["OPENAI_API_KEY"] = "secret" + + site_config = make_site_config("test_project", "dev") + + user_config = load_user_config(site_config).model_dump(mode="python") + model_cfg = user_config["models"]["models"]["gpt-4o"] + assert model_cfg["options"]["reasoning"] == "fast" + + del os.environ["MXCP_CONFIG"] + del os.environ["OPENAI_API_KEY"] diff --git a/uv.lock b/uv.lock index 08700fd6..82c85ecb 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 2 +revision = 3 requires-python = ">=3.10" resolution-markers = [ "python_full_version >= '3.12'", @@ -173,6 +173,25 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" }, ] +[[package]] +name = "anthropic" +version = "0.75.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "distro" }, + { name = "docstring-parser" }, + { name = "httpx" }, + { name = "jiter" }, + { name = "pydantic" }, + { name = "sniffio" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/04/1f/08e95f4b7e2d35205ae5dcbb4ae97e7d477fc521c275c02609e2931ece2d/anthropic-0.75.0.tar.gz", hash = "sha256:e8607422f4ab616db2ea5baacc215dd5f028da99ce2f022e33c7c535b29f3dfb", size = 439565, upload-time = "2025-11-24T20:41:45.28Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/60/1c/1cd02b7ae64302a6e06724bf80a96401d5313708651d277b1458504a1730/anthropic-0.75.0-py3-none-any.whl", hash = "sha256:ea8317271b6c15d80225a9f3c670152746e88805a7a61e14d4a374577164965b", size = 388164, upload-time = "2025-11-24T20:41:43.587Z" }, +] + [[package]] name = "anyio" version = "4.9.0" @@ -712,6 +731,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277, upload-time = "2023-12-24T09:54:30.421Z" }, ] +[[package]] +name = "docstring-parser" +version = "0.17.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b2/9d/c3b43da9515bd270df0f80548d9944e389870713cc1fe2b8fb35fe2bcefd/docstring_parser-0.17.0.tar.gz", hash = "sha256:583de4a309722b3315439bb31d64ba3eebada841f2e2cee23b99df001434c912", size = 27442, upload-time = "2025-07-21T07:35:01.868Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/55/e2/2537ebcff11c1ee1ff17d8d0b6f4db75873e3b0fb32c2d4a2ee31ecb310a/docstring_parser-0.17.0-py3-none-any.whl", hash = "sha256:cf2569abd23dce8099b300f9b4fa8191e9582dda731fd533daf54c4551658708", size = 36896, upload-time = "2025-07-21T07:35:00.684Z" }, +] + [[package]] name = "docutils" version = "0.21.2" @@ -753,6 +781,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/30/79/4f544d73fcc0513b71296cb3ebb28a227d22e80dec27204977039b9fa875/duckdb-1.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:280fd663dacdd12bb3c3bf41f3e5b2e5b95e00b88120afabb8b8befa5f335c6f", size = 12336460, upload-time = "2025-10-07T10:37:12.154Z" }, ] +[[package]] +name = "eval-type-backport" +version = "0.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/51/23/079e39571d6dd8d90d7a369ecb55ad766efb6bae4e77389629e14458c280/eval_type_backport-0.3.0.tar.gz", hash = "sha256:1638210401e184ff17f877e9a2fa076b60b5838790f4532a21761cc2be67aea1", size = 9272, upload-time = "2025-11-13T20:56:50.845Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/19/d8/2a1c638d9e0aa7e269269a1a1bf423ddd94267f1a01bbe3ad03432b67dd4/eval_type_backport-0.3.0-py3-none-any.whl", hash = "sha256:975a10a0fe333c8b6260d7fdb637698c9a16c3a9e3b6eb943fee6a6f67a37fe8", size = 6061, upload-time = "2025-11-13T20:56:49.499Z" }, +] + [[package]] name = "exceptiongroup" version = "1.3.0" @@ -874,6 +911,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ee/45/b82e3c16be2182bff01179db177fe144d58b5dc787a7d4492c6ed8b9317f/frozenlist-1.7.0-py3-none-any.whl", hash = "sha256:9a5af342e34f7e97caf8c995864c7a396418ae2859cc6fdf1b1073020d516a7e", size = 13106, upload-time = "2025-06-09T23:02:34.204Z" }, ] +[[package]] +name = "genai-prices" +version = "0.0.47" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "eval-type-backport", marker = "python_full_version < '3.11'" }, + { name = "httpx" }, + { name = "pydantic" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b8/47/f25fb84fa40142699dc54ca294628d600625eb3d90fead103a606b4e999a/genai_prices-0.0.47.tar.gz", hash = "sha256:3b8c514f0ce5818b3944a371861586ed9bfe10d02598e62c350b5bd2916d03c2", size = 54501, upload-time = "2025-11-25T18:38:17.695Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4e/84/d50c52d0eeadb9dbf7f2f86da9b6257e162b7c6a791f5b1009bae912c103/genai_prices-0.0.47-py3-none-any.whl", hash = "sha256:735e45950d2299276f2c00cd18075b77a124cd24ee58243f236ee29af3210594", size = 57000, upload-time = "2025-11-25T18:38:16.464Z" }, +] + [[package]] name = "googleapis-common-protos" version = "1.70.0" @@ -886,6 +937,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/86/f1/62a193f0227cf15a920390abe675f386dec35f7ae3ffe6da582d3ade42c7/googleapis_common_protos-1.70.0-py3-none-any.whl", hash = "sha256:b8bfcca8c25a2bb253e0e0b0adaf8c00773e5e6af6fd92397576680b807e0fd8", size = 294530, upload-time = "2025-04-14T10:17:01.271Z" }, ] +[[package]] +name = "griffe" +version = "1.15.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/0d/0c/3a471b6e31951dce2360477420d0a8d1e00dea6cf33b70f3e8c3ab6e28e1/griffe-1.15.0.tar.gz", hash = "sha256:7726e3afd6f298fbc3696e67958803e7ac843c1cfe59734b6251a40cdbfb5eea", size = 424112, upload-time = "2025-11-10T15:03:15.52Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9c/83/3b1d03d36f224edded98e9affd0467630fc09d766c0e56fb1498cbb04a9b/griffe-1.15.0-py3-none-any.whl", hash = "sha256:6f6762661949411031f5fcda9593f586e6ce8340f0ba88921a0f2ef7a81eb9a3", size = 150705, upload-time = "2025-11-10T15:03:13.549Z" }, +] + [[package]] name = "h11" version = "0.16.0" @@ -968,11 +1031,11 @@ wheels = [ [[package]] name = "httpx-sse" -version = "0.4.1" +version = "0.4.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/6e/fa/66bd985dd0b7c109a3bcb89272ee0bfb7e2b4d06309ad7b38ff866734b2a/httpx_sse-0.4.1.tar.gz", hash = "sha256:8f44d34414bc7b21bf3602713005c5df4917884f76072479b21f68befa4ea26e", size = 12998, upload-time = "2025-06-24T13:21:05.71Z" } +sdist = { url = "https://files.pythonhosted.org/packages/4c/60/8f4281fa9bbf3c8034fd54c0e7412e66edbab6bc74c4996bd616f8d0406e/httpx-sse-0.4.0.tar.gz", hash = "sha256:1e81a3a3070ce322add1d3529ed42eb5f70817f45ed6ec915ab753f961139721", size = 12624, upload-time = "2023-12-22T08:01:21.083Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/25/0a/6269e3473b09aed2dab8aa1a600c70f31f00ae1349bee30658f7e358a159/httpx_sse-0.4.1-py3-none-any.whl", hash = "sha256:cba42174344c3a5b06f255ce65b350880f962d99ead85e776f23c6618a377a37", size = 8054, upload-time = "2025-06-24T13:21:04.772Z" }, + { url = "https://files.pythonhosted.org/packages/e1/9b/a181f281f65d776426002f330c31849b86b31fc9d848db62e16f03ff739f/httpx_sse-0.4.0-py3-none-any.whl", hash = "sha256:f329af6eae57eaa2bdfd962b42524764af68075ea87370a2de920af5341e318f", size = 7819, upload-time = "2023-12-22T08:01:19.89Z" }, ] [[package]] @@ -1098,6 +1161,103 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" }, ] +[[package]] +name = "jiter" +version = "0.12.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/45/9d/e0660989c1370e25848bb4c52d061c71837239738ad937e83edca174c273/jiter-0.12.0.tar.gz", hash = "sha256:64dfcd7d5c168b38d3f9f8bba7fc639edb3418abcc74f22fdbe6b8938293f30b", size = 168294, upload-time = "2025-11-09T20:49:23.302Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3b/91/13cb9505f7be74a933f37da3af22e029f6ba64f5669416cb8b2774bc9682/jiter-0.12.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:e7acbaba9703d5de82a2c98ae6a0f59ab9770ab5af5fa35e43a303aee962cf65", size = 316652, upload-time = "2025-11-09T20:46:41.021Z" }, + { url = "https://files.pythonhosted.org/packages/4e/76/4e9185e5d9bb4e482cf6dec6410d5f78dfeb374cfcecbbe9888d07c52daa/jiter-0.12.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:364f1a7294c91281260364222f535bc427f56d4de1d8ffd718162d21fbbd602e", size = 319829, upload-time = "2025-11-09T20:46:43.281Z" }, + { url = "https://files.pythonhosted.org/packages/86/af/727de50995d3a153138139f259baae2379d8cb0522c0c00419957bc478a6/jiter-0.12.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:85ee4d25805d4fb23f0a5167a962ef8e002dbfb29c0989378488e32cf2744b62", size = 350568, upload-time = "2025-11-09T20:46:45.075Z" }, + { url = "https://files.pythonhosted.org/packages/6a/c1/d6e9f4b7a3d5ac63bcbdfddeb50b2dcfbdc512c86cffc008584fdc350233/jiter-0.12.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:796f466b7942107eb889c08433b6e31b9a7ed31daceaecf8af1be26fb26c0ca8", size = 369052, upload-time = "2025-11-09T20:46:46.818Z" }, + { url = "https://files.pythonhosted.org/packages/eb/be/00824cd530f30ed73fa8a4f9f3890a705519e31ccb9e929f1e22062e7c76/jiter-0.12.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:35506cb71f47dba416694e67af996bbdefb8e3608f1f78799c2e1f9058b01ceb", size = 481585, upload-time = "2025-11-09T20:46:48.319Z" }, + { url = "https://files.pythonhosted.org/packages/74/b6/2ad7990dff9504d4b5052eef64aa9574bd03d722dc7edced97aad0d47be7/jiter-0.12.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:726c764a90c9218ec9e4f99a33d6bf5ec169163f2ca0fc21b654e88c2abc0abc", size = 380541, upload-time = "2025-11-09T20:46:49.643Z" }, + { url = "https://files.pythonhosted.org/packages/b5/c7/f3c26ecbc1adbf1db0d6bba99192143d8fe8504729d9594542ecc4445784/jiter-0.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:baa47810c5565274810b726b0dc86d18dce5fd17b190ebdc3890851d7b2a0e74", size = 364423, upload-time = "2025-11-09T20:46:51.731Z" }, + { url = "https://files.pythonhosted.org/packages/18/51/eac547bf3a2d7f7e556927278e14c56a0604b8cddae75815d5739f65f81d/jiter-0.12.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f8ec0259d3f26c62aed4d73b198c53e316ae11f0f69c8fbe6682c6dcfa0fcce2", size = 389958, upload-time = "2025-11-09T20:46:53.432Z" }, + { url = "https://files.pythonhosted.org/packages/2c/1f/9ca592e67175f2db156cff035e0d817d6004e293ee0c1d73692d38fcb596/jiter-0.12.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:79307d74ea83465b0152fa23e5e297149506435535282f979f18b9033c0bb025", size = 522084, upload-time = "2025-11-09T20:46:54.848Z" }, + { url = "https://files.pythonhosted.org/packages/83/ff/597d9cdc3028f28224f53e1a9d063628e28b7a5601433e3196edda578cdd/jiter-0.12.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:cf6e6dd18927121fec86739f1a8906944703941d000f0639f3eb6281cc601dca", size = 513054, upload-time = "2025-11-09T20:46:56.487Z" }, + { url = "https://files.pythonhosted.org/packages/24/6d/1970bce1351bd02e3afcc5f49e4f7ef3dabd7fb688f42be7e8091a5b809a/jiter-0.12.0-cp310-cp310-win32.whl", hash = "sha256:b6ae2aec8217327d872cbfb2c1694489057b9433afce447955763e6ab015b4c4", size = 206368, upload-time = "2025-11-09T20:46:58.638Z" }, + { url = "https://files.pythonhosted.org/packages/e3/6b/eb1eb505b2d86709b59ec06681a2b14a94d0941db091f044b9f0e16badc0/jiter-0.12.0-cp310-cp310-win_amd64.whl", hash = "sha256:c7f49ce90a71e44f7e1aa9e7ec415b9686bbc6a5961e57eab511015e6759bc11", size = 204847, upload-time = "2025-11-09T20:47:00.295Z" }, + { url = "https://files.pythonhosted.org/packages/32/f9/eaca4633486b527ebe7e681c431f529b63fe2709e7c5242fc0f43f77ce63/jiter-0.12.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:d8f8a7e317190b2c2d60eb2e8aa835270b008139562d70fe732e1c0020ec53c9", size = 316435, upload-time = "2025-11-09T20:47:02.087Z" }, + { url = "https://files.pythonhosted.org/packages/10/c1/40c9f7c22f5e6ff715f28113ebaba27ab85f9af2660ad6e1dd6425d14c19/jiter-0.12.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2218228a077e784c6c8f1a8e5d6b8cb1dea62ce25811c356364848554b2056cd", size = 320548, upload-time = "2025-11-09T20:47:03.409Z" }, + { url = "https://files.pythonhosted.org/packages/6b/1b/efbb68fe87e7711b00d2cfd1f26bb4bfc25a10539aefeaa7727329ffb9cb/jiter-0.12.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9354ccaa2982bf2188fd5f57f79f800ef622ec67beb8329903abf6b10da7d423", size = 351915, upload-time = "2025-11-09T20:47:05.171Z" }, + { url = "https://files.pythonhosted.org/packages/15/2d/c06e659888c128ad1e838123d0638f0efad90cc30860cb5f74dd3f2fc0b3/jiter-0.12.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8f2607185ea89b4af9a604d4c7ec40e45d3ad03ee66998b031134bc510232bb7", size = 368966, upload-time = "2025-11-09T20:47:06.508Z" }, + { url = "https://files.pythonhosted.org/packages/6b/20/058db4ae5fb07cf6a4ab2e9b9294416f606d8e467fb74c2184b2a1eeacba/jiter-0.12.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3a585a5e42d25f2e71db5f10b171f5e5ea641d3aa44f7df745aa965606111cc2", size = 482047, upload-time = "2025-11-09T20:47:08.382Z" }, + { url = "https://files.pythonhosted.org/packages/49/bb/dc2b1c122275e1de2eb12905015d61e8316b2f888bdaac34221c301495d6/jiter-0.12.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bd9e21d34edff5a663c631f850edcb786719c960ce887a5661e9c828a53a95d9", size = 380835, upload-time = "2025-11-09T20:47:09.81Z" }, + { url = "https://files.pythonhosted.org/packages/23/7d/38f9cd337575349de16da575ee57ddb2d5a64d425c9367f5ef9e4612e32e/jiter-0.12.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a612534770470686cd5431478dc5a1b660eceb410abade6b1b74e320ca98de6", size = 364587, upload-time = "2025-11-09T20:47:11.529Z" }, + { url = "https://files.pythonhosted.org/packages/f0/a3/b13e8e61e70f0bb06085099c4e2462647f53cc2ca97614f7fedcaa2bb9f3/jiter-0.12.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:3985aea37d40a908f887b34d05111e0aae822943796ebf8338877fee2ab67725", size = 390492, upload-time = "2025-11-09T20:47:12.993Z" }, + { url = "https://files.pythonhosted.org/packages/07/71/e0d11422ed027e21422f7bc1883c61deba2d9752b720538430c1deadfbca/jiter-0.12.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b1207af186495f48f72529f8d86671903c8c10127cac6381b11dddc4aaa52df6", size = 522046, upload-time = "2025-11-09T20:47:14.6Z" }, + { url = "https://files.pythonhosted.org/packages/9f/59/b968a9aa7102a8375dbbdfbd2aeebe563c7e5dddf0f47c9ef1588a97e224/jiter-0.12.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:ef2fb241de583934c9915a33120ecc06d94aa3381a134570f59eed784e87001e", size = 513392, upload-time = "2025-11-09T20:47:16.011Z" }, + { url = "https://files.pythonhosted.org/packages/ca/e4/7df62002499080dbd61b505c5cb351aa09e9959d176cac2aa8da6f93b13b/jiter-0.12.0-cp311-cp311-win32.whl", hash = "sha256:453b6035672fecce8007465896a25b28a6b59cfe8fbc974b2563a92f5a92a67c", size = 206096, upload-time = "2025-11-09T20:47:17.344Z" }, + { url = "https://files.pythonhosted.org/packages/bb/60/1032b30ae0572196b0de0e87dce3b6c26a1eff71aad5fe43dee3082d32e0/jiter-0.12.0-cp311-cp311-win_amd64.whl", hash = "sha256:ca264b9603973c2ad9435c71a8ec8b49f8f715ab5ba421c85a51cde9887e421f", size = 204899, upload-time = "2025-11-09T20:47:19.365Z" }, + { url = "https://files.pythonhosted.org/packages/49/d5/c145e526fccdb834063fb45c071df78b0cc426bbaf6de38b0781f45d956f/jiter-0.12.0-cp311-cp311-win_arm64.whl", hash = "sha256:cb00ef392e7d684f2754598c02c409f376ddcef857aae796d559e6cacc2d78a5", size = 188070, upload-time = "2025-11-09T20:47:20.75Z" }, + { url = "https://files.pythonhosted.org/packages/92/c9/5b9f7b4983f1b542c64e84165075335e8a236fa9e2ea03a0c79780062be8/jiter-0.12.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:305e061fa82f4680607a775b2e8e0bcb071cd2205ac38e6ef48c8dd5ebe1cf37", size = 314449, upload-time = "2025-11-09T20:47:22.999Z" }, + { url = "https://files.pythonhosted.org/packages/98/6e/e8efa0e78de00db0aee82c0cf9e8b3f2027efd7f8a71f859d8f4be8e98ef/jiter-0.12.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5c1860627048e302a528333c9307c818c547f214d8659b0705d2195e1a94b274", size = 319855, upload-time = "2025-11-09T20:47:24.779Z" }, + { url = "https://files.pythonhosted.org/packages/20/26/894cd88e60b5d58af53bec5c6759d1292bd0b37a8b5f60f07abf7a63ae5f/jiter-0.12.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:df37577a4f8408f7e0ec3205d2a8f87672af8f17008358063a4d6425b6081ce3", size = 350171, upload-time = "2025-11-09T20:47:26.469Z" }, + { url = "https://files.pythonhosted.org/packages/f5/27/a7b818b9979ac31b3763d25f3653ec3a954044d5e9f5d87f2f247d679fd1/jiter-0.12.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:75fdd787356c1c13a4f40b43c2156276ef7a71eb487d98472476476d803fb2cf", size = 365590, upload-time = "2025-11-09T20:47:27.918Z" }, + { url = "https://files.pythonhosted.org/packages/ba/7e/e46195801a97673a83746170b17984aa8ac4a455746354516d02ca5541b4/jiter-0.12.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1eb5db8d9c65b112aacf14fcd0faae9913d07a8afea5ed06ccdd12b724e966a1", size = 479462, upload-time = "2025-11-09T20:47:29.654Z" }, + { url = "https://files.pythonhosted.org/packages/ca/75/f833bfb009ab4bd11b1c9406d333e3b4357709ed0570bb48c7c06d78c7dd/jiter-0.12.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:73c568cc27c473f82480abc15d1301adf333a7ea4f2e813d6a2c7d8b6ba8d0df", size = 378983, upload-time = "2025-11-09T20:47:31.026Z" }, + { url = "https://files.pythonhosted.org/packages/71/b3/7a69d77943cc837d30165643db753471aff5df39692d598da880a6e51c24/jiter-0.12.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4321e8a3d868919bcb1abb1db550d41f2b5b326f72df29e53b2df8b006eb9403", size = 361328, upload-time = "2025-11-09T20:47:33.286Z" }, + { url = "https://files.pythonhosted.org/packages/b0/ac/a78f90caf48d65ba70d8c6efc6f23150bc39dc3389d65bbec2a95c7bc628/jiter-0.12.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:0a51bad79f8cc9cac2b4b705039f814049142e0050f30d91695a2d9a6611f126", size = 386740, upload-time = "2025-11-09T20:47:34.703Z" }, + { url = "https://files.pythonhosted.org/packages/39/b6/5d31c2cc8e1b6a6bcf3c5721e4ca0a3633d1ab4754b09bc7084f6c4f5327/jiter-0.12.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:2a67b678f6a5f1dd6c36d642d7db83e456bc8b104788262aaefc11a22339f5a9", size = 520875, upload-time = "2025-11-09T20:47:36.058Z" }, + { url = "https://files.pythonhosted.org/packages/30/b5/4df540fae4e9f68c54b8dab004bd8c943a752f0b00efd6e7d64aa3850339/jiter-0.12.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:efe1a211fe1fd14762adea941e3cfd6c611a136e28da6c39272dbb7a1bbe6a86", size = 511457, upload-time = "2025-11-09T20:47:37.932Z" }, + { url = "https://files.pythonhosted.org/packages/07/65/86b74010e450a1a77b2c1aabb91d4a91dd3cd5afce99f34d75fd1ac64b19/jiter-0.12.0-cp312-cp312-win32.whl", hash = "sha256:d779d97c834b4278276ec703dc3fc1735fca50af63eb7262f05bdb4e62203d44", size = 204546, upload-time = "2025-11-09T20:47:40.47Z" }, + { url = "https://files.pythonhosted.org/packages/1c/c7/6659f537f9562d963488e3e55573498a442503ced01f7e169e96a6110383/jiter-0.12.0-cp312-cp312-win_amd64.whl", hash = "sha256:e8269062060212b373316fe69236096aaf4c49022d267c6736eebd66bbbc60bb", size = 205196, upload-time = "2025-11-09T20:47:41.794Z" }, + { url = "https://files.pythonhosted.org/packages/21/f4/935304f5169edadfec7f9c01eacbce4c90bb9a82035ac1de1f3bd2d40be6/jiter-0.12.0-cp312-cp312-win_arm64.whl", hash = "sha256:06cb970936c65de926d648af0ed3d21857f026b1cf5525cb2947aa5e01e05789", size = 186100, upload-time = "2025-11-09T20:47:43.007Z" }, + { url = "https://files.pythonhosted.org/packages/3d/a6/97209693b177716e22576ee1161674d1d58029eb178e01866a0422b69224/jiter-0.12.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:6cc49d5130a14b732e0612bc76ae8db3b49898732223ef8b7599aa8d9810683e", size = 313658, upload-time = "2025-11-09T20:47:44.424Z" }, + { url = "https://files.pythonhosted.org/packages/06/4d/125c5c1537c7d8ee73ad3d530a442d6c619714b95027143f1b61c0b4dfe0/jiter-0.12.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:37f27a32ce36364d2fa4f7fdc507279db604d27d239ea2e044c8f148410defe1", size = 318605, upload-time = "2025-11-09T20:47:45.973Z" }, + { url = "https://files.pythonhosted.org/packages/99/bf/a840b89847885064c41a5f52de6e312e91fa84a520848ee56c97e4fa0205/jiter-0.12.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bbc0944aa3d4b4773e348cda635252824a78f4ba44328e042ef1ff3f6080d1cf", size = 349803, upload-time = "2025-11-09T20:47:47.535Z" }, + { url = "https://files.pythonhosted.org/packages/8a/88/e63441c28e0db50e305ae23e19c1d8fae012d78ed55365da392c1f34b09c/jiter-0.12.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:da25c62d4ee1ffbacb97fac6dfe4dcd6759ebdc9015991e92a6eae5816287f44", size = 365120, upload-time = "2025-11-09T20:47:49.284Z" }, + { url = "https://files.pythonhosted.org/packages/0a/7c/49b02714af4343970eb8aca63396bc1c82fa01197dbb1e9b0d274b550d4e/jiter-0.12.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:048485c654b838140b007390b8182ba9774621103bd4d77c9c3f6f117474ba45", size = 479918, upload-time = "2025-11-09T20:47:50.807Z" }, + { url = "https://files.pythonhosted.org/packages/69/ba/0a809817fdd5a1db80490b9150645f3aae16afad166960bcd562be194f3b/jiter-0.12.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:635e737fbb7315bef0037c19b88b799143d2d7d3507e61a76751025226b3ac87", size = 379008, upload-time = "2025-11-09T20:47:52.211Z" }, + { url = "https://files.pythonhosted.org/packages/5f/c3/c9fc0232e736c8877d9e6d83d6eeb0ba4e90c6c073835cc2e8f73fdeef51/jiter-0.12.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4e017c417b1ebda911bd13b1e40612704b1f5420e30695112efdbed8a4b389ed", size = 361785, upload-time = "2025-11-09T20:47:53.512Z" }, + { url = "https://files.pythonhosted.org/packages/96/61/61f69b7e442e97ca6cd53086ddc1cf59fb830549bc72c0a293713a60c525/jiter-0.12.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:89b0bfb8b2bf2351fba36bb211ef8bfceba73ef58e7f0c68fb67b5a2795ca2f9", size = 386108, upload-time = "2025-11-09T20:47:54.893Z" }, + { url = "https://files.pythonhosted.org/packages/e9/2e/76bb3332f28550c8f1eba3bf6e5efe211efda0ddbbaf24976bc7078d42a5/jiter-0.12.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:f5aa5427a629a824a543672778c9ce0c5e556550d1569bb6ea28a85015287626", size = 519937, upload-time = "2025-11-09T20:47:56.253Z" }, + { url = "https://files.pythonhosted.org/packages/84/d6/fa96efa87dc8bff2094fb947f51f66368fa56d8d4fc9e77b25d7fbb23375/jiter-0.12.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ed53b3d6acbcb0fd0b90f20c7cb3b24c357fe82a3518934d4edfa8c6898e498c", size = 510853, upload-time = "2025-11-09T20:47:58.32Z" }, + { url = "https://files.pythonhosted.org/packages/8a/28/93f67fdb4d5904a708119a6ab58a8f1ec226ff10a94a282e0215402a8462/jiter-0.12.0-cp313-cp313-win32.whl", hash = "sha256:4747de73d6b8c78f2e253a2787930f4fffc68da7fa319739f57437f95963c4de", size = 204699, upload-time = "2025-11-09T20:47:59.686Z" }, + { url = "https://files.pythonhosted.org/packages/c4/1f/30b0eb087045a0abe2a5c9c0c0c8da110875a1d3be83afd4a9a4e548be3c/jiter-0.12.0-cp313-cp313-win_amd64.whl", hash = "sha256:e25012eb0c456fcc13354255d0338cd5397cce26c77b2832b3c4e2e255ea5d9a", size = 204258, upload-time = "2025-11-09T20:48:01.01Z" }, + { url = "https://files.pythonhosted.org/packages/2c/f4/2b4daf99b96bce6fc47971890b14b2a36aef88d7beb9f057fafa032c6141/jiter-0.12.0-cp313-cp313-win_arm64.whl", hash = "sha256:c97b92c54fe6110138c872add030a1f99aea2401ddcdaa21edf74705a646dd60", size = 185503, upload-time = "2025-11-09T20:48:02.35Z" }, + { url = "https://files.pythonhosted.org/packages/39/ca/67bb15a7061d6fe20b9b2a2fd783e296a1e0f93468252c093481a2f00efa/jiter-0.12.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:53839b35a38f56b8be26a7851a48b89bc47e5d88e900929df10ed93b95fea3d6", size = 317965, upload-time = "2025-11-09T20:48:03.783Z" }, + { url = "https://files.pythonhosted.org/packages/18/af/1788031cd22e29c3b14bc6ca80b16a39a0b10e611367ffd480c06a259831/jiter-0.12.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94f669548e55c91ab47fef8bddd9c954dab1938644e715ea49d7e117015110a4", size = 345831, upload-time = "2025-11-09T20:48:05.55Z" }, + { url = "https://files.pythonhosted.org/packages/05/17/710bf8472d1dff0d3caf4ced6031060091c1320f84ee7d5dcbed1f352417/jiter-0.12.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:351d54f2b09a41600ffea43d081522d792e81dcfb915f6d2d242744c1cc48beb", size = 361272, upload-time = "2025-11-09T20:48:06.951Z" }, + { url = "https://files.pythonhosted.org/packages/fb/f1/1dcc4618b59761fef92d10bcbb0b038b5160be653b003651566a185f1a5c/jiter-0.12.0-cp313-cp313t-win_amd64.whl", hash = "sha256:2a5e90604620f94bf62264e7c2c038704d38217b7465b863896c6d7c902b06c7", size = 204604, upload-time = "2025-11-09T20:48:08.328Z" }, + { url = "https://files.pythonhosted.org/packages/d9/32/63cb1d9f1c5c6632a783c0052cde9ef7ba82688f7065e2f0d5f10a7e3edb/jiter-0.12.0-cp313-cp313t-win_arm64.whl", hash = "sha256:88ef757017e78d2860f96250f9393b7b577b06a956ad102c29c8237554380db3", size = 185628, upload-time = "2025-11-09T20:48:09.572Z" }, + { url = "https://files.pythonhosted.org/packages/a8/99/45c9f0dbe4a1416b2b9a8a6d1236459540f43d7fb8883cff769a8db0612d/jiter-0.12.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:c46d927acd09c67a9fb1416df45c5a04c27e83aae969267e98fba35b74e99525", size = 312478, upload-time = "2025-11-09T20:48:10.898Z" }, + { url = "https://files.pythonhosted.org/packages/4c/a7/54ae75613ba9e0f55fcb0bc5d1f807823b5167cc944e9333ff322e9f07dd/jiter-0.12.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:774ff60b27a84a85b27b88cd5583899c59940bcc126caca97eb2a9df6aa00c49", size = 318706, upload-time = "2025-11-09T20:48:12.266Z" }, + { url = "https://files.pythonhosted.org/packages/59/31/2aa241ad2c10774baf6c37f8b8e1f39c07db358f1329f4eb40eba179c2a2/jiter-0.12.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c5433fab222fb072237df3f637d01b81f040a07dcac1cb4a5c75c7aa9ed0bef1", size = 351894, upload-time = "2025-11-09T20:48:13.673Z" }, + { url = "https://files.pythonhosted.org/packages/54/4f/0f2759522719133a9042781b18cc94e335b6d290f5e2d3e6899d6af933e3/jiter-0.12.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f8c593c6e71c07866ec6bfb790e202a833eeec885022296aff6b9e0b92d6a70e", size = 365714, upload-time = "2025-11-09T20:48:15.083Z" }, + { url = "https://files.pythonhosted.org/packages/dc/6f/806b895f476582c62a2f52c453151edd8a0fde5411b0497baaa41018e878/jiter-0.12.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:90d32894d4c6877a87ae00c6b915b609406819dce8bc0d4e962e4de2784e567e", size = 478989, upload-time = "2025-11-09T20:48:16.706Z" }, + { url = "https://files.pythonhosted.org/packages/86/6c/012d894dc6e1033acd8db2b8346add33e413ec1c7c002598915278a37f79/jiter-0.12.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:798e46eed9eb10c3adbbacbd3bdb5ecd4cf7064e453d00dbef08802dae6937ff", size = 378615, upload-time = "2025-11-09T20:48:18.614Z" }, + { url = "https://files.pythonhosted.org/packages/87/30/d718d599f6700163e28e2c71c0bbaf6dace692e7df2592fd793ac9276717/jiter-0.12.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b3f1368f0a6719ea80013a4eb90ba72e75d7ea67cfc7846db2ca504f3df0169a", size = 364745, upload-time = "2025-11-09T20:48:20.117Z" }, + { url = "https://files.pythonhosted.org/packages/8f/85/315b45ce4b6ddc7d7fceca24068543b02bdc8782942f4ee49d652e2cc89f/jiter-0.12.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:65f04a9d0b4406f7e51279710b27484af411896246200e461d80d3ba0caa901a", size = 386502, upload-time = "2025-11-09T20:48:21.543Z" }, + { url = "https://files.pythonhosted.org/packages/74/0b/ce0434fb40c5b24b368fe81b17074d2840748b4952256bab451b72290a49/jiter-0.12.0-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:fd990541982a24281d12b67a335e44f117e4c6cbad3c3b75c7dea68bf4ce3a67", size = 519845, upload-time = "2025-11-09T20:48:22.964Z" }, + { url = "https://files.pythonhosted.org/packages/e8/a3/7a7a4488ba052767846b9c916d208b3ed114e3eb670ee984e4c565b9cf0d/jiter-0.12.0-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:b111b0e9152fa7df870ecaebb0bd30240d9f7fff1f2003bcb4ed0f519941820b", size = 510701, upload-time = "2025-11-09T20:48:24.483Z" }, + { url = "https://files.pythonhosted.org/packages/c3/16/052ffbf9d0467b70af24e30f91e0579e13ded0c17bb4a8eb2aed3cb60131/jiter-0.12.0-cp314-cp314-win32.whl", hash = "sha256:a78befb9cc0a45b5a5a0d537b06f8544c2ebb60d19d02c41ff15da28a9e22d42", size = 205029, upload-time = "2025-11-09T20:48:25.749Z" }, + { url = "https://files.pythonhosted.org/packages/e4/18/3cf1f3f0ccc789f76b9a754bdb7a6977e5d1d671ee97a9e14f7eb728d80e/jiter-0.12.0-cp314-cp314-win_amd64.whl", hash = "sha256:e1fe01c082f6aafbe5c8faf0ff074f38dfb911d53f07ec333ca03f8f6226debf", size = 204960, upload-time = "2025-11-09T20:48:27.415Z" }, + { url = "https://files.pythonhosted.org/packages/02/68/736821e52ecfdeeb0f024b8ab01b5a229f6b9293bbdb444c27efade50b0f/jiter-0.12.0-cp314-cp314-win_arm64.whl", hash = "sha256:d72f3b5a432a4c546ea4bedc84cce0c3404874f1d1676260b9c7f048a9855451", size = 185529, upload-time = "2025-11-09T20:48:29.125Z" }, + { url = "https://files.pythonhosted.org/packages/30/61/12ed8ee7a643cce29ac97c2281f9ce3956eb76b037e88d290f4ed0d41480/jiter-0.12.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:e6ded41aeba3603f9728ed2b6196e4df875348ab97b28fc8afff115ed42ba7a7", size = 318974, upload-time = "2025-11-09T20:48:30.87Z" }, + { url = "https://files.pythonhosted.org/packages/2d/c6/f3041ede6d0ed5e0e79ff0de4c8f14f401bbf196f2ef3971cdbe5fd08d1d/jiter-0.12.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a947920902420a6ada6ad51892082521978e9dd44a802663b001436e4b771684", size = 345932, upload-time = "2025-11-09T20:48:32.658Z" }, + { url = "https://files.pythonhosted.org/packages/d5/5d/4d94835889edd01ad0e2dbfc05f7bdfaed46292e7b504a6ac7839aa00edb/jiter-0.12.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:add5e227e0554d3a52cf390a7635edaffdf4f8fce4fdbcef3cc2055bb396a30c", size = 367243, upload-time = "2025-11-09T20:48:34.093Z" }, + { url = "https://files.pythonhosted.org/packages/fd/76/0051b0ac2816253a99d27baf3dda198663aff882fa6ea7deeb94046da24e/jiter-0.12.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f9b1cda8fcb736250d7e8711d4580ebf004a46771432be0ae4796944b5dfa5d", size = 479315, upload-time = "2025-11-09T20:48:35.507Z" }, + { url = "https://files.pythonhosted.org/packages/70/ae/83f793acd68e5cb24e483f44f482a1a15601848b9b6f199dacb970098f77/jiter-0.12.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:deeb12a2223fe0135c7ff1356a143d57f95bbf1f4a66584f1fc74df21d86b993", size = 380714, upload-time = "2025-11-09T20:48:40.014Z" }, + { url = "https://files.pythonhosted.org/packages/b1/5e/4808a88338ad2c228b1126b93fcd8ba145e919e886fe910d578230dabe3b/jiter-0.12.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c596cc0f4cb574877550ce4ecd51f8037469146addd676d7c1a30ebe6391923f", size = 365168, upload-time = "2025-11-09T20:48:41.462Z" }, + { url = "https://files.pythonhosted.org/packages/0c/d4/04619a9e8095b42aef436b5aeb4c0282b4ff1b27d1db1508df9f5dc82750/jiter-0.12.0-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5ab4c823b216a4aeab3fdbf579c5843165756bd9ad87cc6b1c65919c4715f783", size = 387893, upload-time = "2025-11-09T20:48:42.921Z" }, + { url = "https://files.pythonhosted.org/packages/17/ea/d3c7e62e4546fdc39197fa4a4315a563a89b95b6d54c0d25373842a59cbe/jiter-0.12.0-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:e427eee51149edf962203ff8db75a7514ab89be5cb623fb9cea1f20b54f1107b", size = 520828, upload-time = "2025-11-09T20:48:44.278Z" }, + { url = "https://files.pythonhosted.org/packages/cc/0b/c6d3562a03fd767e31cb119d9041ea7958c3c80cb3d753eafb19b3b18349/jiter-0.12.0-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:edb868841f84c111255ba5e80339d386d937ec1fdce419518ce1bd9370fac5b6", size = 511009, upload-time = "2025-11-09T20:48:45.726Z" }, + { url = "https://files.pythonhosted.org/packages/aa/51/2cb4468b3448a8385ebcd15059d325c9ce67df4e2758d133ab9442b19834/jiter-0.12.0-cp314-cp314t-win32.whl", hash = "sha256:8bbcfe2791dfdb7c5e48baf646d37a6a3dcb5a97a032017741dea9f817dca183", size = 205110, upload-time = "2025-11-09T20:48:47.033Z" }, + { url = "https://files.pythonhosted.org/packages/b2/c5/ae5ec83dec9c2d1af805fd5fe8f74ebded9c8670c5210ec7820ce0dbeb1e/jiter-0.12.0-cp314-cp314t-win_amd64.whl", hash = "sha256:2fa940963bf02e1d8226027ef461e36af472dea85d36054ff835aeed944dd873", size = 205223, upload-time = "2025-11-09T20:48:49.076Z" }, + { url = "https://files.pythonhosted.org/packages/97/9a/3c5391907277f0e55195550cf3fa8e293ae9ee0c00fb402fec1e38c0c82f/jiter-0.12.0-cp314-cp314t-win_arm64.whl", hash = "sha256:506c9708dd29b27288f9f8f1140c3cb0e3d8ddb045956d7757b1fa0e0f39a473", size = 185564, upload-time = "2025-11-09T20:48:50.376Z" }, + { url = "https://files.pythonhosted.org/packages/fe/54/5339ef1ecaa881c6948669956567a64d2670941925f245c434f494ffb0e5/jiter-0.12.0-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:4739a4657179ebf08f85914ce50332495811004cc1747852e8b2041ed2aab9b8", size = 311144, upload-time = "2025-11-09T20:49:10.503Z" }, + { url = "https://files.pythonhosted.org/packages/27/74/3446c652bffbd5e81ab354e388b1b5fc1d20daac34ee0ed11ff096b1b01a/jiter-0.12.0-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:41da8def934bf7bec16cb24bd33c0ca62126d2d45d81d17b864bd5ad721393c3", size = 305877, upload-time = "2025-11-09T20:49:12.269Z" }, + { url = "https://files.pythonhosted.org/packages/a1/f4/ed76ef9043450f57aac2d4fbeb27175aa0eb9c38f833be6ef6379b3b9a86/jiter-0.12.0-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9c44ee814f499c082e69872d426b624987dbc5943ab06e9bbaa4f81989fdb79e", size = 340419, upload-time = "2025-11-09T20:49:13.803Z" }, + { url = "https://files.pythonhosted.org/packages/21/01/857d4608f5edb0664aa791a3d45702e1a5bcfff9934da74035e7b9803846/jiter-0.12.0-graalpy311-graalpy242_311_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cd2097de91cf03eaa27b3cbdb969addf83f0179c6afc41bbc4513705e013c65d", size = 347212, upload-time = "2025-11-09T20:49:15.643Z" }, + { url = "https://files.pythonhosted.org/packages/cb/f5/12efb8ada5f5c9edc1d4555fe383c1fb2eac05ac5859258a72d61981d999/jiter-0.12.0-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:e8547883d7b96ef2e5fe22b88f8a4c8725a56e7f4abafff20fd5272d634c7ecb", size = 309974, upload-time = "2025-11-09T20:49:17.187Z" }, + { url = "https://files.pythonhosted.org/packages/85/15/d6eb3b770f6a0d332675141ab3962fd4a7c270ede3515d9f3583e1d28276/jiter-0.12.0-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:89163163c0934854a668ed783a2546a0617f71706a2551a4a0666d91ab365d6b", size = 304233, upload-time = "2025-11-09T20:49:18.734Z" }, + { url = "https://files.pythonhosted.org/packages/8c/3e/e7e06743294eea2cf02ced6aa0ff2ad237367394e37a0e2b4a1108c67a36/jiter-0.12.0-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d96b264ab7d34bbb2312dedc47ce07cd53f06835eacbc16dde3761f47c3a9e7f", size = 338537, upload-time = "2025-11-09T20:49:20.317Z" }, + { url = "https://files.pythonhosted.org/packages/2f/9c/6753e6522b8d0ef07d3a3d239426669e984fb0eba15a315cdbc1253904e4/jiter-0.12.0-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c24e864cb30ab82311c6425655b0cdab0a98c5d973b065c66a3f020740c2324c", size = 346110, upload-time = "2025-11-09T20:49:21.817Z" }, +] + [[package]] name = "jsonschema" version = "4.24.0" @@ -1161,6 +1321,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a1/30/9ec597c962c5249ebd5c580386e4b5f2884cd943af42634291ee3b406415/leather-0.4.0-py2.py3-none-any.whl", hash = "sha256:18290bc93749ae39039af5e31e871fcfad74d26c4c3ea28ea4f681f4571b3a2b", size = 30256, upload-time = "2024-02-23T22:03:34.75Z" }, ] +[[package]] +name = "logfire-api" +version = "4.15.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e2/e6/3895c0ebf9f6a7acee04a816a569ca871c3d3048fdbd6b2a041f980abc54/logfire_api-4.15.1.tar.gz", hash = "sha256:3fbafc5593f4a16a038a3d23c67a7a7ee9da8be9e3b148fa73069d32e1ed4e8e", size = 57614, upload-time = "2025-11-20T15:52:17.019Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f3/2b/851e78a60b85e8e8e8c6ebb9928f8e883df0340a93e34960ed9f0a41fa82/logfire_api-4.15.1-py3-none-any.whl", hash = "sha256:a88b5c4b6e4acbf6f35a3e992a63f271cf2797aefd21e1cfc93d52b21ade65f6", size = 95031, upload-time = "2025-11-20T15:52:14.433Z" }, +] + [[package]] name = "makefun" version = "1.16.0" @@ -1470,6 +1639,7 @@ dependencies = [ { name = "pandas" }, { name = "posthog" }, { name = "psutil" }, + { name = "pydantic-ai-slim", extra = ["anthropic", "openai"] }, { name = "pyyaml" }, { name = "starlette" }, { name = "uvicorn", extra = ["standard"] }, @@ -1557,6 +1727,7 @@ requires-dist = [ { name = "pandas-stubs", marker = "extra == 'dev'" }, { name = "posthog", specifier = ">=3.0.0" }, { name = "psutil", specifier = ">=5.9.0" }, + { name = "pydantic-ai-slim", extras = ["anthropic", "openai"], specifier = ">=1.25.0" }, { name = "pytest", marker = "extra == 'all'" }, { name = "pytest", marker = "extra == 'dev'" }, { name = "pytest-asyncio", marker = "extra == 'all'" }, @@ -1849,6 +2020,25 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c7/1d/91becb8fa0e417c172a5721c06dc403ad2abbbc766e9a8bdeff46bdea6ba/onepassword_sdk-0.3.1-cp313-cp313-win_amd64.whl", hash = "sha256:4a77fa3fdbad03738faf2703387256f53c2c86329bcd9f19ee5725a2075db77b", size = 5506878, upload-time = "2025-06-11T17:24:33.339Z" }, ] +[[package]] +name = "openai" +version = "2.8.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "distro" }, + { name = "httpx" }, + { name = "jiter" }, + { name = "pydantic" }, + { name = "sniffio" }, + { name = "tqdm" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d5/e4/42591e356f1d53c568418dc7e30dcda7be31dd5a4d570bca22acb0525862/openai-2.8.1.tar.gz", hash = "sha256:cb1b79eef6e809f6da326a7ef6038719e35aa944c42d081807bfa1be8060f15f", size = 602490, upload-time = "2025-11-17T22:39:59.549Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/55/4f/dbc0c124c40cb390508a82770fb9f6e3ed162560181a85089191a851c59a/openai-2.8.1-py3-none-any.whl", hash = "sha256:c6c3b5a04994734386e8dad3c00a393f56d3b68a27cd2e8acae91a59e4122463", size = 1022688, upload-time = "2025-11-17T22:39:57.675Z" }, +] + [[package]] name = "opentelemetry-api" version = "1.36.0" @@ -2277,6 +2467,32 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6a/c0/ec2b1c8712ca690e5d61979dee872603e92b8a32f94cc1b72d53beab008a/pydantic-2.11.7-py3-none-any.whl", hash = "sha256:dde5df002701f6de26248661f6835bbe296a47bf73990135c7d07ce741b9623b", size = 444782, upload-time = "2025-06-14T08:33:14.905Z" }, ] +[[package]] +name = "pydantic-ai-slim" +version = "1.25.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "exceptiongroup", marker = "python_full_version < '3.11'" }, + { name = "genai-prices" }, + { name = "griffe" }, + { name = "httpx" }, + { name = "opentelemetry-api" }, + { name = "pydantic" }, + { name = "pydantic-graph" }, + { name = "typing-inspection" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/25/b0/7e3de325bf45d7fbf798ec7c74894f18a6fb4bebb8f250936dd26015d4cf/pydantic_ai_slim-1.25.0-py3-none-any.whl", hash = "sha256:87fd01472939862ffba92dc7f93ae2cb47d6a417c0278846dd24ea7f5164f9a8", size = 420416, upload-time = "2025-11-28T05:04:33.012Z" }, +] + +[package.optional-dependencies] +anthropic = [ + { name = "anthropic" }, +] +openai = [ + { name = "openai" }, +] + [[package]] name = "pydantic-core" version = "2.33.2" @@ -2364,6 +2580,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/32/56/8a7ca5d2cd2cda1d245d34b1c9a942920a718082ae8e54e5f3e5a58b7add/pydantic_core-2.33.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:329467cecfb529c925cf2bbd4d60d2c509bc2fb52a20c1045bf09bb70971a9c1", size = 2066757, upload-time = "2025-04-23T18:33:30.645Z" }, ] +[[package]] +name = "pydantic-graph" +version = "1.25.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "httpx" }, + { name = "logfire-api" }, + { name = "pydantic" }, + { name = "typing-inspection" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/84/3e/c6f5d0a1a22e8ad968c7fb9ea443a1310f7878a6d0a7682526ee210684c5/pydantic_graph-1.25.0-py3-none-any.whl", hash = "sha256:30f0890729cae49f6967297815d4e226557001c650ffe1500fe7ea517561bc2b", size = 72262, upload-time = "2025-11-28T05:04:36.83Z" }, +] + [[package]] name = "pydantic-settings" version = "2.10.1" @@ -2951,6 +3181,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6e/c2/61d3e0f47e2b74ef40a68b9e6ad5984f6241a942f7cd3bbfbdbd03861ea9/tomli-2.2.1-py3-none-any.whl", hash = "sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc", size = 14257, upload-time = "2024-11-27T22:38:35.385Z" }, ] +[[package]] +name = "tqdm" +version = "4.67.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737, upload-time = "2024-11-24T20:12:22.481Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" }, +] + [[package]] name = "twine" version = "6.1.0"