diff --git a/docs/guides/configuration.md b/docs/guides/configuration.md
index fd660da0..2fa99046 100644
--- a/docs/guides/configuration.md
+++ b/docs/guides/configuration.md
@@ -609,19 +609,14 @@ Add model configuration to your user config file (`~/.mxcp/config.yml`):
 
 ```yaml
 models:
-  default: "claude-4-sonnet"  # Default model to use for evals
+  default: "claude-3-5-sonnet-20240620"  # Default model to use for evals (update to a valid ID)
   models:
-    claude-4-opus:
-      type: "claude"
+    claude-3-5-sonnet-20240620:
+      type: "anthropic"
       api_key: "${ANTHROPIC_API_KEY}"  # Environment variable containing API key
-      timeout: 60  # Request timeout in seconds
+      timeout: 30  # Anthropic Messages model ID; ensure your account has access
       max_retries: 3  # Number of retries on failure
     
-    claude-4-sonnet:
-      type: "claude"
-      api_key: "${ANTHROPIC_API_KEY}"
-      timeout: 30
-    
     gpt-4o:
       type: "openai"
       api_key: "${OPENAI_API_KEY}"
@@ -638,11 +633,48 @@ models:
 
 - **default**: The model to use when not specified in eval suite or CLI
 - **models**: Dictionary of model configurations
-  - **type**: Either "claude" or "openai"
+- **type**: Either "anthropic" or "openai"
   - **api_key**: API key (you can use environment variables references)
-  - **base_url**: Custom API endpoint (optional, for OpenAI-compatible services)
-  - **timeout**: Request timeout in seconds
-  - **max_retries**: Number of retries on failure
+- **base_url**: Custom API endpoint (optional, for OpenAI-compatible services)
+- **timeout**: Request timeout in seconds
+- **max_retries**: Number of retries on failure
+- **options**: Extra provider-specific options forwarded to the model (e.g. `thinking: false`)
+
+Example with mixed providers and options:
+
+```yaml
+models:
+  default: "gpt-4o"
+  models:
+    gpt-4o:
+      type: "openai"
+      api_key: "${OPENAI_API_KEY}"
+      timeout: 45
+      options:
+        reasoning: "fast"
+    claude-3-5-sonnet-20240620:
+      type: "anthropic"
+      api_key: "${ANTHROPIC_API_KEY}"
+      timeout: 30
+      options:
+        thinking: false
+
+# Using OpenAI Responses API with reasoning
+# Set api: responses to route through the Responses endpoint (e.g., for reasoning)
+models:
+  default: "gpt-5"
+  models:
+    gpt-5:
+      type: "openai"
+      api_key: "${OPENAI_API_KEY}"
+      options:
+        api: "responses"          # Choices: responses (for OpenAI Responses API), chat (default)
+        # Provider-specific fields must be prefixed:
+        # - body:<key> goes into the request body
+        # - header:<key> goes into request headers
+        body:reasoning:
+          effort: "medium"        # Passed via extra_body to the provider
+```
 
 For more information on using evals, see the [LLM Evaluation section](quality.md#llm-evaluation-evals) in the Quality & Testing Guide.
 
diff --git a/docs/guides/quality.md b/docs/guides/quality.md
index 9b732c4f..d52a8c46 100644
--- a/docs/guides/quality.md
+++ b/docs/guides/quality.md
@@ -805,7 +805,7 @@ Create eval files with the suffix `-evals.yml` or `.evals.yml`:
 mxcp: 1
 suite: customer_analysis
 description: "Test LLM's ability to analyze customer data"
-model: claude-3-opus  # Optional: specify model for this suite
+model: claude-3-5-sonnet-20240620  # Optional: specify model for this suite (ensure valid ID)
 
 tests:
   - name: churn_risk_assessment
@@ -819,6 +819,7 @@ tests:
         - tool: get_churn_score
           args:
             customer_id: "ABC"
+      expected_answer: "The customer is high risk of churn"
       answer_contains:
         - "risk"
         - "churn"
@@ -880,6 +881,115 @@ answer_not_contains:
   - "unauthorized"
 ```
 
+#### `expected_answer`
+Checks the model's final answer against an expected answer using the LLM as a grader. The grader
+returns `correct`, `wrong`, or `partially correct` plus a short comment.
+
+```yaml
+expected_answer: "The customer is high risk of churn"
+```
+
+### Complete Eval Example
+
+```yaml
+# faq-evals.yml
+mxcp: 1
+suite: faq_checks
+description: "Make sure the assistant answers FAQs accurately and uses tools when needed"
+model: gpt-4o
+
+tests:
+  - name: tool_usage_for_price_lookup
+    prompt: "What's the current price for SKU-1234?"
+    assertions:
+      must_call:
+        - tool: get_product_price
+          args:
+            sku: "SKU-1234"
+      answer_contains:
+        - "price"
+
+  - name: expected_answer_grading
+    prompt: "What are your support hours?"
+    assertions:
+      expected_answer: "Our support team is available Monday to Friday, 9am-5pm local time."
+      answer_contains:
+        - "Monday"
+        - "Friday"
+```
+
+### Customizing the System Prompt
+
+Each eval suite can override the default LLM instructions to better match your domain or desired behavior. Add a `system_prompt` field at the suite level—if it is omitted, MXCP falls back to the built-in prompt that encourages concise, tool-aware answers.
+
+```yaml
+mxcp: 1
+suite: relationship_navigation
+description: "Ensure the assistant navigates relationships carefully"
+model: gpt-4o
+system_prompt: |
+  You are a Vertec specialist. Always explain which tool you used.
+  If a tool fails, read the error carefully before trying again.
+
+tests:
+  - name: compare_owners
+    prompt: "Are the owners of Project A and Project B the same?"
+    assertions:
+      must_call:
+        - tool: sql_search_objects
+          args:
+            object_type: "Project"
+```
+
+### Model Configuration Example
+
+Add models to your user config (`~/.mxcp/config.yml`) so evals know which providers to call:
+
+```yaml
+models:
+  default: "claude-3-5-sonnet-20240620"
+  models:
+    claude-3-5-sonnet-20240620:
+      type: "anthropic"
+      api_key: "${ANTHROPIC_API_KEY}"
+      timeout: 30
+    gpt-4o:
+      type: "openai"
+      api_key: "${OPENAI_API_KEY}"
+      base_url: "https://api.openai.com/v1"
+      timeout: 45
+      options:
+        reasoning: "fast"  # forwarded to the provider as-is
+
+# Example: use a faster model just for grading expected answers
+mxcp: 1
+suite: faq_checks
+model: gpt-4o                   # primary model used to answer
+expected_answer_model: gpt-4o-mini  # model used only for grading expected answers
+tests:
+  - name: expected_answer_grading
+    prompt: "What are your support hours?"
+    assertions:
+      expected_answer: "Our support team is available Monday to Friday, 9am-5pm local time."
+      # expected_answer_model is useful when:
+      # - Your main model is slow/expensive, but grading can use a lighter model
+      # - You want deterministic, faster grading for many evals
+
+# OpenAI Responses API example (reasoning)
+models:
+  default: "gpt-5"
+  models:
+    gpt-5:
+      type: "openai"
+      api_key: "${OPENAI_API_KEY}"
+      options:
+        api: "responses"          # Choices: responses (Responses API) or chat (default)
+        # Provider-specific fields must use prefixes:
+        #   body:<key> for request body, header:<key> for headers
+        body:reasoning:
+          effort: "medium"
+```
+
 ### Running Evals
 
 ```bash
@@ -910,7 +1020,7 @@ models:
   default: claude-3-opus
   models:
     claude-3-opus:
-      type: claude
+      type: anthropic
       api_key: ${ANTHROPIC_API_KEY}
       timeout: 60
       max_retries: 3
@@ -1037,4 +1147,4 @@ Well-tested endpoints with rich metadata provide:
 - Faster debugging
 - Safe AI interactions
 
-Remember: LLMs perform best when they clearly understand what your endpoints do, how to use them, and what to expect in return! 
\ No newline at end of file
+Remember: LLMs perform best when they clearly understand what your endpoints do, how to use them, and what to expect in return! 
diff --git a/docs/reference/cli.md b/docs/reference/cli.md
index 7ef974a0..cf67984a 100644
--- a/docs/reference/cli.md
+++ b/docs/reference/cli.md
@@ -624,4 +624,4 @@ The following environment variables can be used to configure MXCP:
 - `MXCP_TELEMETRY_TRACING_CONSOLE`: Enable console trace export for debugging (`true`/`false`)
 - `MXCP_TELEMETRY_METRICS_INTERVAL`: Metrics export interval in seconds (default: `60`)
 
-For more details on environment variables and their usage, see the [Configuration Guide](../guides/configuration.md) and [Observability Guide](../guides/observability.md). 
\ No newline at end of file
+For more details on environment variables and their usage, see the [Configuration Guide](../guides/configuration.md) and [Observability Guide](../guides/observability.md). 
diff --git a/pyproject.toml b/pyproject.toml
index aa7450bf..01f020e9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,6 +41,7 @@ dependencies = [
     "fastapi>=0.110.0",  # FastAPI for admin API
     "uvicorn[standard]>=0.27.0",  # ASGI server for admin API
     "psutil>=5.9.0",  # System metrics for admin API
+    "pydantic-ai-slim[anthropic,openai]>=1.25.0",
 ]
 
 [project.scripts]
diff --git a/src/mxcp/sdk/evals/__init__.py b/src/mxcp/sdk/evals/__init__.py
index 9f8a91b0..9045c433 100644
--- a/src/mxcp/sdk/evals/__init__.py
+++ b/src/mxcp/sdk/evals/__init__.py
@@ -9,15 +9,13 @@
 - Tool definition types for describing available tools to the LLM
 """
 
-from ._types import ClaudeConfig, ModelConfigType, OpenAIConfig, ParameterDefinition, ToolDefinition
-from .executor import LLMExecutor, ToolExecutor
+from ._types import ParameterDefinition, ToolDefinition
+from .executor import LLMExecutor, ProviderConfig, ToolExecutor
 
 __all__ = [
     "LLMExecutor",
     "ToolExecutor",
     "ToolDefinition",
     "ParameterDefinition",
-    "ModelConfigType",
-    "ClaudeConfig",
-    "OpenAIConfig",
+    "ProviderConfig",
 ]
diff --git a/src/mxcp/sdk/evals/_types.py b/src/mxcp/sdk/evals/_types.py
index 6b9359b7..60bab68e 100644
--- a/src/mxcp/sdk/evals/_types.py
+++ b/src/mxcp/sdk/evals/_types.py
@@ -1,54 +1,16 @@
 """Types for MXCP SDK Evals module.
 
-This module contains type definitions for LLM models, tool definitions,
-and other data structures used in the evaluation framework.
+This module contains type definitions for tool definitions and
+other data structures used in the evaluation framework.
 """
 
-from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
 from typing import Any
 
 from mxcp.sdk.validator import TypeSchemaModel
 
-
-# LLM Model configuration types
-@dataclass
-class ModelConfig(ABC):
-    """Base class for LLM model configurations."""
-
-    name: str
-    api_key: str
-
-    @abstractmethod
-    def get_type(self) -> str:
-        """Get the type identifier for this model."""
-        pass
-
-
-@dataclass
-class ClaudeConfig(ModelConfig):
-    """Configuration for Claude models."""
-
-    base_url: str = "https://api.anthropic.com"
-    timeout: int = 30
-
-    def get_type(self) -> str:
-        return "claude"
-
-
-@dataclass
-class OpenAIConfig(ModelConfig):
-    """Configuration for OpenAI models."""
-
-    base_url: str = "https://api.openai.com/v1"
-    timeout: int = 30
-
-    def get_type(self) -> str:
-        return "openai"
-
-
-# Union type for all supported model configurations
-ModelConfigType = ClaudeConfig | OpenAIConfig
+# Type alias for JSON Schema representation
+JsonSchema = dict[str, Any]
 
 
 @dataclass
@@ -60,6 +22,8 @@ class ParameterDefinition:
     description: str = ""
     default: Any | None = None
     required: bool = True
+    schema: JsonSchema | None = None
+    """Optional JSON Schema for complex parameter validation."""
 
 
 @dataclass
diff --git a/src/mxcp/sdk/evals/executor.py b/src/mxcp/sdk/evals/executor.py
index 55a68451..8de4e02e 100644
--- a/src/mxcp/sdk/evals/executor.py
+++ b/src/mxcp/sdk/evals/executor.py
@@ -1,367 +1,423 @@
-"""Core LLM executor for MXCP SDK Evals.
+"""Agent-style LLM executor for MXCP evals."""
 
-This module provides the main LLMExecutor class that handles LLM orchestration
-and tool calling, with tool execution delegated to external implementations.
-"""
+from __future__ import annotations
 
-import json
 import logging
-import re
-from typing import Any, Protocol, cast
-
-import httpx
+from collections.abc import Callable
+from dataclasses import dataclass, field
+from typing import Any, Protocol
+
+from pydantic import BaseModel, Field, create_model
+from pydantic_ai import Agent, ModelSettings, RunContext
+from pydantic_ai.exceptions import ModelRetry, UnexpectedModelBehavior, UsageLimitExceeded
+from pydantic_ai.models.anthropic import AnthropicModel
+from pydantic_ai.models.openai import OpenAIChatModel, OpenAIResponsesModel
+from pydantic_ai.providers.anthropic import AnthropicProvider
+from pydantic_ai.providers.openai import OpenAIProvider
+from pydantic_ai.tools import Tool
+from pydantic_ai.tools import ToolDefinition as AgentToolDefinition
 
 from mxcp.sdk.auth import UserContextModel
 
-from ._types import ModelConfigType, ToolDefinition
+from ._types import ToolDefinition
+
+# Agent/tool retry configuration
+DEFAULT_AGENT_RETRIES = 30
+
+# Type alias for model references (either a model object or a string identifier)
+ModelReference = OpenAIChatModel | OpenAIResponsesModel | AnthropicModel | str
 
 logger = logging.getLogger(__name__)
 
 
 class ToolExecutor(Protocol):
-    """Protocol for tool execution strategies.
-
-    Different contexts can implement this protocol to provide their own
-    tool execution logic (e.g., using ExecutionEngine, HTTP APIs, mocks, etc.).
-    """
+    """Protocol for tool execution strategies."""
 
     async def execute_tool(
         self,
         tool_name: str,
         arguments: dict[str, Any],
         user_context: UserContextModel | None = None,
-    ) -> Any:
-        """Execute a tool and return the result.
+    ) -> Any: ...
+
+
+@dataclass
+class ToolCallRecord:
+    id: str | None
+    tool: str
+    arguments: dict[str, Any]
+    result: Any | None = None
+    error: Any | None = None
+
 
-        Args:
-            tool_name: Name of the tool to execute
-            arguments: Arguments to pass to the tool
-            user_context: Optional user context for execution
+@dataclass
+class AgentResult:
+    answer: str
+    tool_calls: list[ToolCallRecord] = field(default_factory=list)
+    error: str | None = None  # Execution error if agent failed to produce an answer
 
-        Returns:
-            Result of tool execution
 
-        Raises:
-            Exception: If tool execution fails
-        """
-        ...
+class ProviderConfig(BaseModel):
+    api_key: str | None = None
+    base_url: str | None = None
+    timeout: int | None = None
+    model_config = {"extra": "forbid"}
+
+
+class GradeResult(BaseModel):
+    result: str = Field(default="unknown")
+    comment: str = Field(default="")
+    reasoning: str = Field(default="")
 
 
 class LLMExecutor:
-    """Core LLM executor focused on LLM orchestration and tool calling.
-
-    This class handles:
-    - LLM API interactions (Claude, OpenAI, etc.)
-    - Tool call extraction from LLM responses
-    - Multi-turn conversations with tool results
-    - Prompt formatting for different model types
-
-    Tool execution is delegated to an external ToolExecutor implementation,
-    making this class highly testable and reusable across different contexts.
-
-    Example usage:
-        >>> # Create tool definitions (metadata only)
-        >>> tools = [
-        ...     ToolDefinition(
-        ...         name="get_weather",
-        ...         description="Get current weather for a location",
-        ...         parameters=[
-        ...             ParameterDefinition(name="location", type="string", description="City name")
-        ...         ]
-        ...     )
-        ... ]
-        >>>
-        >>> # Create model config
-        >>> model = ClaudeConfig(name="claude-3-haiku", api_key="...")
-        >>>
-        >>> # Create tool executor (implemented by context)
-        >>> tool_executor = MyToolExecutor(...)
-        >>>
-        >>> # Create LLM executor
-        >>> executor = LLMExecutor(model, tools, tool_executor)
-        >>>
-        >>> # Execute a prompt
-        >>> response, tool_calls = await executor.execute_prompt(
-        ...     "What's the weather in Paris?",
-        ...     user_context=user_context
-        ... )
-    """
+    """Pydantic-based agent loop with tool support."""
 
     def __init__(
         self,
-        model_config: ModelConfigType,
+        model_name: str,
+        model_type: str,
+        model_settings: ModelSettings,
         available_tools: list[ToolDefinition],
         tool_executor: ToolExecutor,
+        provider_config: ProviderConfig | None = None,
+        system_prompt: str | None = None,
+        agent_retries: int = DEFAULT_AGENT_RETRIES,
     ):
-        """Initialize LLM executor.
-
-        Args:
-            model_config: Configuration for the LLM model (Claude, OpenAI, etc.)
-            available_tools: List of tool definitions available to the LLM
-            tool_executor: Implementation for executing tools
-        """
-        self.model_config = model_config
         self.available_tools = available_tools
         self.tool_executor = tool_executor
+        self.model_name = model_name
+        self.model_type = model_type
+        self.provider_config = provider_config or ProviderConfig()
+        self._agent_cls: Callable[..., Any] = Agent
+        self._model_settings = model_settings
+        self._tool_models = self._build_tool_models(available_tools)
+        self._tool_schemas: dict[str, dict[str, Any]] = {}
+        self.system_prompt = system_prompt or self._build_system_prompt(available_tools)
+        self._agent_retries = max(1, agent_retries)
+        self._model_reference = self._build_model_reference()
 
         logger.info(
-            f"LLM executor initialized with model: {model_config.name} ({model_config.get_type()})"
+            "LLM executor initialized with model %s (%s) and %d tools",
+            self.model_name,
+            self.model_type,
+            len(available_tools),
         )
-        logger.info(f"Available tools: {len(available_tools)}")
-
-    def _format_tools_for_prompt(self) -> str:
-        """Format all available tools for inclusion in the prompt."""
-        if not self.available_tools:
-            return "No tools available."
-
-        tool_sections = []
-        for tool in self.available_tools:
-            tool_sections.append(tool.to_prompt_format())
-
-        return "=== AVAILABLE TOOLS ===\n\n" + "\n\n".join(tool_sections)
-
-    def _get_model_prompt(
-        self, user_prompt: str, conversation_history: list[dict[str, str]] | None = None
-    ) -> str:
-        """Get model-specific prompt format"""
-        available_tools = self._format_tools_for_prompt()
-        model_type = self.model_config.get_type()
-
-        if model_type == "claude":
-            return self._get_claude_prompt(user_prompt, available_tools, conversation_history)
-        elif model_type == "openai":
-            return self._get_openai_prompt(user_prompt, available_tools, conversation_history)
-        else:
-            return self._get_default_prompt(user_prompt, available_tools, conversation_history)
-
-    def _get_claude_prompt(
-        self,
-        user_prompt: str,
-        available_tools: str,
-        conversation_history: list[dict[str, str]] | None = None,
-    ) -> str:
-        """Claude-specific prompt format"""
-        system_prompt = f"""You are a helpful assistant with access to the following tools:
-
-{available_tools}
-
-To use a tool, respond with a JSON object:
-{{"tool": "tool_name", "arguments": {{"param": "value"}}}}
-
-For multiple tool calls, use an array:
-[{{"tool": "tool1", "arguments": {{}}}}, {{"tool": "tool2", "arguments": {{}}}}]
-
-Only output JSON when calling tools. Otherwise respond with regular text."""
-
-        messages = []
-        if conversation_history:
-            for msg in conversation_history:
-                messages.append(f"{msg['role']}: {msg['content']}")
-        messages.append(f"Human: {user_prompt}")
-
-        return system_prompt + "\n\n" + "\n\n".join(messages)
-
-    def _get_openai_prompt(
-        self,
-        user_prompt: str,
-        available_tools: str,
-        conversation_history: list[dict[str, str]] | None = None,
-    ) -> str:
-        """OpenAI-specific prompt format"""
-        system_prompt = f"""You are a helpful assistant with access to the following tools:
-
-{available_tools}
-
-To use a tool, respond with a JSON object:
-{{"tool": "tool_name", "arguments": {{"param": "value"}}}}
-
-For multiple tool calls, use an array:
-[{{"tool": "tool1", "arguments": {{}}}}, {{"tool": "tool2", "arguments": {{}}}}]
-
-Only output JSON when calling tools. Otherwise respond with regular text."""
-
-        messages = []
-        if conversation_history:
-            for msg in conversation_history:
-                messages.append(f"{msg['role']}: {msg['content']}")
-        messages.append(f"User: {user_prompt}")
-
-        return system_prompt + "\n\n" + "\n\n".join(messages)
-
-    def _get_default_prompt(
-        self,
-        user_prompt: str,
-        available_tools: str,
-        conversation_history: list[dict[str, str]] | None = None,
-    ) -> str:
-        """Default prompt format"""
-        return self._get_claude_prompt(user_prompt, available_tools, conversation_history)
 
     async def execute_prompt(
-        self, prompt: str, user_context: UserContextModel | None = None
-    ) -> tuple[str, list[dict[str, Any]]]:
-        """Execute a prompt and return the response and tool calls made.
+        self, prompt: str, user_context: UserContextModel | None = None, max_turns: int = 20
+    ) -> AgentResult:
+        """Run the agent loop for a prompt using pydantic-ai Agent."""
+        history: list[ToolCallRecord] = []
+        # Local callable mapping for this execution (passed to agent factory for testing)
+        tool_callables: dict[str, Callable[..., Any]] = {}
+
+        def _make_tool(tool_def: ToolDefinition) -> Tool:
+            args_model = self._tool_models.get(tool_def.name)
+            schema = self._tool_schemas.get(tool_def.name)
+            if schema is None:
+                schema = (
+                    args_model.model_json_schema()
+                    if args_model
+                    else {"type": "object", "properties": {}, "required": []}
+                )
+                self._tool_schemas[tool_def.name] = schema
+
+            async def _fn(**kwargs: Any) -> Any:
+                if max_turns is not None and len(history) >= max_turns:
+                    error_msg = f"Maximum tool calls exceeded ({max_turns})"
+                    history.append(
+                        ToolCallRecord(
+                            id=None, tool=tool_def.name, arguments=kwargs, error=error_msg
+                        )
+                    )
+                    raise RuntimeError(error_msg)
 
-        Args:
-            prompt: The user prompt to execute
-            user_context: Optional user context for tool execution
+                record = ToolCallRecord(id=None, tool=tool_def.name, arguments=kwargs)
+                try:
+                    validated = (
+                        args_model.model_validate(kwargs).model_dump() if args_model else kwargs
+                    )
+                    record.arguments = validated
+                    result = await self.tool_executor.execute_tool(
+                        tool_def.name, validated, user_context
+                    )
+                    record.result = result
+                    return result
+                except ModelRetry as exc:
+                    error_response = self._build_tool_error_response(tool_def.name, exc.message)
+                    record.error = error_response
+                    raise
+                except Exception as exc:  # noqa: BLE001
+                    error_response = self._build_tool_error_response(tool_def.name, str(exc))
+                    record.error = error_response
+                    retry_message = self._format_tool_retry_message(error_response)
+                    raise ModelRetry(retry_message) from exc
+                finally:
+                    history.append(record)
+
+            async def _prepare(
+                _ctx: RunContext[Any], _tool_def: AgentToolDefinition
+            ) -> AgentToolDefinition:
+                return AgentToolDefinition(
+                    name=tool_def.name,
+                    description=tool_def.description,
+                    parameters_json_schema=schema,
+                    strict=True,
+                )
+
+            tool = Tool(
+                _fn,
+                name=tool_def.name,
+                description=tool_def.description,
+                prepare=_prepare,
+            )
+            tool_callables[tool_def.name] = _fn
+            return tool
 
-        Returns:
-            Tuple of (final_response, list_of_tool_calls_made)
-        """
-        conversation_history: list[dict[str, Any]] = []
-        tool_calls_made: list[dict[str, Any]] = []
-        max_iterations = 10  # Prevent infinite loops
+        agent_tools = [_make_tool(t) for t in self.available_tools]
 
-        for _iteration in range(max_iterations):
-            # Get model-specific prompt
-            full_prompt = self._get_model_prompt(prompt, conversation_history)
+        # Build agent kwargs - only pass _tool_callables for test agents (not real pydantic-ai Agent)
+        agent_kwargs: dict[str, Any] = {
+            "model": self._model_reference,
+            "instructions": self.system_prompt,
+            "tools": agent_tools,
+            "retries": self._agent_retries,
+        }
+        if self._agent_cls is not Agent:
+            # Test agent factory - pass tool callables for invocation
+            agent_kwargs["_tool_callables"] = tool_callables
 
-            # Call the LLM
-            response = await self._call_llm(full_prompt)
+        agent = self._agent_cls(**agent_kwargs)
 
-            # Check if response contains tool calls
-            tool_calls = self._extract_tool_calls(response)
+        try:
+            agent_run = await agent.run(
+                prompt, deps=user_context, model_settings=self._model_settings
+            )
 
-            if not tool_calls:
-                # No more tool calls, return final response
-                return response, tool_calls_made
+            answer = getattr(agent_run, "output", "")
 
-            # Execute tool calls
-            tool_results = []
-            for tool_call in tool_calls:
-                tool_calls_made.append(tool_call)
+            # Log detailed info about the result
+            logger.debug(
+                "Agent completed: answer_length=%d, tool_calls=%d, raw_output_type=%s",
+                len(str(answer)) if answer else 0,
+                len(history),
+                type(answer).__name__,
+            )
 
-                try:
-                    tool_name = tool_call["tool"]
-                    arguments = tool_call.get("arguments", {})
+            if not answer:
+                logger.warning(
+                    "Agent returned empty output after %d tool calls. "
+                    "Check conversation history above for details.",
+                    len(history),
+                )
+            return AgentResult(answer=str(answer), tool_calls=history)
+
+        except UnexpectedModelBehavior as exc:
+            error_msg = f"Agent exhausted retries ({self._agent_retries}): {exc}"
+            logger.error(
+                "Agent failed after exhausting retries (retries=%d, tool_calls=%d): %s",
+                self._agent_retries,
+                len(history),
+                exc,
+            )
+            if logger.isEnabledFor(logging.DEBUG):
+                logger.debug("Tool call history on failure: %s", [tc.tool for tc in history])
+            return AgentResult(answer="", tool_calls=history, error=error_msg)
+        except UsageLimitExceeded as exc:
+            error_msg = f"Usage limit exceeded: {exc}"
+            logger.error("Agent hit usage limit after %d tool calls: %s", len(history), exc)
+            return AgentResult(answer="", tool_calls=history, error=error_msg)
+        except RuntimeError as exc:
+            error_msg = f"Execution aborted: {exc}"
+            logger.error("LLM execution aborted after %d tool calls: %s", len(history), exc)
+            return AgentResult(answer="", tool_calls=history, error=error_msg)
+        except Exception as exc:
+            error_msg = f"Unexpected error ({type(exc).__name__}): {exc}"
+            logger.error(
+                "Unexpected error during agent execution after %d tool calls: %s: %s",
+                len(history),
+                type(exc).__name__,
+                exc,
+            )
+            return AgentResult(answer="", tool_calls=history, error=error_msg)
+
+    async def evaluate_expected_answer(self, answer: str, expected_answer: str) -> dict[str, str]:
+        """Ask the model to grade an answer against an expected value."""
+        logger.debug(
+            "Grading answer:\n  Candidate: %s\n  Expected: %s",
+            answer[:200] + "..." if len(answer) > 200 else answer,
+            expected_answer[:200] + "..." if len(expected_answer) > 200 else expected_answer,
+        )
 
-                    # Execute the tool using external executor
-                    result = await self.tool_executor.execute_tool(
-                        tool_name, arguments, user_context
-                    )
+        grader_system = (
+            "You check if the candidate answer CONTAINS the expected information.\n\n"
+            "GRADING RULES:\n"
+            "- 'correct': The expected fact(s) appear in the candidate answer. "
+            "Extra details, context, or longer explanations are FINE and do not affect the grade.\n"
+            "- 'partially correct': Only use when the expected answer has MULTIPLE facts and some are missing.\n"
+            "- 'wrong': The expected information is absent, contradicted, or the candidate says it's unavailable.\n\n"
+            "IMPORTANT: If the expected answer is a single value (e.g., a name, status, role) and that exact value "
+            "appears anywhere in the candidate answer, grade it as 'correct' regardless of surrounding text.\n\n"
+            'Return JSON: {"result": "correct|wrong|partially correct", "comment": "...", "reasoning": "..."}'
+        )
+        grader_prompt = (
+            "Compare the candidate answer to the expected answer (semantic match, not exact string).\n"
+            "Candidate answer:\n"
+            f"{answer}\n\n"
+            "Expected answer:\n"
+            f"{expected_answer}\n\n"
+            "Respond with JSON like "
+            '{"result":"correct|wrong|partially correct","comment":"short","reasoning":"short"}'
+        )
 
-                    tool_results.append({"tool": tool_name, "result": result})
+        agent = self._agent_cls(
+            model=self._model_reference,
+            instructions=grader_system,
+            tools=(),
+            output_type=GradeResult,
+            retries=self._agent_retries,
+        )
 
-                except Exception as e:
-                    tool_results.append({"tool": tool_call.get("tool", "unknown"), "error": str(e)})
+        try:
+            run = await agent.run(grader_prompt, model_settings=self._model_settings)
+            out: GradeResult = getattr(run, "output", GradeResult())
+            result = out.model_dump()
 
-            # Add tool results to conversation
-            conversation_history.append({"role": "assistant", "content": response})
-            conversation_history.append(
-                {"role": "system", "content": f"Tool results: {json.dumps(tool_results)}"}
+            logger.debug(
+                "Grading result: %s (comment: %s, reasoning: %s)",
+                result.get("result", "unknown"),
+                result.get("comment", ""),
+                result.get("reasoning", ""),
             )
 
-            # Continue conversation with tool results
-            prompt = "Please incorporate the tool results into your response."
+            return result
+        except Exception as exc:
+            logger.error("Grading failed with error: %s: %s", type(exc).__name__, exc)
+            return {"result": "unknown", "comment": f"Grading error: {exc}", "reasoning": ""}
+
+    def _build_system_prompt(self, tools: list[ToolDefinition]) -> str:
+        if not tools:
+            return "You are an AI assistant. If no tools are suitable, answer directly."
+
+        tool_names = ", ".join(tool.name for tool in tools)
+        return (
+            "You are an AI assistant that uses tools to answer questions accurately. "
+            f"Available tools: {tool_names}.\n\n"
+            "IMPORTANT GUIDELINES:\n"
+            "1. If a tool call fails, READ THE ERROR MESSAGE CAREFULLY. "
+            "It often contains hints about what went wrong and how to fix it.\n"
+            "2. If you don't know the correct parameters (like field names or schema), "
+            "look for tools that can help you discover this information first.\n"
+            "3. Be persistent: try different approaches if one doesn't work.\n"
+            "4. YOU MUST ALWAYS PROVIDE A FINAL ANSWER. Even if tools fail, "
+            "provide the best answer you can with the information available, "
+            "or explain what information you were unable to retrieve."
+        )
 
-        # If we reach here, we hit the max iterations
-        return response, tool_calls_made
+    def _build_tool_models(self, tools: list[ToolDefinition]) -> dict[str, type[BaseModel]]:
+        models: dict[str, type[BaseModel]] = {}
+        for tool in tools:
+            fields: dict[str, Any] = {}
+            for param in tool.parameters:
+                py_type = self._map_param_type(param.type)
+                field_kwargs: dict[str, Any] = {}
+                if param.description:
+                    field_kwargs["description"] = param.description
+
+                if getattr(param, "schema", None):
+                    schema_extra_raw: dict[str, Any] = param.schema or {}
+                    schema_extra = dict(schema_extra_raw)
+                    schema_extra.pop("type", None)
+                    if schema_extra:
+                        field_kwargs["json_schema_extra"] = schema_extra
+
+                if param.default is not None:
+                    default_value: Any = param.default
+                elif param.required:
+                    default_value = ...
+                else:
+                    default_value = None
+
+                field_info = Field(default_value, **field_kwargs)
+                fields[param.name] = (py_type, field_info)
+
+            models[tool.name] = create_model(f"{tool.name}_Args", **fields)
+        return models
+
+    def _build_tool_error_response(self, tool_name: str, error_message: str) -> dict[str, Any]:
+        """Build a structured error response that guides the model to recover."""
+        return {
+            "status": "error",
+            "tool": tool_name,
+            "error": error_message,
+            "suggestion": (
+                "This tool call failed. Read the error message carefully - it often "
+                "contains hints about what went wrong. Consider: (1) calling this tool "
+                "with corrected arguments, (2) using a different tool to discover the "
+                "correct parameters first, or (3) trying a different approach."
+            ),
+        }
+
+    def _format_tool_retry_message(self, error_response: dict[str, Any]) -> str:
+        """Convert a structured error response into a message for ModelRetry."""
+        tool = error_response.get("tool", "unknown")
+        error_text = error_response.get("error", "Unknown error")
+        suggestion = error_response.get("suggestion")
+        base = f"Tool '{tool}' failed with error: {error_text}"
+        if suggestion:
+            return f"{base}. {suggestion}"
+        return base
+
+    def _build_model_reference(self) -> ModelReference:
+        """Instantiate a model object for providers that support direct configuration."""
+        model_type = (self.model_type or "").lower()
+        provider_kwargs = self._provider_kwargs()
 
-    def _extract_tool_calls(self, response: str) -> list[dict[str, Any]]:
-        """Extract tool calls from LLM response"""
         try:
-            # Try to parse as JSON (single tool call)
-            tool_call = json.loads(response.strip())
-            if isinstance(tool_call, dict) and "tool" in tool_call:
-                return [tool_call]
-            elif isinstance(tool_call, list):
-                # Multiple tool calls
-                return [tc for tc in tool_call if isinstance(tc, dict) and "tool" in tc]
-        except json.JSONDecodeError:
-            pass
-
-        # If not pure JSON, look for JSON in the response
-
-        json_pattern = r'\{[^}]*"tool"[^}]*\}'
-        matches = re.findall(json_pattern, response)
-
-        tool_calls = []
-        for match in matches:
-            try:
-                tool_call = json.loads(match)
-                if "tool" in tool_call:
-                    tool_calls.append(tool_call)
-            except json.JSONDecodeError:
-                continue
-
-        return tool_calls
-
-    async def _call_llm(self, prompt: str) -> str:
-        """Call the actual LLM API using the configured model"""
-
-        # Log the full prompt in debug mode
-        logger.debug(f"=== LLM Request to {self.model_config.name} ===")
-        logger.debug(f"Full prompt:\n{prompt}")
-        logger.debug("=== End of prompt ===")
-
-        model_type = self.model_config.get_type()
-
-        if model_type == "claude":
-            return await self._call_claude(prompt)
-        elif model_type == "openai":
-            return await self._call_openai(prompt)
-        else:
-            raise ValueError(f"Unknown model type: {model_type}")
-
-    async def _call_claude(self, prompt: str) -> str:
-        """Call Claude API"""
-
-        async with httpx.AsyncClient() as client:
-            response = await client.post(
-                f"{self.model_config.base_url}/v1/messages",
-                headers={
-                    "x-api-key": self.model_config.api_key,
-                    "anthropic-version": "2023-06-01",
-                    "content-type": "application/json",
-                },
-                json={
-                    "model": self.model_config.name,
-                    "messages": [{"role": "user", "content": prompt}],
-                    "max_tokens": 4096,
-                },
-                timeout=self.model_config.timeout,
-            )
-
-            response.raise_for_status()
-            data = response.json()
-
-            # Log response in debug mode
-            logger.debug(f"=== LLM Response from {self.model_config.name} ===")
-            logger.debug(f"Response: {data['content'][0]['text'][:500]}...")  # First 500 chars
-            logger.debug("=== End of response ===")
-
-            return cast(str, data["content"][0]["text"])
-
-    async def _call_openai(self, prompt: str) -> str:
-        """Call OpenAI API"""
-
-        async with httpx.AsyncClient() as client:
-            response = await client.post(
-                f"{self.model_config.base_url}/chat/completions",
-                headers={
-                    "Authorization": f"Bearer {self.model_config.api_key}",
-                    "Content-Type": "application/json",
-                },
-                json={
-                    "model": self.model_config.name,
-                    "messages": [
-                        {"role": "system", "content": "You are a helpful assistant."},
-                        {"role": "user", "content": prompt},
-                    ],
-                    "max_tokens": 4096,
-                },
-                timeout=self.model_config.timeout,
+            if model_type in {"openai", "openai-chat"}:
+                return OpenAIChatModel(self.model_name, provider=OpenAIProvider(**provider_kwargs))
+            if model_type == "openai-responses":
+                return OpenAIResponsesModel(
+                    self.model_name, provider=OpenAIProvider(**provider_kwargs)
+                )
+            if model_type.startswith("anthropic"):
+                return AnthropicModel(
+                    self.model_name, provider=AnthropicProvider(**provider_kwargs)
+                )
+        except Exception as exc:  # noqa: BLE001
+            logger.warning(
+                "Failed to build custom provider for model '%s' (%s): %s. Falling back to string reference.",
+                self.model_name,
+                self.model_type,
+                exc,
             )
 
-            response.raise_for_status()
-            data = response.json()
-
-            # Log response in debug mode
-            logger.debug(f"=== LLM Response from {self.model_config.name} ===")
-            logger.debug(
-                f"Response: {data['choices'][0]['message']['content'][:500]}..."
-            )  # First 500 chars
-            logger.debug("=== End of response ===")
-
-            return cast(str, data["choices"][0]["message"]["content"])
+        return f"{self.model_type}:{self.model_name}"
+
+    def _provider_kwargs(self) -> dict[str, Any]:
+        kwargs: dict[str, Any] = {}
+        if self.provider_config.base_url:
+            kwargs["base_url"] = self.provider_config.base_url
+        if self.provider_config.api_key:
+            kwargs["api_key"] = self.provider_config.api_key
+        if self.provider_config.timeout:
+            kwargs["timeout"] = self.provider_config.timeout
+        return kwargs
+
+    def _map_param_type(self, param_type: str) -> Any:
+        """Map simple tool parameter types to Python/Pydantic types."""
+        key = param_type.lower()
+        mapping: dict[tuple[str, ...], Any] = {
+            ("string", "str", "text"): str,
+            ("integer", "int"): int,
+            ("number", "float", "double"): float,
+            ("boolean", "bool"): bool,
+            ("object", "map", "dict"): dict[str, Any],
+            ("array", "list"): list[Any],
+        }
+        for aliases, py_type in mapping.items():
+            if key in aliases:
+                return py_type
+        logger.warning("Unknown tool parameter type '%s'; defaulting to Any", param_type)
+        return Any
diff --git a/src/mxcp/sdk/executor/plugins/python.py b/src/mxcp/sdk/executor/plugins/python.py
index 6308cf77..0ee1d1e8 100644
--- a/src/mxcp/sdk/executor/plugins/python.py
+++ b/src/mxcp/sdk/executor/plugins/python.py
@@ -411,7 +411,7 @@ async def execute(
                     return result
                 except (ImportError, SyntaxError) as e:
                     # These are executor-level errors that should be wrapped
-                    logger.error(f"Python execution failed: {e}")
+                    logger.debug(f"Python execution failed: {e}")
                     # Record error metrics
                     record_counter(
                         "mxcp.python.executions_total",
@@ -496,7 +496,7 @@ async def _execute_from_file(
             return await self._execute_function(func, params, context)
 
         except Exception as e:
-            logger.error(f"Failed to execute file {file_path}: {e}")
+            logger.debug(f"Failed to execute file {file_path}: {e}")
             raise
 
     async def _execute_inline(
@@ -566,7 +566,7 @@ async def _execute_inline(
             raise ValueError("No suitable function found in inline code")
 
         except Exception as e:
-            logger.error(f"Failed to execute inline code: {e}")
+            logger.debug(f"Failed to execute inline code: {e}")
             raise
 
     async def _execute_function(
@@ -606,5 +606,5 @@ def sync_function_wrapper() -> Any:
             return result
 
         except Exception as e:
-            logger.error(f"Function execution failed: {e}")
+            logger.debug(f"Function execution failed: {e}")
             raise
diff --git a/src/mxcp/server/core/config/models.py b/src/mxcp/server/core/config/models.py
index 98622b4c..e50a84dd 100644
--- a/src/mxcp/server/core/config/models.py
+++ b/src/mxcp/server/core/config/models.py
@@ -392,11 +392,12 @@ def _apply_defaults(self) -> UserAuthConfigModel:
 class UserModelConfigModel(BaseModel):
     model_config = ConfigDict(extra="forbid", frozen=True)
 
-    type: Literal["claude", "openai"]
+    type: Literal["anthropic", "openai"]
     api_key: str | None = None
     base_url: str | None = None
     timeout: int | None = None
     max_retries: int | None = None
+    options: dict[str, Any] = Field(default_factory=dict)
 
 
 class UserModelsConfigModel(BaseModel):
diff --git a/src/mxcp/server/definitions/endpoints/utils.py b/src/mxcp/server/definitions/endpoints/utils.py
index c9ca9cbf..ac2410c2 100644
--- a/src/mxcp/server/definitions/endpoints/utils.py
+++ b/src/mxcp/server/definitions/endpoints/utils.py
@@ -58,11 +58,7 @@ def get_endpoint_source_code(
         return source.code
 
     if source.file is not None:
-        source_path = Path(source.file)
-        if source_path.is_absolute():
-            full_path = repo_root / source_path.relative_to("/")
-        else:
-            full_path = endpoint_file_path.parent / source_path
+        full_path = resolve_file_path(source.file, endpoint_file_path, repo_root)
         return full_path.read_text()
     raise ValueError("No source code found in endpoint definition")
 
@@ -124,9 +120,24 @@ def resolve_file_path(file_path: str, endpoint_file_path: Path, repo_root: Path)
     """
     source_path = Path(file_path)
     if source_path.is_absolute():
-        return repo_root / source_path.relative_to("/")
+        return source_path
+
+    repo_candidate = (repo_root / source_path).resolve(strict=False)
+
+    endpoint_path = endpoint_file_path
+    if not endpoint_path.is_absolute():
+        endpoint_path = (repo_root / endpoint_path).resolve(strict=False)
     else:
-        return endpoint_file_path.parent / source_path
+        endpoint_path = endpoint_path.resolve(strict=False)
+    endpoint_candidate = (endpoint_path.parent / source_path).resolve(strict=False)
+
+    if repo_candidate.exists():
+        return repo_candidate
+    if endpoint_candidate.exists():
+        return endpoint_candidate
+
+    # Default to repo-relative path to keep behavior predictable even if file is missing
+    return repo_candidate
 
 
 def get_endpoint_name_or_uri(
@@ -193,14 +204,14 @@ def prepare_source_for_execution(
         if not tool_def:
             raise ValueError("No tool definition found")
         source = tool_def.source
-        language = tool_def.language
+        language = (source.language if source else None) or tool_def.language
         function_name = tool_def.name
     elif endpoint_type == "resource":
         resource_def = endpoint_definition.resource
         if not resource_def:
             raise ValueError("No resource definition found")
         source = resource_def.source
-        language = resource_def.language
+        language = (source.language if source else None) or resource_def.language
     else:
         raise ValueError("Prompts don't have source code")
 
@@ -236,4 +247,4 @@ def prepare_source_for_execution(
                 endpoint_definition, endpoint_type, endpoint_file_path, repo_root
             )
             return (language, source_code)
-    raise ValueError("No source code or file specified in endpoint definition")
+    raise ValueError("No source found for endpoint")
diff --git a/src/mxcp/server/definitions/evals/models.py b/src/mxcp/server/definitions/evals/models.py
index 78e489a0..cbed8dff 100644
--- a/src/mxcp/server/definitions/evals/models.py
+++ b/src/mxcp/server/definitions/evals/models.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from contextlib import suppress
-from typing import Any, Literal
+from typing import Any
 
 from pydantic import BaseModel, ConfigDict, field_validator, model_validator
 
@@ -22,6 +22,7 @@ class EvalAssertionsModel(EvalBaseModel):
     must_not_call: list[str] | None = None
     answer_contains: list[str] | None = None
     answer_not_contains: list[str] | None = None
+    expected_answer: str | None = None
 
 
 class EvalTestModel(EvalBaseModel):
@@ -49,7 +50,9 @@ class EvalSuiteModel(EvalBaseModel):
     mxcp: int = 1
     suite: str
     description: str | None = None
-    model: Literal["claude-4-opus", "claude-4-sonnet", "gpt-4o", "gpt-4.1"] | None = None
+    model: str | None = None
+    expected_answer_model: str | None = None
+    system_prompt: str | None = None
     tests: list[EvalTestModel]
 
     @field_validator("suite")
diff --git a/src/mxcp/server/executor/runners/tool.py b/src/mxcp/server/executor/runners/tool.py
index d1f4ae67..cfcd1eec 100644
--- a/src/mxcp/server/executor/runners/tool.py
+++ b/src/mxcp/server/executor/runners/tool.py
@@ -6,16 +6,25 @@
 """
 
 import logging
+from dataclasses import dataclass
+from pathlib import Path
 from typing import Any
 
 from mxcp.sdk.auth import UserContextModel
 from mxcp.sdk.executor import ExecutionContext, ExecutionEngine
+from mxcp.server.core.config.site_config import find_repo_root
 from mxcp.server.definitions.endpoints.models import EndpointDefinitionModel
-from mxcp.server.definitions.endpoints.utils import detect_language_from_source, extract_source_info
+from mxcp.server.definitions.endpoints.utils import prepare_source_for_execution
 
 logger = logging.getLogger(__name__)
 
 
+@dataclass(frozen=True)
+class EndpointWithPath:
+    definition: EndpointDefinitionModel
+    path: Path
+
+
 class EndpointToolExecutor:
     """Tool executor that executes tools via SDK ExecutionEngine and endpoints.
 
@@ -41,7 +50,7 @@ class EndpointToolExecutor:
         >>> llm_executor = LLMExecutor(model_config, tool_definitions, tool_executor)
     """
 
-    def __init__(self, engine: ExecutionEngine, endpoints: list[EndpointDefinitionModel]):
+    def __init__(self, engine: ExecutionEngine, endpoints: list[EndpointWithPath]):
         """Initialize the endpoint tool executor.
 
         Args:
@@ -49,15 +58,16 @@ def __init__(self, engine: ExecutionEngine, endpoints: list[EndpointDefinitionMo
             endpoints: List of endpoint definitions
         """
         self.engine = engine
-        self.endpoints = endpoints
+        self.endpoints = [entry.definition for entry in endpoints]
 
         # Create lookup map for faster tool resolution
-        self._tool_map: dict[str, EndpointDefinitionModel] = {}
-        for endpoint_def in endpoints:
+        self._tool_map: dict[str, tuple[EndpointDefinitionModel, Path]] = {}
+        for entry in endpoints:
+            endpoint_def, path = entry.definition, entry.path
             if endpoint_def.tool:
-                self._tool_map[endpoint_def.tool.name] = endpoint_def
+                self._tool_map[endpoint_def.tool.name] = (endpoint_def, path)
             elif endpoint_def.resource:
-                self._tool_map[endpoint_def.resource.uri] = endpoint_def
+                self._tool_map[endpoint_def.resource.uri] = (endpoint_def, path)
 
         logger.info(f"EndpointToolExecutor initialized with {len(endpoints)} endpoints")
 
@@ -82,60 +92,43 @@ async def execute_tool(
             Exception: If execution fails
         """
         # Find the endpoint
-        endpoint_def = self._tool_map.get(tool_name)
-        if not endpoint_def:
+        entry = self._tool_map.get(tool_name)
+        if not entry:
             available_tools = list(self._tool_map.keys())
             raise ValueError(f"Tool '{tool_name}' not found. Available tools: {available_tools}")
+        endpoint_def, endpoint_path = entry
 
         # Create execution context
         context = ExecutionContext(user_context=user_context)
 
         # Determine the source code and language
-        source_info = self._get_source_code(endpoint_def, tool_name)
-        language = self._get_language(endpoint_def, tool_name, source_info)
+        if endpoint_def.tool:
+            endpoint_type = "tool"
+        elif endpoint_def.resource:
+            endpoint_type = "resource"
+        else:
+            raise ValueError(f"Endpoint '{tool_name}' has no tool or resource definition")
+
+        repo_root = find_repo_root()
+        language, source_payload = prepare_source_for_execution(
+            endpoint_def,
+            endpoint_type,
+            endpoint_path,
+            repo_root,
+            include_function_name=True,
+        )
 
         logger.debug(f"Executing tool '{tool_name}' with language '{language}'")
 
         try:
             # Execute using the SDK engine
             result = await self.engine.execute(
-                language=language, source_code=source_info, params=arguments, context=context
+                language=language, source_code=source_payload, params=arguments, context=context
             )
 
             logger.debug(f"Tool '{tool_name}' executed successfully")
             return result
 
         except Exception as e:
-            logger.error(f"Tool '{tool_name}' execution failed: {e}")
+            logger.debug(f"Tool '{tool_name}' execution failed: {e}")
             raise
-
-    def _get_source_code(self, endpoint_def: EndpointDefinitionModel, tool_name: str) -> str:
-        """Extract source code from endpoint definition."""
-        # Get the tool or resource definition
-        source = None
-        if endpoint_def.tool:
-            source = endpoint_def.tool.source
-        elif endpoint_def.resource:
-            source = endpoint_def.resource.source
-
-        if not source:
-            raise ValueError(f"No source found for endpoint '{tool_name}'")
-
-        source_type, source_value = extract_source_info(source)
-        return source_value
-
-    def _get_language(
-        self, endpoint_def: EndpointDefinitionModel, tool_name: str, source_info: str
-    ) -> str:
-        """Determine the programming language for the endpoint."""
-        # Get the tool or resource definition
-        source = None
-        if endpoint_def.tool:
-            source = endpoint_def.tool.source
-        elif endpoint_def.resource:
-            source = endpoint_def.resource.source
-
-        if not source:
-            raise ValueError(f"No source found for endpoint '{tool_name}'")
-
-        return detect_language_from_source(source, source_info)
diff --git a/src/mxcp/server/interfaces/cli/evals.py b/src/mxcp/server/interfaces/cli/evals.py
index 5d944843..1ad0dad4 100644
--- a/src/mxcp/server/interfaces/cli/evals.py
+++ b/src/mxcp/server/interfaces/cli/evals.py
@@ -1,4 +1,7 @@
 import json
+import logging
+import re
+import sys
 import time
 from pathlib import Path
 from typing import Any
@@ -18,6 +21,93 @@
 )
 from mxcp.server.services.evals import run_all_evals, run_eval_suite
 
+_NOISY_EVAL_LOGGERS = (
+    "openai",
+    "openai._base_client",
+    "openai._client",
+    "openai._streaming",
+    "httpx",
+    "httpcore",
+    "httpcore.connection",
+    "httpcore.connectionpool",
+    "urllib3",
+    "urllib3.connectionpool",
+)
+
+_ANSI_ESCAPE = re.compile(r"\x1B\[[0-?]*[ -/]*[@-~]")
+
+# ANSI escape sequences for terminal control
+_ANSI_MOVE_UP_CLEAR = "\033[F\033[2K"
+
+# Characters that indicate a final (completed) progress message
+_FINAL_INDICATORS = ("✓", "✗")
+
+
+class ProgressRenderer:
+    """Stateful progress renderer that overwrites lines in TTY mode."""
+
+    def __init__(self, is_tty: bool = True) -> None:
+        self._lines: dict[str, str] = {}
+        self._order: list[str] = []
+        self._lines_printed: int = 0
+        self._is_tty = is_tty
+
+    def clear_all(self) -> None:
+        """Clear all progress lines and reset cursor to start."""
+        if not self._is_tty or self._lines_printed == 0:
+            return
+        for _ in range(self._lines_printed):
+            sys.stdout.write(_ANSI_MOVE_UP_CLEAR)
+        sys.stdout.flush()
+        self._lines_printed = 0
+
+    def _render(self) -> None:
+        """Render all in-progress items."""
+        if not self._is_tty or not self._order:
+            return
+        for key in self._order:
+            line = self._lines.get(key, "")
+            sys.stdout.write(line + "\n")
+        sys.stdout.flush()
+        self._lines_printed = len(self._order)
+
+    def _is_final_message(self, msg: str) -> bool:
+        """Check if message indicates completion (success or failure)."""
+        clean = _ANSI_ESCAPE.sub("", msg).lstrip()
+        return any(clean.startswith(indicator) for indicator in _FINAL_INDICATORS)
+
+    def update(self, key: str, msg: str) -> None:
+        """Update progress for a key. Final messages are printed permanently."""
+        if not self._is_tty:
+            click.echo(msg)
+            return
+
+        if self._is_final_message(msg):
+            # Clear progress, print final result, re-render remaining
+            self.clear_all()
+            self._order = [k for k in self._order if k != key]
+            self._lines.pop(key, None)
+            click.echo(msg)
+            self._render()
+        else:
+            # Update or add progress line
+            self.clear_all()
+            if key not in self._order:
+                self._order.append(key)
+            self._lines[key] = msg
+            self._render()
+
+
+def _suppress_noisy_eval_logs(debug: bool) -> None:
+    """Clamp overly chatty third-party loggers unless debug is explicitly enabled."""
+    if debug:
+        return
+
+    for name in _NOISY_EVAL_LOGGERS:
+        noisy_logger = logging.getLogger(name)
+        noisy_logger.setLevel(logging.WARNING)
+        noisy_logger.propagate = True
+
 
 def format_eval_results(results: dict[str, Any], debug: bool = False) -> str:
     """Format eval results for human-readable output"""
@@ -65,7 +155,14 @@ def format_eval_results(results: dict[str, Any], debug: bool = False) -> str:
 
                 failures = test.get("failures", [])
                 for failure in failures:
-                    output.append(f"    {click.style('💡', fg='yellow')} {failure}")
+                    lines = failure.splitlines()
+                    if not lines:
+                        continue
+                    indent = " " * 4
+                    continuation_indent = indent + " " * 3
+                    output.append(f"{indent}{click.style('💡', fg='yellow')} {lines[0]}")
+                    for line in lines[1:]:
+                        output.append(f"{continuation_indent}{line}")
 
                 if debug and "details" in test:
                     output.append(f"    {click.style('Debug info:', fg='yellow')}")
@@ -173,7 +270,14 @@ def format_eval_results(results: dict[str, Any], debug: bool = False) -> str:
                                 f"      {click.style('Error:', fg='red')} {test['error']}"
                             )
                         for failure in test.get("failures", []):
-                            output.append(f"      {click.style('💡', fg='yellow')} {failure}")
+                            lines = failure.splitlines()
+                            if not lines:
+                                continue
+                            indent = " " * 6
+                            continuation_indent = indent + " " * 3
+                            output.append(f"{indent}{click.style('💡', fg='yellow')} {lines[0]}")
+                            for line in lines[1:]:
+                                output.append(f"{continuation_indent}{line}")
 
         # Show passed suites
         passed = [s for s in suites if s.get("status") == "passed"]
@@ -268,6 +372,7 @@ def evals(
 
         # Configure logging
         configure_logging_from_config(user_config=user_config, debug=debug)
+        _suppress_noisy_eval_logs(debug)
         # Run async implementation
         run_async_cli(
             _evals_impl(
@@ -339,6 +444,11 @@ async def _evals_impl(
 
     # Run evals
     start_time = time.time()
+
+    # Create progress renderer for TTY-aware output
+    is_tty = click.get_text_stream("stdout").isatty()
+    progress = ProgressRenderer(is_tty=is_tty)
+
     if suite_name:
         results = await run_eval_suite(
             suite_name,
@@ -347,6 +457,7 @@ async def _evals_impl(
             profile,
             cli_user_context=cli_user_context,
             override_model=model,
+            progress_callback=progress.update,
         )
     else:
         results = await run_all_evals(
@@ -355,6 +466,7 @@ async def _evals_impl(
             profile,
             cli_user_context=cli_user_context,
             override_model=model,
+            progress_callback=progress.update,
         )
     elapsed_time = time.time() - start_time
     results["elapsed_time"] = elapsed_time
@@ -362,6 +474,7 @@ async def _evals_impl(
     if json_output:
         output_result(results, json_output, debug)
     else:
+        progress.clear_all()
         click.echo(format_eval_results(results, debug))
 
     # Exit with error code if any tests failed
diff --git a/src/mxcp/server/services/evals/service.py b/src/mxcp/server/services/evals/service.py
index 43870b33..0d5b2468 100644
--- a/src/mxcp/server/services/evals/service.py
+++ b/src/mxcp/server/services/evals/service.py
@@ -1,37 +1,36 @@
 import logging
 import time
+from collections.abc import Callable
 from typing import Any
 
+import click
+from pydantic_ai import ModelSettings
+
 from mxcp.sdk.auth import UserContextModel
-from mxcp.sdk.evals import (
-    ClaudeConfig,
-    LLMExecutor,
-    ModelConfigType,
-    OpenAIConfig,
-    ParameterDefinition,
-    ToolDefinition,
-)
+from mxcp.sdk.evals import LLMExecutor, ParameterDefinition, ProviderConfig, ToolDefinition
 from mxcp.sdk.validator import TypeSchemaModel
 from mxcp.server.core.config.models import SiteConfigModel, UserConfigModel
 from mxcp.server.core.config.site_config import find_repo_root
 from mxcp.server.definitions.endpoints.loader import EndpointLoader
-from mxcp.server.definitions.endpoints.models import EndpointDefinitionModel
+from mxcp.server.definitions.endpoints.models import ParamDefinitionModel, TypeDefinitionModel
 from mxcp.server.definitions.evals.loader import discover_eval_files, load_eval_suite
 from mxcp.server.executor.engine import create_runtime_environment
-from mxcp.server.executor.runners.tool import EndpointToolExecutor
+from mxcp.server.executor.runners.tool import EndpointToolExecutor, EndpointWithPath
 
 logger = logging.getLogger(__name__)
 
 
-def _create_model_config(model: str, user_config: UserConfigModel) -> ModelConfigType:
-    """Create a model configuration from user config.
+def _create_model_config(
+    model: str, user_config: UserConfigModel
+) -> tuple[str, str, dict[str, Any], ProviderConfig]:
+    """Create a model configuration tuple from user config.
 
     Args:
         model: Model name to use
         user_config: User configuration containing model settings
 
     Returns:
-        Configured model object
+        Tuple of (model_name, model_type, options, provider_config)
 
     Raises:
         ValueError: If model is not configured or has invalid type
@@ -46,45 +45,171 @@ def _create_model_config(model: str, user_config: UserConfigModel) -> ModelConfi
 
     model_type = model_config.type
     api_key = model_config.api_key
+    options = dict(model_config.options or {})
+    api_mode = options.get("api") or options.get("endpoint")
 
     if not api_key:
         raise ValueError(f"No API key configured for model '{model}'")
 
-    if model_type == "claude":
-        base_url = model_config.base_url or "https://api.anthropic.com"
-        timeout = model_config.timeout or 30
-        return ClaudeConfig(name=model, api_key=api_key, base_url=base_url, timeout=timeout)
-    elif model_type == "openai":
-        base_url = model_config.base_url or "https://api.openai.com/v1"
-        timeout = model_config.timeout or 30
-        return OpenAIConfig(name=model, api_key=api_key, base_url=base_url, timeout=timeout)
-    else:
+    if model_type not in {"anthropic", "openai"}:
         raise ValueError(f"Unknown model type: {model_type}")
 
-
-def _load_endpoints(site_config: SiteConfigModel) -> list[EndpointDefinitionModel]:
+    effective_model_type = (
+        "openai-responses" if model_type == "openai" and api_mode == "responses" else model_type
+    )
+
+    base_url = model_config.base_url
+    timeout = model_config.timeout
+
+    # Ensure timeout also flows through options if present
+    if timeout and "timeout" not in options:
+        options["timeout"] = timeout
+
+    provider_config = ProviderConfig(api_key=api_key, base_url=base_url, timeout=timeout)
+
+    return model, effective_model_type, options, provider_config
+
+
+def _build_model_settings(
+    model_name: str, model_type: str, model_options: dict[str, Any], allowed_keys: set[str]
+) -> ModelSettings:
+    model_opts = dict(model_options)
+    model_opts.pop("api", None)
+    model_opts.pop("endpoint", None)
+
+    recognized_options = {k: v for k, v in model_opts.items() if k in allowed_keys}
+    body_extras: dict[str, Any] = dict(recognized_options.get("extra_body") or {})
+    header_extras: dict[str, str] = dict(recognized_options.get("extra_headers") or {})
+    ignored: list[str] = []
+
+    for key, value in model_opts.items():
+        if key in allowed_keys:
+            continue
+        if key.startswith("body:"):
+            body_extras[key.split(":", 1)[1]] = value
+        elif key.startswith("header:"):
+            header_value: str
+            if isinstance(value, list):
+                header_value = ",".join(str(v) for v in value)
+            else:
+                header_value = str(value)
+            header_extras[key.split(":", 1)[1]] = header_value
+        else:
+            ignored.append(key)
+
+    if ignored:
+        logger.warning(
+            "Ignoring unprefixed model options for model '%s': %s. "
+            "Use 'body:<key>' or 'header:<key>' prefixes.",
+            model_name,
+            sorted(ignored),
+        )
+
+    if body_extras:
+        recognized_options["extra_body"] = body_extras
+    if header_extras:
+        recognized_options["extra_headers"] = header_extras
+
+    if "max_tokens" not in recognized_options:
+        recognized_options["max_tokens"] = 10_000
+
+    return ModelSettings(**recognized_options)  # type: ignore[typeddict-item,no-any-return]
+
+
+def _type_definition_to_schema(type_definition: TypeDefinitionModel) -> dict[str, Any]:
+    schema: dict[str, Any] = {"type": type_definition.type}
+
+    if type_definition.description:
+        schema["description"] = type_definition.description
+    if type_definition.default is not None:
+        schema["default"] = type_definition.default
+    if type_definition.enum:
+        schema["enum"] = list(type_definition.enum)
+    if type_definition.examples:
+        schema["examples"] = list(type_definition.examples)
+
+    if type_definition.type == "string":
+        if type_definition.format:
+            schema["format"] = type_definition.format
+        if type_definition.minLength is not None:
+            schema["minLength"] = type_definition.minLength
+        if type_definition.maxLength is not None:
+            schema["maxLength"] = type_definition.maxLength
+        if type_definition.pattern:
+            schema["pattern"] = type_definition.pattern
+    elif type_definition.type in {"number", "integer"}:
+        if type_definition.minimum is not None:
+            schema["minimum"] = type_definition.minimum
+        if type_definition.maximum is not None:
+            schema["maximum"] = type_definition.maximum
+        if type_definition.exclusiveMinimum is not None:
+            schema["exclusiveMinimum"] = type_definition.exclusiveMinimum
+        if type_definition.exclusiveMaximum is not None:
+            schema["exclusiveMaximum"] = type_definition.exclusiveMaximum
+        if type_definition.multipleOf is not None:
+            schema["multipleOf"] = type_definition.multipleOf
+    elif type_definition.type == "array":
+        if type_definition.items is not None:
+            schema["items"] = _type_definition_to_schema(type_definition.items)
+        else:
+            schema["items"] = {"type": "string"}
+        if type_definition.minItems is not None:
+            schema["minItems"] = type_definition.minItems
+        if type_definition.maxItems is not None:
+            schema["maxItems"] = type_definition.maxItems
+        if type_definition.uniqueItems is not None:
+            schema["uniqueItems"] = type_definition.uniqueItems
+    elif type_definition.type == "object":
+        if type_definition.properties:
+            schema["properties"] = {
+                key: _type_definition_to_schema(value)
+                for key, value in type_definition.properties.items()
+            }
+        if type_definition.required:
+            schema["required"] = list(type_definition.required)
+        if type_definition.additionalProperties is not None:
+            schema["additionalProperties"] = type_definition.additionalProperties
+
+    return schema
+
+
+def _parameter_definition_from_model(param: ParamDefinitionModel) -> ParameterDefinition:
+    has_default = "default" in param.model_fields_set
+    schema = _type_definition_to_schema(param)
+    schema.pop("name", None)
+    return ParameterDefinition(
+        name=param.name,
+        type=param.type,
+        description=param.description or "",
+        default=param.default if has_default else None,
+        required=not has_default,
+        schema=schema or None,
+    )
+
+
+def _load_endpoints(site_config: SiteConfigModel) -> list[EndpointWithPath]:
     """Load all available endpoints.
 
     Args:
         site_config: Site configuration for endpoint discovery
 
     Returns:
-        List of endpoint definitions
+        List of (endpoint definition, file path)
     """
     loader = EndpointLoader(site_config)
-    endpoints: list[EndpointDefinitionModel] = []
+    endpoints: list[EndpointWithPath] = []
     discovered = loader.discover_endpoints()
 
-    for _path, endpoint_def, error in discovered:
+    for path, endpoint_def, error in discovered:
         if error is None and endpoint_def and (endpoint_def.tool or endpoint_def.resource):
             # Only include endpoints that have a tool or resource definition
-            endpoints.append(endpoint_def)
+            endpoints.append(EndpointWithPath(endpoint_def, path))
 
     return endpoints
 
 
 def _convert_endpoints_to_tool_definitions(
-    endpoints: list[EndpointDefinitionModel],
+    endpoints: list[EndpointWithPath],
 ) -> list[ToolDefinition]:
     """Convert endpoint definitions to ToolDefinition objects for the LLM.
 
@@ -96,22 +221,14 @@ def _convert_endpoints_to_tool_definitions(
     """
     tool_definitions = []
 
-    for endpoint_def in endpoints:
+    for entry in endpoints:
+        endpoint_def = entry.definition
         if endpoint_def.tool:
             tool = endpoint_def.tool
 
-            tool_parameters = []
-            for param in tool.parameters or []:
-                has_default = "default" in param.model_fields_set
-                tool_parameters.append(
-                    ParameterDefinition(
-                        name=param.name,
-                        type=param.type,
-                        description=param.description or "",
-                        default=param.default if has_default else None,
-                        required=not has_default,
-                    )
-                )
+            tool_parameters = [
+                _parameter_definition_from_model(param) for param in (tool.parameters or [])
+            ]
 
             return_type = None
             if tool.return_:
@@ -138,18 +255,9 @@ def _convert_endpoints_to_tool_definitions(
 
         elif endpoint_def.resource:
             resource = endpoint_def.resource
-            resource_parameters = []
-            for param in resource.parameters or []:
-                has_default = "default" in param.model_fields_set
-                resource_parameters.append(
-                    ParameterDefinition(
-                        name=param.name,
-                        type=param.type,
-                        description=param.description or "",
-                        default=param.default if has_default else None,
-                        required=not has_default,
-                    )
-                )
+            resource_parameters = [
+                _parameter_definition_from_model(param) for param in (resource.parameters or [])
+            ]
 
             return_type = None
             if resource.return_:
@@ -171,6 +279,132 @@ def _convert_endpoints_to_tool_definitions(
     return tool_definitions
 
 
+def _format_expected_answer_failure(
+    response: str,
+    expected: str,
+    grade: str | None,
+    comment: str | None,
+    reasoning: str | None,
+) -> str:
+    """Build a multi-line failure detail block for expected-answer grading."""
+    lines = [
+        f"LLM Answer: {response}",
+        f"Expected: {expected}",
+        f"Grade: {grade or 'unknown'}",
+        f"Comment: {comment or 'n/a'}",
+        f"Reasoning: {reasoning or 'n/a'}",
+    ]
+    return "\n".join(lines)
+
+
+async def _evaluate_test_assertions(
+    test: Any,
+    response: str,
+    tool_calls: list[Any],
+    execution_error: str | None,
+    grader: LLMExecutor,
+) -> tuple[list[str], dict[str, Any] | None]:
+    """Evaluate all assertions for a test.
+
+    Returns:
+        Tuple of (failures list, expected_answer_evaluation dict or None)
+    """
+    failures: list[str] = []
+    evaluation: dict[str, Any] | None = None
+    assertions = test.assertions
+
+    # If the agent failed to execute, report it clearly
+    if execution_error:
+        failures.append(f"Agent execution failed: {execution_error}")
+
+    for call in tool_calls:
+        if call.error:
+            logger.debug(
+                "Tool '%s' failed during test '%s': %s",
+                call.tool,
+                test.name,
+                call.error,
+            )
+
+    # Check must_call assertions
+    if assertions.must_call:
+        for expected_call in assertions.must_call:
+            expected_tool = expected_call.tool
+            expected_args = expected_call.args or {}
+
+            found = False
+            for call in tool_calls:
+                if call.tool == expected_tool:
+                    actual_args = call.arguments or {}
+                    if all(actual_args.get(k) == v for k, v in expected_args.items()):
+                        found = True
+                        break
+
+            if not found:
+                failures.append(
+                    f"Expected call to '{expected_tool}' with args {expected_args} not found"
+                )
+
+    # Check must_not_call assertions
+    if assertions.must_not_call:
+        for forbidden_tool in assertions.must_not_call:
+            if any(call.tool == forbidden_tool for call in tool_calls):
+                failures.append(f"Tool '{forbidden_tool}' was called but should not have been")
+
+    # Check answer_contains assertions
+    if assertions.answer_contains:
+        for expected_text in assertions.answer_contains:
+            if expected_text.lower() not in response.lower():
+                failures.append(f"Expected text '{expected_text}' not found in response")
+
+    # Check answer_not_contains assertions
+    if assertions.answer_not_contains:
+        for forbidden_text in assertions.answer_not_contains:
+            if forbidden_text.lower() in response.lower():
+                failures.append(f"Forbidden text '{forbidden_text}' found in response")
+
+    if assertions.expected_answer:
+        logger.debug(
+            "Evaluating expected_answer assertion for test '%s': response_length=%d, expected='%s'",
+            test.name,
+            len(response),
+            (
+                assertions.expected_answer[:100] + "..."
+                if len(assertions.expected_answer) > 100
+                else assertions.expected_answer
+            ),
+        )
+        evaluation = await grader.evaluate_expected_answer(response, assertions.expected_answer)
+        grade = (evaluation.get("result") or "").lower()
+        comment = evaluation.get("comment") or "Model answer did not match expected"
+        reasoning = evaluation.get("reasoning") or ""
+
+        logger.debug(
+            "Expected answer evaluation for '%s': grade=%s, response='%s'",
+            test.name,
+            grade,
+            response[:150] + "..." if len(response) > 150 else response,
+        )
+
+        detail = _format_expected_answer_failure(
+            response,
+            assertions.expected_answer,
+            grade or "unknown",
+            comment,
+            reasoning,
+        )
+        if grade != "correct":
+            logger.info(
+                "Test '%s' failed expected_answer check: grade=%s, comment=%s",
+                test.name,
+                grade,
+                comment,
+            )
+            failures.append(detail)
+
+    return failures, evaluation
+
+
 async def run_eval_suite(
     suite_name: str,
     user_config: UserConfigModel,
@@ -178,6 +412,8 @@ async def run_eval_suite(
     profile: str | None,
     cli_user_context: UserContextModel | None = None,
     override_model: str | None = None,
+    progress_callback: Callable[[str, str], None] | None = None,
+    expected_answer_model: str | None = None,
 ) -> dict[str, Any]:
     """Run a specific eval suite by name.
 
@@ -210,9 +446,16 @@ async def run_eval_suite(
             "error": "No model specified. Set 'model' in eval suite or configure a default model.",
             "suite": suite_name,
         }
+    grading_model = expected_answer_model or getattr(eval_suite, "expected_answer_model", None)
 
     # Create model configuration
-    model_config = _create_model_config(model, user_config)
+    model_name, model_type, model_options, provider_config = _create_model_config(
+        model, user_config
+    )
+    allowed_keys = set(ModelSettings.__annotations__.keys())
+    model_opts = dict(model_options)
+
+    model_settings = _build_model_settings(model_name, model_type, model_opts, allowed_keys)
 
     # Load endpoints
     endpoints = _load_endpoints(site_config)
@@ -226,20 +469,56 @@ async def run_eval_suite(
 
     # Create tool executor that bridges LLM calls to endpoint execution
     tool_executor = EndpointToolExecutor(engine, endpoints)
+    grading_executor: LLMExecutor | None = None
+
+    if grading_model:
+        grade_model_name, grade_model_type, grade_opts, grade_provider = _create_model_config(
+            grading_model, user_config
+        )
+        grade_settings = _build_model_settings(
+            grade_model_name, grade_model_type, dict(grade_opts), allowed_keys
+        )
+        grading_executor = LLMExecutor(
+            grade_model_name,
+            grade_model_type,
+            grade_settings,
+            [],  # no tools needed for grading
+            tool_executor,
+            provider_config=grade_provider,
+        )
 
     logger.info(f"Running eval suite: {suite_name} from {file_path}")
     logger.info(f"Suite description: {eval_suite.description or 'No description'}")
     logger.info(f"Model: {model}")
     logger.info(f"Number of tests: {len(eval_suite.tests)}")
 
+    total_tests = len(eval_suite.tests)
+
     try:
         # Create LLM executor with model config, tool definitions, and tool executor
-        executor = LLMExecutor(model_config, tool_definitions, tool_executor)
+        executor = LLMExecutor(
+            model_name,
+            model_type,
+            model_settings,
+            tool_definitions,
+            tool_executor,
+            provider_config=provider_config,
+            system_prompt=eval_suite.system_prompt,
+        )
 
         # Run each test
         tests = []
-        for test in eval_suite.tests:
+        for idx, test in enumerate(eval_suite.tests, start=1):
             test_start = time.time()
+            if progress_callback:
+                progress_callback(
+                    f"test:{suite_name}:{idx}",
+                    "  ⏳ "
+                    + click.style(
+                        f"[{suite_name}] {idx}/{total_tests} • {test.name}...",
+                        fg="cyan",
+                    ),
+                )
 
             # Determine user context for this test
             test_user_context = cli_user_context
@@ -257,67 +536,60 @@ async def run_eval_suite(
 
             try:
                 # Execute the prompt
-                response, tool_calls = await executor.execute_prompt(
+                agent_result = await executor.execute_prompt(
                     test.prompt, user_context=test_user_context
                 )
 
+                response = agent_result.answer
+                tool_calls = agent_result.tool_calls
+                execution_error = agent_result.error
+
                 # Evaluate assertions
-                failures = []
-                assertions = test.assertions
-
-                # Check must_call assertions
-                if assertions.must_call:
-                    for expected_call in assertions.must_call:
-                        expected_tool = expected_call.tool
-                        expected_args = expected_call.args or {}
-
-                        found = False
-                        for call in tool_calls:
-                            if call["tool"] == expected_tool:
-                                actual_args = call.get("arguments", {})
-                                if all(actual_args.get(k) == v for k, v in expected_args.items()):
-                                    found = True
-                                    break
-
-                        if not found:
-                            failures.append(
-                                f"Expected call to '{expected_tool}' with args {expected_args} not found"
-                            )
-
-                # Check must_not_call assertions
-                if assertions.must_not_call:
-                    for forbidden_tool in assertions.must_not_call:
-                        if any(call["tool"] == forbidden_tool for call in tool_calls):
-                            failures.append(
-                                f"Tool '{forbidden_tool}' was called but should not have been"
-                            )
-
-                # Check answer_contains assertions
-                if assertions.answer_contains:
-                    for expected_text in assertions.answer_contains:
-                        if expected_text.lower() not in response.lower():
-                            failures.append(
-                                f"Expected text '{expected_text}' not found in response"
-                            )
-
-                # Check answer_not_contains assertions
-                if assertions.answer_not_contains:
-                    for forbidden_text in assertions.answer_not_contains:
-                        if forbidden_text.lower() in response.lower():
-                            failures.append(f"Forbidden text '{forbidden_text}' found in response")
+                grader = grading_executor or executor
+                failures, evaluation = await _evaluate_test_assertions(
+                    test, response, tool_calls, execution_error, grader
+                )
 
                 test_time = time.time() - test_start
 
+                passed = len(failures) == 0
                 tests.append(
                     {
                         "name": test.name,
                         "description": test.description,
-                        "passed": len(failures) == 0,
+                        "passed": passed,
                         "failures": failures,
                         "time": test_time,
-                        "details": {"response": response, "tool_calls": tool_calls},
+                        "details": {
+                            "response": response,
+                            "execution_error": execution_error,
+                            "tool_calls": [
+                                {
+                                    "id": call.id,
+                                    "tool": call.tool,
+                                    "arguments": call.arguments,
+                                    "result": call.result,
+                                    "error": call.error,
+                                }
+                                for call in tool_calls
+                            ],
+                            "expected_answer": test.assertions.expected_answer,
+                            "expected_answer_evaluation": evaluation,
+                        },
                     }
                 )
+                if progress_callback:
+                    icon = click.style("✓", fg="green") if passed else click.style("✗", fg="red")
+                    progress_callback(
+                        f"test:{suite_name}:{idx}",
+                        "  "
+                        + icon
+                        + " "
+                        + click.style(
+                            f"[{suite_name}] {idx}/{total_tests} • {test.name} ({test_time:.2f}s)",
+                            fg="green" if passed else "red",
+                        ),
+                    )
 
             except Exception as e:
                 test_time = time.time() - test_start
@@ -330,6 +602,15 @@ async def run_eval_suite(
                         "time": test_time,
                     }
                 )
+                if progress_callback:
+                    progress_callback(
+                        f"test:{suite_name}:{idx}",
+                        "  ✗ "
+                        + click.style(
+                            f"[{suite_name}] {idx}/{total_tests} • {test.name} errored: {e} ({test_time:.2f}s)",
+                            fg="red",
+                        ),
+                    )
 
     finally:
         # Clean up runtime environment
@@ -352,6 +633,8 @@ async def run_all_evals(
     profile: str | None,
     cli_user_context: UserContextModel | None = None,
     override_model: str | None = None,
+    progress_callback: Callable[[str, str], None] | None = None,
+    expected_answer_model: str | None = None,
 ) -> dict[str, Any]:
     """Run all eval suites found in the repository.
 
@@ -388,9 +671,16 @@ async def run_all_evals(
             if eval_suite is None:
                 continue
             suite_name = eval_suite.suite or "unnamed"
-            # Run the suite
+            # Run the suite (progress_callback is passed through to run_eval_suite)
             result = await run_eval_suite(
-                suite_name, user_config, site_config, profile, cli_user_context, override_model
+                suite_name,
+                user_config,
+                site_config,
+                profile,
+                cli_user_context,
+                override_model,
+                progress_callback=progress_callback,
+                expected_answer_model=expected_answer_model,
             )
 
             # Get relative path
@@ -399,8 +689,10 @@ async def run_all_evals(
             except Exception:
                 relative_path = str(file_path)
 
-            # Map new result structure to old structure for backward compatibility
-            all_passed = result.get("summary", {}).get("failed", 1) == 0 if result else False
+            # Determine pass/fail
+            all_passed = bool(result.get("all_passed"))
+            if not all_passed and result.get("summary"):
+                all_passed = result["summary"].get("failed", 1) == 0
 
             suites.append(
                 {
diff --git a/tests/sdk/evals/test_executor.py b/tests/sdk/evals/test_executor.py
index 73561412..2805187c 100644
--- a/tests/sdk/evals/test_executor.py
+++ b/tests/sdk/evals/test_executor.py
@@ -1,27 +1,62 @@
-"""Tests for mxcp.sdk.evals.executor module."""
-
+import asyncio
 from typing import Any
-from unittest.mock import AsyncMock
 
 import pytest
+from pydantic_ai import ModelSettings
+from pydantic_ai.exceptions import ModelRetry
 
 from mxcp.sdk.auth import UserContextModel
-from mxcp.sdk.evals import (
-    ClaudeConfig,
-    LLMExecutor,
-    OpenAIConfig,
-    ParameterDefinition,
-    ToolDefinition,
-)
-from mxcp.sdk.validator import TypeSchemaModel
+from mxcp.sdk.evals import ParameterDefinition, ToolDefinition
+from mxcp.sdk.evals.executor import AgentResult, GradeResult, LLMExecutor, ProviderConfig
 
 
-class MockToolExecutor:
-    """Mock tool executor for testing."""
+class FakeRun:
+    def __init__(self, output: Any) -> None:
+        self.output = output
+
+
+class FakeAgent:
+    def __init__(
+        self,
+        *,
+        tools: list[Any],
+        output: Any,
+        tool_args: dict[str, dict[str, Any]],
+        tool_callables: dict[str, Any] | None = None,
+    ) -> None:
+        self.tools = tools
+        self.output = output
+        self.tool_args = tool_args
+        self.tool_callables = tool_callables or {}
+
+    async def run(
+        self, _prompt: str, deps: Any | None = None, model_settings: Any | None = None
+    ) -> FakeRun:
+        for tool in self.tools:
+            tool_name = getattr(tool, "name", None) or getattr(
+                getattr(tool, "tool_def", None), "name", None
+            )
+            # Look up callable from provided map
+            fn = self.tool_callables.get(tool_name or "")
+            args = self.tool_args.get(tool_name or "", {})
+            if fn:
+                if asyncio.iscoroutinefunction(fn):
+                    try:
+                        await fn(**args)
+                    except ModelRetry:
+                        continue
+                else:
+                    try:
+                        fn(**args)
+                    except ModelRetry:
+                        continue
+        return FakeRun(self.output)
+
 
+class MockToolExecutor:
     def __init__(self, responses: dict[str, Any] | None = None):
         self.responses = responses or {}
-        self.calls = []
+        self.calls: list[dict[str, Any]] = []
 
     async def execute_tool(
         self,
@@ -29,275 +64,482 @@ async def execute_tool(
         arguments: dict[str, Any],
         user_context: UserContextModel | None = None,
     ) -> Any:
-        """Mock tool execution that records calls and returns predefined responses."""
         self.calls.append(
             {"tool_name": tool_name, "arguments": arguments, "user_context": user_context}
         )
-
         if tool_name in self.responses:
-            result = self.responses[tool_name]
-            if isinstance(result, Exception):
-                raise result
-            return result
-
-        return f"Mock result for {tool_name}"
-
-
-class TestLLMExecutor:
-    """Test cases for LLMExecutor."""
-
-    def setup_method(self):
-        """Set up test fixtures."""
-        self.model_config = ClaudeConfig(name="claude-3-haiku", api_key="test-key")
-
-        self.tools = [
-            ToolDefinition(
-                name="get_weather",
-                description="Get current weather for a location",
-                parameters=[
-                    ParameterDefinition(name="location", type="string", description="City name")
-                ],
-            ),
-            ToolDefinition(
-                name="calculate",
-                description="Perform mathematical calculations",
-                parameters=[
-                    ParameterDefinition(
-                        name="expression",
-                        type="string",
-                        description="Mathematical expression to evaluate",
-                    )
-                ],
-            ),
-        ]
-
-        self.tool_executor = MockToolExecutor(
-            {"get_weather": {"temperature": 22, "condition": "sunny"}, "calculate": 42}
+            value = self.responses[tool_name]
+            if isinstance(value, Exception):
+                raise value
+            return value
+        return {"echo": arguments}
+
+
+def make_executor(
+    tools: list[ToolDefinition] | None = None,
+    responses: dict[str, Any] | None = None,
+    system_prompt: str | None = None,
+    agent_retries: int = 3,
+) -> LLMExecutor:
+    default_tools = [
+        ToolDefinition(
+            name="get_weather",
+            description="Weather lookup",
+            parameters=[ParameterDefinition(name="location", type="string", required=True)],
         )
-
-        self.executor = LLMExecutor(self.model_config, self.tools, self.tool_executor)
-
-    def test_initialization(self):
-        """Test LLMExecutor initialization."""
-        assert self.executor.model_config == self.model_config
-        assert self.executor.available_tools == self.tools
-        assert self.executor.tool_executor == self.tool_executor
-
-    def test_format_tools_for_prompt(self):
-        """Test tool formatting for prompts."""
-        formatted = self.executor._format_tools_for_prompt()
-
-        assert "=== AVAILABLE TOOLS ===" in formatted
-        assert "Tool: get_weather" in formatted
-        assert "Tool: calculate" in formatted
-        assert "Description: Get current weather for a location" in formatted
-        assert "location (string): City name" in formatted
-
-    def test_format_tools_for_prompt_empty(self):
-        """Test tool formatting with no tools."""
-        executor = LLMExecutor(self.model_config, [], self.tool_executor)
-        formatted = executor._format_tools_for_prompt()
-        assert formatted == "No tools available."
-
-    def test_get_claude_prompt(self):
-        """Test Claude-specific prompt formatting."""
-        prompt = self.executor._get_claude_prompt(
-            "What's the weather in Paris?", "Mock tools", None
+    ]
+    tool_defs = tools or default_tools
+    default_responses = {"get_weather": {"temperature": 20}} if tools is None else {}
+    tool_executor = MockToolExecutor(responses or default_responses)
+    executor = LLMExecutor(
+        "claude-test",
+        "anthropic",
+        ModelSettings(),
+        tool_defs,
+        tool_executor,
+        provider_config=ProviderConfig(api_key="key", base_url="https://api.anthropic.com"),
+        system_prompt=system_prompt,
+        agent_retries=agent_retries,
+    )
+    return executor
+
+
+def test_executor_uses_custom_system_prompt() -> None:
+    custom_prompt = "You are a specialized assistant."
+    executor = make_executor(system_prompt=custom_prompt)
+
+    assert executor.system_prompt == custom_prompt
+
+
+def test_executor_passes_agent_retries_to_agent() -> None:
+    observed: list[int | None] = []
+
+    executor = make_executor(agent_retries=5)
+
+    def agent_factory(**kwargs: Any) -> FakeAgent:
+        observed.append(kwargs.get("retries"))
+        return FakeAgent(
+            tools=kwargs["tools"],
+            output="ok",
+            tool_args={"get_weather": {"location": "Paris"}},
+            tool_callables=kwargs.get("_tool_callables", {}),
         )
 
-        assert "You are a helpful assistant" in prompt
-        assert "Mock tools" in prompt
-        assert "Human: What's the weather in Paris?" in prompt
-        assert '{"tool": "tool_name"' in prompt
-
-    def test_get_openai_prompt(self):
-        """Test OpenAI-specific prompt formatting."""
-        prompt = self.executor._get_openai_prompt("Calculate 2+2", "Mock tools", None)
-
-        assert "You are a helpful assistant" in prompt
-        assert "Mock tools" in prompt
-        assert "User: Calculate 2+2" in prompt
-        assert '{"tool": "tool_name"' in prompt
-
-    def test_extract_tool_calls_single(self):
-        """Test extraction of single tool call."""
-        response = '{"tool": "get_weather", "arguments": {"location": "Paris"}}'
-        calls = self.executor._extract_tool_calls(response)
-
-        assert len(calls) == 1
-        assert calls[0]["tool"] == "get_weather"
-        assert calls[0]["arguments"]["location"] == "Paris"
-
-    def test_extract_tool_calls_multiple(self):
-        """Test extraction of multiple tool calls."""
-        response = '[{"tool": "get_weather", "arguments": {"location": "Paris"}}, {"tool": "calculate", "arguments": {"expression": "2+2"}}]'
-        calls = self.executor._extract_tool_calls(response)
-
-        assert len(calls) == 2
-        assert calls[0]["tool"] == "get_weather"
-        assert calls[1]["tool"] == "calculate"
-
-    def test_extract_tool_calls_none(self):
-        """Test extraction when no tool calls present."""
-        response = "The weather in Paris is sunny and 22 degrees."
-        calls = self.executor._extract_tool_calls(response)
-
-        assert len(calls) == 0
-
-    def test_extract_tool_calls_invalid_json(self):
-        """Test extraction with invalid JSON."""
-        response = "Invalid JSON {tool: get_weather}"
-        calls = self.executor._extract_tool_calls(response)
-
-        assert len(calls) == 0
-
-    @pytest.mark.asyncio
-    async def test_execute_prompt_no_tools(self):
-        """Test prompt execution without tool calls."""
-        # Mock the LLM call to return a simple response
-        self.executor._call_llm = AsyncMock(return_value="Hello! I'm a helpful assistant.")
-
-        response, tool_calls = await self.executor.execute_prompt("Hello")
-
-        assert response == "Hello! I'm a helpful assistant."
-        assert len(tool_calls) == 0
-        assert len(self.tool_executor.calls) == 0
-
-    @pytest.mark.asyncio
-    async def test_execute_prompt_with_tools(self):
-        """Test prompt execution with tool calls."""
-        # Mock LLM to first return tool call, then final response
-        self.executor._call_llm = AsyncMock(
-            side_effect=[
-                '{"tool": "get_weather", "arguments": {"location": "Paris"}}',
-                "The weather in Paris is sunny and 22 degrees.",
-            ]
+    executor._agent_cls = agent_factory
+
+    asyncio.run(executor.execute_prompt("Weather?"))
+
+    assert observed == [5]
+
+
+def test_execute_prompt_with_tool_call() -> None:
+    executor = make_executor()
+    user_ctx = UserContextModel(provider="test", user_id="u1", username="user")
+    executor._agent_cls = lambda **kwargs: FakeAgent(
+        tools=kwargs["tools"],
+        output="Sunny",
+        tool_args={"get_weather": {"location": "Paris"}},
+        tool_callables=kwargs.get("_tool_callables", {}),
+    )
+
+    result = asyncio.run(executor.execute_prompt("Weather?", user_context=user_ctx))
+
+    assert isinstance(result, AgentResult)
+    assert result.answer == "Sunny"
+    assert len(result.tool_calls) == 1
+    call = result.tool_calls[0]
+    assert call.tool == "get_weather"
+    assert call.arguments["location"] == "Paris"
+    assert call.result == {"temperature": 20}
+    assert call.error is None
+
+
+def test_execute_prompt_tool_calls_do_not_leak_between_runs() -> None:
+    executor = make_executor()
+
+    # First run
+    executor._agent_cls = lambda **kwargs: FakeAgent(
+        tools=kwargs["tools"],
+        output="ok",
+        tool_args={"get_weather": {"location": "Paris"}},
+        tool_callables=kwargs.get("_tool_callables", {}),
+    )
+    first = asyncio.run(executor.execute_prompt("Weather?"))
+    assert len(first.tool_calls) == 1
+    assert first.tool_calls[0].arguments["location"] == "Paris"
+
+    # Second run should still invoke tools and capture history independently
+    executor._agent_cls = lambda **kwargs: FakeAgent(
+        tools=kwargs["tools"],
+        output="ok",
+        tool_args={"get_weather": {"location": "Rome"}},
+        tool_callables=kwargs.get("_tool_callables", {}),
+    )
+    second = asyncio.run(executor.execute_prompt("Weather?"))
+
+    assert len(second.tool_calls) == 1
+    assert second.tool_calls[0].arguments["location"] == "Rome"
+
+
+def test_execute_prompt_tool_error() -> None:
+    executor = make_executor()
+    executor.tool_executor.responses["get_weather"] = ValueError("boom")  # type: ignore[attr-defined]
+    executor._agent_cls = lambda **kwargs: FakeAgent(
+        tools=kwargs["tools"],
+        output="Error",
+        tool_args={"get_weather": {"location": "Rome"}},
+        tool_callables=kwargs.get("_tool_callables", {}),
+    )
+
+    result = asyncio.run(executor.execute_prompt("Weather?"))
+
+    assert result.tool_calls
+    error = result.tool_calls[0].error
+    # Error is now a dict with status, tool, error, and suggestion
+    assert isinstance(error, dict)
+    assert error["status"] == "error"
+    assert error["tool"] == "get_weather"
+    assert "boom" in error["error"]
+
+
+def test_tool_argument_validation_error_is_captured() -> None:
+    executor = make_executor()
+    executor._agent_cls = lambda **kwargs: FakeAgent(
+        tools=kwargs["tools"],
+        output="done",
+        tool_args={"get_weather": {}},  # missing required arg
+        tool_callables=kwargs.get("_tool_callables", {}),
+    )
+
+    result = asyncio.run(executor.execute_prompt("Weather?"))
+
+    assert result.tool_calls
+    error = result.tool_calls[0].error
+    # Error is now a dict with status, tool, error, and suggestion
+    assert isinstance(error, dict)
+    assert error["status"] == "error"
+    assert "Field required" in error["error"]
+
+
+def test_tool_model_retry_reinvokes_tool() -> None:
+    class FlakyToolExecutor:
+        def __init__(self) -> None:
+            self.calls = 0
+
+        async def execute_tool(
+            self,
+            tool_name: str,
+            arguments: dict[str, Any],
+            user_context: UserContextModel | None = None,
+        ) -> Any:
+            self.calls += 1
+            if self.calls == 1:
+                raise ValueError("temporary issue")
+            return {"status": "ok"}
+
+    class RetryingAgent(FakeAgent):
+        """Agent that retries tool calls when ModelRetry is raised."""
+
+        def __init__(
+            self,
+            *,
+            tools: list[Any],
+            output: Any,
+            tool_args: dict[str, dict[str, Any]],
+            tool_callables: dict[str, Any] | None = None,
+            max_retries: int = 1,
+        ) -> None:
+            super().__init__(
+                tools=tools, output=output, tool_args=tool_args, tool_callables=tool_callables
+            )
+            self.max_retries = max_retries
+
+        async def run(  # type: ignore[override]
+            self, _prompt: str, deps: Any | None = None, model_settings: Any | None = None
+        ) -> FakeRun:
+            for tool in self.tools:
+                tool_name = getattr(tool, "name", None) or getattr(
+                    getattr(tool, "tool_def", None), "name", None
+                )
+                fn = self.tool_callables.get(tool_name or "")
+                args = self.tool_args.get(tool_name or "", {})
+                if not fn:
+                    continue
+
+                attempt = 0
+                while attempt < self.max_retries:
+                    try:
+                        if asyncio.iscoroutinefunction(fn):
+                            await fn(**args)
+                        else:
+                            fn(**args)
+                        break
+                    except ModelRetry:
+                        attempt += 1
+                        if attempt >= self.max_retries:
+                            raise
+                        continue
+            return FakeRun(self.output)
+
+    executor = make_executor()
+    flaky_executor = FlakyToolExecutor()
+    executor.tool_executor = flaky_executor  # type: ignore[assignment]
+    executor._agent_cls = lambda **kwargs: RetryingAgent(
+        tools=kwargs["tools"],
+        output="ok",
+        tool_args={"get_weather": {"location": "Paris"}},
+        tool_callables=kwargs.get("_tool_callables", {}),
+        max_retries=kwargs.get("retries", 1),
+    )
+
+    result = asyncio.run(executor.execute_prompt("Weather?"))
+
+    # Tool was called twice: first raised ModelRetry, second succeeded
+    assert flaky_executor.calls == 2
+    assert len(result.tool_calls) == 2
+    # First call should have error recorded as dict
+    first_error = result.tool_calls[0].error
+    assert isinstance(first_error, dict)
+    assert first_error["status"] == "error"
+    assert "temporary issue" in first_error["error"]
+    # Second call should succeed
+    assert result.tool_calls[1].result == {"status": "ok"}
+
+
+def test_expected_answer_grading() -> None:
+    executor = make_executor()
+    executor._agent_cls = lambda **kwargs: FakeAgent(
+        tools=kwargs.get("tools", []),
+        output=GradeResult(result="correct", comment="ok", reasoning="match"),
+        tool_args={},
+        tool_callables=kwargs.get("_tool_callables", {}),
+    )
+
+    result = asyncio.run(executor.evaluate_expected_answer("hello", "hello"))
+    assert result["result"] == "correct"
+    assert result["comment"]
+
+
+def test_expected_answer_uses_model_reference() -> None:
+    executor = make_executor()
+    observed: list[Any] = []
+
+    def agent_factory(**kwargs: Any) -> FakeAgent:
+        observed.append(kwargs.get("model"))
+        return FakeAgent(
+            tools=kwargs.get("tools", []),
+            output=GradeResult(result="correct", comment="ok", reasoning="match"),
+            tool_args={},
+            tool_callables=kwargs.get("_tool_callables", {}),
         )
 
-        user_context = UserContextModel(provider="test", user_id="test-user", username="testuser")
-
-        response, tool_calls = await self.executor.execute_prompt(
-            "What's the weather in Paris?", user_context=user_context
+    executor._agent_cls = agent_factory
+
+    result = asyncio.run(executor.evaluate_expected_answer("value", "value"))
+    assert result["result"] == "correct"
+    assert observed == [executor._model_reference]
+
+
+def test_max_turns_limits_tool_calls() -> None:
+    class MultiCallAgent:
+        def __init__(self, tools: list[Any], tool_callables: dict[str, Any]) -> None:
+            self.tools = tools
+            self.tool_callables = tool_callables
+
+        async def run(
+            self, _prompt: str, deps: Any | None = None, model_settings: Any | None = None
+        ) -> FakeRun:
+            for _ in range(2):
+                for tool in self.tools:
+                    tool_name = getattr(tool, "name", None)
+                    fn = self.tool_callables.get(tool_name or "")
+                    if fn:
+                        if asyncio.iscoroutinefunction(fn):
+                            try:
+                                await fn()
+                            except ModelRetry:
+                                continue
+                        else:
+                            try:
+                                fn()
+                            except ModelRetry:
+                                continue
+            return FakeRun("done")
+
+    executor = make_executor()
+    executor._agent_cls = lambda **kwargs: MultiCallAgent(
+        kwargs["tools"], kwargs.get("_tool_callables", {})
+    )
+
+    result = asyncio.run(executor.execute_prompt("Weather?", max_turns=1))
+
+    assert len(result.tool_calls) == 2
+    assert result.tool_calls[-1].error
+
+
+def test_tool_model_schema_preserves_array_items_type() -> None:
+    predicates_param = ParameterDefinition(
+        name="predicates",
+        type="array",
+        description="Filters",
+        required=True,
+        schema={
+            "type": "array",
+            "description": "Filters",
+            "items": {"type": "string", "description": "SQL predicate"},
+        },
+    )
+    members_param = ParameterDefinition(
+        name="members",
+        type="array",
+        description="Projection list",
+        required=True,
+        schema={"type": "array", "items": {"type": "string"}},
+    )
+    tools = [
+        ToolDefinition(
+            name="sql_search",
+            description="Search objects",
+            parameters=[predicates_param, members_param],
         )
-
-        assert response == "The weather in Paris is sunny and 22 degrees."
-        assert len(tool_calls) == 1
-        assert tool_calls[0]["tool"] == "get_weather"
-        assert tool_calls[0]["arguments"]["location"] == "Paris"
-
-        # Verify tool executor was called correctly
-        assert len(self.tool_executor.calls) == 1
-        call = self.tool_executor.calls[0]
-        assert call["tool_name"] == "get_weather"
-        assert call["arguments"]["location"] == "Paris"
-        assert call["user_context"] == user_context
-
-    @pytest.mark.asyncio
-    async def test_execute_prompt_tool_error(self):
-        """Test prompt execution when tool execution fails."""
-        # Configure tool executor to raise an error
-        self.tool_executor.responses["get_weather"] = ValueError("Tool failed")
-
-        # Mock LLM to return tool call, then final response
-        self.executor._call_llm = AsyncMock(
-            side_effect=[
-                '{"tool": "get_weather", "arguments": {"location": "Paris"}}',
-                "I'm sorry, I couldn't get the weather information.",
-            ]
+    ]
+    executor = make_executor(tools=tools)
+
+    schema = executor._tool_models["sql_search"].model_json_schema()
+    props = schema["properties"]
+
+    assert props["predicates"]["type"] == "array"
+    assert props["predicates"]["items"]["type"] == "string"
+    assert props["predicates"]["items"]["description"] == "SQL predicate"
+    assert props["members"]["items"]["type"] == "string"
+    assert "predicates" in schema["required"]
+    assert "members" in schema["required"]
+
+
+def test_tool_model_schema_supports_optional_object_params() -> None:
+    context_param = ParameterDefinition(
+        name="context",
+        type="object",
+        description="Optional filters",
+        required=False,
+        default={},
+        schema={
+            "type": "object",
+            "properties": {
+                "limit": {"type": "integer", "minimum": 1, "maximum": 100},
+                "sort": {"type": "string"},
+            },
+            "required": ["limit"],
+            "additionalProperties": False,
+        },
+    )
+    tools = [
+        ToolDefinition(
+            name="fetch_objects",
+            description="Fetch objects",
+            parameters=[
+                ParameterDefinition(name="object_type", type="string", required=True),
+                context_param,
+            ],
         )
-
-        response, tool_calls = await self.executor.execute_prompt("What's the weather in Paris?")
-
-        assert response == "I'm sorry, I couldn't get the weather information."
-        assert len(tool_calls) == 1
-
-        # Verify the LLM received the tool error in the conversation
-        assert self.executor._call_llm.call_count == 2
-
-    @pytest.mark.asyncio
-    async def test_execute_prompt_max_iterations(self):
-        """Test that max iterations prevents infinite loops."""
-        # Mock LLM to always return tool calls
-        self.executor._call_llm = AsyncMock(
-            return_value='{"tool": "get_weather", "arguments": {"location": "Paris"}}'
+    ]
+    executor = make_executor(tools=tools)
+
+    schema = executor._tool_models["fetch_objects"].model_json_schema()
+    props = schema["properties"]
+
+    assert "context" in props
+    assert props["context"]["type"] == "object"
+    assert props["context"]["properties"]["limit"]["minimum"] == 1
+    assert props["context"]["properties"]["limit"]["maximum"] == 100
+    assert props["context"]["required"] == ["limit"]
+    assert "context" not in schema.get("required", [])
+
+
+def test_executor_with_empty_tool_list() -> None:
+    """Test executor handles empty tool list gracefully."""
+    # Create executor directly, bypassing make_executor which adds default tools
+    tool_executor = MockToolExecutor()
+    executor = LLMExecutor(
+        "claude-test",
+        "anthropic",
+        ModelSettings(),
+        [],  # Empty tool list
+        tool_executor,
+        provider_config=ProviderConfig(api_key="key", base_url="https://api.anthropic.com"),
+    )
+
+    assert executor.available_tools == []
+    assert executor._tool_models == {}
+    # System prompt should indicate no tools
+    assert (
+        "no tools" in executor.system_prompt.lower()
+        or "answer directly" in executor.system_prompt.lower()
+    )
+
+
+def test_unknown_parameter_type_defaults_to_any() -> None:
+    """Test that unknown parameter types fall back to Any with a warning."""
+    tools = [
+        ToolDefinition(
+            name="custom_tool",
+            description="Tool with unknown type",
+            parameters=[ParameterDefinition(name="param", type="unknown_type", required=True)],
         )
+    ]
+    # This should not raise an error
+    executor = make_executor(tools=tools)
 
-        response, tool_calls = await self.executor.execute_prompt("Weather?")
+    # The tool model should be created
+    assert "custom_tool" in executor._tool_models
 
-        # Should hit max iterations (10) and return the last response
-        assert len(tool_calls) == 10
-        assert self.executor._call_llm.call_count == 10
 
+def test_execute_prompt_handles_empty_output() -> None:
+    """Test that empty agent output is handled gracefully."""
+    executor = make_executor()
 
-class TestToolDefinition:
-    """Test cases for ToolDefinition."""
+    executor._agent_cls = lambda **kwargs: FakeAgent(
+        tools=kwargs["tools"],
+        output="",  # Empty output
+        tool_args={},
+        tool_callables=kwargs.get("_tool_callables", {}),
+    )
 
-    def test_to_prompt_format_basic(self):
-        """Test basic tool formatting."""
-        tool = ToolDefinition(name="test_tool", description="A test tool")
+    result = asyncio.run(executor.execute_prompt("Test prompt"))
 
-        formatted = tool.to_prompt_format()
-        assert "Tool: test_tool" in formatted
-        assert "Description: A test tool" in formatted
-        assert "Parameters: None" in formatted
+    assert isinstance(result, AgentResult)
+    assert result.answer == ""
+    assert result.error is None
 
-    def test_to_prompt_format_with_parameters(self):
-        """Test tool formatting with parameters."""
-        tool = ToolDefinition(
-            name="calculator",
-            description="Perform calculations",
-            parameters=[
-                ParameterDefinition(
-                    name="expression", type="string", description="Math expression", default="0"
-                ),
-                ParameterDefinition(name="precision", type="integer", description="Decimal places"),
-            ],
-            return_type=TypeSchemaModel(type="number", description="Result"),
-            tags=["math", "utility"],
-        )
 
-        formatted = tool.to_prompt_format()
-        assert "Tool: calculator" in formatted
-        assert "Description: Perform calculations" in formatted
-        assert "expression (string) [default: 0]: Math expression" in formatted
-        assert "precision (integer): Decimal places" in formatted
-        assert "Returns: number - Result" in formatted
-        assert "Tags: math, utility" in formatted
+def test_provider_config_defaults() -> None:
+    """Test ProviderConfig uses defaults correctly."""
+    from mxcp.sdk.evals.executor import ProviderConfig
 
+    config = ProviderConfig()
+    assert config.api_key is None
+    assert config.base_url is None
+    assert config.timeout is None
 
-class TestModelConfigs:
-    """Test cases for model configurations."""
 
-    def test_claude_config(self):
-        """Test Claude configuration."""
-        config = ClaudeConfig(
-            name="claude-3-haiku", api_key="test-key", base_url="https://api.custom.com", timeout=60
-        )
+def test_provider_config_with_values() -> None:
+    """Test ProviderConfig accepts and stores values."""
+    from mxcp.sdk.evals.executor import ProviderConfig
 
-        assert config.get_type() == "claude"
-        assert config.name == "claude-3-haiku"
-        assert config.api_key == "test-key"
-        assert config.base_url == "https://api.custom.com"
-        assert config.timeout == 60
+    config = ProviderConfig(
+        api_key="test-key",
+        base_url="https://api.example.com",
+        timeout=30,
+    )
+    assert config.api_key == "test-key"
+    assert config.base_url == "https://api.example.com"
+    assert config.timeout == 30
 
-    def test_openai_config(self):
-        """Test OpenAI configuration."""
-        config = OpenAIConfig(
-            name="gpt-4", api_key="test-key", base_url="https://api.custom.com", timeout=45
-        )
 
-        assert config.get_type() == "openai"
-        assert config.name == "gpt-4"
-        assert config.api_key == "test-key"
-        assert config.base_url == "https://api.custom.com"
-        assert config.timeout == 45
-
-    def test_config_defaults(self):
-        """Test default values for configs."""
-        claude = ClaudeConfig(name="claude", api_key="key")
-        assert claude.base_url == "https://api.anthropic.com"
-        assert claude.timeout == 30
-
-        openai = OpenAIConfig(name="gpt", api_key="key")
-        assert openai.base_url == "https://api.openai.com/v1"
-        assert openai.timeout == 30
+def test_agent_retries_clamped_to_minimum() -> None:
+    """Test that agent_retries is clamped to at least 1."""
+    executor = make_executor(agent_retries=0)
+    assert executor._agent_retries == 1
+
+    executor = make_executor(agent_retries=-5)
+    assert executor._agent_retries == 1
diff --git a/tests/server/test_evals_service.py b/tests/server/test_evals_service.py
new file mode 100644
index 00000000..4224971b
--- /dev/null
+++ b/tests/server/test_evals_service.py
@@ -0,0 +1,125 @@
+from pydantic_ai import ModelSettings
+
+from mxcp.server.definitions.endpoints.models import ParamDefinitionModel, TypeDefinitionModel
+from mxcp.server.services.evals.service import (
+    _build_model_settings,
+    _format_expected_answer_failure,
+    _parameter_definition_from_model,
+    _type_definition_to_schema,
+)
+
+
+def test_model_settings_chat_drops_response_only_keys() -> None:
+    allowed = set(ModelSettings.__annotations__.keys())
+    settings = _build_model_settings(
+        "gpt-4o",
+        "openai",
+        {"body:reasoning": {"effort": "medium"}, "timeout": 30},
+        allowed,
+    )
+
+    extra_body = settings.get("extra_body")
+    assert extra_body and "reasoning" in extra_body
+    assert settings.get("timeout") == 30
+    assert settings.get("max_tokens") == 10_000
+
+
+def test_model_settings_responses_keeps_extras() -> None:
+    allowed = set(ModelSettings.__annotations__.keys())
+    settings = _build_model_settings(
+        "gpt-5",
+        "openai",
+        {"api": "responses", "body:reasoning": {"effort": "medium"}},
+        allowed,
+    )
+
+    extra_body = settings.get("extra_body")
+    assert extra_body and "reasoning" in extra_body
+    assert settings.get("max_tokens") == 10_000
+
+
+def test_model_settings_anthropic_output_config_and_betas() -> None:
+    allowed = set(ModelSettings.__annotations__.keys())
+    settings = _build_model_settings(
+        "claude",
+        "anthropic",
+        {
+            "body:output_config": {"effort": "medium"},
+            "header:anthropic-beta": ["effort-2025-11-24"],
+        },
+        allowed,
+    )
+
+    extra_body = settings.get("extra_body")
+    assert extra_body and extra_body.get("output_config") == {"effort": "medium"}
+    headers = settings.get("extra_headers")
+    assert headers and headers.get("anthropic-beta") == "effort-2025-11-24"
+    assert settings.get("max_tokens") == 10_000
+
+
+def test_model_settings_respects_user_max_tokens_override() -> None:
+    allowed = set(ModelSettings.__annotations__.keys())
+    settings = _build_model_settings(
+        "gpt-4o",
+        "openai",
+        {"max_tokens": 2048},
+        allowed,
+    )
+
+    assert settings.get("max_tokens") == 2048
+
+
+def test_expected_answer_failure_formatting_is_multiline() -> None:
+    detail = _format_expected_answer_failure(
+        "Answer",
+        "Expected",
+        "wrong",
+        "bad",
+        "missed value",
+    )
+    lines = detail.splitlines()
+    assert lines == [
+        "LLM Answer: Answer",
+        "Expected: Expected",
+        "Grade: wrong",
+        "Comment: bad",
+        "Reasoning: missed value",
+    ]
+
+
+def test_parameter_definition_from_model_includes_array_items_schema() -> None:
+    param = ParamDefinitionModel(
+        name="predicates",
+        type="array",
+        description="Filters",
+        items=TypeDefinitionModel(type="string", description="SQL predicate"),
+    )
+
+    definition = _parameter_definition_from_model(param)
+
+    assert definition.required is True
+    assert definition.default is None
+    assert definition.schema == {
+        "type": "array",
+        "description": "Filters",
+        "items": {"type": "string", "description": "SQL predicate"},
+    }
+
+
+def test_parameter_definition_from_model_marks_optional_when_default_present() -> None:
+    param = ParamDefinitionModel(
+        name="limit",
+        type="integer",
+        description="Result limit",
+        default=25,
+        minimum=1,
+        maximum=100,
+    )
+
+    definition = _parameter_definition_from_model(param)
+
+    assert definition.required is False
+    assert definition.default == 25
+    assert definition.schema["type"] == "integer"
+    assert definition.schema["minimum"] == 1
+    assert definition.schema["maximum"] == 100
diff --git a/tests/server/test_evals_tool_executor.py b/tests/server/test_evals_tool_executor.py
index c0e21794..15d99a19 100644
--- a/tests/server/test_evals_tool_executor.py
+++ b/tests/server/test_evals_tool_executor.py
@@ -1,5 +1,6 @@
 """Tests for EndpointToolExecutor integration."""
 
+from pathlib import Path
 from typing import Any
 
 import pytest
@@ -7,7 +8,7 @@
 from mxcp.sdk.auth import UserContextModel
 from mxcp.sdk.executor import ExecutionContext
 from mxcp.server.definitions.endpoints.models import EndpointDefinitionModel, SourceDefinitionModel
-from mxcp.server.executor.runners.tool import EndpointToolExecutor
+from mxcp.server.executor.runners.tool import EndpointToolExecutor, EndpointWithPath
 
 
 class MockExecutionEngine:
@@ -47,60 +48,79 @@ def setup_method(self):
                 "weather.py": {"temperature": 22, "condition": "sunny"},
             }
         )
+        self._monkeypatch = pytest.MonkeyPatch()
+        self._monkeypatch.setattr(
+            "mxcp.server.executor.runners.tool.find_repo_root", lambda: Path.cwd()
+        )
 
-        self.endpoints: list[EndpointDefinitionModel] = [
-            EndpointDefinitionModel.model_validate(
-                {
-                    "mxcp": 1,
-                    "tool": {
-                        "name": "get_date",
-                        "description": "Get current date",
-                        "parameters": [],
-                        "source": {"code": "SELECT current_date()"},
-                    },
-                }
+        self.endpoints = [
+            EndpointWithPath(
+                EndpointDefinitionModel.model_validate(
+                    {
+                        "mxcp": 1,
+                        "tool": {
+                            "name": "get_date",
+                            "description": "Get current date",
+                            "parameters": [],
+                            "source": {"code": "SELECT current_date()"},
+                        },
+                    }
+                ),
+                Path("endpoints/get_date.yml"),
             ),
-            EndpointDefinitionModel.model_validate(
-                {
-                    "mxcp": 1,
-                    "tool": {
-                        "name": "calculate",
-                        "description": "Calculate expression",
-                        "parameters": [{"name": "expr", "type": "string"}],
-                        "source": {"code": "return 2 + 2", "language": "python"},
-                    },
-                }
+            EndpointWithPath(
+                EndpointDefinitionModel.model_validate(
+                    {
+                        "mxcp": 1,
+                        "tool": {
+                            "name": "calculate",
+                            "description": "Calculate expression",
+                            "parameters": [{"name": "expr", "type": "string"}],
+                            "source": {"code": "return 2 + 2", "language": "python"},
+                        },
+                    }
+                ),
+                Path("endpoints/calculate.yml"),
             ),
-            EndpointDefinitionModel.model_validate(
-                {
-                    "mxcp": 1,
-                    "tool": {
-                        "name": "get_weather",
-                        "description": "Get weather info",
-                        "parameters": [{"name": "location", "type": "string"}],
-                        "source": {"file": "weather.py", "language": "python"},
-                    },
-                }
+            EndpointWithPath(
+                EndpointDefinitionModel.model_validate(
+                    {
+                        "mxcp": 1,
+                        "tool": {
+                            "name": "get_weather",
+                            "description": "Get weather info",
+                            "parameters": [{"name": "location", "type": "string"}],
+                            "source": {"code": "weather.py", "language": "python"},
+                        },
+                    }
+                ),
+                Path("endpoints/get_weather.yml"),
             ),
-            EndpointDefinitionModel.model_validate(
-                {
-                    "mxcp": 1,
-                    "resource": {
-                        "uri": "data://users",
-                        "description": "User data resource",
-                        "parameters": [{"name": "limit", "type": "integer"}],
-                        "source": {"code": "SELECT * FROM users LIMIT $limit"},
-                    },
-                }
+            EndpointWithPath(
+                EndpointDefinitionModel.model_validate(
+                    {
+                        "mxcp": 1,
+                        "resource": {
+                            "uri": "data://users",
+                            "description": "User data resource",
+                            "parameters": [{"name": "limit", "type": "integer"}],
+                            "source": {"code": "SELECT * FROM users LIMIT $limit"},
+                        },
+                    }
+                ),
+                Path("endpoints/users.yml"),
             ),
         ]
 
         self.executor = EndpointToolExecutor(self.engine, self.endpoints)
 
+    def teardown_method(self):
+        self._monkeypatch.undo()
+
     def test_initialization(self):
         """Test EndpointToolExecutor initialization."""
         assert self.executor.engine == self.engine
-        assert self.executor.endpoints == self.endpoints
+        assert len(self.executor.endpoints) == len(self.endpoints)
         assert len(self.executor._tool_map) == 4
         assert "get_date" in self.executor._tool_map
         assert "data://users" in self.executor._tool_map
@@ -139,6 +159,8 @@ async def test_execute_tool_with_language(self):
     @pytest.mark.asyncio
     async def test_execute_tool_with_file(self):
         """Test executing a tool with file reference."""
+        tmp_file = Path("weather.py")
+        tmp_file.write_text("weather.py")
         result = await self.executor.execute_tool("get_weather", {"location": "Paris"})
 
         assert result == {"temperature": 22, "condition": "sunny"}
@@ -149,6 +171,8 @@ async def test_execute_tool_with_file(self):
         assert call["language"] == "python"
         assert call["source_code"] == "weather.py"
         assert call["params"] == {"location": "Paris"}
+        if tmp_file.exists():
+            tmp_file.unlink()
 
     @pytest.mark.asyncio
     async def test_execute_resource(self):
@@ -205,43 +229,98 @@ async def test_execute_tool_no_source(self):
             "source",
             SourceDefinitionModel.model_construct(code=None, file=None),
         )
-        endpoints_no_source: list[EndpointDefinitionModel] = [endpoint]
-
+        endpoints_no_source = [EndpointWithPath(endpoint, Path("endpoints/broken.yml"))]
         executor = EndpointToolExecutor(self.engine, endpoints_no_source)
 
         with pytest.raises(ValueError) as exc_info:
             await executor.execute_tool("broken_tool", {})
 
-        assert "No source code or file found in source definition" in str(exc_info.value)
+        assert "No source found for endpoint" in str(exc_info.value)
 
-    def test_get_language_inference(self):
-        """Test language inference via endpoint execution."""
-        # Create endpoints with different language sources
-        test_endpoints: list[EndpointDefinitionModel] = [
+    @pytest.mark.asyncio
+    async def test_execute_tool_loads_file_content(self, tmp_path, monkeypatch):
+        """Ensure file-based sources are read and executed with their content."""
+        sql_dir = tmp_path / "sql"
+        sql_dir.mkdir()
+        sql_file = sql_dir / "hello.sql"
+        sql_file.write_text("select 1 as val;")
+
+        # Provide mxcp-site.yml so find_repo_root() resolves to tmp_path
+        (tmp_path / "mxcp-site.yml").write_text("mxcp: 1\nproject: test\nprofile: default\n")
+        monkeypatch.chdir(tmp_path)
+
+        endpoint = EndpointWithPath(
             EndpointDefinitionModel.model_validate(
-                {"mxcp": 1, "tool": {"name": "python_file_tool", "source": {"file": "script.py"}}}
+                {"mxcp": 1, "tool": {"name": "hello_tool", "source": {"file": "sql/hello.sql"}}}
             ),
+            Path("endpoints/hello.yml"),
+        )
+
+        engine = MockExecutionEngine({"select 1 as val;": {"val": 1}})
+        executor = EndpointToolExecutor(engine, [endpoint])
+
+        result = await executor.execute_tool("hello_tool", {})
+
+        assert result == {"val": 1}
+        assert engine.calls[0]["source_code"] == "select 1 as val;"
+
+    @pytest.mark.asyncio
+    async def test_execute_tool_loads_relative_parent_path(self, tmp_path, monkeypatch):
+        """Relative paths with '..' should resolve correctly."""
+        (tmp_path / "mxcp-site.yml").write_text("mxcp: 1\nproject: test\nprofile: default\n")
+        sql_dir = tmp_path.parent / "shared-sql"
+        sql_dir.mkdir(exist_ok=True)
+        sql_file = sql_dir / "hi.sql"
+        sql_file.write_text("select 2 as val;")
+
+        # endpoint references ../shared-sql/hi.sql relative to repo root
+        endpoint = EndpointWithPath(
             EndpointDefinitionModel.model_validate(
-                {"mxcp": 1, "tool": {"name": "sql_file_tool", "source": {"file": "query.sql"}}}
+                {"mxcp": 1, "tool": {"name": "hi_tool", "source": {"file": "../shared-sql/hi.sql"}}}
             ),
+            Path("endpoints/hi.yml"),
+        )
+
+        engine = MockExecutionEngine({"select 2 as val;": {"val": 2}})
+        monkeypatch.chdir(tmp_path)
+        executor = EndpointToolExecutor(engine, [endpoint])
+
+        result = await executor.execute_tool("hi_tool", {})
+
+        assert result == {"val": 2}
+
+    @pytest.mark.asyncio
+    async def test_python_file_executes_by_path(self, tmp_path, monkeypatch):
+        """Python sources should be passed as file paths to the engine."""
+        (tmp_path / "mxcp-site.yml").write_text("mxcp: 1\nproject: test\nprofile: default\n")
+        py_dir = tmp_path / "python"
+        py_dir.mkdir()
+        script = py_dir / "hello.py"
+        script.write_text("def python_tool():\n" "    return {'message': 'hi'}\n")
+
+        endpoint = EndpointWithPath(
             EndpointDefinitionModel.model_validate(
                 {
                     "mxcp": 1,
                     "tool": {
-                        "name": "explicit_override_tool",
-                        "source": {"file": "script.py", "language": "sql"},
+                        "name": "python_tool",
+                        "source": {"file": "python/hello.py", "language": "python"},
                     },
                 }
             ),
-            EndpointDefinitionModel.model_validate(
-                {"mxcp": 1, "tool": {"name": "default_sql_tool", "source": {"code": "some code"}}}
-            ),
-        ]
+            Path("endpoints/python.yml"),
+        )
+
+        monkeypatch.chdir(tmp_path)
+        engine = MockExecutionEngine()
+        executor = EndpointToolExecutor(engine, [endpoint])
 
-        test_executor = EndpointToolExecutor(self.engine, test_endpoints)
+        result = await executor.execute_tool("python_tool", {})
 
-        # Verify the tools were properly registered
-        assert "python_file_tool" in test_executor._tool_map
-        assert "sql_file_tool" in test_executor._tool_map
-        assert "explicit_override_tool" in test_executor._tool_map
-        assert "default_sql_tool" in test_executor._tool_map
+        assert result == "Mock result for " + engine.calls[0]["source_code"]
+        assert engine.calls[0]["language"] == "python"
+        source_code = engine.calls[0]["source_code"]
+        file_part, sep, function_name = source_code.partition(":")
+        assert sep == ":"
+        assert function_name == "python_tool"
+        assert Path(file_part).resolve() == script.resolve()
diff --git a/tests/server/test_user_config.py b/tests/server/test_user_config.py
index c211ba3a..0f2f8bb8 100644
--- a/tests/server/test_user_config.py
+++ b/tests/server/test_user_config.py
@@ -402,3 +402,36 @@ def test_load_without_resolving_refs(tmp_path):
         # Clean up the secret file
         if secret_file.exists():
             secret_file.unlink()
+
+
+def test_model_options_allowed(tmp_path):
+    """Ensure model options field is accepted in user config."""
+    config_path = tmp_path / "config.yml"
+    config_content = """
+    mxcp: 1
+    models:
+      default: "gpt-4o"
+      models:
+        gpt-4o:
+          type: "openai"
+          api_key: "${OPENAI_API_KEY}"
+          options:
+            reasoning: "fast"
+    projects:
+      test_project:
+        profiles:
+          dev: {}
+    """
+    config_path.write_text(config_content)
+
+    os.environ["MXCP_CONFIG"] = str(config_path)
+    os.environ["OPENAI_API_KEY"] = "secret"
+
+    site_config = make_site_config("test_project", "dev")
+
+    user_config = load_user_config(site_config).model_dump(mode="python")
+    model_cfg = user_config["models"]["models"]["gpt-4o"]
+    assert model_cfg["options"]["reasoning"] == "fast"
+
+    del os.environ["MXCP_CONFIG"]
+    del os.environ["OPENAI_API_KEY"]
diff --git a/uv.lock b/uv.lock
index 08700fd6..82c85ecb 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,5 +1,5 @@
 version = 1
-revision = 2
+revision = 3
 requires-python = ">=3.10"
 resolution-markers = [
     "python_full_version >= '3.12'",
@@ -173,6 +173,25 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" },
 ]
 
+[[package]]
+name = "anthropic"
+version = "0.75.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "anyio" },
+    { name = "distro" },
+    { name = "docstring-parser" },
+    { name = "httpx" },
+    { name = "jiter" },
+    { name = "pydantic" },
+    { name = "sniffio" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/04/1f/08e95f4b7e2d35205ae5dcbb4ae97e7d477fc521c275c02609e2931ece2d/anthropic-0.75.0.tar.gz", hash = "sha256:e8607422f4ab616db2ea5baacc215dd5f028da99ce2f022e33c7c535b29f3dfb", size = 439565, upload-time = "2025-11-24T20:41:45.28Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/60/1c/1cd02b7ae64302a6e06724bf80a96401d5313708651d277b1458504a1730/anthropic-0.75.0-py3-none-any.whl", hash = "sha256:ea8317271b6c15d80225a9f3c670152746e88805a7a61e14d4a374577164965b", size = 388164, upload-time = "2025-11-24T20:41:43.587Z" },
+]
+
 [[package]]
 name = "anyio"
 version = "4.9.0"
@@ -712,6 +731,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277, upload-time = "2023-12-24T09:54:30.421Z" },
 ]
 
+[[package]]
+name = "docstring-parser"
+version = "0.17.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/b2/9d/c3b43da9515bd270df0f80548d9944e389870713cc1fe2b8fb35fe2bcefd/docstring_parser-0.17.0.tar.gz", hash = "sha256:583de4a309722b3315439bb31d64ba3eebada841f2e2cee23b99df001434c912", size = 27442, upload-time = "2025-07-21T07:35:01.868Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/55/e2/2537ebcff11c1ee1ff17d8d0b6f4db75873e3b0fb32c2d4a2ee31ecb310a/docstring_parser-0.17.0-py3-none-any.whl", hash = "sha256:cf2569abd23dce8099b300f9b4fa8191e9582dda731fd533daf54c4551658708", size = 36896, upload-time = "2025-07-21T07:35:00.684Z" },
+]
+
 [[package]]
 name = "docutils"
 version = "0.21.2"
@@ -753,6 +781,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/30/79/4f544d73fcc0513b71296cb3ebb28a227d22e80dec27204977039b9fa875/duckdb-1.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:280fd663dacdd12bb3c3bf41f3e5b2e5b95e00b88120afabb8b8befa5f335c6f", size = 12336460, upload-time = "2025-10-07T10:37:12.154Z" },
 ]
 
+[[package]]
+name = "eval-type-backport"
+version = "0.3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/51/23/079e39571d6dd8d90d7a369ecb55ad766efb6bae4e77389629e14458c280/eval_type_backport-0.3.0.tar.gz", hash = "sha256:1638210401e184ff17f877e9a2fa076b60b5838790f4532a21761cc2be67aea1", size = 9272, upload-time = "2025-11-13T20:56:50.845Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/19/d8/2a1c638d9e0aa7e269269a1a1bf423ddd94267f1a01bbe3ad03432b67dd4/eval_type_backport-0.3.0-py3-none-any.whl", hash = "sha256:975a10a0fe333c8b6260d7fdb637698c9a16c3a9e3b6eb943fee6a6f67a37fe8", size = 6061, upload-time = "2025-11-13T20:56:49.499Z" },
+]
+
 [[package]]
 name = "exceptiongroup"
 version = "1.3.0"
@@ -874,6 +911,20 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ee/45/b82e3c16be2182bff01179db177fe144d58b5dc787a7d4492c6ed8b9317f/frozenlist-1.7.0-py3-none-any.whl", hash = "sha256:9a5af342e34f7e97caf8c995864c7a396418ae2859cc6fdf1b1073020d516a7e", size = 13106, upload-time = "2025-06-09T23:02:34.204Z" },
 ]
 
+[[package]]
+name = "genai-prices"
+version = "0.0.47"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "eval-type-backport", marker = "python_full_version < '3.11'" },
+    { name = "httpx" },
+    { name = "pydantic" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b8/47/f25fb84fa40142699dc54ca294628d600625eb3d90fead103a606b4e999a/genai_prices-0.0.47.tar.gz", hash = "sha256:3b8c514f0ce5818b3944a371861586ed9bfe10d02598e62c350b5bd2916d03c2", size = 54501, upload-time = "2025-11-25T18:38:17.695Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4e/84/d50c52d0eeadb9dbf7f2f86da9b6257e162b7c6a791f5b1009bae912c103/genai_prices-0.0.47-py3-none-any.whl", hash = "sha256:735e45950d2299276f2c00cd18075b77a124cd24ee58243f236ee29af3210594", size = 57000, upload-time = "2025-11-25T18:38:16.464Z" },
+]
+
 [[package]]
 name = "googleapis-common-protos"
 version = "1.70.0"
@@ -886,6 +937,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/86/f1/62a193f0227cf15a920390abe675f386dec35f7ae3ffe6da582d3ade42c7/googleapis_common_protos-1.70.0-py3-none-any.whl", hash = "sha256:b8bfcca8c25a2bb253e0e0b0adaf8c00773e5e6af6fd92397576680b807e0fd8", size = 294530, upload-time = "2025-04-14T10:17:01.271Z" },
 ]
 
+[[package]]
+name = "griffe"
+version = "1.15.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/0d/0c/3a471b6e31951dce2360477420d0a8d1e00dea6cf33b70f3e8c3ab6e28e1/griffe-1.15.0.tar.gz", hash = "sha256:7726e3afd6f298fbc3696e67958803e7ac843c1cfe59734b6251a40cdbfb5eea", size = 424112, upload-time = "2025-11-10T15:03:15.52Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9c/83/3b1d03d36f224edded98e9affd0467630fc09d766c0e56fb1498cbb04a9b/griffe-1.15.0-py3-none-any.whl", hash = "sha256:6f6762661949411031f5fcda9593f586e6ce8340f0ba88921a0f2ef7a81eb9a3", size = 150705, upload-time = "2025-11-10T15:03:13.549Z" },
+]
+
 [[package]]
 name = "h11"
 version = "0.16.0"
@@ -968,11 +1031,11 @@ wheels = [
 
 [[package]]
 name = "httpx-sse"
-version = "0.4.1"
+version = "0.4.0"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/6e/fa/66bd985dd0b7c109a3bcb89272ee0bfb7e2b4d06309ad7b38ff866734b2a/httpx_sse-0.4.1.tar.gz", hash = "sha256:8f44d34414bc7b21bf3602713005c5df4917884f76072479b21f68befa4ea26e", size = 12998, upload-time = "2025-06-24T13:21:05.71Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/4c/60/8f4281fa9bbf3c8034fd54c0e7412e66edbab6bc74c4996bd616f8d0406e/httpx-sse-0.4.0.tar.gz", hash = "sha256:1e81a3a3070ce322add1d3529ed42eb5f70817f45ed6ec915ab753f961139721", size = 12624, upload-time = "2023-12-22T08:01:21.083Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/25/0a/6269e3473b09aed2dab8aa1a600c70f31f00ae1349bee30658f7e358a159/httpx_sse-0.4.1-py3-none-any.whl", hash = "sha256:cba42174344c3a5b06f255ce65b350880f962d99ead85e776f23c6618a377a37", size = 8054, upload-time = "2025-06-24T13:21:04.772Z" },
+    { url = "https://files.pythonhosted.org/packages/e1/9b/a181f281f65d776426002f330c31849b86b31fc9d848db62e16f03ff739f/httpx_sse-0.4.0-py3-none-any.whl", hash = "sha256:f329af6eae57eaa2bdfd962b42524764af68075ea87370a2de920af5341e318f", size = 7819, upload-time = "2023-12-22T08:01:19.89Z" },
 ]
 
 [[package]]
@@ -1098,6 +1161,103 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" },
 ]
 
+[[package]]
+name = "jiter"
+version = "0.12.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/45/9d/e0660989c1370e25848bb4c52d061c71837239738ad937e83edca174c273/jiter-0.12.0.tar.gz", hash = "sha256:64dfcd7d5c168b38d3f9f8bba7fc639edb3418abcc74f22fdbe6b8938293f30b", size = 168294, upload-time = "2025-11-09T20:49:23.302Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/3b/91/13cb9505f7be74a933f37da3af22e029f6ba64f5669416cb8b2774bc9682/jiter-0.12.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:e7acbaba9703d5de82a2c98ae6a0f59ab9770ab5af5fa35e43a303aee962cf65", size = 316652, upload-time = "2025-11-09T20:46:41.021Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/76/4e9185e5d9bb4e482cf6dec6410d5f78dfeb374cfcecbbe9888d07c52daa/jiter-0.12.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:364f1a7294c91281260364222f535bc427f56d4de1d8ffd718162d21fbbd602e", size = 319829, upload-time = "2025-11-09T20:46:43.281Z" },
+    { url = "https://files.pythonhosted.org/packages/86/af/727de50995d3a153138139f259baae2379d8cb0522c0c00419957bc478a6/jiter-0.12.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:85ee4d25805d4fb23f0a5167a962ef8e002dbfb29c0989378488e32cf2744b62", size = 350568, upload-time = "2025-11-09T20:46:45.075Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/c1/d6e9f4b7a3d5ac63bcbdfddeb50b2dcfbdc512c86cffc008584fdc350233/jiter-0.12.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:796f466b7942107eb889c08433b6e31b9a7ed31daceaecf8af1be26fb26c0ca8", size = 369052, upload-time = "2025-11-09T20:46:46.818Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/be/00824cd530f30ed73fa8a4f9f3890a705519e31ccb9e929f1e22062e7c76/jiter-0.12.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:35506cb71f47dba416694e67af996bbdefb8e3608f1f78799c2e1f9058b01ceb", size = 481585, upload-time = "2025-11-09T20:46:48.319Z" },
+    { url = "https://files.pythonhosted.org/packages/74/b6/2ad7990dff9504d4b5052eef64aa9574bd03d722dc7edced97aad0d47be7/jiter-0.12.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:726c764a90c9218ec9e4f99a33d6bf5ec169163f2ca0fc21b654e88c2abc0abc", size = 380541, upload-time = "2025-11-09T20:46:49.643Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/c7/f3c26ecbc1adbf1db0d6bba99192143d8fe8504729d9594542ecc4445784/jiter-0.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:baa47810c5565274810b726b0dc86d18dce5fd17b190ebdc3890851d7b2a0e74", size = 364423, upload-time = "2025-11-09T20:46:51.731Z" },
+    { url = "https://files.pythonhosted.org/packages/18/51/eac547bf3a2d7f7e556927278e14c56a0604b8cddae75815d5739f65f81d/jiter-0.12.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f8ec0259d3f26c62aed4d73b198c53e316ae11f0f69c8fbe6682c6dcfa0fcce2", size = 389958, upload-time = "2025-11-09T20:46:53.432Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/1f/9ca592e67175f2db156cff035e0d817d6004e293ee0c1d73692d38fcb596/jiter-0.12.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:79307d74ea83465b0152fa23e5e297149506435535282f979f18b9033c0bb025", size = 522084, upload-time = "2025-11-09T20:46:54.848Z" },
+    { url = "https://files.pythonhosted.org/packages/83/ff/597d9cdc3028f28224f53e1a9d063628e28b7a5601433e3196edda578cdd/jiter-0.12.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:cf6e6dd18927121fec86739f1a8906944703941d000f0639f3eb6281cc601dca", size = 513054, upload-time = "2025-11-09T20:46:56.487Z" },
+    { url = "https://files.pythonhosted.org/packages/24/6d/1970bce1351bd02e3afcc5f49e4f7ef3dabd7fb688f42be7e8091a5b809a/jiter-0.12.0-cp310-cp310-win32.whl", hash = "sha256:b6ae2aec8217327d872cbfb2c1694489057b9433afce447955763e6ab015b4c4", size = 206368, upload-time = "2025-11-09T20:46:58.638Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/6b/eb1eb505b2d86709b59ec06681a2b14a94d0941db091f044b9f0e16badc0/jiter-0.12.0-cp310-cp310-win_amd64.whl", hash = "sha256:c7f49ce90a71e44f7e1aa9e7ec415b9686bbc6a5961e57eab511015e6759bc11", size = 204847, upload-time = "2025-11-09T20:47:00.295Z" },
+    { url = "https://files.pythonhosted.org/packages/32/f9/eaca4633486b527ebe7e681c431f529b63fe2709e7c5242fc0f43f77ce63/jiter-0.12.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:d8f8a7e317190b2c2d60eb2e8aa835270b008139562d70fe732e1c0020ec53c9", size = 316435, upload-time = "2025-11-09T20:47:02.087Z" },
+    { url = "https://files.pythonhosted.org/packages/10/c1/40c9f7c22f5e6ff715f28113ebaba27ab85f9af2660ad6e1dd6425d14c19/jiter-0.12.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2218228a077e784c6c8f1a8e5d6b8cb1dea62ce25811c356364848554b2056cd", size = 320548, upload-time = "2025-11-09T20:47:03.409Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/1b/efbb68fe87e7711b00d2cfd1f26bb4bfc25a10539aefeaa7727329ffb9cb/jiter-0.12.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9354ccaa2982bf2188fd5f57f79f800ef622ec67beb8329903abf6b10da7d423", size = 351915, upload-time = "2025-11-09T20:47:05.171Z" },
+    { url = "https://files.pythonhosted.org/packages/15/2d/c06e659888c128ad1e838123d0638f0efad90cc30860cb5f74dd3f2fc0b3/jiter-0.12.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8f2607185ea89b4af9a604d4c7ec40e45d3ad03ee66998b031134bc510232bb7", size = 368966, upload-time = "2025-11-09T20:47:06.508Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/20/058db4ae5fb07cf6a4ab2e9b9294416f606d8e467fb74c2184b2a1eeacba/jiter-0.12.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3a585a5e42d25f2e71db5f10b171f5e5ea641d3aa44f7df745aa965606111cc2", size = 482047, upload-time = "2025-11-09T20:47:08.382Z" },
+    { url = "https://files.pythonhosted.org/packages/49/bb/dc2b1c122275e1de2eb12905015d61e8316b2f888bdaac34221c301495d6/jiter-0.12.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bd9e21d34edff5a663c631f850edcb786719c960ce887a5661e9c828a53a95d9", size = 380835, upload-time = "2025-11-09T20:47:09.81Z" },
+    { url = "https://files.pythonhosted.org/packages/23/7d/38f9cd337575349de16da575ee57ddb2d5a64d425c9367f5ef9e4612e32e/jiter-0.12.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a612534770470686cd5431478dc5a1b660eceb410abade6b1b74e320ca98de6", size = 364587, upload-time = "2025-11-09T20:47:11.529Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/a3/b13e8e61e70f0bb06085099c4e2462647f53cc2ca97614f7fedcaa2bb9f3/jiter-0.12.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:3985aea37d40a908f887b34d05111e0aae822943796ebf8338877fee2ab67725", size = 390492, upload-time = "2025-11-09T20:47:12.993Z" },
+    { url = "https://files.pythonhosted.org/packages/07/71/e0d11422ed027e21422f7bc1883c61deba2d9752b720538430c1deadfbca/jiter-0.12.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b1207af186495f48f72529f8d86671903c8c10127cac6381b11dddc4aaa52df6", size = 522046, upload-time = "2025-11-09T20:47:14.6Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/59/b968a9aa7102a8375dbbdfbd2aeebe563c7e5dddf0f47c9ef1588a97e224/jiter-0.12.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:ef2fb241de583934c9915a33120ecc06d94aa3381a134570f59eed784e87001e", size = 513392, upload-time = "2025-11-09T20:47:16.011Z" },
+    { url = "https://files.pythonhosted.org/packages/ca/e4/7df62002499080dbd61b505c5cb351aa09e9959d176cac2aa8da6f93b13b/jiter-0.12.0-cp311-cp311-win32.whl", hash = "sha256:453b6035672fecce8007465896a25b28a6b59cfe8fbc974b2563a92f5a92a67c", size = 206096, upload-time = "2025-11-09T20:47:17.344Z" },
+    { url = "https://files.pythonhosted.org/packages/bb/60/1032b30ae0572196b0de0e87dce3b6c26a1eff71aad5fe43dee3082d32e0/jiter-0.12.0-cp311-cp311-win_amd64.whl", hash = "sha256:ca264b9603973c2ad9435c71a8ec8b49f8f715ab5ba421c85a51cde9887e421f", size = 204899, upload-time = "2025-11-09T20:47:19.365Z" },
+    { url = "https://files.pythonhosted.org/packages/49/d5/c145e526fccdb834063fb45c071df78b0cc426bbaf6de38b0781f45d956f/jiter-0.12.0-cp311-cp311-win_arm64.whl", hash = "sha256:cb00ef392e7d684f2754598c02c409f376ddcef857aae796d559e6cacc2d78a5", size = 188070, upload-time = "2025-11-09T20:47:20.75Z" },
+    { url = "https://files.pythonhosted.org/packages/92/c9/5b9f7b4983f1b542c64e84165075335e8a236fa9e2ea03a0c79780062be8/jiter-0.12.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:305e061fa82f4680607a775b2e8e0bcb071cd2205ac38e6ef48c8dd5ebe1cf37", size = 314449, upload-time = "2025-11-09T20:47:22.999Z" },
+    { url = "https://files.pythonhosted.org/packages/98/6e/e8efa0e78de00db0aee82c0cf9e8b3f2027efd7f8a71f859d8f4be8e98ef/jiter-0.12.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5c1860627048e302a528333c9307c818c547f214d8659b0705d2195e1a94b274", size = 319855, upload-time = "2025-11-09T20:47:24.779Z" },
+    { url = "https://files.pythonhosted.org/packages/20/26/894cd88e60b5d58af53bec5c6759d1292bd0b37a8b5f60f07abf7a63ae5f/jiter-0.12.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:df37577a4f8408f7e0ec3205d2a8f87672af8f17008358063a4d6425b6081ce3", size = 350171, upload-time = "2025-11-09T20:47:26.469Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/27/a7b818b9979ac31b3763d25f3653ec3a954044d5e9f5d87f2f247d679fd1/jiter-0.12.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:75fdd787356c1c13a4f40b43c2156276ef7a71eb487d98472476476d803fb2cf", size = 365590, upload-time = "2025-11-09T20:47:27.918Z" },
+    { url = "https://files.pythonhosted.org/packages/ba/7e/e46195801a97673a83746170b17984aa8ac4a455746354516d02ca5541b4/jiter-0.12.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1eb5db8d9c65b112aacf14fcd0faae9913d07a8afea5ed06ccdd12b724e966a1", size = 479462, upload-time = "2025-11-09T20:47:29.654Z" },
+    { url = "https://files.pythonhosted.org/packages/ca/75/f833bfb009ab4bd11b1c9406d333e3b4357709ed0570bb48c7c06d78c7dd/jiter-0.12.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:73c568cc27c473f82480abc15d1301adf333a7ea4f2e813d6a2c7d8b6ba8d0df", size = 378983, upload-time = "2025-11-09T20:47:31.026Z" },
+    { url = "https://files.pythonhosted.org/packages/71/b3/7a69d77943cc837d30165643db753471aff5df39692d598da880a6e51c24/jiter-0.12.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4321e8a3d868919bcb1abb1db550d41f2b5b326f72df29e53b2df8b006eb9403", size = 361328, upload-time = "2025-11-09T20:47:33.286Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/ac/a78f90caf48d65ba70d8c6efc6f23150bc39dc3389d65bbec2a95c7bc628/jiter-0.12.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:0a51bad79f8cc9cac2b4b705039f814049142e0050f30d91695a2d9a6611f126", size = 386740, upload-time = "2025-11-09T20:47:34.703Z" },
+    { url = "https://files.pythonhosted.org/packages/39/b6/5d31c2cc8e1b6a6bcf3c5721e4ca0a3633d1ab4754b09bc7084f6c4f5327/jiter-0.12.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:2a67b678f6a5f1dd6c36d642d7db83e456bc8b104788262aaefc11a22339f5a9", size = 520875, upload-time = "2025-11-09T20:47:36.058Z" },
+    { url = "https://files.pythonhosted.org/packages/30/b5/4df540fae4e9f68c54b8dab004bd8c943a752f0b00efd6e7d64aa3850339/jiter-0.12.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:efe1a211fe1fd14762adea941e3cfd6c611a136e28da6c39272dbb7a1bbe6a86", size = 511457, upload-time = "2025-11-09T20:47:37.932Z" },
+    { url = "https://files.pythonhosted.org/packages/07/65/86b74010e450a1a77b2c1aabb91d4a91dd3cd5afce99f34d75fd1ac64b19/jiter-0.12.0-cp312-cp312-win32.whl", hash = "sha256:d779d97c834b4278276ec703dc3fc1735fca50af63eb7262f05bdb4e62203d44", size = 204546, upload-time = "2025-11-09T20:47:40.47Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/c7/6659f537f9562d963488e3e55573498a442503ced01f7e169e96a6110383/jiter-0.12.0-cp312-cp312-win_amd64.whl", hash = "sha256:e8269062060212b373316fe69236096aaf4c49022d267c6736eebd66bbbc60bb", size = 205196, upload-time = "2025-11-09T20:47:41.794Z" },
+    { url = "https://files.pythonhosted.org/packages/21/f4/935304f5169edadfec7f9c01eacbce4c90bb9a82035ac1de1f3bd2d40be6/jiter-0.12.0-cp312-cp312-win_arm64.whl", hash = "sha256:06cb970936c65de926d648af0ed3d21857f026b1cf5525cb2947aa5e01e05789", size = 186100, upload-time = "2025-11-09T20:47:43.007Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/a6/97209693b177716e22576ee1161674d1d58029eb178e01866a0422b69224/jiter-0.12.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:6cc49d5130a14b732e0612bc76ae8db3b49898732223ef8b7599aa8d9810683e", size = 313658, upload-time = "2025-11-09T20:47:44.424Z" },
+    { url = "https://files.pythonhosted.org/packages/06/4d/125c5c1537c7d8ee73ad3d530a442d6c619714b95027143f1b61c0b4dfe0/jiter-0.12.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:37f27a32ce36364d2fa4f7fdc507279db604d27d239ea2e044c8f148410defe1", size = 318605, upload-time = "2025-11-09T20:47:45.973Z" },
+    { url = "https://files.pythonhosted.org/packages/99/bf/a840b89847885064c41a5f52de6e312e91fa84a520848ee56c97e4fa0205/jiter-0.12.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bbc0944aa3d4b4773e348cda635252824a78f4ba44328e042ef1ff3f6080d1cf", size = 349803, upload-time = "2025-11-09T20:47:47.535Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/88/e63441c28e0db50e305ae23e19c1d8fae012d78ed55365da392c1f34b09c/jiter-0.12.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:da25c62d4ee1ffbacb97fac6dfe4dcd6759ebdc9015991e92a6eae5816287f44", size = 365120, upload-time = "2025-11-09T20:47:49.284Z" },
+    { url = "https://files.pythonhosted.org/packages/0a/7c/49b02714af4343970eb8aca63396bc1c82fa01197dbb1e9b0d274b550d4e/jiter-0.12.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:048485c654b838140b007390b8182ba9774621103bd4d77c9c3f6f117474ba45", size = 479918, upload-time = "2025-11-09T20:47:50.807Z" },
+    { url = "https://files.pythonhosted.org/packages/69/ba/0a809817fdd5a1db80490b9150645f3aae16afad166960bcd562be194f3b/jiter-0.12.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:635e737fbb7315bef0037c19b88b799143d2d7d3507e61a76751025226b3ac87", size = 379008, upload-time = "2025-11-09T20:47:52.211Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/c3/c9fc0232e736c8877d9e6d83d6eeb0ba4e90c6c073835cc2e8f73fdeef51/jiter-0.12.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4e017c417b1ebda911bd13b1e40612704b1f5420e30695112efdbed8a4b389ed", size = 361785, upload-time = "2025-11-09T20:47:53.512Z" },
+    { url = "https://files.pythonhosted.org/packages/96/61/61f69b7e442e97ca6cd53086ddc1cf59fb830549bc72c0a293713a60c525/jiter-0.12.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:89b0bfb8b2bf2351fba36bb211ef8bfceba73ef58e7f0c68fb67b5a2795ca2f9", size = 386108, upload-time = "2025-11-09T20:47:54.893Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/2e/76bb3332f28550c8f1eba3bf6e5efe211efda0ddbbaf24976bc7078d42a5/jiter-0.12.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:f5aa5427a629a824a543672778c9ce0c5e556550d1569bb6ea28a85015287626", size = 519937, upload-time = "2025-11-09T20:47:56.253Z" },
+    { url = "https://files.pythonhosted.org/packages/84/d6/fa96efa87dc8bff2094fb947f51f66368fa56d8d4fc9e77b25d7fbb23375/jiter-0.12.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ed53b3d6acbcb0fd0b90f20c7cb3b24c357fe82a3518934d4edfa8c6898e498c", size = 510853, upload-time = "2025-11-09T20:47:58.32Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/28/93f67fdb4d5904a708119a6ab58a8f1ec226ff10a94a282e0215402a8462/jiter-0.12.0-cp313-cp313-win32.whl", hash = "sha256:4747de73d6b8c78f2e253a2787930f4fffc68da7fa319739f57437f95963c4de", size = 204699, upload-time = "2025-11-09T20:47:59.686Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/1f/30b0eb087045a0abe2a5c9c0c0c8da110875a1d3be83afd4a9a4e548be3c/jiter-0.12.0-cp313-cp313-win_amd64.whl", hash = "sha256:e25012eb0c456fcc13354255d0338cd5397cce26c77b2832b3c4e2e255ea5d9a", size = 204258, upload-time = "2025-11-09T20:48:01.01Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/f4/2b4daf99b96bce6fc47971890b14b2a36aef88d7beb9f057fafa032c6141/jiter-0.12.0-cp313-cp313-win_arm64.whl", hash = "sha256:c97b92c54fe6110138c872add030a1f99aea2401ddcdaa21edf74705a646dd60", size = 185503, upload-time = "2025-11-09T20:48:02.35Z" },
+    { url = "https://files.pythonhosted.org/packages/39/ca/67bb15a7061d6fe20b9b2a2fd783e296a1e0f93468252c093481a2f00efa/jiter-0.12.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:53839b35a38f56b8be26a7851a48b89bc47e5d88e900929df10ed93b95fea3d6", size = 317965, upload-time = "2025-11-09T20:48:03.783Z" },
+    { url = "https://files.pythonhosted.org/packages/18/af/1788031cd22e29c3b14bc6ca80b16a39a0b10e611367ffd480c06a259831/jiter-0.12.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94f669548e55c91ab47fef8bddd9c954dab1938644e715ea49d7e117015110a4", size = 345831, upload-time = "2025-11-09T20:48:05.55Z" },
+    { url = "https://files.pythonhosted.org/packages/05/17/710bf8472d1dff0d3caf4ced6031060091c1320f84ee7d5dcbed1f352417/jiter-0.12.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:351d54f2b09a41600ffea43d081522d792e81dcfb915f6d2d242744c1cc48beb", size = 361272, upload-time = "2025-11-09T20:48:06.951Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/f1/1dcc4618b59761fef92d10bcbb0b038b5160be653b003651566a185f1a5c/jiter-0.12.0-cp313-cp313t-win_amd64.whl", hash = "sha256:2a5e90604620f94bf62264e7c2c038704d38217b7465b863896c6d7c902b06c7", size = 204604, upload-time = "2025-11-09T20:48:08.328Z" },
+    { url = "https://files.pythonhosted.org/packages/d9/32/63cb1d9f1c5c6632a783c0052cde9ef7ba82688f7065e2f0d5f10a7e3edb/jiter-0.12.0-cp313-cp313t-win_arm64.whl", hash = "sha256:88ef757017e78d2860f96250f9393b7b577b06a956ad102c29c8237554380db3", size = 185628, upload-time = "2025-11-09T20:48:09.572Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/99/45c9f0dbe4a1416b2b9a8a6d1236459540f43d7fb8883cff769a8db0612d/jiter-0.12.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:c46d927acd09c67a9fb1416df45c5a04c27e83aae969267e98fba35b74e99525", size = 312478, upload-time = "2025-11-09T20:48:10.898Z" },
+    { url = "https://files.pythonhosted.org/packages/4c/a7/54ae75613ba9e0f55fcb0bc5d1f807823b5167cc944e9333ff322e9f07dd/jiter-0.12.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:774ff60b27a84a85b27b88cd5583899c59940bcc126caca97eb2a9df6aa00c49", size = 318706, upload-time = "2025-11-09T20:48:12.266Z" },
+    { url = "https://files.pythonhosted.org/packages/59/31/2aa241ad2c10774baf6c37f8b8e1f39c07db358f1329f4eb40eba179c2a2/jiter-0.12.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c5433fab222fb072237df3f637d01b81f040a07dcac1cb4a5c75c7aa9ed0bef1", size = 351894, upload-time = "2025-11-09T20:48:13.673Z" },
+    { url = "https://files.pythonhosted.org/packages/54/4f/0f2759522719133a9042781b18cc94e335b6d290f5e2d3e6899d6af933e3/jiter-0.12.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f8c593c6e71c07866ec6bfb790e202a833eeec885022296aff6b9e0b92d6a70e", size = 365714, upload-time = "2025-11-09T20:48:15.083Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/6f/806b895f476582c62a2f52c453151edd8a0fde5411b0497baaa41018e878/jiter-0.12.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:90d32894d4c6877a87ae00c6b915b609406819dce8bc0d4e962e4de2784e567e", size = 478989, upload-time = "2025-11-09T20:48:16.706Z" },
+    { url = "https://files.pythonhosted.org/packages/86/6c/012d894dc6e1033acd8db2b8346add33e413ec1c7c002598915278a37f79/jiter-0.12.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:798e46eed9eb10c3adbbacbd3bdb5ecd4cf7064e453d00dbef08802dae6937ff", size = 378615, upload-time = "2025-11-09T20:48:18.614Z" },
+    { url = "https://files.pythonhosted.org/packages/87/30/d718d599f6700163e28e2c71c0bbaf6dace692e7df2592fd793ac9276717/jiter-0.12.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b3f1368f0a6719ea80013a4eb90ba72e75d7ea67cfc7846db2ca504f3df0169a", size = 364745, upload-time = "2025-11-09T20:48:20.117Z" },
+    { url = "https://files.pythonhosted.org/packages/8f/85/315b45ce4b6ddc7d7fceca24068543b02bdc8782942f4ee49d652e2cc89f/jiter-0.12.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:65f04a9d0b4406f7e51279710b27484af411896246200e461d80d3ba0caa901a", size = 386502, upload-time = "2025-11-09T20:48:21.543Z" },
+    { url = "https://files.pythonhosted.org/packages/74/0b/ce0434fb40c5b24b368fe81b17074d2840748b4952256bab451b72290a49/jiter-0.12.0-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:fd990541982a24281d12b67a335e44f117e4c6cbad3c3b75c7dea68bf4ce3a67", size = 519845, upload-time = "2025-11-09T20:48:22.964Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/a3/7a7a4488ba052767846b9c916d208b3ed114e3eb670ee984e4c565b9cf0d/jiter-0.12.0-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:b111b0e9152fa7df870ecaebb0bd30240d9f7fff1f2003bcb4ed0f519941820b", size = 510701, upload-time = "2025-11-09T20:48:24.483Z" },
+    { url = "https://files.pythonhosted.org/packages/c3/16/052ffbf9d0467b70af24e30f91e0579e13ded0c17bb4a8eb2aed3cb60131/jiter-0.12.0-cp314-cp314-win32.whl", hash = "sha256:a78befb9cc0a45b5a5a0d537b06f8544c2ebb60d19d02c41ff15da28a9e22d42", size = 205029, upload-time = "2025-11-09T20:48:25.749Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/18/3cf1f3f0ccc789f76b9a754bdb7a6977e5d1d671ee97a9e14f7eb728d80e/jiter-0.12.0-cp314-cp314-win_amd64.whl", hash = "sha256:e1fe01c082f6aafbe5c8faf0ff074f38dfb911d53f07ec333ca03f8f6226debf", size = 204960, upload-time = "2025-11-09T20:48:27.415Z" },
+    { url = "https://files.pythonhosted.org/packages/02/68/736821e52ecfdeeb0f024b8ab01b5a229f6b9293bbdb444c27efade50b0f/jiter-0.12.0-cp314-cp314-win_arm64.whl", hash = "sha256:d72f3b5a432a4c546ea4bedc84cce0c3404874f1d1676260b9c7f048a9855451", size = 185529, upload-time = "2025-11-09T20:48:29.125Z" },
+    { url = "https://files.pythonhosted.org/packages/30/61/12ed8ee7a643cce29ac97c2281f9ce3956eb76b037e88d290f4ed0d41480/jiter-0.12.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:e6ded41aeba3603f9728ed2b6196e4df875348ab97b28fc8afff115ed42ba7a7", size = 318974, upload-time = "2025-11-09T20:48:30.87Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/c6/f3041ede6d0ed5e0e79ff0de4c8f14f401bbf196f2ef3971cdbe5fd08d1d/jiter-0.12.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a947920902420a6ada6ad51892082521978e9dd44a802663b001436e4b771684", size = 345932, upload-time = "2025-11-09T20:48:32.658Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/5d/4d94835889edd01ad0e2dbfc05f7bdfaed46292e7b504a6ac7839aa00edb/jiter-0.12.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:add5e227e0554d3a52cf390a7635edaffdf4f8fce4fdbcef3cc2055bb396a30c", size = 367243, upload-time = "2025-11-09T20:48:34.093Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/76/0051b0ac2816253a99d27baf3dda198663aff882fa6ea7deeb94046da24e/jiter-0.12.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f9b1cda8fcb736250d7e8711d4580ebf004a46771432be0ae4796944b5dfa5d", size = 479315, upload-time = "2025-11-09T20:48:35.507Z" },
+    { url = "https://files.pythonhosted.org/packages/70/ae/83f793acd68e5cb24e483f44f482a1a15601848b9b6f199dacb970098f77/jiter-0.12.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:deeb12a2223fe0135c7ff1356a143d57f95bbf1f4a66584f1fc74df21d86b993", size = 380714, upload-time = "2025-11-09T20:48:40.014Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/5e/4808a88338ad2c228b1126b93fcd8ba145e919e886fe910d578230dabe3b/jiter-0.12.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c596cc0f4cb574877550ce4ecd51f8037469146addd676d7c1a30ebe6391923f", size = 365168, upload-time = "2025-11-09T20:48:41.462Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/d4/04619a9e8095b42aef436b5aeb4c0282b4ff1b27d1db1508df9f5dc82750/jiter-0.12.0-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5ab4c823b216a4aeab3fdbf579c5843165756bd9ad87cc6b1c65919c4715f783", size = 387893, upload-time = "2025-11-09T20:48:42.921Z" },
+    { url = "https://files.pythonhosted.org/packages/17/ea/d3c7e62e4546fdc39197fa4a4315a563a89b95b6d54c0d25373842a59cbe/jiter-0.12.0-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:e427eee51149edf962203ff8db75a7514ab89be5cb623fb9cea1f20b54f1107b", size = 520828, upload-time = "2025-11-09T20:48:44.278Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/0b/c6d3562a03fd767e31cb119d9041ea7958c3c80cb3d753eafb19b3b18349/jiter-0.12.0-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:edb868841f84c111255ba5e80339d386d937ec1fdce419518ce1bd9370fac5b6", size = 511009, upload-time = "2025-11-09T20:48:45.726Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/51/2cb4468b3448a8385ebcd15059d325c9ce67df4e2758d133ab9442b19834/jiter-0.12.0-cp314-cp314t-win32.whl", hash = "sha256:8bbcfe2791dfdb7c5e48baf646d37a6a3dcb5a97a032017741dea9f817dca183", size = 205110, upload-time = "2025-11-09T20:48:47.033Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/c5/ae5ec83dec9c2d1af805fd5fe8f74ebded9c8670c5210ec7820ce0dbeb1e/jiter-0.12.0-cp314-cp314t-win_amd64.whl", hash = "sha256:2fa940963bf02e1d8226027ef461e36af472dea85d36054ff835aeed944dd873", size = 205223, upload-time = "2025-11-09T20:48:49.076Z" },
+    { url = "https://files.pythonhosted.org/packages/97/9a/3c5391907277f0e55195550cf3fa8e293ae9ee0c00fb402fec1e38c0c82f/jiter-0.12.0-cp314-cp314t-win_arm64.whl", hash = "sha256:506c9708dd29b27288f9f8f1140c3cb0e3d8ddb045956d7757b1fa0e0f39a473", size = 185564, upload-time = "2025-11-09T20:48:50.376Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/54/5339ef1ecaa881c6948669956567a64d2670941925f245c434f494ffb0e5/jiter-0.12.0-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:4739a4657179ebf08f85914ce50332495811004cc1747852e8b2041ed2aab9b8", size = 311144, upload-time = "2025-11-09T20:49:10.503Z" },
+    { url = "https://files.pythonhosted.org/packages/27/74/3446c652bffbd5e81ab354e388b1b5fc1d20daac34ee0ed11ff096b1b01a/jiter-0.12.0-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:41da8def934bf7bec16cb24bd33c0ca62126d2d45d81d17b864bd5ad721393c3", size = 305877, upload-time = "2025-11-09T20:49:12.269Z" },
+    { url = "https://files.pythonhosted.org/packages/a1/f4/ed76ef9043450f57aac2d4fbeb27175aa0eb9c38f833be6ef6379b3b9a86/jiter-0.12.0-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9c44ee814f499c082e69872d426b624987dbc5943ab06e9bbaa4f81989fdb79e", size = 340419, upload-time = "2025-11-09T20:49:13.803Z" },
+    { url = "https://files.pythonhosted.org/packages/21/01/857d4608f5edb0664aa791a3d45702e1a5bcfff9934da74035e7b9803846/jiter-0.12.0-graalpy311-graalpy242_311_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cd2097de91cf03eaa27b3cbdb969addf83f0179c6afc41bbc4513705e013c65d", size = 347212, upload-time = "2025-11-09T20:49:15.643Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/f5/12efb8ada5f5c9edc1d4555fe383c1fb2eac05ac5859258a72d61981d999/jiter-0.12.0-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:e8547883d7b96ef2e5fe22b88f8a4c8725a56e7f4abafff20fd5272d634c7ecb", size = 309974, upload-time = "2025-11-09T20:49:17.187Z" },
+    { url = "https://files.pythonhosted.org/packages/85/15/d6eb3b770f6a0d332675141ab3962fd4a7c270ede3515d9f3583e1d28276/jiter-0.12.0-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:89163163c0934854a668ed783a2546a0617f71706a2551a4a0666d91ab365d6b", size = 304233, upload-time = "2025-11-09T20:49:18.734Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/3e/e7e06743294eea2cf02ced6aa0ff2ad237367394e37a0e2b4a1108c67a36/jiter-0.12.0-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d96b264ab7d34bbb2312dedc47ce07cd53f06835eacbc16dde3761f47c3a9e7f", size = 338537, upload-time = "2025-11-09T20:49:20.317Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/9c/6753e6522b8d0ef07d3a3d239426669e984fb0eba15a315cdbc1253904e4/jiter-0.12.0-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c24e864cb30ab82311c6425655b0cdab0a98c5d973b065c66a3f020740c2324c", size = 346110, upload-time = "2025-11-09T20:49:21.817Z" },
+]
+
 [[package]]
 name = "jsonschema"
 version = "4.24.0"
@@ -1161,6 +1321,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/a1/30/9ec597c962c5249ebd5c580386e4b5f2884cd943af42634291ee3b406415/leather-0.4.0-py2.py3-none-any.whl", hash = "sha256:18290bc93749ae39039af5e31e871fcfad74d26c4c3ea28ea4f681f4571b3a2b", size = 30256, upload-time = "2024-02-23T22:03:34.75Z" },
 ]
 
+[[package]]
+name = "logfire-api"
+version = "4.15.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e2/e6/3895c0ebf9f6a7acee04a816a569ca871c3d3048fdbd6b2a041f980abc54/logfire_api-4.15.1.tar.gz", hash = "sha256:3fbafc5593f4a16a038a3d23c67a7a7ee9da8be9e3b148fa73069d32e1ed4e8e", size = 57614, upload-time = "2025-11-20T15:52:17.019Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f3/2b/851e78a60b85e8e8e8c6ebb9928f8e883df0340a93e34960ed9f0a41fa82/logfire_api-4.15.1-py3-none-any.whl", hash = "sha256:a88b5c4b6e4acbf6f35a3e992a63f271cf2797aefd21e1cfc93d52b21ade65f6", size = 95031, upload-time = "2025-11-20T15:52:14.433Z" },
+]
+
 [[package]]
 name = "makefun"
 version = "1.16.0"
@@ -1470,6 +1639,7 @@ dependencies = [
     { name = "pandas" },
     { name = "posthog" },
     { name = "psutil" },
+    { name = "pydantic-ai-slim", extra = ["anthropic", "openai"] },
     { name = "pyyaml" },
     { name = "starlette" },
     { name = "uvicorn", extra = ["standard"] },
@@ -1557,6 +1727,7 @@ requires-dist = [
     { name = "pandas-stubs", marker = "extra == 'dev'" },
     { name = "posthog", specifier = ">=3.0.0" },
     { name = "psutil", specifier = ">=5.9.0" },
+    { name = "pydantic-ai-slim", extras = ["anthropic", "openai"], specifier = ">=1.25.0" },
     { name = "pytest", marker = "extra == 'all'" },
     { name = "pytest", marker = "extra == 'dev'" },
     { name = "pytest-asyncio", marker = "extra == 'all'" },
@@ -1849,6 +2020,25 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c7/1d/91becb8fa0e417c172a5721c06dc403ad2abbbc766e9a8bdeff46bdea6ba/onepassword_sdk-0.3.1-cp313-cp313-win_amd64.whl", hash = "sha256:4a77fa3fdbad03738faf2703387256f53c2c86329bcd9f19ee5725a2075db77b", size = 5506878, upload-time = "2025-06-11T17:24:33.339Z" },
 ]
 
+[[package]]
+name = "openai"
+version = "2.8.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "anyio" },
+    { name = "distro" },
+    { name = "httpx" },
+    { name = "jiter" },
+    { name = "pydantic" },
+    { name = "sniffio" },
+    { name = "tqdm" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d5/e4/42591e356f1d53c568418dc7e30dcda7be31dd5a4d570bca22acb0525862/openai-2.8.1.tar.gz", hash = "sha256:cb1b79eef6e809f6da326a7ef6038719e35aa944c42d081807bfa1be8060f15f", size = 602490, upload-time = "2025-11-17T22:39:59.549Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/55/4f/dbc0c124c40cb390508a82770fb9f6e3ed162560181a85089191a851c59a/openai-2.8.1-py3-none-any.whl", hash = "sha256:c6c3b5a04994734386e8dad3c00a393f56d3b68a27cd2e8acae91a59e4122463", size = 1022688, upload-time = "2025-11-17T22:39:57.675Z" },
+]
+
 [[package]]
 name = "opentelemetry-api"
 version = "1.36.0"
@@ -2277,6 +2467,32 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/6a/c0/ec2b1c8712ca690e5d61979dee872603e92b8a32f94cc1b72d53beab008a/pydantic-2.11.7-py3-none-any.whl", hash = "sha256:dde5df002701f6de26248661f6835bbe296a47bf73990135c7d07ce741b9623b", size = 444782, upload-time = "2025-06-14T08:33:14.905Z" },
 ]
 
+[[package]]
+name = "pydantic-ai-slim"
+version = "1.25.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "exceptiongroup", marker = "python_full_version < '3.11'" },
+    { name = "genai-prices" },
+    { name = "griffe" },
+    { name = "httpx" },
+    { name = "opentelemetry-api" },
+    { name = "pydantic" },
+    { name = "pydantic-graph" },
+    { name = "typing-inspection" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/25/b0/7e3de325bf45d7fbf798ec7c74894f18a6fb4bebb8f250936dd26015d4cf/pydantic_ai_slim-1.25.0-py3-none-any.whl", hash = "sha256:87fd01472939862ffba92dc7f93ae2cb47d6a417c0278846dd24ea7f5164f9a8", size = 420416, upload-time = "2025-11-28T05:04:33.012Z" },
+]
+
+[package.optional-dependencies]
+anthropic = [
+    { name = "anthropic" },
+]
+openai = [
+    { name = "openai" },
+]
+
 [[package]]
 name = "pydantic-core"
 version = "2.33.2"
@@ -2364,6 +2580,20 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/32/56/8a7ca5d2cd2cda1d245d34b1c9a942920a718082ae8e54e5f3e5a58b7add/pydantic_core-2.33.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:329467cecfb529c925cf2bbd4d60d2c509bc2fb52a20c1045bf09bb70971a9c1", size = 2066757, upload-time = "2025-04-23T18:33:30.645Z" },
 ]
 
+[[package]]
+name = "pydantic-graph"
+version = "1.25.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "httpx" },
+    { name = "logfire-api" },
+    { name = "pydantic" },
+    { name = "typing-inspection" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/84/3e/c6f5d0a1a22e8ad968c7fb9ea443a1310f7878a6d0a7682526ee210684c5/pydantic_graph-1.25.0-py3-none-any.whl", hash = "sha256:30f0890729cae49f6967297815d4e226557001c650ffe1500fe7ea517561bc2b", size = 72262, upload-time = "2025-11-28T05:04:36.83Z" },
+]
+
 [[package]]
 name = "pydantic-settings"
 version = "2.10.1"
@@ -2951,6 +3181,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/6e/c2/61d3e0f47e2b74ef40a68b9e6ad5984f6241a942f7cd3bbfbdbd03861ea9/tomli-2.2.1-py3-none-any.whl", hash = "sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc", size = 14257, upload-time = "2024-11-27T22:38:35.385Z" },
 ]
 
+[[package]]
+name = "tqdm"
+version = "4.67.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737, upload-time = "2024-11-24T20:12:22.481Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" },
+]
+
 [[package]]
 name = "twine"
 version = "6.1.0"